reglibcpp  1.3.0
(Naïve) C++ implementation of models for regular languages
expression.cpp
Go to the documentation of this file.
1 #include "expression.h"
3 
4 using std::make_pair;
5 using std::pair;
6 using std::string;
7 using std::vector;
8 using std::unique_ptr;
9 using std::make_unique;
10 using std::u32string;
11 
12 #include <algorithm>
13 
14 #include <array>
15 using std::array;
16 
17 #include <bitset>
18 using std::bitset;
19 
20 #include "gnfa.h"
21 #include "nfa.h"
22 #include "dfa.h"
23 
24 namespace reg {
25 expression::exptr expression::empty = expression::exptr(new expression);
26 std::unordered_map<char32_t, expression::exptr> expression::symbols;
27 
29 
34  return expression::empty;
35 }
36 
38 
43  return expression::spawnSymbol(U'\0');
44 }
45 
47 
52 expression::exptr const& expression::spawnSymbol(char32_t symbol) {
53  return expression::symbols.insert(make_pair(symbol, exptr(new expression(symbol)))).first->second;
54 }
55 
57 expression::exptr const& expression::spawnSymbol(string const& utf8Symbol) {
58  char32_t u32Symbol(converter.from_bytes(utf8Symbol)[0]);
59  return expression::symbols.insert(make_pair(u32Symbol, exptr(new expression(u32Symbol)))).first->second;
60 }
61 
63 
71 expression::exptr expression::spawnConcatenation(expression::exptr const& l, expression::exptr const& r, bool optimized, bool aggressive) {
72  if (optimized) {
73  if (l == expression::spawnEmptyString()) {
74  return r;
75  }
76  if (r == expression::spawnEmptyString()) {
77  return l;
78  }
81  }
82  if (aggressive) {
83  if (*l == *expression::spawnEmptyString()) {
84  return r;
85  }
86  if (*r == *expression::spawnEmptyString()) {
87  return l;
88  }
89  if (*r == *expression::spawnEmptySet() || *l == *expression::spawnEmptySet()) {
91  }
92  }
93  }
94  return expression::exptr(new expression(l, r, operation::concatenation));
95 }
96 
98 
106 expression::exptr expression::spawnAlternation(expression::exptr const& l, expression::exptr const& r, bool optimized, bool aggressive) {
107  if (optimized) {
108  if (l == expression::spawnEmptySet()) {
109  return r;
110  }
111  if (r == expression::spawnEmptySet()) {
112  return l;
113  }
114  if (l == r) {
115  return l;
116  }
117  if (aggressive) {
118  if (*l == *expression::spawnEmptySet()) {
119  return r;
120  }
121  if (*r == *expression::spawnEmptySet()) {
122  return l;
123  }
124  if (*l == *r) {
125  return l->size() > r->size() ? r : l;
126  }
127  }
128  }
129  return expression::exptr(new expression(l, r, operation::alternation));
130 }
131 
133 
140 expression::exptr expression::spawnKleene(expression::exptr const& b, bool optimized, bool aggressive) {
141  if (optimized) {
144  }
145  if (aggressive) {
146  if (*b == *expression::spawnEmptySet() || *b == *expression::spawnEmptyString()) {
148  }
149  if (*b == expression(b)) {
150  return b;
151  }
152  }
153  }
154  return expression::exptr(new expression(b));
155 }
156 
158 
168 size_t expression::size() const {
169  size_t s(op != operation::empty);
170  for (exptr const& re : subExpressions) {
171  s += re->size();
172  }
173  return s;
174 }
175 
177 
182  return op;
183 }
184 
186 
189 bool expression::operator==(expression const& r) const {
190  if (!acceptingDfa) {
191  acceptingDfa = make_unique<dfa const>(dfa::builder(gnfa(*this).splitAllTransitions()).minimize());
192  }
193  if (!r.acceptingDfa) {
194  r.acceptingDfa = make_unique<dfa const>(dfa::builder(gnfa(r).splitAllTransitions()).minimize());
195  }
196  return *acceptingDfa == *r.acceptingDfa;
197 }
198 
200 
203 bool expression::operator!=(expression const& r) const {
204  return !operator==(r);
205 }
206 
208 
212 char32_t expression::extractSymbol() const {
213  auto it = std::find_if(
214  expression::symbols.begin(),
215  expression::symbols.end(),
216  [&](pair<char32_t,exptr> const& entry)->bool{return entry.second.get() == this;}
217  );
218  #ifdef __EXCEPTIONS
219  if (it == expression::symbols.end()) {
220  throw std::logic_error("This RE does not seem to be a valid symbol expression.");
221  }
222  #endif
223  return it->first;
224 }
225 
227 
232  char32_t symbol(extractSymbol());
233  if (symbol == U'\0') {
234  return string();
235  } else {
236  return converter.to_bytes(extractSymbol());
237  }
238 }
239 
241 u32string expression::to_u32string() const {
242  switch (op) {
243  case operation::alternation : return subExpressions[0]->to_u32string().append(U"+").append(subExpressions[1]->to_u32string());
244  case operation::concatenation : {
245  u32string concat;
246  if (subExpressions[0]->op >= operation::alternation) {
247  concat.append(1, '(');
248  concat.append(subExpressions[0]->to_u32string());
249  concat.append(1, ')');
250  } else {
251  concat.append(subExpressions[0]->to_u32string());
252  }
253  if (subExpressions[1]->op >= operation::alternation) {
254  concat.append(1, '(');
255  concat.append(subExpressions[1]->to_u32string());
256  concat.append(1, ')');
257  } else {
258  concat.append(subExpressions[1]->to_u32string());
259  }
260  return concat;
261  }
262  case operation::kleene : {
263  if (subExpressions[0]->op >= operation::concatenation) {
264  return u32string(1, U'(').append(subExpressions[0]->to_u32string()).append(U")*");
265  } else {
266  return subExpressions[0]->to_u32string().append(1, '*');
267  }
268  }
269  case operation::symbol : {
270  char32_t symbol = extractSymbol();
271  return symbol == U'\0' ? u32string(U"ε") : u32string(1, symbol);
272  }
273  case operation::empty : return u32string(U"∅");
274  default : return u32string();
275  }
276 }
277 
279 string expression::to_string() const {
280  return converter.to_bytes(to_u32string());
281 }
282 
284 vector<expression::exptr>::const_iterator expression::begin() const {
285  return subExpressions.cbegin();
286 }
287 
289 vector<expression::exptr>::const_iterator expression::end() const {
290  return subExpressions.cend();
291 }
292 
294 
321  enum token : unsigned char {
322  A,
323  B,
324  C,
325  K,
326  E,
327  F,
328  Σ,
329  P,
330  L,
331  R,
332  S,
334  };
335 
337  typedef bitset<token::END> tokens;
338 
339  static array<token,token::END> const inverseUnitGraph;
341 
343  static array<array<tokens,token::END>,token::END> const inverseBinaryRules;
345 
346 
348 
354  static tokens getUClosure(tokens const& m) {
355  tokens closure;
356  for (unsigned char c(0); c < token::END; c++) {
357  if (m[c]) {
358  for (token s(static_cast<token>(c)); s != token::END; s = inverseUnitGraph[s]) {
359  closure.set(s);
360  }
361  }
362  } // Going through the tokens in m.
363  return closure;
364  }
365 
367 
373  static bool canDerive(token symbol, tokens const& left, tokens const& right) {
374  tokens leftClosure(getUClosure(left));
375  tokens rightClosure(getUClosure(right));
376  for (unsigned char li(0); li < token::END; li++) {
377  if (leftClosure[li]) {
378  for (unsigned char ri(0); ri < token::END; ri++) {
379  if (rightClosure[ri]) {
380  for (unsigned char ci(0); ci < token::END; ci++) {
381  if (inverseBinaryRules[li][ri][ci]) {
382  for (unsigned char successor(ci); successor != token::END; successor = inverseUnitGraph[successor]) {
383  if (symbol == successor) {
384  return true;
385  }
386  }
387  }
388  }
389  }
390  }
391  }
392  }
393  return false;
394  }
395 
396 
398 
406  static void compileTableEntry(size_t row, size_t diag, vector<vector<tokens>>& table) {
407  for (size_t i(0); i < diag; i++) {
408  tokens first(getUClosure(table[row][i]));
409  for (unsigned char fi(0); fi < first.size(); fi++) {
410  if (first[fi]) {
411  tokens second(getUClosure(table[row+1+i][diag-i-1]));
412  for (unsigned char si(0); si < second.size(); si++) {
413  if (second[si]) {
414  table[row][diag] |= inverseBinaryRules[fi][si];
415  }
416  } // Going through the tokens in second.
417  }
418  } // Going through the tokens in first.
419  }
420  }
421 
423  vector<char32_t> symbolMapping;
424  size_t mutable symbolMappingIndex;
425  vector<vector<tokens>> table;
426  bool badRegularExpression = false;
427 
429  parser(u32string const& re, literals const& lits) {
430  if (re.empty()) {
431  badRegularExpression = true;
432  return;
433  }
434  symbolMappingIndex = 0;
435  size_t numberOfTokens(re.length());
436  symbolMapping.reserve(numberOfTokens);
437  table.reserve(numberOfTokens);
438  size_t row(0);
439  for (char32_t symbol : re) {
440  table.push_back(vector<tokens>(numberOfTokens-row, tokens()));
441  if (symbol == lits.L) { table[row][0].set(token::L); }
442  else if (symbol == lits.R) { table[row][0].set(token::R); }
443  else if (symbol == lits.P) { table[row][0].set(token::P); }
444  else if (symbol == lits.S) { table[row][0].set(token::S); }
445  else {
446  table[row][0].set(token::Σ);
447  symbolMapping.push_back(symbol);
448  }
449  row++;
450  }
451  symbolMapping.shrink_to_fit();
452  for (size_t diag(1); diag < numberOfTokens; diag++) {
453  for (size_t row(0); row < numberOfTokens - diag; row++) {
454  compileTableEntry(row, diag, table);
455  }
456  }
457  if (!getUClosure(table[0][table[0].size()-1])[token::A]) {
458  badRegularExpression = true;
459  }
460  }
461 
463  struct tree {
464  parser const* p;
466  pair<unique_ptr<tree>,unique_ptr<tree>> children;
468 
470 
478  pair<unique_ptr<tree>,unique_ptr<tree>> findNextPair(token symbol, size_t row, size_t diag, parser const* p) {
479  for (size_t i = 0; i < diag; i++) {
480  size_t leftDiag = diag-i-1;
481  size_t rightDiag = i;
482  size_t rightRow = diag+row-rightDiag;
483  if (canDerive(symbol, p->table[row][leftDiag], p->table[rightRow][rightDiag])) {
484  return make_pair(make_unique<tree>(row, leftDiag, p), make_unique<tree>(rightRow, rightDiag, p));
485  }
486  }
487  return make_pair(unique_ptr<tree>(), unique_ptr<tree>());
488  }
489 
491 
496  tree(size_t row, size_t diag, parser const* p) :
497  p(p),
498  symbol([&]()->token{
499  for (unsigned char i(token::END); i > 0; i--) {
500  if (p->table[row][diag][i-1]) {
501  return static_cast<token>(i-1);
502  }
503  }
504  return token::END;
505  }()),
506  children(findNextPair(symbol, row, diag, p)) {}
507 
509  exptr operator()(bool optimized, bool aggressive) {
510  switch (symbol) {
511  case token::A:
512  return spawnAlternation((*children.first)(optimized, aggressive), (*children.second)(optimized, aggressive), optimized, aggressive);
513  case token::B:
514  return (*children.second)(optimized, aggressive);
515  case token::C:
516  return spawnConcatenation((*children.first)(optimized, aggressive), (*children.second)(optimized, aggressive), optimized, aggressive);
517  case token::K:
518  return spawnKleene((*children.first)(optimized, aggressive), optimized, aggressive);
519  case token::E:
520  return (*children.second)(optimized, aggressive);
521  case token::F:
522  return (*children.first)(optimized, aggressive);
523  case token::Σ: {
524  char32_t symbol = p->symbolMapping[p->symbolMappingIndex++];
525  if (p->symbolMappingIndex >= p->symbolMapping.size()) { p->symbolMappingIndex = 0; }
526  if (symbol == p->lits.EPSILON) { return spawnEmptyString(); }
527  else if (symbol == p->lits.EMPTY) { return spawnEmptySet(); }
528  else { return spawnSymbol(symbol); }
529  }
530  default:
531  return exptr();
532  }
533  }
534  };
535 
537 
540  exptr operator()(bool optimized, bool aggressive) {
541  if (badRegularExpression) {
542  #ifdef __EXCEPTIONS
543  throw std::bad_function_call();
544  #else
545  return exptr();
546  #endif
547  }
548  return tree(0, table.size()-1, this)(optimized, aggressive);
549  }
550 };
551 
553  []()->array<expression::parser::token,expression::parser::token::END>{
554  array<token,token::END> graph;
555  graph.fill(token::END);
556  graph[token::Σ] = token::E;
557  graph[token::E] = token::K;
558  graph[token::K] = token::C;
559  graph[token::C] = token::A;
560  return graph;
561 }();
562 
563 
565  []()->array<array<expression::parser::tokens,expression::parser::token::END>,expression::parser::token::END>{
566  array<tokens,token::END> noPredecessor;
567  noPredecessor.fill(tokens());
568  array<array<tokens,token::END>,token::END> rules;
569  rules.fill(noPredecessor);
570  rules[token::A][token::B].set(token::A);
571  rules[token::P][token::C].set(token::B);
572  rules[token::C][token::K].set(token::C);
573  rules[token::E][token::S].set(token::K);
574  rules[token::L][token::F].set(token::E);
575  rules[token::A][token::R].set(token::F);
576  return rules;
577 }();
578 
580 
587 expression::exptr expression::spawnFromString(std::u32string const& re, literals lits, bool optimized, bool aggressive) {
588  parser stringParser(re, lits);
589  #ifdef __EXCEPTIONS
590  try {
591  return stringParser(optimized, aggressive);
592  } catch (std::bad_function_call e) {
593  throw std::invalid_argument("Invalid regular expression.");
594  }
595  #else
596  if (stringParser.badRegularExpression) {
597  return exptr();
598  }
599  return stringParser(optimized, aggressive);
600  #endif
601 }
602 
603 
605 expression::exptr expression::spawnFromString(string const& utf8Re, literals lits, bool optimized, bool aggressive) {
606  return spawnFromString(converter.from_bytes(utf8Re), lits, optimized, aggressive);
607 }
608 
609 expression::expression() : op(operation::empty) {}
610 expression::expression(char32_t symbol) : op(operation::symbol) {}
611 expression::expression(exptr const& l, exptr const& r, operation op) : subExpressions({l, r}), op(op) {}
612 expression::expression(exptr const& b) : subExpressions({b}), op(operation::kleene) {}
613 }
Number of elements in this enumeration, NOT AN ACTUAL TOKEN!
Definition: expression.cpp:333
Represents the table entries as binary trees.
Definition: expression.cpp:463
static array< array< tokens, token::END >, token::END > const inverseBinaryRules
Maps pairs of symbols to the symbols that derive them.
Definition: expression.cpp:343
An alternation symbol.
Definition: expression.cpp:329
char32_t const S
The Kleene star.
Definition: expression.h:49
static bool canDerive(token symbol, tokens const &left, tokens const &right)
Checks if a token could derive a pair of tokens from two other entries.
Definition: expression.cpp:373
A concatenation expression.
Definition: expression.cpp:324
static tokens getUClosure(tokens const &m)
Constructs the reflexive-transitive closure of the inverse unit relation for a given set of symbols...
Definition: expression.cpp:354
pair< unique_ptr< tree >, unique_ptr< tree > > findNextPair(token symbol, size_t row, size_t diag, parser const *p)
Finds the child trees that can be derived from a given entry.
Definition: expression.cpp:478
token symbol
This tree&#39;s root symbol.
Definition: expression.cpp:465
A left parenthesis.
Definition: expression.cpp:330
bool operator!=(expression const &r) const
Checks whether this RE is semantically different from another one.
Definition: expression.cpp:203
Token literals as used in Introduction to Automata Theory, Languages, and Computation by Hopcroft...
Definition: expression.h:48
char32_t const EMPTY
Neutral element of alternation and annihilating element of concatenation, a.k.a. empty set...
Definition: expression.h:49
std::vector< exptr >::const_iterator begin() const
Returns an iterator pointing to this RE's first subexpression.
Definition: expression.cpp:284
static exptr const & spawnEmptyString()
Gives an RE representing the empty string ε.
Definition: expression.cpp:42
nfa splitAllTransitions()
Splits all transitions until only ∅, ε, and symbol REs remain and builds the resulting NFA...
Definition: gnfa.cpp:287
vector< vector< tokens > > table
The table of sets of symbols that derive a subsentence.
Definition: expression.cpp:425
builder & minimize()
Convenience method for chaining purge and merge to achieve proper minimization.
Definition: dfa.cpp:773
exptr operator()(bool optimized, bool aggressive)
Gives the RE encoded in this tree.
Definition: expression.cpp:509
static exptr spawnFromString(std::u32string const &re, literals lits=literals(), bool optimized=false, bool aggressive=false)
Gives an RE encoded in a given string.
Definition: expression.cpp:587
static exptr spawnAlternation(exptr const &l, exptr const &r, bool optimized=true, bool aggressive=false)
Gives an RE representing the alternation of two given REs.
Definition: expression.cpp:106
Represents formal regular expressions.
Definition: expression.h:27
Beginning of a new subexpression.
Definition: expression.cpp:326
std::u32string to_u32string() const
Describes this RE in UTF-32-encoded human-readable form.
Definition: expression.cpp:241
bool badRegularExpression
Signifies that the RE used to initialize this object was invalid.
Definition: expression.cpp:426
char32_t extractSymbol() const
Reports this symbol expression's UTF-32-encoded symbol.
Definition: expression.cpp:212
literals lits
Stores interpretations for characters encountered in the parsed string.
Definition: expression.cpp:422
A symbol expression.
Definition: expression.cpp:328
static exptr const & spawnSymbol(char32_t symbol)
Gives an RE representing the given UTF-32-encoded symbol.
Definition: expression.cpp:52
pair< unique_ptr< tree >, unique_ptr< tree > > children
Trees with the symbols of the entry's derived pair as root.
Definition: expression.cpp:466
vector< char32_t > symbolMapping
Stores the actual symbols encountered in the RE while parsing.
Definition: expression.cpp:423
operation
The different purposes an RE may fulfill.
Definition: expression.h:79
Parses regular expressions.
Definition: expression.cpp:319
Contains the reg::nfa class definition.
A Kleene star symbol.
Definition: expression.cpp:332
token
Tokens the grammar deals with.
Definition: expression.cpp:321
operation getOperation() const
Reports this RE's function.
Definition: expression.cpp:181
Contains the reg::expression class defintion.
std::vector< exptr >::const_iterator end() const
Returns an iterator pointing behind this RE's last subexpression.
Definition: expression.cpp:289
Represents generalized nondeterministic finite automata.
Definition: gnfa.h:18
std::string to_string() const
Describes this RE in UTF-8-encoded human-readable form.
Definition: expression.cpp:279
size_t symbolMappingIndex
Index for when symbols have to be extracted from the mapping.
Definition: expression.cpp:424
Second part of an alternation expression.
Definition: expression.cpp:323
bool operator==(expression const &r) const
Checks whether this RE is semantically equivalent to another one.
Definition: expression.cpp:189
char32_t const L
The left parenthesis.
Definition: expression.h:49
A right parenthesis.
Definition: expression.cpp:331
Constructs DFAs step by step.
Definition: dfa.h:60
parser const * p
Points to the parser this tree belongs to.
Definition: expression.cpp:464
Beginning of an alternation expression.
Definition: expression.cpp:322
Contains the reg::gnfa class definition.
char32_t const P
The alternation symbol.
Definition: expression.h:49
size_t size() const
Reports the size of this RE's tree representation.
Definition: expression.cpp:168
Definition: dfa.cpp:29
static void compileTableEntry(size_t row, size_t diag, vector< vector< tokens >> &table)
Fills a table entry.
Definition: expression.cpp:406
char32_t const EPSILON
Neutral element of concatenation, a.k.a. empty string.
Definition: expression.h:49
Kleene expression.
Definition: expression.cpp:325
bitset< token::END > tokens
Tokens don't usually come alone.
Definition: expression.cpp:337
parser(u32string const &re, literals const &lits)
Initializes with a string to parse and literals to parse for.
Definition: expression.cpp:429
exptr operator()(bool optimized, bool aggressive)
Gives the RE resulting from parsing.
Definition: expression.cpp:540
tree(size_t row, size_t diag, parser const *p)
Initializes a tree with a given table entry as root.
Definition: expression.cpp:496
std::shared_ptr< expression const > exptr
This is the type used to handle regular expressions.
Definition: expression.h:39
static exptr const & spawnEmptySet()
Gives an RE representing the empty set ∅.
Definition: expression.cpp:33
char32_t const R
The right parenthesis.
Definition: expression.h:49
Contains the reg::dfa class definition.
std::wstring_convert< std::codecvt_utf8< char32_t >, char32_t > converter
Converts between UTF-8-encoded and UTF-32-encoded strings.
Definition: dfa.cpp:1001
std::string extractUtf8Symbol() const
Reports this symbol expression's UTF-8-encoded symbol.
Definition: expression.cpp:231
static array< token, token::END > const inverseUnitGraph
Maps symbols that may be derived in-place to their predecessing symbols.
Definition: expression.cpp:339
static exptr spawnKleene(exptr const &b, bool optimized=true, bool aggressive=false)
Gives an RE representing the Kleene closure of a given RE.
Definition: expression.cpp:140
static exptr spawnConcatenation(exptr const &l, exptr const &r, bool optimized=true, bool aggressive=false)
Gives an RE representing the concatenation of two given REs.
Definition: expression.cpp:71
Second part of a new subexpression.
Definition: expression.cpp:327