reglibcpp  1.0.0
(Naïve) C++ implementation of models for regular languages
expression.cpp
Go to the documentation of this file.
1 #include "expression.h"
3 
4 using std::shared_ptr;
5 using std::make_pair;
6 using std::pair;
7 using std::string;
8 using std::vector;
9 using std::unique_ptr;
10 using std::u32string;
11 
12 #include <algorithm>
13 using std::find_if;
14 using std::distance;
15 
16 #include <array>
17 using std::array;
18 
19 #include <bitset>
20 using std::bitset;
21 
22 #include "gnfa.h"
23 #include "nfa.h"
24 #include "dfa.h"
25 
26 namespace reg {
27 auto const expression::converter
28  = unique_ptr<std::wstring_convert<std::codecvt_utf8<char32_t>,char32_t>>(
29  new std::wstring_convert<std::codecvt_utf8<char32_t>,char32_t>()
30  );
31 expression::exptr expression::empty = expression::exptr(new expression);
32 std::unordered_map<char32_t, expression::exptr> expression::symbols;
33 
35 
41  return expression::empty;
42 }
43 
45 
51  return expression::spawnSymbol(U'\0');
52 }
53 
55 
61 expression::exptr const& expression::spawnSymbol(char32_t symbol) {
62  return expression::symbols.insert(pair<char32_t,exptr>(symbol, exptr(new expression(symbol)))).first->second;
63 }
64 
66 expression::exptr const& expression::spawnSymbol(string utf8Symbol) {
67  char32_t u32Symbol(converter->from_bytes(utf8Symbol)[0]);
68  return expression::symbols.insert(pair<char32_t,exptr>(u32Symbol, exptr(new expression(u32Symbol)))).first->second;
69 }
70 
72 
81 expression::exptr expression::spawnConcatenation(expression::exptr const& l, expression::exptr const& r, bool optimized, bool aggressive) {
82  if (optimized) {
83  if (l == expression::spawnEmptyString()) {
84  return r;
85  }
86  if (r == expression::spawnEmptyString()) {
87  return l;
88  }
91  }
92  if (aggressive) {
93  if (*l == *expression::spawnEmptyString()) {
94  return r;
95  }
96  if (*r == *expression::spawnEmptyString()) {
97  return l;
98  }
99  if (*r == *expression::spawnEmptySet() || *l == *expression::spawnEmptySet()) {
100  return expression::spawnEmptySet();
101  }
102  }
103  }
104  return expression::exptr(new expression(l, r, operation::concatenation));
105 }
106 
108 
117 expression::exptr expression::spawnAlternation(expression::exptr const& l, expression::exptr const& r, bool optimized, bool aggressive) {
118  if (optimized) {
119  if (l == expression::spawnEmptySet()) {
120  return r;
121  }
122  if (r == expression::spawnEmptySet()) {
123  return l;
124  }
125  if (l == r) {
126  return l;
127  }
128  if (aggressive) {
129  if (*l == *expression::spawnEmptySet()) {
130  return r;
131  }
132  if (*r == *expression::spawnEmptySet()) {
133  return l;
134  }
135  if (*l == *r) {
136  return l->size() > r->size() ? r : l;
137  }
138  }
139  }
140  return expression::exptr(new expression(l, r, operation::alternation));
141 }
142 
144 
152 expression::exptr expression::spawnKleene(expression::exptr const& b, bool optimized, bool aggressive) {
153  if (optimized) {
156  }
157  if (aggressive) {
158  if (*b == *expression::spawnEmptySet() || *b == *expression::spawnEmptyString()) {
160  }
161  if (*b == expression(b)) {
162  return b;
163  }
164  }
165  }
166  return expression::exptr(new expression(b));
167 }
168 
170 
181 size_t expression::size() const {
182  size_t s(op != operation::empty);
183  for (exptr const& re : subExpressions) {
184  s += re->size();
185  }
186  return s;
187 }
188 
190 
196  return op;
197 }
198 
200 
204 bool expression::operator==(expression const& r) const {
205  if (!acceptingDfa) {
206  acceptingDfa = unique_ptr<dfa const>(new dfa(dfa::builder(gnfa(*this).splitAllTransitions()).purge().merge()));
207  }
208  if (!r.acceptingDfa) {
209  r.acceptingDfa = unique_ptr<dfa const>(new dfa(dfa::builder(gnfa(r).splitAllTransitions()).purge().merge()));
210  }
211  return *acceptingDfa == *r.acceptingDfa;
212 }
213 
215 
219 bool expression::operator!=(expression const& r) const {
220  return !operator==(r);
221 }
222 
224 
229 char32_t expression::extractSymbol() const {
230  auto it = find_if(
231  expression::symbols.begin(),
232  expression::symbols.end(),
233  [&](pair<char32_t,exptr> const& entry)->bool{return entry.second.get() == this;}
234  );
235  #ifdef __EXCEPTIONS
236  if (it == expression::symbols.end()) {
237  throw std::logic_error("This RE does not seem to be a valid symbol expression.");
238  }
239  #endif
240  return it->first;
241 }
242 
244 
250  char32_t symbol(extractSymbol());
251  if (symbol == U'\0') {
252  return string();
253  } else {
254  return converter->to_bytes(extractSymbol());
255  }
256 }
257 
259 u32string expression::to_u32string() const {
260  switch (op) {
261  case operation::alternation : return subExpressions[0]->to_u32string().append(U"+").append(subExpressions[1]->to_u32string());
262  case operation::concatenation : {
263  u32string concat;
264  if (subExpressions[0]->op >= operation::alternation) {
265  concat.append(1, '(');
266  concat.append(subExpressions[0]->to_u32string());
267  concat.append(1, ')');
268  } else {
269  concat.append(subExpressions[0]->to_u32string());
270  }
271  if (subExpressions[1]->op >= operation::alternation) {
272  concat.append(1, '(');
273  concat.append(subExpressions[1]->to_u32string());
274  concat.append(1, ')');
275  } else {
276  concat.append(subExpressions[1]->to_u32string());
277  }
278  return concat;
279  }
280  case operation::kleene : {
281  if (subExpressions[0]->op >= operation::concatenation) {
282  return u32string(1, U'(').append(subExpressions[0]->to_u32string()).append(U")*");
283  } else {
284  return subExpressions[0]->to_u32string().append(1, '*');
285  }
286  }
287  case operation::symbol : {
288  char32_t symbol = extractSymbol();
289  return symbol == U'\0' ? u32string(U"ε") : u32string(1, symbol);
290  }
291  case operation::empty : return u32string(U"∅");
292  default : return u32string();
293  }
294 }
295 
297 string expression::to_string() const {
298  return converter->to_bytes(to_u32string());
299 }
300 
302 vector<expression::exptr>::const_iterator expression::begin() const {
303  return subExpressions.cbegin();
304 }
305 
307 vector<expression::exptr>::const_iterator expression::end() const {
308  return subExpressions.cend();
309 }
310 
312 
340  enum token : unsigned char {
341  A,
342  B,
343  C,
344  K,
345  E,
346  F,
347  Σ,
348  P,
349  L,
350  R,
351  S,
353  };
354 
356  typedef bitset<token::END> tokens;
357 
358  static array<token,token::END> const inverseUnitGraph;
360 
362  static array<array<tokens,token::END>,token::END> const inverseBinaryRules;
364 
365 
367 
374  static tokens getUClosure(tokens const& m) {
375  tokens closure;
376  for (unsigned char c(0); c < token::END; c++) {
377  if (m[c]) {
378  for (token s(static_cast<token>(c)); s != token::END; s = inverseUnitGraph[s]) {
379  closure.set(s);
380  }
381  }
382  } // Going through the tokens in m.
383  return closure;
384  }
385 
387 
394  static bool canDerive(token symbol, tokens const& left, tokens const& right) {
395  tokens leftClosure(getUClosure(left));
396  tokens rightClosure(getUClosure(right));
397  for (unsigned char li(0); li < token::END; li++) {
398  if (leftClosure[li]) {
399  for (unsigned char ri(0); ri < token::END; ri++) {
400  if (rightClosure[ri]) {
401  for (unsigned char ci(0); ci < token::END; ci++) {
402  if (inverseBinaryRules[li][ri][ci]) {
403  for (unsigned char successor(ci); successor != token::END; successor = inverseUnitGraph[successor]) {
404  if (symbol == successor) {
405  return true;
406  }
407  }
408  }
409  }
410  }
411  }
412  }
413  }
414  return false;
415  }
416 
417 
419 
428  static void compileTableEntry(size_t row, size_t diag, vector<vector<tokens>>& table) {
429  for (size_t i(0); i < diag; i++) {
430  tokens first(getUClosure(table[row][i]));
431  for (unsigned char fi(0); fi < first.size(); fi++) {
432  if (first[fi]) {
433  tokens second(getUClosure(table[row+1+i][diag-i-1]));
434  for (unsigned char si(0); si < second.size(); si++) {
435  if (second[si]) {
436  table[row][diag] |= inverseBinaryRules[fi][si];
437  }
438  } // Going through the tokens in second.
439  }
440  } // Going through the tokens in first.
441  }
442  }
443 
445  vector<char32_t> symbolMapping;
446  size_t mutable symbolMappingIndex;
447  vector<vector<tokens>> table;
448  bool badRegularExpression = false;
449 
451  parser(u32string const& re, literals const& lits) {
452  if (re.empty()) {
453  badRegularExpression = true;
454  return;
455  }
456  symbolMappingIndex = 0;
457  size_t numberOfTokens(re.length());
458  symbolMapping.reserve(numberOfTokens);
459  table.reserve(numberOfTokens);
460  size_t row(0);
461  for (char32_t symbol : re) {
462  table.push_back(vector<tokens>(numberOfTokens-row, tokens()));
463  if (symbol == lits.L) { table[row][0].set(token::L); }
464  else if (symbol == lits.R) { table[row][0].set(token::R); }
465  else if (symbol == lits.P) { table[row][0].set(token::P); }
466  else if (symbol == lits.S) { table[row][0].set(token::S); }
467  else {
468  table[row][0].set(token::Σ);
469  symbolMapping.push_back(symbol);
470  }
471  row++;
472  }
473  symbolMapping.shrink_to_fit();
474  for (size_t diag(1); diag < numberOfTokens; diag++) {
475  for (size_t row(0); row < numberOfTokens - diag; row++) {
476  compileTableEntry(row, diag, table);
477  }
478  }
479  if (!getUClosure(table[0][table[0].size()-1])[token::A]) {
480  badRegularExpression = true;
481  }
482  }
483 
485  struct tree {
486  parser const* p;
488  pair<unique_ptr<tree>,unique_ptr<tree>> children;
490 
492 
501  pair<unique_ptr<tree>,unique_ptr<tree>> findNextPair(token symbol, size_t row, size_t diag, parser const* p) {
502  for (size_t i = 0; i < diag; i++) {
503  size_t leftDiag = diag-i-1;
504  size_t rightDiag = i;
505  size_t rightRow = diag+row-rightDiag;
506  if (canDerive(symbol, p->table[row][leftDiag], p->table[rightRow][rightDiag])) {
507  return make_pair(unique_ptr<tree>(new tree(row, leftDiag, p)), unique_ptr<tree>(new tree(rightRow, rightDiag, p)));
508  }
509  }
510  return make_pair(unique_ptr<tree>(), unique_ptr<tree>());
511  }
512 
514 
520  tree(size_t row, size_t diag, parser const* p) :
521  p(p),
522  symbol([&]()->token{
523  for (unsigned char i(token::END); i > 0; i--) {
524  if (p->table[row][diag][i-1]) {
525  return static_cast<token>(i-1);
526  }
527  }
528  return token::END;
529  }()),
530  children(findNextPair(symbol, row, diag, p)) {}
531 
533  exptr operator()(bool optimized, bool aggressive) {
534  switch (symbol) {
535  case token::A:
536  return spawnAlternation((*children.first)(optimized, aggressive), (*children.second)(optimized, aggressive), optimized, aggressive);
537  case token::B:
538  return (*children.second)(optimized, aggressive);
539  case token::C:
540  return spawnConcatenation((*children.first)(optimized, aggressive), (*children.second)(optimized, aggressive), optimized, aggressive);
541  case token::K:
542  return spawnKleene((*children.first)(optimized, aggressive), optimized, aggressive);
543  case token::E:
544  return (*children.second)(optimized, aggressive);
545  case token::F:
546  return (*children.first)(optimized, aggressive);
547  case token::Σ: {
548  char32_t symbol = p->symbolMapping[p->symbolMappingIndex++];
549  if (p->symbolMappingIndex >= p->symbolMapping.size()) { p->symbolMappingIndex = 0; }
550  if (symbol == p->lits.EPSILON) { return spawnEmptyString(); }
551  else if (symbol == p->lits.EMPTY) { return spawnEmptySet(); }
552  else { return spawnSymbol(symbol); }
553  }
554  default:
555  return exptr();
556  }
557  }
558  };
559 
561 
565  exptr operator()(bool optimized, bool aggressive) {
566  if (badRegularExpression) {
567  #ifdef __EXCEPTIONS
568  throw std::bad_function_call();
569  #else
570  return exptr();
571  #endif
572  }
573  return tree(0, table.size()-1, this)(optimized, aggressive);
574  }
575 };
576 
578  []()->array<expression::parser::token,expression::parser::token::END>{
579  array<token,token::END> graph;
580  graph.fill(token::END);
581  graph[token::Σ] = token::E;
582  graph[token::E] = token::K;
583  graph[token::K] = token::C;
584  graph[token::C] = token::A;
585  return graph;
586 }();
587 
588 
590  []()->array<array<expression::parser::tokens,expression::parser::token::END>,expression::parser::token::END>{
591  array<tokens,token::END> noPredecessor;
592  noPredecessor.fill(tokens());
593  array<array<tokens,token::END>,token::END> rules;
594  rules.fill(noPredecessor);
595  rules[token::A][token::B].set(token::A);
596  rules[token::P][token::C].set(token::B);
597  rules[token::C][token::K].set(token::C);
598  rules[token::E][token::S].set(token::K);
599  rules[token::L][token::F].set(token::E);
600  rules[token::A][token::R].set(token::F);
601  return rules;
602 }();
603 
605 
613 expression::exptr expression::spawnFromString(std::u32string const& re, literals lits, bool optimized, bool aggressive) {
614  parser stringParser(re, lits);
615  #ifdef __EXCEPTIONS
616  try {
617  return stringParser(optimized, aggressive);
618  } catch (std::bad_function_call e) {
619  throw std::invalid_argument("Invalid regular expression.");
620  }
621  #else
622  if (stringParser.badRegularExpression) {
623  return exptr();
624  }
625  return stringParser(optimized, aggressive);
626  #endif
627 }
628 
629 
631 expression::exptr expression::spawnFromString(string const& utf8Re, literals lits, bool optimized, bool aggressive) {
632  return spawnFromString(converter->from_bytes(utf8Re), lits, optimized, aggressive);
633 }
634 
635 expression::expression() : op(operation::empty) {}
636 expression::expression(char32_t symbol) : op(operation::symbol) {}
637 expression::expression(exptr const& l, exptr const& r, operation op) : subExpressions({l, r}), op(op) {}
638 expression::expression(exptr const& b) : subExpressions({b}), op(operation::kleene) {}
639 }
Number of elements in this enumeration, NOT AN ACTUAL TOKEN!
Definition: expression.cpp:352
Represents the table entries as binary trees.
Definition: expression.cpp:485
static array< array< tokens, token::END >, token::END > const inverseBinaryRules
Maps pairs of symbols to the symbols that derive them.
Definition: expression.cpp:362
static std::unique_ptr< std::wstring_convert< std::codecvt_utf8< char32_t >, char32_t > > const converter
Converts between UTF-8-encoded and UTF-32-encoded strings.
Definition: expression.h:97
An alternation symbol.
Definition: expression.cpp:348
char32_t const S
The Kleene star.
Definition: expression.h:53
static bool canDerive(token symbol, tokens const &left, tokens const &right)
Checks if a token could derive a pair of tokens from two other entries.
Definition: expression.cpp:394
A concatenation expression.
Definition: expression.cpp:343
static tokens getUClosure(tokens const &m)
Constructs the reflexive-transitive closure of the inverse unit relation for a given set of symbols...
Definition: expression.cpp:374
pair< unique_ptr< tree >, unique_ptr< tree > > findNextPair(token symbol, size_t row, size_t diag, parser const *p)
Finds the child trees that can be derived from a given entry.
Definition: expression.cpp:501
token symbol
This tree&#39;s root symbol.
Definition: expression.cpp:487
A left parenthesis.
Definition: expression.cpp:349
bool operator!=(expression const &r) const
Checks whether this RE is semantically different from another one.
Definition: expression.cpp:219
Token literals as used in Introduction to Automata Theory, Languages, and Computation by Hopcroft...
Definition: expression.h:52
Represents deterministic finite automata.
Definition: dfa.h:22
char32_t const EMPTY
Neutral element of alternation and annihilating element of concatenation, a.k.a. empty set...
Definition: expression.h:53
std::vector< exptr >::const_iterator begin() const
Returns an iterator pointing to this RE&#39;s first subexpression.
Definition: expression.cpp:302
static exptr const & spawnEmptyString()
Gives an RE representing the empty string ε.
Definition: expression.cpp:50
nfa splitAllTransitions()
Splits all transitions until only ∅, ε, and symbol REs remain and builds the resulting NFA...
Definition: gnfa.cpp:300
vector< vector< tokens > > table
The table of sets of symbols that derive a subsentence.
Definition: expression.cpp:447
exptr operator()(bool optimized, bool aggressive)
Gives the RE encoded in this tree.
Definition: expression.cpp:533
static exptr spawnFromString(std::u32string const &re, literals lits=literals(), bool optimized=false, bool aggressive=false)
Gives an RE encoded in a given string.
Definition: expression.cpp:613
static exptr spawnAlternation(exptr const &l, exptr const &r, bool optimized=true, bool aggressive=false)
Gives an RE representing the alternation of two given REs.
Definition: expression.cpp:117
Represents formal regular expressions.
Definition: expression.h:31
Beginning of a new subexpression.
Definition: expression.cpp:345
std::u32string to_u32string() const
Describes this RE in UTF-32-encoded human-readable form.
Definition: expression.cpp:259
bool badRegularExpression
Signifies that the RE used to initialize this object was invalid.
Definition: expression.cpp:448
char32_t extractSymbol() const
Reports this symbol expression&#39;s UTF-32-encoded symbol.
Definition: expression.cpp:229
literals lits
Stores interpretations for characters encountered in the parsed string.
Definition: expression.cpp:444
A symbol expression.
Definition: expression.cpp:347
static exptr const & spawnSymbol(char32_t symbol)
Gives an RE representing the given UTF-32-encoded symbol.
Definition: expression.cpp:61
pair< unique_ptr< tree >, unique_ptr< tree > > children
Trees with the symbols of the entry&#39;s derived pair as root.
Definition: expression.cpp:488
vector< char32_t > symbolMapping
Stores the actual symbols encountered in the RE while parsing.
Definition: expression.cpp:445
operation
The different purposes an RE may fulfill.
Definition: expression.h:84
Parses regular expressions.
Definition: expression.cpp:338
Contains the reg::nfa class definition.
A Kleene star symbol.
Definition: expression.cpp:351
token
Tokens the grammar deals with.
Definition: expression.cpp:340
operation getOperation() const
Reports this RE&#39;s function.
Definition: expression.cpp:195
Contains the reg::expression class defintion.
std::vector< exptr >::const_iterator end() const
Returns an iterator pointing behind this RE&#39;s last subexpression.
Definition: expression.cpp:307
Represents generalized nondeterministic finite automata.
Definition: gnfa.h:18
std::string to_string() const
Describes this RE in UTF-32-encoded human-readable form.
Definition: expression.cpp:297
size_t symbolMappingIndex
Index for when symbols have to be extracted from the mapping.
Definition: expression.cpp:446
Second part of an alternation expression.
Definition: expression.cpp:342
bool operator==(expression const &r) const
Checks whether this RE is semantically equivalent to another one.
Definition: expression.cpp:204
char32_t const L
The left parenthesis.
Definition: expression.h:53
A right parenthesis.
Definition: expression.cpp:350
builder & merge()
Merges the prospective DFA&#39;s indistinguishable states, allowing for minimization. ...
Definition: dfa.cpp:611
Constructs DFAs step by step.
Definition: dfa.h:49
parser const * p
Points to the parser this tree belongs to.
Definition: expression.cpp:486
Beginning of an alternation expression.
Definition: expression.cpp:341
Contains the reg::gnfa class definition.
char32_t const P
The alternation symbol.
Definition: expression.h:53
size_t size() const
Reports the size of this RE&#39;s tree representation.
Definition: expression.cpp:181
Definition: dfa.cpp:32
static void compileTableEntry(size_t row, size_t diag, vector< vector< tokens >> &table)
Fills a table entry.
Definition: expression.cpp:428
char32_t const EPSILON
Neutral element of concatenation, a.k.a. empty string.
Definition: expression.h:53
Kleene expression.
Definition: expression.cpp:344
bitset< token::END > tokens
Tokens don't usually come alone.
Definition: expression.cpp:356
parser(u32string const &re, literals const &lits)
Initializes with a string to parse and literals to parse for.
Definition: expression.cpp:451
exptr operator()(bool optimized, bool aggressive)
Gives the RE resulting from parsing.
Definition: expression.cpp:565
tree(size_t row, size_t diag, parser const *p)
Initializes a tree with a given table entry as root.
Definition: expression.cpp:520
std::shared_ptr< expression const > exptr
This is the type used to handle regular expressions.
Definition: expression.h:43
static exptr const & spawnEmptySet()
Gives an RE representing the empty set ∅.
Definition: expression.cpp:40
char32_t const R
The right parenthesis.
Definition: expression.h:53
Contains the reg::dfa class definition.
std::string extractUtf8Symbol() const
Reports this symbol expression&#39;s UTF-8-encoded symbol.
Definition: expression.cpp:249
static array< token, token::END > const inverseUnitGraph
Maps symbols that may be derived in-place to their predecessing symbols.
Definition: expression.cpp:358
static exptr spawnKleene(exptr const &b, bool optimized=true, bool aggressive=false)
Gives an RE representing the Kleene closure of a given RE.
Definition: expression.cpp:152
static exptr spawnConcatenation(exptr const &l, exptr const &r, bool optimized=true, bool aggressive=false)
Gives an RE representing the concatenation of two given REs.
Definition: expression.cpp:81
Second part of a new subexpression.
Definition: expression.cpp:346