reglibcpp  2.0.0
A C++ implementation of models for regular languages
expression.cpp
Go to the documentation of this file.
1 // Copyright 2017, 2018 Tom Kranz
2 //
3 // This file is part of reglibcpp.
4 //
5 // reglibcpp is free software: you can redistribute it and/or modify
6 // it under the terms of the GNU General Public License as published by
7 // the Free Software Foundation, either version 3 of the License, or
8 // (at your option) any later version.
9 //
10 // reglibcpp is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 // GNU General Public License for more details.
14 //
15 // You should have received a copy of the GNU General Public License
16 // along with reglibcpp. If not, see <https://www.gnu.org/licenses/>.
17 
19 #include "expression.h"
20 
21 using std::make_pair;
22 using std::make_unique;
23 using std::pair;
24 using std::string;
25 using std::u32string;
26 using std::unique_ptr;
27 using std::vector;
28 
29 #include <algorithm>
30 
31 #include <array>
32 using std::array;
33 
34 #include <bitset>
35 using std::bitset;
36 
37 #include "dfa.h"
38 #include "fabuilder.h"
39 #include "gnfa.h"
40 #include "nfa.h"
41 
42 namespace reg {
43 expression::exptr expression::empty = expression::exptr(new expression);
45 
46 char32_t expression::L =
47  U'(';
48 char32_t expression::R =
50  U')';
51 char32_t expression::K =
53  U'*';
54 char32_t expression::A =
56  U'+';
57 char32_t expression::E =
59  U'ε';
60 char32_t expression::N =
62  U'∅';
63 
67  L = U'(';
68  R = U')';
69  K = U'*';
70  A = U'+';
71  E = U'ε';
72  N = U'∅';
73 }
74 
76 
81  return expression::empty;
82 }
83 
85 
90  return expression::spawnSymbol_(U'\0');
91 }
92 
94 
101  return expression::symbols.emplace(symbol, new expression(symbol))
102  .first->second;
103 }
104 
106 
116 expression::exptr const& expression::spawnSymbol(string const& symbol) {
117  char32_t u32Symbol(converter.from_bytes(symbol)[0]);
118  return expression::symbols.emplace(u32Symbol, new expression(u32Symbol))
119  .first->second;
120 }
121 
123 
136  expression::exptr const& r,
137  bool optimized,
138  bool aggressive) {
139  if (optimized) {
140  if (l == expression::spawnEmptyString()) {
141  return r;
142  }
143  if (r == expression::spawnEmptyString()) {
144  return l;
145  }
147  return expression::spawnEmptySet();
148  }
149  if (aggressive) {
150  if (*l == *expression::spawnEmptyString()) {
151  return r;
152  }
153  if (*r == *expression::spawnEmptyString()) {
154  return l;
155  }
156  if (*r == *expression::spawnEmptySet() ||
157  *l == *expression::spawnEmptySet()) {
158  return expression::spawnEmptySet();
159  }
160  }
161  }
162  return expression::exptr(new expression(l, r, operation::concatenation));
163 }
164 
166 
179  expression::exptr const& r,
180  bool optimized,
181  bool aggressive) {
182  if (optimized) {
183  if (l == expression::spawnEmptySet()) {
184  return r;
185  }
186  if (r == expression::spawnEmptySet()) {
187  return l;
188  }
189  if (l == r) {
190  return l;
191  }
192  if (aggressive) {
193  if (*l == *expression::spawnEmptySet()) {
194  return r;
195  }
196  if (*r == *expression::spawnEmptySet()) {
197  return l;
198  }
199  if (*l == *r) {
200  return l->size() > r->size() ? r : l;
201  }
202  static dfa empty;
203  if (empty == nfa::subtract(*l, *r).buildDfa(true)) {
204  return r;
205  }
206  if (empty == nfa::subtract(*r, *l).buildDfa(true)) {
207  return l;
208  }
209  }
210  }
211  return expression::exptr(new expression(l, r, operation::alternation));
212 }
213 
215 
225  bool optimized, bool aggressive) {
226  if (optimized) {
227  if (b == expression::spawnEmptySet() ||
230  }
231  if (aggressive) {
232  if (*b == *expression::spawnEmptySet() ||
233  *b == *expression::spawnEmptyString()) {
235  }
236  if (*b == expression(b)) {
237  return b;
238  }
239  }
240  }
241  return expression::exptr(new expression(b));
242 }
243 
245 
255 size_t expression::size() const {
256  size_t s(1);
257  for (exptr const& re : *this) {
258  s += re->size();
259  }
260  return s;
261 }
262 
264 
270 
271 #ifndef DOXYGEN_SHOULD_SKIP_THIS
272 expression::operator nfa const&() const {
273  if (!acceptingNfa) {
274  acceptingNfa = make_unique<nfa const>(gnfa(*this).splitAllTransitions());
275  }
276  return *acceptingNfa;
277 }
278 #endif // DOXYGEN_SHOULD_SKIP_THIS
279 
282 
286 bool expression::operator==(nfa const& other) const {
287  return other.operator==(*this);
288 }
289 
292 
296 bool expression::operator!=(nfa const& other) const {
297  return !operator==(other);
298 }
299 
302 
309 char32_t expression::extractSymbol_() const {
310  auto it = std::find_if(expression::symbols.begin(), expression::symbols.end(),
311  [&](pair<char32_t, exptr> const& entry) -> bool {
312  return entry.second.get() == this;
313  });
314  if (it == expression::symbols.end()) {
315  throw std::logic_error(
316  "This RE does not seem to be a valid symbol expression.");
317  }
318  return it->first;
319 }
320 
323 
330  char32_t symbol(extractSymbol_());
331  return converter.to_bytes(u32string(!!symbol, symbol));
332 }
333 
336  switch (op) {
337  case operation::alternation:
338  return subExpressions[0]->to_u32string() + A +
339  subExpressions[1]->to_u32string();
340  case operation::concatenation: {
341  u32string concat;
342  if (subExpressions[0]->op >= operation::alternation) {
343  concat.append(L + subExpressions[0]->to_u32string() + R);
344  } else {
345  concat.append(subExpressions[0]->to_u32string());
346  }
347  if (subExpressions[1]->op >= operation::alternation) {
348  concat.append(L + subExpressions[1]->to_u32string() + R);
349  } else {
350  concat.append(subExpressions[1]->to_u32string());
351  }
352  return concat;
353  }
354  case operation::kleene: {
355  if (subExpressions[0]->op >= operation::concatenation) {
356  return (L + subExpressions[0]->to_u32string() + R) + K;
357  } else {
358  return subExpressions[0]->to_u32string() + K;
359  }
360  }
361  case operation::symbol: {
362  char32_t symbol = extractSymbol_();
363  return u32string(1, symbol + (!symbol * E));
364  }
365  case operation::empty:
366  return u32string(1, N);
367  default:
368  return u32string();
369  }
370 }
371 
373 string expression::to_string() const {
374  return converter.to_bytes(to_u32string());
375 }
376 
379  return subExpressions.cbegin();
380 }
381 
384  return subExpressions.cend();
385 }
386 
388 
419  enum struct token : unsigned char {
420  A,
421  B,
422  C,
423  K,
424  E,
425  F,
426  Σ,
427  P,
428  L,
429  R,
430  S,
431  END
432  };
433 
435 #define TOKEN(T) static_cast<unsigned char>(reg::expression::parser::token::T)
436 
439 
442 
446 
449 
456  static tokens getUClosure(tokens const& m) {
457  tokens closure;
458  for (unsigned char c(0); c < TOKEN(END); c++) {
459  if (m[c]) {
460  for (token s(static_cast<token>(c)); s != token::END;
461  s = inverseUnitGraph[static_cast<unsigned char>(s)]) {
462  closure.set(static_cast<unsigned char>(s));
463  }
464  }
465  } // Going through the tokens in m.
466  return closure;
467  }
468 
470 
479  static bool canDerive(token symbol, tokens const& left, tokens const& right) {
480  tokens leftClosure(getUClosure(left));
481  tokens rightClosure(getUClosure(right));
482  for (unsigned char li(0); li < TOKEN(END); li++) {
483  if (leftClosure[li]) {
484  for (unsigned char ri(0); ri < TOKEN(END); ri++) {
485  if (rightClosure[ri]) {
486  for (unsigned char ci(0); ci < TOKEN(END); ci++) {
487  if (inverseBinaryRules[li][ri][ci]) {
488  for (unsigned char successor(ci); successor != TOKEN(END);
489  successor = static_cast<unsigned char>(
490  inverseUnitGraph[successor])) {
491  if (static_cast<unsigned char>(symbol) == successor) {
492  return true;
493  }
494  }
495  }
496  }
497  }
498  }
499  }
500  }
501  return false;
502  }
503 
505 
514  static void compileTableEntry(size_t row, size_t diag,
516  for (size_t i(0); i < diag; i++) {
517  tokens first(getUClosure(table[row][i]));
518  for (unsigned char fi(0); fi < first.size(); fi++) {
519  if (first[fi]) {
520  tokens second(getUClosure(table[row + 1 + i][diag - i - 1]));
521  for (unsigned char si(0); si < second.size(); si++) {
522  if (second[si]) {
523  table[row][diag] |= inverseBinaryRules[fi][si];
524  }
525  } // Going through the tokens in second.
526  }
527  } // Going through the tokens in first.
528  }
529  }
530 
532  size_t mutable symbolMappingIndex;
538  false;
539 
542  parser(u32string const& re) {
543  if (re.empty()) {
544  badRegularExpression = true;
545  return;
546  }
547  symbolMappingIndex = 0;
548  size_t numberOfTokens(re.length());
549  symbolMapping.reserve(numberOfTokens);
550  table.reserve(numberOfTokens);
551  size_t row(0);
552  for (char32_t symbol : re) {
553  table.push_back(vector<tokens>(numberOfTokens - row, tokens()));
554  if (symbol == L) {
555  table[row][0].set(TOKEN(L));
556  } else if (symbol == R) {
557  table[row][0].set(TOKEN(R));
558  } else if (symbol == A) {
559  table[row][0].set(TOKEN(P));
560  } else if (symbol == K) {
561  table[row][0].set(TOKEN(S));
562  } else {
563  table[row][0].set(TOKEN(Σ));
564  symbolMapping.push_back(symbol);
565  }
566  row++;
567  }
568  symbolMapping.shrink_to_fit();
569  for (size_t diag(1); diag < numberOfTokens; diag++) {
570  for (size_t row(0); row < numberOfTokens - diag; row++) {
571  compileTableEntry(row, diag, table);
572  }
573  }
574  if (!getUClosure(table[0][table[0].size() - 1])[TOKEN(A)]) {
575  badRegularExpression = true;
576  }
577  }
578 
580  struct tree {
581  parser const* p;
585 
587 
596  size_t row,
597  size_t diag,
598  parser const* p) {
599  for (size_t i = 0; i < diag; i++) {
600  size_t leftDiag = diag - i - 1;
601  size_t rightDiag = i;
602  size_t rightRow = diag + row - rightDiag;
603  if (canDerive(symbol, p->table[row][leftDiag],
604  p->table[rightRow][rightDiag])) {
605  return make_pair(make_unique<tree>(row, leftDiag, p),
606  make_unique<tree>(rightRow, rightDiag, p));
607  }
608  }
609  return make_pair(unique_ptr<tree>(), unique_ptr<tree>());
610  }
611 
613 
618  tree(size_t row, size_t diag, parser const* p)
619  : p(p),
620  symbol([&]() -> token {
621  for (unsigned char i(TOKEN(END)); i > 0; i--) {
622  if (p->table[row][diag][i - 1]) {
623  return static_cast<token>(i - 1);
624  }
625  }
626  return token::END;
627  }()),
628  children(findNextPair(symbol, row, diag, p)) {}
629 
631  exptr operator()(bool optimized, bool aggressive) {
632  switch (symbol) {
633  case token::A:
634  return spawnAlternation((*children.first)(optimized, aggressive),
635  (*children.second)(optimized, aggressive),
636  optimized, aggressive);
637  case token::B:
638  return (*children.second)(optimized, aggressive);
639  case token::C:
640  return spawnConcatenation((*children.first)(optimized, aggressive),
641  (*children.second)(optimized, aggressive),
642  optimized, aggressive);
643  case token::K:
644  return spawnKleene((*children.first)(optimized, aggressive),
645  optimized, aggressive);
646  case token::E:
647  return (*children.second)(optimized, aggressive);
648  case token::F:
649  return (*children.first)(optimized, aggressive);
650  case token::Σ: {
651  char32_t symbol = p->symbolMapping[p->symbolMappingIndex++];
652  if (p->symbolMappingIndex >= p->symbolMapping.size()) {
653  p->symbolMappingIndex = 0;
654  }
655  if (symbol == E) {
656  return spawnEmptyString();
657  } else if (symbol == N) {
658  return spawnEmptySet();
659  } else {
660  return spawnSymbol_(symbol);
661  }
662  }
663  default:
664  return exptr();
665  }
666  }
667  };
668 
670 
675  exptr operator()(bool optimized, bool aggressive) {
676  if (badRegularExpression) {
677  throw std::bad_function_call();
678  }
679  return tree(0, table.size() - 1, this)(optimized, aggressive);
680  }
681 };
682 
684  []() -> array<expression::parser::token, TOKEN(END)> {
686  graph.fill(token::END);
687  graph[TOKEN(Σ)] = token::E;
688  graph[TOKEN(E)] = token::K;
689  graph[TOKEN(K)] = token::C;
690  graph[TOKEN(C)] = token::A;
691  return graph;
692 }();
693 
695  []() -> array<array<expression::parser::tokens, TOKEN(END)>, TOKEN(END)> {
696  array<tokens, TOKEN(END)> noPredecessor;
697  noPredecessor.fill(tokens());
699  rules.fill(noPredecessor);
700  rules[TOKEN(A)][TOKEN(B)].set(TOKEN(A));
701  rules[TOKEN(P)][TOKEN(C)].set(TOKEN(B));
702  rules[TOKEN(C)][TOKEN(K)].set(TOKEN(C));
703  rules[TOKEN(E)][TOKEN(S)].set(TOKEN(K));
704  rules[TOKEN(L)][TOKEN(F)].set(TOKEN(E));
705  rules[TOKEN(A)][TOKEN(R)].set(TOKEN(F));
706  return rules;
707 }();
708 #undef TOKEN
709 
711 
721  bool optimized,
722  bool aggressive) {
723  parser stringParser(u32re);
724  try {
725  return stringParser(optimized, aggressive);
726  } catch (std::bad_function_call e) {
727  throw std::invalid_argument("Malformed regular expression.");
728  }
729 }
730 
732 
746 expression::exptr expression::spawnFromString(string const& re, bool optimized,
747  bool aggressive) {
748  return spawnFromString_(converter.from_bytes(re), optimized, aggressive);
749 }
750 
751 expression::expression() : op(operation::empty) {}
752 expression::expression(char32_t symbol) : op(operation::symbol) {}
753 expression::expression(exptr const& l, exptr const& r, operation op)
754  : subExpressions({l, r}), op(op) {}
755 expression::expression(exptr const& b)
756  : subExpressions({b}), op(operation::kleene) {}
757 } // namespace reg
Represents the table entries as binary trees.
Definition: expression.cpp:580
static array< array< tokens, TOKEN(END)>, TOKEN(END)> const inverseBinaryRules
Maps pairs of symbols to the symbols that derive them.
Definition: expression.cpp:444
static bool canDerive(token symbol, tokens const &left, tokens const &right)
Checks if a token could derive a pair of tokens from two other entries.
Definition: expression.cpp:479
Represents nondeterministic finite automata with ε-moves.
Definition: nfa.h:38
static tokens getUClosure(tokens const &m)
Constructs the reflexive-transitive closure of the inverse unit relation for a given set of symbols...
Definition: expression.cpp:456
token symbol
This tree&#39;s root symbol.
Definition: expression.cpp:582
static fabuilder subtract(nfa const &n1, nfa const &n2)
Creates a builder for an NFA accepting the set difference of the languages of two NFAs...
Definition: nfa.cpp:750
static char32_t N
The symbol used to represent the Null/empty set in a regular expression.
Definition: expression.h:59
Represents deterministic finite automata.
Definition: dfa.h:41
bool operator==(nfa const &other) const
Checks whether this RE describes the same regular language as another object.
Definition: expression.cpp:286
std::vector< exptr >::const_iterator begin() const
Returns an iterator pointing to this RE's first subexpression.
Definition: expression.cpp:378
Beginning of a new subexpression.
static exptr const & spawnEmptyString()
Gives an RE representing the empty string ε.
Definition: expression.cpp:89
nfa splitAllTransitions()
Splits all transitions until only ∅, ε, and symbol REs remain and builds the resulting NFA...
Definition: gnfa.cpp:378
#define TOKEN(T)
Gives casting to base type back to scoped enums, as God intended.
Definition: expression.cpp:435
bool operator!=(nfa const &other) const
Checks whether this RE describes a different regular language than another object.
Definition: expression.cpp:296
vector< vector< tokens > > table
The table of sets of symbols that derive a subsentence.
Definition: expression.cpp:536
static exptr const & spawnSymbol(std::string const &symbol)
Gives an RE representing the given UTF-32-encoded symbol.
Definition: expression.cpp:116
exptr operator()(bool optimized, bool aggressive)
Gives the RE encoded in this tree.
Definition: expression.cpp:631
std::string extractSymbol() const
Reports this symbol expression's UTF-8-encoded symbol.
Definition: expression.cpp:329
static exptr spawnAlternation(exptr const &l, exptr const &r, bool optimized=true, bool aggressive=false)
Gives an RE representing the alternation of two given REs.
Definition: expression.cpp:178
Second part of an alternation expression.
Represents formal regular expressions.
Definition: expression.h:46
static exptr spawnFromString_(std::u32string const &u32re, bool optimized=false, bool aggressive=false)
Gives an RE encoded in a given UTF-32 string.
Definition: expression.cpp:720
std::u32string to_u32string() const
Describes this RE in UTF-32-encoded human-readable form.
Definition: expression.cpp:335
bool badRegularExpression
Signifies that the RE used to initialize this object was invalid.
Definition: expression.cpp:537
static exptr const & spawnSymbol_(char32_t u32symbol)
Gives an RE representing the given UTF-32-encoded symbol.
Definition: expression.cpp:100
pair< unique_ptr< tree >, unique_ptr< tree > > findNextPair(token symbol, size_t row, size_t diag, parser const *p)
Finds the child trees that can be derived from a given entry.
Definition: expression.cpp:595
static exptr spawnFromString(std::string const &re, bool optimized=false, bool aggressive=false)
Gives an RE encoded in a given UTF-8 string.
Definition: expression.cpp:746
vector< char32_t > symbolMapping
Stores the actual symbols encountered in the RE while parsing.
Definition: expression.cpp:531
operation
The different purposes an RE may fulfill.
Definition: expression.h:84
Parses regular expressions.
Definition: expression.cpp:417
Contains the reg::nfa class definition.
token
Tokens the grammar deals with.
Definition: expression.cpp:419
operation getOperation() const
Reports this RE's function.
Definition: expression.cpp:269
Contains the reg::expression class defintion.
std::vector< exptr >::const_iterator end() const
Returns an iterator pointing behind this RE's last subexpression.
Definition: expression.cpp:383
bitset< TOKEN(END)> tokens
Tokens don't usually come alone.
Definition: expression.cpp:438
static char32_t K
The symbol used to represent the Kleene star in a regular expression.
Definition: expression.h:59
Represents generalized nondeterministic finite automata.
Definition: gnfa.h:35
std::string to_string() const
Describes this RE in UTF-8-encoded human-readable form.
Definition: expression.cpp:373
size_t symbolMappingIndex
Index for when symbols have to be extracted from the mapping.
Definition: expression.cpp:533
std::wstring_convert< std::codecvt_utf8< char32_t >, char32_t > converter
Converts between UTF-8-encoded and UTF-32-encoded strings.
Definition: utils.h:52
parser const * p
Points to the parser this tree belongs to.
Definition: expression.cpp:581
Contains the reg::gnfa class definition.
parser(u32string const &re)
Initializes with a string to parse.
Definition: expression.cpp:542
Number of elements in this enumeration, NOT AN ACTUAL TOKEN!
size_t size() const
Reports the size of this RE's tree representation.
Definition: expression.cpp:255
static char32_t L
The symbol used to represent the Left parenthesis in a regular expression.
Definition: expression.h:59
A Kleene star symbol.
Where this library lives.
Definition: dfa.cpp:48
static void compileTableEntry(size_t row, size_t diag, vector< vector< tokens >> &table)
Fills a table entry.
Definition: expression.cpp:514
Second part of a new subexpression.
static void reset()
Resets the symbols used for RE operators to their defaults.
Definition: expression.cpp:66
static char32_t E
The symbol used to represent the Empty string in a regular expression.
Definition: expression.h:59
exptr operator()(bool optimized, bool aggressive)
Gives the RE resulting from parsing.
Definition: expression.cpp:675
tree(size_t row, size_t diag, parser const *p)
Initializes a tree with a given table entry as root.
Definition: expression.cpp:618
char32_t extractSymbol_() const
Reports this symbol expression's UTF-32-encoded symbol.
Definition: expression.cpp:309
Beginning of an alternation expression.
std::shared_ptr< expression const > exptr
This is the type used to handle regular expressions.
Definition: expression.h:58
static exptr const & spawnEmptySet()
Gives an RE representing the empty set ∅.
Definition: expression.cpp:80
An alternation symbol.
Contains the reg::dfa class definition.
static array< token, TOKEN(END)> const inverseUnitGraph
Maps symbols that may be derived in-place to their predecessing symbols.
Definition: expression.cpp:440
pair< unique_ptr< tree >, unique_ptr< tree > > children
Trees with the symbols of the entry's derived pair as root.
Definition: expression.cpp:583
Contains the reg::fabuilder class definition.
static char32_t R
The symbol used to represent the Right parenthesis in a regular expression.
Definition: expression.h:59
static exptr spawnKleene(exptr const &b, bool optimized=true, bool aggressive=false)
Gives an RE representing the Kleene closure of a given RE.
Definition: expression.cpp:224
static exptr spawnConcatenation(exptr const &l, exptr const &r, bool optimized=true, bool aggressive=false)
Gives an RE representing the concatenation of two given REs.
Definition: expression.cpp:135
static char32_t A
The symbol used to represent the Alternation in a regular expression.
Definition: expression.h:59
A concatenation expression.