28 = unique_ptr<std::wstring_convert<std::codecvt_utf8<char32_t>,char32_t>>(
29 new std::wstring_convert<std::codecvt_utf8<char32_t>,char32_t>()
32 std::unordered_map<char32_t, expression::exptr> expression::symbols;
41 return expression::empty;
62 return expression::symbols.insert(pair<char32_t,exptr>(symbol,
exptr(
new expression(symbol)))).first->second;
67 char32_t u32Symbol(
converter->from_bytes(utf8Symbol)[0]);
68 return expression::symbols.insert(pair<char32_t,exptr>(u32Symbol,
exptr(
new expression(u32Symbol)))).first->second;
136 return l->size() > r->size() ? r : l;
182 size_t s(op != operation::empty);
183 for (
exptr const& re : subExpressions) {
208 if (!r.acceptingDfa) {
209 r.acceptingDfa = unique_ptr<dfa const>(
new dfa(
dfa::builder(
gnfa(r).splitAllTransitions()).purge().merge()));
211 return *acceptingDfa == *r.acceptingDfa;
231 expression::symbols.
begin(),
232 expression::symbols.
end(),
233 [&](pair<char32_t,exptr>
const& entry)->
bool{
return entry.second.get() ==
this;}
236 if (it == expression::symbols.
end()) {
237 throw std::logic_error(
"This RE does not seem to be a valid symbol expression.");
251 if (symbol == U
'\0') {
261 case operation::alternation :
return subExpressions[0]->to_u32string().append(U
"+").append(subExpressions[1]->
to_u32string());
262 case operation::concatenation : {
264 if (subExpressions[0]->op >= operation::alternation) {
265 concat.append(1,
'(');
267 concat.append(1,
')');
271 if (subExpressions[1]->op >= operation::alternation) {
272 concat.append(1,
'(');
274 concat.append(1,
')');
280 case operation::kleene : {
281 if (subExpressions[0]->op >= operation::concatenation) {
282 return u32string(1, U
'(').append(subExpressions[0]->
to_u32string()).append(U
")*");
284 return subExpressions[0]->to_u32string().append(1,
'*');
287 case operation::symbol : {
289 return symbol == U
'\0' ? u32string(U
"ε") : u32string(1, symbol);
291 case operation::empty :
return u32string(U
"∅");
292 default :
return u32string();
303 return subExpressions.cbegin();
308 return subExpressions.cend();
376 for (
unsigned char c(0); c < token::END; c++) {
397 for (
unsigned char li(0); li < token::END; li++) {
398 if (leftClosure[li]) {
399 for (
unsigned char ri(0); ri < token::END; ri++) {
400 if (rightClosure[ri]) {
401 for (
unsigned char ci(0); ci < token::END; ci++) {
403 for (
unsigned char successor(ci); successor != token::END; successor =
inverseUnitGraph[successor]) {
404 if (symbol == successor) {
429 for (
size_t i(0); i < diag; i++) {
431 for (
unsigned char fi(0); fi < first.size(); fi++) {
434 for (
unsigned char si(0); si < second.size(); si++) {
457 size_t numberOfTokens(re.length());
459 table.reserve(numberOfTokens);
461 for (char32_t symbol : re) {
462 table.push_back(vector<tokens>(numberOfTokens-row,
tokens()));
463 if (symbol ==
lits.
L) {
table[row][0].set(token::L); }
464 else if (symbol ==
lits.
R) {
table[row][0].set(token::R); }
465 else if (symbol ==
lits.
P) {
table[row][0].set(token::P); }
466 else if (symbol ==
lits.
S) {
table[row][0].set(token::S); }
468 table[row][0].set(token::Σ);
474 for (
size_t diag(1); diag < numberOfTokens; diag++) {
475 for (
size_t row(0); row < numberOfTokens - diag; row++) {
502 for (
size_t i = 0; i < diag; i++) {
503 size_t leftDiag = diag-i-1;
504 size_t rightDiag = i;
505 size_t rightRow = diag+row-rightDiag;
507 return make_pair(unique_ptr<tree>(
new tree(row, leftDiag,
p)), unique_ptr<tree>(
new tree(rightRow, rightDiag,
p)));
510 return make_pair(unique_ptr<tree>(), unique_ptr<tree>());
523 for (
unsigned char i(token::END); i > 0; i--) {
524 if (
p->
table[row][diag][i-1]) {
525 return static_cast<token>(i-1);
538 return (*
children.second)(optimized, aggressive);
544 return (*
children.second)(optimized, aggressive);
546 return (*
children.first)(optimized, aggressive);
568 throw std::bad_function_call();
573 return tree(0,
table.size()-1,
this)(optimized, aggressive);
578 []()->array<expression::parser::token,expression::parser::token::END>{
579 array<token,token::END> graph;
580 graph.fill(token::END);
581 graph[token::Σ] = token::E;
582 graph[token::E] = token::K;
583 graph[token::K] = token::C;
584 graph[token::C] = token::A;
590 []()->array<array<expression::parser::tokens,expression::parser::token::END>,expression::parser::token::END>{
591 array<tokens,token::END> noPredecessor;
592 noPredecessor.fill(tokens());
593 array<array<tokens,token::END>,token::END> rules;
594 rules.fill(noPredecessor);
595 rules[token::A][token::B].set(token::A);
596 rules[token::P][token::C].set(token::B);
597 rules[token::C][token::K].set(token::C);
598 rules[token::E][token::S].set(token::K);
599 rules[token::L][token::F].set(token::E);
600 rules[token::A][token::R].set(token::F);
614 parser stringParser(re, lits);
617 return stringParser(optimized, aggressive);
618 }
catch (std::bad_function_call e) {
619 throw std::invalid_argument(
"Invalid regular expression.");
625 return stringParser(optimized, aggressive);
635 expression::expression() : op(operation::empty) {}
636 expression::expression(char32_t symbol) : op(operation::symbol) {}
637 expression::expression(exptr
const& l, exptr
const& r, operation op) : subExpressions({l, r}), op(op) {}
638 expression::expression(exptr
const& b) : subExpressions({b}), op(operation::kleene) {}
Number of elements in this enumeration, NOT AN ACTUAL TOKEN!
Represents the table entries as binary trees.
static array< array< tokens, token::END >, token::END > const inverseBinaryRules
Maps pairs of symbols to the symbols that derive them.
static std::unique_ptr< std::wstring_convert< std::codecvt_utf8< char32_t >, char32_t > > const converter
Converts between UTF-8-encoded and UTF-32-encoded strings.
char32_t const S
The Kleene star.
static bool canDerive(token symbol, tokens const &left, tokens const &right)
Checks if a token could derive a pair of tokens from two other entries.
A concatenation expression.
static tokens getUClosure(tokens const &m)
Constructs the reflexive-transitive closure of the inverse unit relation for a given set of symbols...
pair< unique_ptr< tree >, unique_ptr< tree > > findNextPair(token symbol, size_t row, size_t diag, parser const *p)
Finds the child trees that can be derived from a given entry.
token symbol
This tree's root symbol.
bool operator!=(expression const &r) const
Checks whether this RE is semantically different from another one.
Token literals as used in Introduction to Automata Theory, Languages, and Computation by Hopcroft...
Represents deterministic finite automata.
char32_t const EMPTY
Neutral element of alternation and annihilating element of concatenation, a.k.a. empty set...
std::vector< exptr >::const_iterator begin() const
Returns an iterator pointing to this RE's first subexpression.
static exptr const & spawnEmptyString()
Gives an RE representing the empty string ε.
nfa splitAllTransitions()
Splits all transitions until only ∅, ε, and symbol REs remain and builds the resulting NFA...
vector< vector< tokens > > table
The table of sets of symbols that derive a subsentence.
exptr operator()(bool optimized, bool aggressive)
Gives the RE encoded in this tree.
static exptr spawnFromString(std::u32string const &re, literals lits=literals(), bool optimized=false, bool aggressive=false)
Gives an RE encoded in a given string.
static exptr spawnAlternation(exptr const &l, exptr const &r, bool optimized=true, bool aggressive=false)
Gives an RE representing the alternation of two given REs.
Represents formal regular expressions.
Beginning of a new subexpression.
std::u32string to_u32string() const
Describes this RE in UTF-32-encoded human-readable form.
bool badRegularExpression
Signifies that the RE used to initialize this object was invalid.
char32_t extractSymbol() const
Reports this symbol expression's UTF-32-encoded symbol.
literals lits
Stores interpretations for characters encountered in the parsed string.
static exptr const & spawnSymbol(char32_t symbol)
Gives an RE representing the given UTF-32-encoded symbol.
pair< unique_ptr< tree >, unique_ptr< tree > > children
Trees with the symbols of the entry's derived pair as root.
vector< char32_t > symbolMapping
Stores the actual symbols encountered in the RE while parsing.
operation
The different purposes an RE may fulfill.
Parses regular expressions.
Contains the reg::nfa class definition.
token
Tokens the grammar deals with.
operation getOperation() const
Reports this RE's function.
Contains the reg::expression class defintion.
std::vector< exptr >::const_iterator end() const
Returns an iterator pointing behind this RE's last subexpression.
Represents generalized nondeterministic finite automata.
std::string to_string() const
Describes this RE in UTF-32-encoded human-readable form.
size_t symbolMappingIndex
Index for when symbols have to be extracted from the mapping.
Second part of an alternation expression.
bool operator==(expression const &r) const
Checks whether this RE is semantically equivalent to another one.
char32_t const L
The left parenthesis.
builder & merge()
Merges the prospective DFA's indistinguishable states, allowing for minimization. ...
Constructs DFAs step by step.
parser const * p
Points to the parser this tree belongs to.
Beginning of an alternation expression.
Contains the reg::gnfa class definition.
char32_t const P
The alternation symbol.
size_t size() const
Reports the size of this RE's tree representation.
static void compileTableEntry(size_t row, size_t diag, vector< vector< tokens >> &table)
Fills a table entry.
char32_t const EPSILON
Neutral element of concatenation, a.k.a. empty string.
bitset< token::END > tokens
Tokens don't usually come alone.
parser(u32string const &re, literals const &lits)
Initializes with a string to parse and literals to parse for.
exptr operator()(bool optimized, bool aggressive)
Gives the RE resulting from parsing.
tree(size_t row, size_t diag, parser const *p)
Initializes a tree with a given table entry as root.
std::shared_ptr< expression const > exptr
This is the type used to handle regular expressions.
static exptr const & spawnEmptySet()
Gives an RE representing the empty set ∅.
char32_t const R
The right parenthesis.
Contains the reg::dfa class definition.
std::string extractUtf8Symbol() const
Reports this symbol expression's UTF-8-encoded symbol.
static array< token, token::END > const inverseUnitGraph
Maps symbols that may be derived in-place to their predecessing symbols.
static exptr spawnKleene(exptr const &b, bool optimized=true, bool aggressive=false)
Gives an RE representing the Kleene closure of a given RE.
static exptr spawnConcatenation(exptr const &l, exptr const &r, bool optimized=true, bool aggressive=false)
Gives an RE representing the concatenation of two given REs.
Second part of a new subexpression.