22 using std::make_unique;
81 return expression::empty;
101 return expression::symbols.emplace(symbol,
new expression(symbol))
117 char32_t u32Symbol(
converter.from_bytes(symbol)[0]);
118 return expression::symbols.emplace(u32Symbol,
new expression(u32Symbol))
200 return l->size() > r->size() ? r : l;
225 bool optimized,
bool aggressive) {
257 for (
exptr const& re : *
this) {
271 #ifndef DOXYGEN_SHOULD_SKIP_THIS 272 expression::operator
nfa const&()
const {
276 return *acceptingNfa;
278 #endif // DOXYGEN_SHOULD_SKIP_THIS 287 return other.operator==(*this);
310 auto it = std::find_if(expression::symbols.
begin(), expression::symbols.
end(),
312 return entry.second.get() ==
this;
314 if (it == expression::symbols.
end()) {
316 "This RE does not seem to be a valid symbol expression.");
337 case operation::alternation:
338 return subExpressions[0]->to_u32string() +
A +
339 subExpressions[1]->to_u32string();
340 case operation::concatenation: {
342 if (subExpressions[0]->op >= operation::alternation) {
347 if (subExpressions[1]->op >= operation::alternation) {
354 case operation::kleene: {
355 if (subExpressions[0]->op >= operation::concatenation) {
358 return subExpressions[0]->to_u32string() +
K;
361 case operation::symbol: {
365 case operation::empty:
379 return subExpressions.cbegin();
384 return subExpressions.cend();
435 #define TOKEN(T) static_cast<unsigned char>(reg::expression::parser::token::T) 458 for (
unsigned char c(0); c <
TOKEN(END); c++) {
462 closure.set(static_cast<unsigned char>(s));
482 for (
unsigned char li(0); li <
TOKEN(END); li++) {
483 if (leftClosure[li]) {
484 for (
unsigned char ri(0); ri <
TOKEN(END); ri++) {
485 if (rightClosure[ri]) {
486 for (
unsigned char ci(0); ci <
TOKEN(END); ci++) {
488 for (
unsigned char successor(ci); successor !=
TOKEN(END);
489 successor =
static_cast<unsigned char>(
491 if (static_cast<unsigned char>(symbol) == successor) {
516 for (
size_t i(0); i < diag; i++) {
518 for (
unsigned char fi(0); fi < first.size(); fi++) {
521 for (
unsigned char si(0); si < second.size(); si++) {
548 size_t numberOfTokens(re.length());
550 table.reserve(numberOfTokens);
552 for (char32_t symbol : re) {
556 }
else if (symbol == R) {
558 }
else if (symbol == A) {
560 }
else if (symbol == K) {
569 for (
size_t diag(1); diag < numberOfTokens; diag++) {
570 for (
size_t row(0); row < numberOfTokens - diag; row++) {
599 for (
size_t i = 0; i < diag; i++) {
600 size_t leftDiag = diag - i - 1;
601 size_t rightDiag = i;
602 size_t rightRow = diag + row - rightDiag;
604 p->
table[rightRow][rightDiag])) {
605 return make_pair(make_unique<tree>(row, leftDiag,
p),
606 make_unique<tree>(rightRow, rightDiag,
p));
621 for (
unsigned char i(
TOKEN(END)); i > 0; i--) {
622 if (
p->
table[row][diag][i - 1]) {
623 return static_cast<token>(i - 1);
635 (*
children.second)(optimized, aggressive),
636 optimized, aggressive);
638 return (*
children.second)(optimized, aggressive);
641 (*
children.second)(optimized, aggressive),
642 optimized, aggressive);
645 optimized, aggressive);
647 return (*
children.second)(optimized, aggressive);
649 return (*
children.first)(optimized, aggressive);
679 return tree(0,
table.size() - 1,
this)(optimized, aggressive);
686 graph.fill(token::END);
687 graph[
TOKEN(Σ)] = token::E;
688 graph[
TOKEN(E)] = token::K;
689 graph[
TOKEN(K)] = token::C;
690 graph[
TOKEN(C)] = token::A;
697 noPredecessor.fill(tokens());
699 rules.fill(noPredecessor);
723 parser stringParser(u32re);
725 return stringParser(optimized, aggressive);
751 expression::expression() : op(operation::empty) {}
752 expression::expression(char32_t symbol) : op(operation::symbol) {}
753 expression::expression(exptr
const& l, exptr
const& r, operation op)
754 : subExpressions({l, r}), op(op) {}
755 expression::expression(exptr
const& b)
756 : subExpressions({b}), op(operation::kleene) {}
Represents the table entries as binary trees.
static array< array< tokens, TOKEN(END)>, TOKEN(END)> const inverseBinaryRules
Maps pairs of symbols to the symbols that derive them.
static bool canDerive(token symbol, tokens const &left, tokens const &right)
Checks if a token could derive a pair of tokens from two other entries.
Represents nondeterministic finite automata with ε-moves.
static tokens getUClosure(tokens const &m)
Constructs the reflexive-transitive closure of the inverse unit relation for a given set of symbols...
token symbol
This tree's root symbol.
static fabuilder subtract(nfa const &n1, nfa const &n2)
Creates a builder for an NFA accepting the set difference of the languages of two NFAs...
static char32_t N
The symbol used to represent the Null/empty set in a regular expression.
Represents deterministic finite automata.
bool operator==(nfa const &other) const
Checks whether this RE describes the same regular language as another object.
std::vector< exptr >::const_iterator begin() const
Returns an iterator pointing to this RE's first subexpression.
Beginning of a new subexpression.
static exptr const & spawnEmptyString()
Gives an RE representing the empty string ε.
nfa splitAllTransitions()
Splits all transitions until only ∅, ε, and symbol REs remain and builds the resulting NFA...
#define TOKEN(T)
Gives casting to base type back to scoped enums, as God intended.
bool operator!=(nfa const &other) const
Checks whether this RE describes a different regular language than another object.
vector< vector< tokens > > table
The table of sets of symbols that derive a subsentence.
static exptr const & spawnSymbol(std::string const &symbol)
Gives an RE representing the given UTF-32-encoded symbol.
exptr operator()(bool optimized, bool aggressive)
Gives the RE encoded in this tree.
std::string extractSymbol() const
Reports this symbol expression's UTF-8-encoded symbol.
static exptr spawnAlternation(exptr const &l, exptr const &r, bool optimized=true, bool aggressive=false)
Gives an RE representing the alternation of two given REs.
Second part of an alternation expression.
Represents formal regular expressions.
static exptr spawnFromString_(std::u32string const &u32re, bool optimized=false, bool aggressive=false)
Gives an RE encoded in a given UTF-32 string.
std::u32string to_u32string() const
Describes this RE in UTF-32-encoded human-readable form.
bool badRegularExpression
Signifies that the RE used to initialize this object was invalid.
static exptr const & spawnSymbol_(char32_t u32symbol)
Gives an RE representing the given UTF-32-encoded symbol.
pair< unique_ptr< tree >, unique_ptr< tree > > findNextPair(token symbol, size_t row, size_t diag, parser const *p)
Finds the child trees that can be derived from a given entry.
static exptr spawnFromString(std::string const &re, bool optimized=false, bool aggressive=false)
Gives an RE encoded in a given UTF-8 string.
vector< char32_t > symbolMapping
Stores the actual symbols encountered in the RE while parsing.
operation
The different purposes an RE may fulfill.
Parses regular expressions.
Contains the reg::nfa class definition.
token
Tokens the grammar deals with.
operation getOperation() const
Reports this RE's function.
Contains the reg::expression class defintion.
std::vector< exptr >::const_iterator end() const
Returns an iterator pointing behind this RE's last subexpression.
bitset< TOKEN(END)> tokens
Tokens don't usually come alone.
static char32_t K
The symbol used to represent the Kleene star in a regular expression.
Represents generalized nondeterministic finite automata.
std::string to_string() const
Describes this RE in UTF-8-encoded human-readable form.
size_t symbolMappingIndex
Index for when symbols have to be extracted from the mapping.
std::wstring_convert< std::codecvt_utf8< char32_t >, char32_t > converter
Converts between UTF-8-encoded and UTF-32-encoded strings.
parser const * p
Points to the parser this tree belongs to.
Contains the reg::gnfa class definition.
parser(u32string const &re)
Initializes with a string to parse.
Number of elements in this enumeration, NOT AN ACTUAL TOKEN!
size_t size() const
Reports the size of this RE's tree representation.
static char32_t L
The symbol used to represent the Left parenthesis in a regular expression.
Where this library lives.
static void compileTableEntry(size_t row, size_t diag, vector< vector< tokens >> &table)
Fills a table entry.
Second part of a new subexpression.
static void reset()
Resets the symbols used for RE operators to their defaults.
static char32_t E
The symbol used to represent the Empty string in a regular expression.
exptr operator()(bool optimized, bool aggressive)
Gives the RE resulting from parsing.
tree(size_t row, size_t diag, parser const *p)
Initializes a tree with a given table entry as root.
char32_t extractSymbol_() const
Reports this symbol expression's UTF-32-encoded symbol.
Beginning of an alternation expression.
std::shared_ptr< expression const > exptr
This is the type used to handle regular expressions.
static exptr const & spawnEmptySet()
Gives an RE representing the empty set ∅.
Contains the reg::dfa class definition.
static array< token, TOKEN(END)> const inverseUnitGraph
Maps symbols that may be derived in-place to their predecessing symbols.
pair< unique_ptr< tree >, unique_ptr< tree > > children
Trees with the symbols of the entry's derived pair as root.
Contains the reg::fabuilder class definition.
static char32_t R
The symbol used to represent the Right parenthesis in a regular expression.
static exptr spawnKleene(exptr const &b, bool optimized=true, bool aggressive=false)
Gives an RE representing the Kleene closure of a given RE.
static exptr spawnConcatenation(exptr const &l, exptr const &r, bool optimized=true, bool aggressive=false)
Gives an RE representing the concatenation of two given REs.
static char32_t A
The symbol used to represent the Alternation in a regular expression.
A concatenation expression.