117 typedef typename Encoding::Ch
Ch;
121 ownAllocator_(allocator ? 0 :
RAPIDJSON_NEW(
Allocator)()), allocator_(allocator ? allocator : ownAllocator_),
122 states_(allocator_, 256), ranges_(allocator_, 256), root_(kRegexInvalidState), stateCount_(), rangeCount_(),
123 anchorBegin_(), anchorEnd_()
136 return root_ != kRegexInvalidState;
149 static const unsigned kAnyCharacterClass = 0xFFFFFFFF;
150 static const unsigned kRangeCharacterClass = 0xFFFFFFFE;
151 static const unsigned kRangeNegationFlag = 0x80000000;
175 return states_.template Bottom<State>()[index];
178 const State& GetState(
SizeType index)
const {
180 return states_.template Bottom<State>()[index];
185 return ranges_.template Bottom<Range>()[index];
190 return ranges_.template Bottom<Range>()[index];
193 template <
typename InputStream>
194 void Parse(DecodedStream<InputStream, Encoding>& ds) {
195 Stack<Allocator> operandStack(allocator_, 256);
196 Stack<Allocator> operatorStack(allocator_, 256);
197 Stack<Allocator> atomCountStack(allocator_, 256);
199 *atomCountStack.template Push<unsigned>() = 0;
202 while (
ds.Peek() != 0) {
203 switch (codepoint =
ds.Take()) {
213 while (!operatorStack.Empty() && *operatorStack.template Top<Operator>() < kAlternation)
214 if (!Eval(operandStack, *operatorStack.template Pop<Operator>(1)))
216 *operatorStack.template Push<Operator>() = kAlternation;
217 *atomCountStack.template Top<unsigned>() = 0;
221 *operatorStack.template Push<Operator>() = kLeftParenthesis;
222 *atomCountStack.template Push<unsigned>() = 0;
226 while (!operatorStack.Empty() && *operatorStack.template Top<Operator>() != kLeftParenthesis)
227 if (!Eval(operandStack, *operatorStack.template Pop<Operator>(1)))
229 if (operatorStack.Empty())
231 operatorStack.template Pop<Operator>(1);
232 atomCountStack.template Pop<unsigned>(1);
233 ImplicitConcatenation(atomCountStack, operatorStack);
237 if (!Eval(operandStack, kZeroOrOne))
242 if (!Eval(operandStack, kZeroOrMore))
247 if (!Eval(operandStack, kOneOrMore))
254 if (!ParseUnsigned(ds, &n))
257 if (
ds.Peek() ==
',') {
259 if (
ds.Peek() ==
'}')
260 m = kInfinityQuantifier;
261 else if (!ParseUnsigned(ds, &m) || m < n)
267 if (!EvalQuantifier(operandStack, n, m) ||
ds.Peek() !=
'}')
274 PushOperand(operandStack, kAnyCharacterClass);
275 ImplicitConcatenation(atomCountStack, operatorStack);
281 if (!ParseRange(ds, &range))
283 SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, kRangeCharacterClass);
284 GetState(
s).rangeStart = range;
285 *operandStack.template Push<Frag>() = Frag(
s,
s,
s);
287 ImplicitConcatenation(atomCountStack, operatorStack);
291 if (!CharacterEscape(ds, &codepoint))
296 PushOperand(operandStack, codepoint);
297 ImplicitConcatenation(atomCountStack, operatorStack);
301 while (!operatorStack.Empty())
302 if (!Eval(operandStack, *operatorStack.template Pop<Operator>(1)))
306 if (operandStack.GetSize() ==
sizeof(Frag)) {
307 Frag* e = operandStack.template Pop<Frag>(1);
308 Patch(e->out, NewState(kRegexInvalidState, kRegexInvalidState, 0));
311#if RAPIDJSON_REGEX_VERBOSE
312 printf(
"root: %d\n", root_);
313 for (
SizeType i = 0; i < stateCount_ ; i++) {
314 State&
s = GetState(i);
315 printf(
"[%2d] out: %2d out1: %2d c: '%c'\n", i,
s.out,
s.out1, (
char)
s.codepoint);
323 State*
s = states_.template Push<State>();
326 s->codepoint = codepoint;
327 s->rangeStart = kRegexInvalidRange;
328 return stateCount_++;
331 void PushOperand(Stack<Allocator>& operandStack,
unsigned codepoint) {
332 SizeType s = NewState(kRegexInvalidState, kRegexInvalidState, codepoint);
333 *operandStack.template Push<Frag>() = Frag(
s,
s,
s);
336 void ImplicitConcatenation(Stack<Allocator>& atomCountStack, Stack<Allocator>& operatorStack) {
337 if (*atomCountStack.template Top<unsigned>())
338 *operatorStack.template Push<Operator>() = kConcatenation;
339 (*atomCountStack.template Top<unsigned>())++;
344 while (GetState(l1).out != kRegexInvalidState)
345 l1 = GetState(l1).out;
346 GetState(l1).out = l2;
352 next = GetState(
l).out;
357 bool Eval(Stack<Allocator>& operandStack, Operator
op) {
362 Frag e2 = *operandStack.template Pop<Frag>(1);
363 Frag e1 = *operandStack.template Pop<Frag>(1);
364 Patch(e1.out, e2.start);
365 *operandStack.template Push<Frag>() = Frag(e1.start, e2.out, Min(e1.minIndex, e2.minIndex));
370 if (operandStack.GetSize() >=
sizeof(Frag) * 2) {
371 Frag e2 = *operandStack.template Pop<Frag>(1);
372 Frag e1 = *operandStack.template Pop<Frag>(1);
373 SizeType s = NewState(e1.start, e2.start, 0);
374 *operandStack.template Push<Frag>() = Frag(
s, Append(e1.out, e2.out), Min(e1.minIndex, e2.minIndex));
380 if (operandStack.GetSize() >=
sizeof(Frag)) {
381 Frag e = *operandStack.template Pop<Frag>(1);
382 SizeType s = NewState(kRegexInvalidState, e.start, 0);
383 *operandStack.template Push<Frag>() = Frag(
s, Append(e.out,
s), e.minIndex);
389 if (operandStack.GetSize() >=
sizeof(Frag)) {
390 Frag e = *operandStack.template Pop<Frag>(1);
391 SizeType s = NewState(kRegexInvalidState, e.start, 0);
393 *operandStack.template Push<Frag>() = Frag(
s,
s, e.minIndex);
399 if (operandStack.GetSize() >=
sizeof(Frag)) {
400 Frag e = *operandStack.template Pop<Frag>(1);
401 SizeType s = NewState(kRegexInvalidState, e.start, 0);
403 *operandStack.template Push<Frag>() = Frag(e.start,
s, e.minIndex);
414 bool EvalQuantifier(Stack<Allocator>& operandStack,
unsigned n,
unsigned m) {
421 else if (m == kInfinityQuantifier)
422 Eval(operandStack, kZeroOrMore);
424 Eval(operandStack, kZeroOrOne);
425 for (
unsigned i = 0; i < m - 1; i++)
426 CloneTopOperand(operandStack);
427 for (
unsigned i = 0; i < m - 1; i++)
428 Eval(operandStack, kConcatenation);
433 for (
unsigned i = 0; i < n - 1; i++)
434 CloneTopOperand(operandStack);
436 if (m == kInfinityQuantifier)
437 Eval(operandStack, kOneOrMore);
439 CloneTopOperand(operandStack);
440 Eval(operandStack, kZeroOrOne);
441 for (
unsigned i = n; i < m - 1; i++)
442 CloneTopOperand(operandStack);
443 for (
unsigned i = n; i < m; i++)
444 Eval(operandStack, kConcatenation);
447 for (
unsigned i = 0; i < n - 1; i++)
448 Eval(operandStack, kConcatenation);
455 void CloneTopOperand(Stack<Allocator>& operandStack) {
456 const Frag src = *operandStack.template Top<Frag>();
458 State*
s = states_.template Push<State>(
count);
459 memcpy(
s, &GetState(src.minIndex),
count *
sizeof(State));
461 if (
s[
j].out != kRegexInvalidState)
463 if (
s[
j].out1 != kRegexInvalidState)
466 *operandStack.template Push<Frag>() = Frag(src.start +
count, src.out +
count, src.minIndex +
count);
467 stateCount_ +=
count;
470 template <
typename InputStream>
471 bool ParseUnsigned(DecodedStream<InputStream, Encoding>& ds,
unsigned* u) {
473 if (
ds.Peek() <
'0' ||
ds.Peek() >
'9')
475 while (
ds.Peek() >=
'0' &&
ds.Peek() <=
'9') {
476 if (
r >= 429496729 &&
ds.Peek() >
'5')
478 r =
r * 10 + (
ds.Take() -
'0');
484 template <
typename InputStream>
485 bool ParseRange(DecodedStream<InputStream, Encoding>& ds,
SizeType* range) {
490 SizeType current = kRegexInvalidRange;
492 while ((codepoint =
ds.Take()) != 0) {
495 if (codepoint ==
'^') {
503 if (start == kRegexInvalidRange)
508 GetRange(current).next =
r;
511 GetRange(start).start |= kRangeNegationFlag;
516 if (
ds.Peek() ==
'b') {
520 else if (!CharacterEscape(ds, &codepoint))
527 if (codepoint ==
'-') {
536 if (current != kRegexInvalidRange)
537 GetRange(current).next =
r;
538 if (start == kRegexInvalidRange)
547 GetRange(current).end = codepoint;
555 SizeType NewRange(
unsigned codepoint) {
556 Range*
r = ranges_.template Push<Range>();
557 r->start =
r->end = codepoint;
558 r->next = kRegexInvalidRange;
559 return rangeCount_++;
562 template <
typename InputStream>
563 bool CharacterEscape(DecodedStream<InputStream, Encoding>& ds,
unsigned* escapedCodepoint) {
565 switch (codepoint =
ds.Take()) {
580 *escapedCodepoint = codepoint;
return true;
581 case 'f': *escapedCodepoint = 0x000C;
return true;
582 case 'n': *escapedCodepoint = 0x000A;
return true;
583 case 'r': *escapedCodepoint = 0x000D;
return true;
584 case 't': *escapedCodepoint = 0x0009;
return true;
585 case 'v': *escapedCodepoint = 0x000B;
return true;
593 Stack<Allocator> states_;
594 Stack<Allocator> ranges_;
599 static const unsigned kInfinityQuantifier = ~0u;
610 typedef typename Encoding::Ch
Ch;
613 regex_(regex), allocator_(allocator), ownAllocator_(0),
614 state0_(allocator, 0), state1_(allocator, 0), stateSet_()
619 stateSet_ =
static_cast<unsigned*
>(allocator_->Malloc(GetStateSetSize()));
620 state0_.template Reserve<SizeType>(regex_.stateCount_);
621 state1_.template Reserve<SizeType>(regex_.stateCount_);
625 Allocator::Free(stateSet_);
629 template <
typename InputStream>
631 return SearchWithAnchoring(is,
true,
true);
639 template <
typename InputStream>
641 return SearchWithAnchoring(is, regex_.anchorBegin_, regex_.anchorEnd_);
650 typedef typename RegexType::State State;
651 typedef typename RegexType::Range Range;
653 template <
typename InputStream>
654 bool SearchWithAnchoring(InputStream& is,
bool anchorBegin,
bool anchorEnd) {
659 const size_t stateSetSize = GetStateSetSize();
660 std::memset(stateSet_, 0, stateSetSize);
662 bool matched = AddState(*current, regex_.root_);
664 while (!current->Empty() && (codepoint = ds.Take()) != 0) {
665 std::memset(stateSet_, 0, stateSetSize);
668 for (
const SizeType*
s = current->template Bottom<SizeType>();
s != current->template End<SizeType>(); ++
s) {
669 const State& sr = regex_.GetState(*
s);
670 if (sr.codepoint == codepoint ||
671 sr.codepoint == RegexType::kAnyCharacterClass ||
672 (sr.codepoint == RegexType::kRangeCharacterClass && MatchRange(sr.rangeStart, codepoint)))
674 matched = AddState(*next, sr.out) || matched;
675 if (!anchorEnd && matched)
679 AddState(*next, regex_.root_);
687 size_t GetStateSetSize()
const {
688 return (regex_.stateCount_ + 31) / 32 * 4;
692 bool AddState(Stack<Allocator>&
l,
SizeType index) {
695 const State&
s = regex_.GetState(index);
696 if (
s.out1 != kRegexInvalidState) {
697 bool matched = AddState(
l,
s.out);
698 return AddState(
l,
s.out1) || matched;
700 else if (!(stateSet_[index >> 5] & (1u << (index & 31)))) {
701 stateSet_[index >> 5] |= (1u << (index & 31));
702 *
l.template PushUnsafe<SizeType>() = index;
704 return s.out == kRegexInvalidState;
707 bool MatchRange(
SizeType rangeIndex,
unsigned codepoint)
const {
708 bool yes = (regex_.GetRange(rangeIndex).start & RegexType::kRangeNegationFlag) == 0;
709 while (rangeIndex != kRegexInvalidRange) {
710 const Range&
r = regex_.GetRange(rangeIndex);
711 if (codepoint >= (
r.start & ~RegexType::kRangeNegationFlag) && codepoint <=
r.end)
718 const RegexType& regex_;
721 Stack<Allocator> state0_;
722 Stack<Allocator> state1_;