#393, Bug 1135377 - Part 6: Support ignoreCase for BMP in RegExp with unicode flag. r=till, f=anba

This commit is contained in:
Tooru Fujisawa 2015-08-07 08:12:01 +09:00 committed by Cameron Kaiser
parent 45a4712b08
commit cf744e9d35
20 changed files with 4708 additions and 97 deletions

View File

@ -783,9 +783,10 @@ NativeRegExpMacroAssembler::CheckGreedyLoop(Label* on_tos_equals_current_positio
}
void
NativeRegExpMacroAssembler::CheckNotBackReference(int start_reg, Label* on_no_match)
NativeRegExpMacroAssembler::CheckNotBackReference(int start_reg, Label* on_no_match,
bool unicode)
{
JitSpew(SPEW_PREFIX "CheckNotBackReference(%d)", start_reg);
JitSpew(SPEW_PREFIX "CheckNotBackReferenceIgnoreCase(%d, %d)", start_reg, unicode);
Label fallthrough;
Label success;
@ -1034,8 +1035,13 @@ NativeRegExpMacroAssembler::CheckNotBackReferenceIgnoreCase(int start_reg, Label
masm.passABIArg(current_character);
masm.passABIArg(current_position);
masm.passABIArg(temp1);
int (*fun)(const char16_t*, const char16_t*, size_t) = CaseInsensitiveCompareStrings;
masm.callWithABI(JS_FUNC_TO_DATA_PTR(void*, fun));
if (!unicode) {
int (*fun)(const char16_t*, const char16_t*, size_t) = CaseInsensitiveCompareStrings;
masm.callWithABI(JS_FUNC_TO_DATA_PTR(void*, fun));
} else {
int (*fun)(const char16_t*, const char16_t*, size_t) = CaseInsensitiveCompareUCStrings;
masm.callWithABI(JS_FUNC_TO_DATA_PTR(void*, fun));
}
masm.storeCallResult(temp0);
masm.PopRegsInMask(volatileRegs);
@ -1047,7 +1053,9 @@ NativeRegExpMacroAssembler::CheckNotBackReferenceIgnoreCase(int start_reg, Label
// PowerPC specific version, somewhat more efficient (fixes issue 308)
Register ppc0 = (temp1 == r6) ? r7 : r6;
Register ppc1 = (temp1 == r8) ? r9 : r8;
int (*fun)(const char16_t*, const char16_t*, size_t) = CaseInsensitiveCompareStrings;
int (*fun)(const char16_t*, const char16_t*, size_t) = (unicode)
? CaseInsensitiveCompareUCStrings
: CaseInsensitiveCompareStrings ;
// This is lazy, but only incurs one extra x_subi.
masm.x_mflr(r0);

View File

@ -0,0 +1,50 @@
diff a/js/src/irregexp/NativeRegExpMacroAssembler.cpp b/js/src/irregexp/NativeRegExpMacroAssembler.cpp (rejected hunks)
@@ -714,19 +714,20 @@ NativeRegExpMacroAssembler::CheckNotBack
// Restore backtrack stack pointer.
masm.pop(backtrack_stack_pointer);
masm.bind(&fallthrough);
}
void
-NativeRegExpMacroAssembler::CheckNotBackReferenceIgnoreCase(int start_reg, Label* on_no_match)
+NativeRegExpMacroAssembler::CheckNotBackReferenceIgnoreCase(int start_reg, Label* on_no_match,
+ bool unicode)
{
- JitSpew(SPEW_PREFIX "CheckNotBackReferenceIgnoreCase(%d)", start_reg);
+ JitSpew(SPEW_PREFIX "CheckNotBackReferenceIgnoreCase(%d, %d)", start_reg, unicode);
Label fallthrough;
masm.loadPtr(register_location(start_reg), current_character); // Index of start of capture
masm.loadPtr(register_location(start_reg + 1), temp1); // Index of end of capture
masm.subPtr(current_character, temp1); // Length of capture.
// The length of a capture should not be negative. This can only happen
@@ -828,18 +829,23 @@ NativeRegExpMacroAssembler::CheckNotBack
// Parameters are
// Address byte_offset1 - Address captured substring's start.
// Address byte_offset2 - Address of current character position.
// size_t byte_length - length of capture in bytes(!)
masm.setupUnalignedABICall(temp0);
masm.passABIArg(current_character);
masm.passABIArg(current_position);
masm.passABIArg(temp1);
- int (*fun)(const char16_t*, const char16_t*, size_t) = CaseInsensitiveCompareStrings;
- masm.callWithABI(JS_FUNC_TO_DATA_PTR(void*, fun));
+ if (!unicode) {
+ int (*fun)(const char16_t*, const char16_t*, size_t) = CaseInsensitiveCompareStrings;
+ masm.callWithABI(JS_FUNC_TO_DATA_PTR(void*, fun));
+ } else {
+ int (*fun)(const char16_t*, const char16_t*, size_t) = CaseInsensitiveCompareUCStrings;
+ masm.callWithABI(JS_FUNC_TO_DATA_PTR(void*, fun));
+ }
masm.storeCallResult(temp0);
masm.PopRegsInMask(volatileRegs);
// Check if function returned non-zero for success or zero for failure.
masm.branchTest32(Assembler::Zero, temp0, temp0, BranchOrBacktrack(on_no_match));
// On success, increment position by length of capture.

View File

@ -104,7 +104,7 @@ class MOZ_STACK_CLASS NativeRegExpMacroAssembler : public RegExpMacroAssembler
void CheckGreedyLoop(jit::Label* on_tos_equals_current_position);
void CheckNotAtStart(jit::Label* on_not_at_start);
void CheckNotBackReference(int start_reg, jit::Label* on_no_match);
void CheckNotBackReferenceIgnoreCase(int start_reg, jit::Label* on_no_match);
void CheckNotBackReferenceIgnoreCase(int start_reg, jit::Label* on_no_match, bool unicode);
void CheckNotCharacter(unsigned c, jit::Label* on_not_equal);
void CheckNotCharacterAfterAnd(unsigned c, unsigned and_with, jit::Label* on_not_equal);
void CheckNotCharacterAfterMinusAnd(char16_t c, char16_t minus, char16_t and_with,

View File

@ -90,7 +90,8 @@ V(CHECK_AT_START, 43, 8) /* bc8 pad24 addr32 */ \
V(CHECK_NOT_AT_START, 44, 8) /* bc8 pad24 addr32 */ \
V(CHECK_GREEDY, 45, 8) /* bc8 pad24 addr32 */ \
V(ADVANCE_CP_AND_GOTO, 46, 8) /* bc8 offset24 addr32 */ \
V(SET_CURRENT_POSITION_FROM_END, 47, 4) /* bc8 idx24 */
V(SET_CURRENT_POSITION_FROM_END, 47, 4) /* bc8 idx24 */ \
V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE, 48, 8) /* bc8 reg_idx24 addr32 */
#define DECLARE_BYTECODES(name, code, length) \
static const int BC_##name = code;

View File

@ -82,11 +82,26 @@ static const int kSpaceAndSurrogateRangeCount = ArrayLength(kSpaceAndSurrogateRa
static const int kWordRanges[] = {
'0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 };
static const int kWordRangeCount = ArrayLength(kWordRanges);
static const int kIgnoreCaseWordRanges[] = {
'0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1,
0x017F, 0x017F + 1, 0x212A, 0x212A + 1,
0x10000 };
static const int kIgnoreCaseWordCount = ArrayLength(kIgnoreCaseWordRanges);
static const int kWordAndSurrogateRanges[] = {
'0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1,
unicode::LeadSurrogateMin, unicode::TrailSurrogateMax + 1,
0x10000 };
static const int kWordAndSurrogateRangeCount = ArrayLength(kWordAndSurrogateRanges);
static const int kNegatedIgnoreCaseWordAndSurrogateRanges[] = {
0, '0', '9' + 1, 'A',
'K', 'K' + 1, 'S', 'S' + 1,
'Z' + 1, '_', '_' + 1, 'a',
'k', 'k' + 1, 's', 's' + 1,
'z' + 1, unicode::LeadSurrogateMin,
unicode::TrailSurrogateMax + 1, 0x10000,
0x10000 };
static const int kNegatedIgnoreCaseWordAndSurrogateRangeCount =
ArrayLength(kNegatedIgnoreCaseWordAndSurrogateRanges);
static const int kDigitRanges[] = { '0', '9' + 1, 0x10000 };
static const int kDigitRangeCount = ArrayLength(kDigitRanges);
static const int kDigitAndSurrogateRanges[] = {
@ -186,14 +201,29 @@ CharacterRange::AddClassEscape(LifoAlloc* alloc, char16_t type,
// Add class escape, excluding surrogate pair range.
void
CharacterRange::AddClassEscapeUnicode(LifoAlloc* alloc, char16_t type,
CharacterRangeVector* ranges)
CharacterRangeVector* ranges, bool ignore_case)
{
switch (type) {
case 's':
case 'd':
return AddClassEscape(alloc, type, ranges);
break;
case 'S':
AddClassNegated(kSpaceAndSurrogateRanges, kSpaceAndSurrogateRangeCount, ranges);
break;
case 'w':
if (ignore_case)
AddClass(kIgnoreCaseWordRanges, kIgnoreCaseWordCount, ranges);
else
AddClassEscape(alloc, type, ranges);
break;
case 'W':
AddClassNegated(kWordAndSurrogateRanges, kWordAndSurrogateRangeCount, ranges);
if (ignore_case) {
AddClass(kNegatedIgnoreCaseWordAndSurrogateRanges,
kNegatedIgnoreCaseWordAndSurrogateRangeCount, ranges);
} else {
AddClassNegated(kWordAndSurrogateRanges, kWordAndSurrogateRangeCount, ranges);
}
break;
case 'D':
AddClassNegated(kDigitAndSurrogateRanges, kDigitAndSurrogateRangeCount, ranges);
@ -203,20 +233,39 @@ CharacterRange::AddClassEscapeUnicode(LifoAlloc* alloc, char16_t type,
}
}
#define FOR_EACH_NON_ASCII_TO_ASCII_FOLDING(macro) \
/* LATIN CAPITAL LETTER Y WITH DIAERESIS */ \
macro(0x0178, 0x00FF) \
/* LATIN SMALL LETTER LONG S */ \
macro(0x017F, 0x0073) \
/* LATIN CAPITAL LETTER SHARP S */ \
macro(0x1E9E, 0x00DF) \
/* KELVIN SIGN */ \
macro(0x212A, 0x006B) \
/* ANGSTROM SIGN */ \
macro(0x212B, 0x00E5)
// We need to check for the following characters: 0x39c 0x3bc 0x178.
static inline bool
RangeContainsLatin1Equivalents(CharacterRange range)
RangeContainsLatin1Equivalents(CharacterRange range, bool unicode)
{
// TODO(dcarney): this could be a lot more efficient.
/* TODO(dcarney): this could be a lot more efficient. */
if (unicode) {
#define CHECK_RANGE(C, F) \
if (range.Contains(C)) return true;
FOR_EACH_NON_ASCII_TO_ASCII_FOLDING(CHECK_RANGE)
#undef CHECK_RANGE
}
return range.Contains(0x39c) || range.Contains(0x3bc) || range.Contains(0x178);
}
static bool
RangesContainLatin1Equivalents(const CharacterRangeVector& ranges)
RangesContainLatin1Equivalents(const CharacterRangeVector& ranges, bool unicode)
{
for (size_t i = 0; i < ranges.length(); i++) {
// TODO(dcarney): this could be a lot more efficient.
if (RangeContainsLatin1Equivalents(ranges[i]))
if (RangeContainsLatin1Equivalents(ranges[i], unicode))
return true;
}
return false;
@ -229,27 +278,24 @@ static const size_t kEcma262UnCanonicalizeMaxWidth = 4;
static int
GetCaseIndependentLetters(char16_t character,
bool ascii_subject,
bool unicode,
const char16_t* choices,
size_t choices_length,
char16_t* letters)
{
const char16_t choices[] = {
character,
unicode::ToLowerCase(character),
unicode::ToUpperCase(character)
};
size_t count = 0;
for (size_t i = 0; i < ArrayLength(choices); i++) {
for (size_t i = 0; i < choices_length; i++) {
char16_t c = choices[i];
// The standard requires that non-ASCII characters cannot have ASCII
// character codes in their equivalence class, even though this
// situation occurs multiple times in the unicode tables.
static const unsigned kMaxAsciiCharCode = 127;
if (character > kMaxAsciiCharCode && c <= kMaxAsciiCharCode)
if (!unicode && character > kMaxAsciiCharCode && c <= kMaxAsciiCharCode)
continue;
// Skip characters that can't appear in one byte strings.
if (ascii_subject && c > kMaxOneByteCharCode)
if (!unicode && ascii_subject && c > kMaxOneByteCharCode)
continue;
// Watch for duplicates.
@ -269,10 +315,45 @@ GetCaseIndependentLetters(char16_t character,
return count;
}
static int
GetCaseIndependentLetters(char16_t character,
bool ascii_subject,
bool unicode,
char16_t* letters)
{
if (unicode) {
const char16_t choices[] = {
character,
unicode::FoldCase(character),
unicode::ReverseFoldCase1(character),
unicode::ReverseFoldCase2(character),
unicode::ReverseFoldCase3(character),
};
return GetCaseIndependentLetters(character, ascii_subject, unicode,
choices, ArrayLength(choices), letters);
}
const char16_t choices[] = {
character,
unicode::ToLowerCase(character),
unicode::ToUpperCase(character)
};
return GetCaseIndependentLetters(character, ascii_subject, unicode,
choices, ArrayLength(choices), letters);
}
static char16_t
ConvertNonLatin1ToLatin1(char16_t c)
ConvertNonLatin1ToLatin1(char16_t c, bool unicode)
{
MOZ_ASSERT(c > kMaxOneByteCharCode);
if (unicode) {
switch (c) {
#define CONVERT(C, F) case C: return F;
FOR_EACH_NON_ASCII_TO_ASCII_FOLDING(CONVERT)
#undef CONVERT
}
}
switch (c) {
// This are equivalent characters in unicode.
case 0x39c:
@ -287,12 +368,12 @@ ConvertNonLatin1ToLatin1(char16_t c)
}
void
CharacterRange::AddCaseEquivalents(bool is_ascii, CharacterRangeVector* ranges)
CharacterRange::AddCaseEquivalents(bool is_ascii, bool unicode, CharacterRangeVector* ranges)
{
char16_t bottom = from();
char16_t top = to();
if (is_ascii && !RangeContainsLatin1Equivalents(*this)) {
if (is_ascii && !RangeContainsLatin1Equivalents(*this, unicode)) {
if (bottom > kMaxOneByteCharCode)
return;
if (top > kMaxOneByteCharCode)
@ -301,7 +382,7 @@ CharacterRange::AddCaseEquivalents(bool is_ascii, CharacterRangeVector* ranges)
for (char16_t c = bottom;; c++) {
char16_t chars[kEcma262UnCanonicalizeMaxWidth];
size_t length = GetCaseIndependentLetters(c, is_ascii, chars);
size_t length = GetCaseIndependentLetters(c, is_ascii, unicode, chars);
for (size_t i = 0; i < length; i++) {
char16_t other = chars[i];
@ -581,7 +662,7 @@ SeqRegExpNode::FillInBMInfo(int offset,
}
RegExpNode*
SeqRegExpNode::FilterASCII(int depth, bool ignore_case)
SeqRegExpNode::FilterASCII(int depth, bool ignore_case, bool unicode)
{
if (info()->replacement_calculated)
return replacement();
@ -591,13 +672,13 @@ SeqRegExpNode::FilterASCII(int depth, bool ignore_case)
MOZ_ASSERT(!info()->visited);
VisitMarker marker(info());
return FilterSuccessor(depth - 1, ignore_case);
return FilterSuccessor(depth - 1, ignore_case, unicode);
}
RegExpNode*
SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case)
SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case, bool unicode)
{
RegExpNode* next = on_success_->FilterASCII(depth - 1, ignore_case);
RegExpNode* next = on_success_->FilterASCII(depth - 1, ignore_case, unicode);
if (next == nullptr)
return set_replacement(nullptr);
@ -740,7 +821,7 @@ TextNode::GreedyLoopTextLength()
}
RegExpNode*
TextNode::FilterASCII(int depth, bool ignore_case)
TextNode::FilterASCII(int depth, bool ignore_case, bool unicode)
{
if (info()->replacement_calculated)
return replacement();
@ -764,7 +845,7 @@ TextNode::FilterASCII(int depth, bool ignore_case)
// Here, we need to check for characters whose upper and lower cases
// are outside the Latin-1 range.
char16_t converted = ConvertNonLatin1ToLatin1(c);
char16_t converted = ConvertNonLatin1ToLatin1(c, unicode);
if (converted == 0) {
// Character is outside Latin-1 completely
return set_replacement(nullptr);
@ -789,7 +870,7 @@ TextNode::FilterASCII(int depth, bool ignore_case)
ranges[0].to() >= kMaxOneByteCharCode)
{
// This will be handled in a later filter.
if (ignore_case && RangesContainLatin1Equivalents(ranges))
if (ignore_case && RangesContainLatin1Equivalents(ranges, unicode))
continue;
return set_replacement(nullptr);
}
@ -798,14 +879,14 @@ TextNode::FilterASCII(int depth, bool ignore_case)
ranges[0].from() > kMaxOneByteCharCode)
{
// This will be handled in a later filter.
if (ignore_case && RangesContainLatin1Equivalents(ranges))
if (ignore_case && RangesContainLatin1Equivalents(ranges, unicode))
continue;
return set_replacement(nullptr);
}
}
}
}
return FilterSuccessor(depth - 1, ignore_case);
return FilterSuccessor(depth - 1, ignore_case, unicode);
}
void
@ -823,7 +904,7 @@ TextNode::CalculateOffsets()
}
}
void TextNode::MakeCaseIndependent(bool is_ascii)
void TextNode::MakeCaseIndependent(bool is_ascii, bool unicode)
{
int element_count = elements().length();
for (int i = 0; i < element_count; i++) {
@ -839,7 +920,7 @@ void TextNode::MakeCaseIndependent(bool is_ascii)
CharacterRangeVector& ranges = cc->ranges(alloc());
int range_count = ranges.length();
for (int j = 0; j < range_count; j++)
ranges[j].AddCaseEquivalents(is_ascii, &ranges);
ranges[j].AddCaseEquivalents(is_ascii, unicode, &ranges);
}
}
}
@ -988,7 +1069,7 @@ ChoiceNode::FillInBMInfo(int offset,
}
RegExpNode*
ChoiceNode::FilterASCII(int depth, bool ignore_case)
ChoiceNode::FilterASCII(int depth, bool ignore_case, bool unicode)
{
if (info()->replacement_calculated)
return replacement();
@ -1012,7 +1093,7 @@ ChoiceNode::FilterASCII(int depth, bool ignore_case)
for (int i = 0; i < choice_count; i++) {
GuardedAlternative alternative = alternatives()[i];
RegExpNode* replacement =
alternative.node()->FilterASCII(depth - 1, ignore_case);
alternative.node()->FilterASCII(depth - 1, ignore_case, unicode);
MOZ_ASSERT(replacement != this); // No missing EMPTY_MATCH_CHECK.
if (replacement != nullptr) {
alternatives()[i].set_node(replacement);
@ -1033,7 +1114,7 @@ ChoiceNode::FilterASCII(int depth, bool ignore_case)
new_alternatives.reserve(surviving);
for (int i = 0; i < choice_count; i++) {
RegExpNode* replacement =
alternatives()[i].node()->FilterASCII(depth - 1, ignore_case);
alternatives()[i].node()->FilterASCII(depth - 1, ignore_case, unicode);
if (replacement != nullptr) {
alternatives()[i].set_node(replacement);
AutoEnterOOMUnsafeRegion oomUnsafe;
@ -1090,7 +1171,7 @@ NegativeLookaheadChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
}
RegExpNode*
NegativeLookaheadChoiceNode::FilterASCII(int depth, bool ignore_case)
NegativeLookaheadChoiceNode::FilterASCII(int depth, bool ignore_case, bool unicode)
{
if (info()->replacement_calculated)
return replacement();
@ -1104,14 +1185,14 @@ NegativeLookaheadChoiceNode::FilterASCII(int depth, bool ignore_case)
// Alternative 0 is the negative lookahead, alternative 1 is what comes
// afterwards.
RegExpNode* node = alternatives()[1].node();
RegExpNode* replacement = node->FilterASCII(depth - 1, ignore_case);
RegExpNode* replacement = node->FilterASCII(depth - 1, ignore_case, unicode);
if (replacement == nullptr)
return set_replacement(nullptr);
alternatives()[1].set_node(replacement);
RegExpNode* neg_node = alternatives()[0].node();
RegExpNode* neg_replacement = neg_node->FilterASCII(depth - 1, ignore_case);
RegExpNode* neg_replacement = neg_node->FilterASCII(depth - 1, ignore_case, unicode);
// If the negative lookahead is always going to fail then
// we don't need to check it.
@ -1192,7 +1273,7 @@ LoopChoiceNode::FillInBMInfo(int offset,
}
RegExpNode*
LoopChoiceNode::FilterASCII(int depth, bool ignore_case)
LoopChoiceNode::FilterASCII(int depth, bool ignore_case, bool unicode)
{
if (info()->replacement_calculated)
return replacement();
@ -1205,7 +1286,7 @@ LoopChoiceNode::FilterASCII(int depth, bool ignore_case)
VisitMarker marker(info());
RegExpNode* continue_replacement =
continue_node_->FilterASCII(depth - 1, ignore_case);
continue_node_->FilterASCII(depth - 1, ignore_case, unicode);
// If we can't continue after the loop then there is no sense in doing the
// loop.
@ -1213,7 +1294,7 @@ LoopChoiceNode::FilterASCII(int depth, bool ignore_case)
return set_replacement(nullptr);
}
return ChoiceNode::FilterASCII(depth - 1, ignore_case);
return ChoiceNode::FilterASCII(depth - 1, ignore_case, unicode);
}
// -------------------------------------------------------------------
@ -1242,7 +1323,7 @@ void
Analysis::VisitText(TextNode* that)
{
if (ignore_case_)
that->MakeCaseIndependent(is_ascii_);
that->MakeCaseIndependent(is_ascii_, unicode_);
EnsureAnalyzed(that->on_success());
if (!has_failed()) {
that->CalculateOffsets();
@ -1534,7 +1615,7 @@ class irregexp::RegExpCompiler
{
public:
RegExpCompiler(JSContext* cx, LifoAlloc* alloc, int capture_count,
bool ignore_case, bool is_ascii, bool match_only);
bool ignore_case, bool is_ascii, bool match_only, bool unicode);
int AllocateRegister() {
if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
@ -1571,6 +1652,7 @@ class irregexp::RegExpCompiler
inline bool ignore_case() { return ignore_case_; }
inline bool ascii() { return ascii_; }
inline bool unicode() { return unicode_; }
FrequencyCollator* frequency_collator() { return &frequency_collator_; }
int current_expansion_factor() { return current_expansion_factor_; }
@ -1592,6 +1674,7 @@ class irregexp::RegExpCompiler
bool ignore_case_;
bool ascii_;
bool match_only_;
bool unicode_;
bool reg_exp_too_big_;
int current_expansion_factor_;
FrequencyCollator frequency_collator_;
@ -1614,12 +1697,13 @@ class RecursionCheck
// Attempts to compile the regexp using an Irregexp code generator. Returns
// a fixed array or a null handle depending on whether it succeeded.
RegExpCompiler::RegExpCompiler(JSContext* cx, LifoAlloc* alloc, int capture_count,
bool ignore_case, bool ascii, bool match_only)
bool ignore_case, bool ascii, bool match_only, bool unicode)
: next_register_(2 * (capture_count + 1)),
recursion_depth_(0),
ignore_case_(ignore_case),
ascii_(ascii),
match_only_(match_only),
unicode_(unicode),
reg_exp_too_big_(false),
current_expansion_factor_(1),
frequency_collator_(),
@ -1692,7 +1776,8 @@ IsNativeRegExpEnabled(JSContext* cx)
RegExpCode
irregexp::CompilePattern(JSContext* cx, RegExpShared* shared, RegExpCompileData* data,
HandleLinearString sample, bool is_global, bool ignore_case,
bool is_ascii, bool match_only, bool force_bytecode, bool sticky)
bool is_ascii, bool match_only, bool force_bytecode, bool sticky,
bool unicode)
{
if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
JS_ReportError(cx, "regexp too big");
@ -1700,7 +1785,8 @@ irregexp::CompilePattern(JSContext* cx, RegExpShared* shared, RegExpCompileData*
}
LifoAlloc& alloc = cx->tempLifoAlloc();
RegExpCompiler compiler(cx, &alloc, data->capture_count, ignore_case, is_ascii, match_only);
RegExpCompiler compiler(cx, &alloc, data->capture_count, ignore_case, is_ascii, match_only,
unicode);
// Sample some characters from the middle of the string.
if (sample->hasLatin1Chars()) {
@ -1746,18 +1832,18 @@ irregexp::CompilePattern(JSContext* cx, RegExpShared* shared, RegExpCompileData*
}
}
if (is_ascii) {
node = node->FilterASCII(RegExpCompiler::kMaxRecursion, ignore_case);
node = node->FilterASCII(RegExpCompiler::kMaxRecursion, ignore_case, unicode);
// Do it again to propagate the new nodes to places where they were not
// put because they had not been calculated yet.
if (node != nullptr) {
node = node->FilterASCII(RegExpCompiler::kMaxRecursion, ignore_case);
node = node->FilterASCII(RegExpCompiler::kMaxRecursion, ignore_case, unicode);
}
}
if (node == nullptr)
node = alloc.newInfallible<EndNode>(&alloc, EndNode::BACKTRACK);
Analysis analysis(cx, ignore_case, is_ascii);
Analysis analysis(cx, ignore_case, is_ascii, unicode);
analysis.EnsureAnalyzed(node);
if (analysis.has_failed()) {
JS_ReportError(cx, analysis.errorMessage());
@ -3597,7 +3683,7 @@ EmitAtomNonLetter(RegExpCompiler* compiler,
RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
bool ascii = compiler->ascii();
char16_t chars[kEcma262UnCanonicalizeMaxWidth];
int length = GetCaseIndependentLetters(c, ascii, chars);
int length = GetCaseIndependentLetters(c, ascii, compiler->unicode(), chars);
if (length < 1) {
// This can't match. Must be an ASCII subject and a non-ASCII character.
// We do not need to do anything since the ASCII pass already handled this.
@ -3673,7 +3759,7 @@ EmitAtomLetter(RegExpCompiler* compiler,
RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
bool ascii = compiler->ascii();
char16_t chars[kEcma262UnCanonicalizeMaxWidth];
int length = GetCaseIndependentLetters(c, ascii, chars);
int length = GetCaseIndependentLetters(c, ascii, compiler->unicode(), chars);
if (length <= 1) return false;
// We may not need to check against the end of the input string
// if this character lies before a character that matched.
@ -4538,7 +4624,8 @@ BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace)
MOZ_ASSERT(start_reg_ + 1 == end_reg_);
if (compiler->ignore_case()) {
assembler->CheckNotBackReferenceIgnoreCase(start_reg_,
trace->backtrack());
trace->backtrack(),
compiler->unicode());
} else {
assembler->CheckNotBackReference(start_reg_, trace->backtrack());
}
@ -4684,6 +4771,7 @@ TextNode::FillInBMInfo(int initial_offset,
char16_t chars[kEcma262UnCanonicalizeMaxWidth];
int length = GetCaseIndependentLetters(character,
bm->max_char() == kMaxOneByteCharCode,
bm->compiler()->unicode(),
chars);
for (int j = 0; j < length; j++)
bm->Set(offset, chars[j]);
@ -4775,7 +4863,8 @@ TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
}
if (compiler->ignore_case()) {
char16_t chars[kEcma262UnCanonicalizeMaxWidth];
size_t length = GetCaseIndependentLetters(c, compiler->ascii(), chars);
size_t length = GetCaseIndependentLetters(c, compiler->ascii(),
compiler->unicode(), chars);
MOZ_ASSERT(length != 0); // Can only happen if c > char_mask (see above).
if (length == 1) {
// This letter has no case equivalents, so it's nice and simple

View File

@ -88,7 +88,8 @@ struct RegExpCode
RegExpCode
CompilePattern(JSContext* cx, RegExpShared* shared, RegExpCompileData* data,
HandleLinearString sample, bool is_global, bool ignore_case,
bool is_ascii, bool match_only, bool force_bytecode, bool sticky);
bool is_ascii, bool match_only, bool force_bytecode, bool sticky,
bool unicode);
// Note: this may return RegExpRunStatus_Error if an interrupt was requested
// while the code was executing.
@ -145,7 +146,7 @@ class CharacterRange
static void AddClassEscape(LifoAlloc* alloc, char16_t type, CharacterRangeVector* ranges);
static void AddClassEscapeUnicode(LifoAlloc* alloc, char16_t type,
CharacterRangeVector* ranges);
CharacterRangeVector* ranges, bool ignoreCase);
static inline CharacterRange Singleton(char16_t value) {
return CharacterRange(value, value);
@ -165,7 +166,7 @@ class CharacterRange
bool is_valid() { return from_ <= to_; }
bool IsEverything(char16_t max) { return from_ == 0 && to_ >= max; }
bool IsSingleton() { return (from_ == to_); }
void AddCaseEquivalents(bool is_ascii, CharacterRangeVector* ranges);
void AddCaseEquivalents(bool is_ascii, bool unicode, CharacterRangeVector* ranges);
static void Split(const LifoAlloc* alloc,
CharacterRangeVector base,
@ -518,7 +519,7 @@ class RegExpNode
// If we know that the input is ASCII then there are some nodes that can
// never match. This method returns a node that can be substituted for
// itself, or nullptr if the node can never match.
virtual RegExpNode* FilterASCII(int depth, bool ignore_case) { return this; }
virtual RegExpNode* FilterASCII(int depth, bool ignore_case, bool unicode) { return this; }
// Helper for FilterASCII.
RegExpNode* replacement() {
@ -625,14 +626,14 @@ class SeqRegExpNode : public RegExpNode
RegExpNode* on_success() { return on_success_; }
void set_on_success(RegExpNode* node) { on_success_ = node; }
virtual RegExpNode* FilterASCII(int depth, bool ignore_case);
virtual RegExpNode* FilterASCII(int depth, bool ignore_case, bool unicode);
virtual bool FillInBMInfo(int offset,
int budget,
BoyerMooreLookahead* bm,
bool not_at_start);
protected:
RegExpNode* FilterSuccessor(int depth, bool ignore_case);
RegExpNode* FilterSuccessor(int depth, bool ignore_case, bool unicode);
private:
RegExpNode* on_success_;
@ -750,7 +751,7 @@ class TextNode : public SeqRegExpNode
int characters_filled_in,
bool not_at_start);
TextElementVector& elements() { return *elements_; }
void MakeCaseIndependent(bool is_ascii);
void MakeCaseIndependent(bool is_ascii, bool unicode);
virtual int GreedyLoopTextLength();
virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
RegExpCompiler* compiler);
@ -759,7 +760,7 @@ class TextNode : public SeqRegExpNode
BoyerMooreLookahead* bm,
bool not_at_start);
void CalculateOffsets();
virtual RegExpNode* FilterASCII(int depth, bool ignore_case);
virtual RegExpNode* FilterASCII(int depth, bool ignore_case, bool unicode);
private:
enum TextEmitPassType {
@ -1013,7 +1014,7 @@ class ChoiceNode : public RegExpNode
void set_not_at_start() { not_at_start_ = true; }
void set_being_calculated(bool b) { being_calculated_ = b; }
virtual bool try_to_emit_quick_check_for_alternative(int i) { return true; }
virtual RegExpNode* FilterASCII(int depth, bool ignore_case);
virtual RegExpNode* FilterASCII(int depth, bool ignore_case, bool unicode);
protected:
int GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative);
@ -1066,7 +1067,7 @@ class NegativeLookaheadChoiceNode : public ChoiceNode
// characters, but on a negative lookahead the negative branch did not take
// part in that calculation (EatsAtLeast) so the assumptions don't hold.
virtual bool try_to_emit_quick_check_for_alternative(int i) { return i != 0; }
virtual RegExpNode* FilterASCII(int depth, bool ignore_case);
virtual RegExpNode* FilterASCII(int depth, bool ignore_case, bool unicode);
};
class LoopChoiceNode : public ChoiceNode
@ -1095,7 +1096,7 @@ class LoopChoiceNode : public ChoiceNode
RegExpNode* continue_node() { return continue_node_; }
bool body_can_be_zero_length() { return body_can_be_zero_length_; }
virtual void Accept(NodeVisitor* visitor);
virtual RegExpNode* FilterASCII(int depth, bool ignore_case);
virtual RegExpNode* FilterASCII(int depth, bool ignore_case, bool unicode);
private:
// AddAlternative is made private for loop nodes because alternatives
@ -1466,10 +1467,11 @@ class NodeVisitor
class Analysis : public NodeVisitor
{
public:
Analysis(JSContext* cx, bool ignore_case, bool is_ascii)
Analysis(JSContext* cx, bool ignore_case, bool is_ascii, bool unicode)
: cx(cx),
ignore_case_(ignore_case),
is_ascii_(is_ascii),
unicode_(unicode),
error_message_(nullptr)
{}
@ -1494,6 +1496,7 @@ class Analysis : public NodeVisitor
JSContext* cx;
bool ignore_case_;
bool is_ascii_;
bool unicode_;
const char* error_message_;
Analysis(Analysis&) = delete;

View File

@ -442,6 +442,27 @@ irregexp::InterpretCode(JSContext* cx, const uint8_t* byteCode, const CharT* cha
}
break;
}
BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE) {
int from = registers[insn >> BYTECODE_SHIFT];
int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;
if (from < 0 || len <= 0) {
pc += BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE_LENGTH;
break;
}
if (current + len > length) {
pc = byteCode + Load32Aligned(pc + 4);
break;
}
if (CaseInsensitiveCompareUCStrings(chars + from, chars + current,
len * sizeof(CharT)))
{
current += len;
pc += BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE_LENGTH;
} else {
pc = byteCode + Load32Aligned(pc + 4);
}
break;
}
BYTECODE(CHECK_AT_START)
if (current == 0)
pc = byteCode + Load32Aligned(pc + 4);

View File

@ -65,6 +65,38 @@ template int
irregexp::CaseInsensitiveCompareStrings(const char16_t* substring1, const char16_t* substring2,
size_t byteLength);
template <typename CharT>
int
irregexp::CaseInsensitiveCompareUCStrings(const CharT* substring1, const CharT* substring2,
size_t byteLength)
{
MOZ_ASSERT(byteLength % sizeof(CharT) == 0);
size_t length = byteLength / sizeof(CharT);
for (size_t i = 0; i < length; i++) {
char16_t c1 = substring1[i];
char16_t c2 = substring2[i];
if (c1 != c2) {
c1 = unicode::FoldCase(c1);
c2 = unicode::FoldCase(c2);
if (c1 != c2)
return 0;
}
}
return 1;
}
template int
irregexp::CaseInsensitiveCompareUCStrings(const Latin1Char* substring1,
const Latin1Char* substring2,
size_t byteLength);
template int
irregexp::CaseInsensitiveCompareUCStrings(const char16_t* substring1,
const char16_t* substring2,
size_t byteLength);
InterpretedRegExpMacroAssembler::InterpretedRegExpMacroAssembler(LifoAlloc* alloc, RegExpShared* shared,
size_t numSavedRegisters)
: RegExpMacroAssembler(*alloc, shared, numSavedRegisters),
@ -210,11 +242,16 @@ InterpretedRegExpMacroAssembler::CheckNotBackReference(int start_reg, jit::Label
}
void
InterpretedRegExpMacroAssembler::CheckNotBackReferenceIgnoreCase(int start_reg, jit::Label* on_no_match)
InterpretedRegExpMacroAssembler::CheckNotBackReferenceIgnoreCase(int start_reg,
jit::Label* on_no_match,
bool unicode)
{
MOZ_ASSERT(start_reg >= 0);
MOZ_ASSERT(start_reg <= kMaxRegister);
Emit(BC_CHECK_NOT_BACK_REF_NO_CASE, start_reg);
if (unicode)
Emit(BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE, start_reg);
else
Emit(BC_CHECK_NOT_BACK_REF_NO_CASE, start_reg);
EmitOrLink(on_no_match);
}

View File

@ -112,7 +112,8 @@ class MOZ_STACK_CLASS RegExpMacroAssembler
virtual void CheckGreedyLoop(jit::Label* on_tos_equals_current_position) = 0;
virtual void CheckNotAtStart(jit::Label* on_not_at_start) = 0;
virtual void CheckNotBackReference(int start_reg, jit::Label* on_no_match) = 0;
virtual void CheckNotBackReferenceIgnoreCase(int start_reg, jit::Label* on_no_match) = 0;
virtual void CheckNotBackReferenceIgnoreCase(int start_reg, jit::Label* on_no_match,
bool unicode) = 0;
// Check the current character for a match with a literal character. If we
// fail to match then goto the on_failure label. End of input always
@ -221,6 +222,11 @@ template <typename CharT>
int
CaseInsensitiveCompareStrings(const CharT* substring1, const CharT* substring2, size_t byteLength);
template <typename CharT>
int
CaseInsensitiveCompareUCStrings(const CharT* substring1, const CharT* substring2,
size_t byteLength);
class MOZ_STACK_CLASS InterpretedRegExpMacroAssembler : public RegExpMacroAssembler
{
public:
@ -241,7 +247,7 @@ class MOZ_STACK_CLASS InterpretedRegExpMacroAssembler : public RegExpMacroAssemb
void CheckGreedyLoop(jit::Label* on_tos_equals_current_position);
void CheckNotAtStart(jit::Label* on_not_at_start);
void CheckNotBackReference(int start_reg, jit::Label* on_no_match);
void CheckNotBackReferenceIgnoreCase(int start_reg, jit::Label* on_no_match);
void CheckNotBackReferenceIgnoreCase(int start_reg, jit::Label* on_no_match, bool unicode);
void CheckNotCharacter(unsigned c, jit::Label* on_not_equal);
void CheckNotCharacterAfterAnd(unsigned c, unsigned and_with, jit::Label* on_not_equal);
void CheckNotCharacterAfterMinusAnd(char16_t c, char16_t minus, char16_t and_with,

View File

@ -206,7 +206,7 @@ RegExpBuilder::AddQuantifierToAtom(int min, int max,
template <typename CharT>
RegExpParser<CharT>::RegExpParser(frontend::TokenStream& ts, LifoAlloc* alloc,
const CharT* chars, const CharT* end, bool multiline_mode,
bool unicode)
bool unicode, bool ignore_case)
: ts(ts),
alloc(alloc),
captures_(nullptr),
@ -217,6 +217,7 @@ RegExpParser<CharT>::RegExpParser(frontend::TokenStream& ts, LifoAlloc* alloc,
has_more_(true),
multiline_(multiline_mode),
unicode_(unicode),
ignore_case_(ignore_case),
simple_(false),
contains_anchor_(false),
is_scanned_for_captures_(false)
@ -609,10 +610,11 @@ AddCharOrEscapeUnicode(LifoAlloc* alloc,
CharacterRangeVector* trail_ranges,
WideCharRangeVector* wide_ranges,
char16_t char_class,
widechar c)
widechar c,
bool ignore_case)
{
if (char_class != kNoCharClass) {
CharacterRange::AddClassEscapeUnicode(alloc, char_class, ranges);
CharacterRange::AddClassEscapeUnicode(alloc, char_class, ranges, ignore_case);
switch (char_class) {
case 'S':
case 'W':
@ -896,7 +898,7 @@ RegExpParser<CharT>::ParseCharacterClass()
} else if (current() == ']') {
if (unicode_) {
AddCharOrEscapeUnicode(alloc, ranges, lead_ranges, trail_ranges, wide_ranges,
char_class, first);
char_class, first, ignore_case_);
} else {
AddCharOrEscape(alloc, ranges, char_class, first);
}
@ -926,7 +928,7 @@ RegExpParser<CharT>::ParseCharacterClass()
} else {
if (unicode_) {
AddCharOrEscapeUnicode(alloc, ranges, lead_ranges, trail_ranges, wide_ranges,
char_class, first);
char_class, first, ignore_case_);
} else {
AddCharOrEscape(alloc, ranges, char_class, first);
}
@ -1228,13 +1230,14 @@ UnicodeEverythingAtom(LifoAlloc* alloc)
}
RegExpTree*
UnicodeCharacterClassEscapeAtom(LifoAlloc* alloc, char16_t char_class)
UnicodeCharacterClassEscapeAtom(LifoAlloc* alloc, char16_t char_class, bool ignore_case)
{
CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
CharacterRangeVector* lead_ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
CharacterRangeVector* trail_ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
WideCharRangeVector* wide_ranges = alloc->newInfallible<WideCharRangeVector>(*alloc);
AddCharOrEscapeUnicode(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, char_class, 0);
AddCharOrEscapeUnicode(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, char_class, 0,
ignore_case);
return UnicodeRangesAtom(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, false);
}
@ -1406,7 +1409,8 @@ RegExpParser<CharT>::ParseDisjunction()
case 'D': case 'S': case 'W':
if (unicode_) {
Advance();
builder->AddAtom(UnicodeCharacterClassEscapeAtom(alloc, current()));
builder->AddAtom(UnicodeCharacterClassEscapeAtom(alloc, current(),
ignore_case_));
Advance();
break;
}
@ -1416,7 +1420,10 @@ RegExpParser<CharT>::ParseDisjunction()
Advance(2);
CharacterRangeVector* ranges =
alloc->newInfallible<CharacterRangeVector>(*alloc);
CharacterRange::AddClassEscape(alloc, c, ranges);
if (unicode_)
CharacterRange::AddClassEscapeUnicode(alloc, c, ranges, ignore_case_);
else
CharacterRange::AddClassEscape(alloc, c, ranges);
RegExpTree* atom = alloc->newInfallible<RegExpCharacterClass>(ranges, false);
builder->AddAtom(atom);
break;
@ -1628,7 +1635,8 @@ template class irregexp::RegExpParser<char16_t>;
template <typename CharT>
static bool
ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, const CharT* chars, size_t length,
bool multiline, bool match_only, bool unicode, RegExpCompileData* data)
bool multiline, bool match_only, bool unicode, bool ignore_case,
RegExpCompileData* data)
{
if (match_only) {
// Try to strip a leading '.*' from the RegExp, but only if it is not
@ -1651,7 +1659,7 @@ ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, const CharT* chars, si
}
}
RegExpParser<CharT> parser(ts, &alloc, chars, chars + length, multiline, unicode);
RegExpParser<CharT> parser(ts, &alloc, chars, chars + length, multiline, unicode, ignore_case);
data->tree = parser.ParsePattern();
if (!data->tree)
return false;
@ -1664,15 +1672,15 @@ ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, const CharT* chars, si
bool
irregexp::ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, JSAtom* str,
bool multiline, bool match_only, bool unicode,
bool multiline, bool match_only, bool unicode, bool ignore_case,
RegExpCompileData* data)
{
JS::AutoCheckCannotGC nogc;
return str->hasLatin1Chars()
? ::ParsePattern(ts, alloc, str->latin1Chars(nogc), str->length(),
multiline, match_only, unicode, data)
multiline, match_only, unicode, ignore_case, data)
: ::ParsePattern(ts, alloc, str->twoByteChars(nogc), str->length(),
multiline, match_only, unicode, data);
multiline, match_only, unicode, ignore_case, data);
}
template <typename CharT>
@ -1682,7 +1690,7 @@ ParsePatternSyntax(frontend::TokenStream& ts, LifoAlloc& alloc, const CharT* cha
{
LifoAllocScope scope(&alloc);
RegExpParser<CharT> parser(ts, &alloc, chars, chars + length, false, unicode);
RegExpParser<CharT> parser(ts, &alloc, chars, chars + length, false, unicode, false);
return parser.ParsePattern() != nullptr;
}

View File

@ -43,7 +43,7 @@ namespace irregexp {
bool
ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, JSAtom* str,
bool multiline, bool match_only, bool unicode,
bool multiline, bool match_only, bool unicode, bool ignore_case,
RegExpCompileData* data);
bool
@ -175,7 +175,8 @@ class RegExpParser
{
public:
RegExpParser(frontend::TokenStream& ts, LifoAlloc* alloc,
const CharT* chars, const CharT* end, bool multiline_mode, bool unicode);
const CharT* chars, const CharT* end, bool multiline_mode, bool unicode,
bool ignore_case);
RegExpTree* ParsePattern();
RegExpTree* ParseDisjunction();
@ -296,6 +297,7 @@ class RegExpParser
bool has_more_;
bool multiline_;
bool unicode_;
bool ignore_case_;
bool simple_;
bool contains_anchor_;
bool is_scanned_for_captures_;

View File

@ -0,0 +1,45 @@
var BUGNUMBER = 1135377;
var summary = "Implement RegExp unicode flag -- ignoreCase flag with non-ascii to ascii map.";
print(BUGNUMBER + ": " + summary);
// LATIN CAPITAL LETTER Y WITH DIAERESIS
assertEqArray(/\u0178/iu.exec("\u00FF"),
["\u00FF"]);
assertEqArray(/\u00FF/iu.exec("\u0178"),
["\u0178"]);
// LATIN SMALL LETTER LONG S
assertEqArray(/\u017F/iu.exec("S"),
["S"]);
assertEqArray(/\u017F/iu.exec("s"),
["s"]);
assertEqArray(/S/iu.exec("\u017F"),
["\u017F"]);
assertEqArray(/s/iu.exec("\u017F"),
["\u017F"]);
// LATIN CAPITAL LETTER SHARP S
assertEqArray(/\u1E9E/iu.exec("\u00DF"),
["\u00DF"]);
assertEqArray(/\u00DF/iu.exec("\u1E9E"),
["\u1E9E"]);
// KELVIN SIGN
assertEqArray(/\u212A/iu.exec("K"),
["K"]);
assertEqArray(/\u212A/iu.exec("k"),
["k"]);
assertEqArray(/K/iu.exec("\u212A"),
["\u212A"]);
assertEqArray(/k/iu.exec("\u212A"),
["\u212A"]);
// ANGSTROM SIGN
assertEqArray(/\u212B/iu.exec("\u00E5"),
["\u00E5"]);
assertEqArray(/\u00E5/iu.exec("\u212B"),
["\u212B"]);
if (typeof reportCompare === "function")
reportCompare(true, true);

View File

@ -0,0 +1,39 @@
var BUGNUMBER = 1135377;
var summary = "Implement RegExp unicode flag -- ignoreCase flag with character class escape.";
print(BUGNUMBER + ": " + summary);
// LATIN SMALL LETTER LONG S
assertEqArray(/\w/iu.exec("S"),
["S"]);
assertEqArray(/\w/iu.exec("s"),
["s"]);
assertEqArray(/\w/iu.exec("\u017F"),
["\u017F"]);
assertEqArray(/\W/iu.exec("S"),
["S"]);
assertEqArray(/\W/iu.exec("s"),
["s"]);
assertEqArray(/\W/iu.exec("\u017F"),
["\u017F"]);
// KELVIN SIGN
assertEqArray(/\w/iu.exec("k"),
["k"]);
assertEqArray(/\w/iu.exec("k"),
["k"]);
assertEqArray(/\w/iu.exec("\u212A"),
["\u212A"]);
assertEqArray(/\W/iu.exec("k"),
["k"]);
assertEqArray(/\W/iu.exec("k"),
["k"]);
assertEqArray(/\W/iu.exec("\u212A"),
["\u212A"]);
if (typeof reportCompare === "function")
reportCompare(true, true);

View File

@ -0,0 +1,19 @@
var BUGNUMBER = 1135377;
var summary = "Implement RegExp unicode flag -- ignoreCase flag with negated character class.";
print(BUGNUMBER + ": " + summary);
assertEq(/[^A]/iu.exec("A"),
null);
assertEq(/[^a]/iu.exec("A"),
null);
assertEq(/[^A]/iu.exec("a"),
null);
assertEq(/[^a]/iu.exec("a"),
null);
assertEqArray(/[^A]/iu.exec("b"),
["b"]);
if (typeof reportCompare === "function")
reportCompare(true, true);

File diff suppressed because it is too large Load Diff

1414
js/src/vm/CaseFolding.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@ -519,7 +519,7 @@ RegExpShared::compile(JSContext* cx, HandleAtom pattern, HandleLinearString inpu
/* Parse the pattern. */
irregexp::RegExpCompileData data;
if (!irregexp::ParsePattern(dummyTokenStream, cx->tempLifoAlloc(), pattern,
multiline(), mode == MatchOnly, unicode(), &data))
multiline(), mode == MatchOnly, unicode(), ignoreCase(), &data))
{
return false;
}
@ -532,7 +532,7 @@ RegExpShared::compile(JSContext* cx, HandleAtom pattern, HandleLinearString inpu
input->hasLatin1Chars(),
mode == MatchOnly,
force == ForceByteCode,
sticky());
sticky(), unicode());
if (code.empty())
return false;

View File

@ -772,4 +772,439 @@ const uint8_t unicode::index2[] = {
5, 5, 5, 0, 0, 0,
};
const FoldingInfo unicode::js_foldinfo[] = {
{0, 0, 0, 0},
{32, 0, 0, 0},
{32, 8415, 0, 0},
{32, 300, 0, 0},
{0, 65504, 0, 0},
{0, 65504, 8383, 0},
{0, 65504, 268, 0},
{775, 743, 0, 0},
{32, 8294, 0, 0},
{0, 7615, 0, 0},
{0, 65504, 8262, 0},
{0, 121, 0, 0},
{1, 0, 0, 0},
{0, 65535, 0, 0},
{65415, 0, 0, 0},
{65268, 65236, 0, 0},
{0, 195, 0, 0},
{210, 0, 0, 0},
{206, 0, 0, 0},
{205, 0, 0, 0},
{79, 0, 0, 0},
{202, 0, 0, 0},
{203, 0, 0, 0},
{207, 0, 0, 0},
{0, 97, 0, 0},
{211, 0, 0, 0},
{209, 0, 0, 0},
{0, 163, 0, 0},
{213, 0, 0, 0},
{0, 130, 0, 0},
{214, 0, 0, 0},
{218, 0, 0, 0},
{217, 0, 0, 0},
{219, 0, 0, 0},
{0, 56, 0, 0},
{2, 1, 0, 0},
{1, 65535, 0, 0},
{0, 65534, 65535, 0},
{0, 65457, 0, 0},
{65439, 0, 0, 0},
{65480, 0, 0, 0},
{65406, 0, 0, 0},
{10795, 0, 0, 0},
{65373, 0, 0, 0},
{10792, 0, 0, 0},
{0, 10815, 0, 0},
{65341, 0, 0, 0},
{69, 0, 0, 0},
{71, 0, 0, 0},
{0, 10783, 0, 0},
{0, 10780, 0, 0},
{0, 10782, 0, 0},
{0, 65326, 0, 0},
{0, 65330, 0, 0},
{0, 65331, 0, 0},
{0, 65334, 0, 0},
{0, 65333, 0, 0},
{0, 42319, 0, 0},
{0, 42315, 0, 0},
{0, 65329, 0, 0},
{0, 42280, 0, 0},
{0, 42308, 0, 0},
{0, 65327, 0, 0},
{0, 65325, 0, 0},
{0, 10743, 0, 0},
{0, 42305, 0, 0},
{0, 10749, 0, 0},
{0, 65323, 0, 0},
{0, 65322, 0, 0},
{0, 10727, 0, 0},
{0, 65318, 0, 0},
{0, 42282, 0, 0},
{0, 65467, 0, 0},
{0, 65319, 0, 0},
{0, 65465, 0, 0},
{0, 65317, 0, 0},
{0, 42261, 0, 0},
{0, 42258, 0, 0},
{116, 84, 7289, 0},
{116, 0, 0, 0},
{38, 0, 0, 0},
{37, 0, 0, 0},
{64, 0, 0, 0},
{63, 0, 0, 0},
{32, 62, 0, 0},
{32, 96, 0, 0},
{32, 57, 92, 0},
{32, 65452, 7205, 0},
{32, 86, 0, 0},
{32, 64793, 0, 0},
{32, 54, 0, 0},
{32, 80, 0, 0},
{32, 31, 0, 0},
{32, 47, 0, 0},
{32, 7549, 0, 0},
{0, 65498, 0, 0},
{0, 65499, 0, 0},
{0, 65504, 30, 0},
{0, 65504, 64, 0},
{0, 65504, 25, 60},
{0, 65420, 65504, 7173},
{0, 65504, 54, 0},
{0, 64761, 65504, 0},
{0, 65504, 22, 0},
{0, 65504, 48, 0},
{1, 65505, 0, 0},
{0, 65504, 65535, 0},
{0, 65504, 15, 0},
{0, 65504, 7517, 0},
{0, 65472, 0, 0},
{0, 65473, 0, 0},
{8, 0, 0, 0},
{65506, 65474, 0, 0},
{65511, 65479, 35, 0},
{65521, 65489, 0, 0},
{65514, 65482, 0, 0},
{0, 65528, 0, 0},
{65482, 65450, 0, 0},
{65488, 65456, 0, 0},
{0, 7, 0, 0},
{0, 65420, 0, 0},
{65476, 65444, 65501, 0},
{65472, 65440, 0, 0},
{65529, 0, 0, 0},
{80, 0, 0, 0},
{0, 65456, 0, 0},
{15, 0, 0, 0},
{0, 65521, 0, 0},
{48, 0, 0, 0},
{0, 65488, 0, 0},
{7264, 0, 0, 0},
{0, 38864, 0, 0},
{0, 8, 0, 0},
{65528, 0, 0, 0},
{0, 35332, 0, 0},
{0, 3814, 0, 0},
{1, 59, 0, 0},
{0, 65535, 58, 0},
{65478, 65477, 0, 0},
{57921, 0, 0, 0},
{0, 74, 0, 0},
{0, 86, 0, 0},
{0, 100, 0, 0},
{0, 128, 0, 0},
{0, 112, 0, 0},
{0, 126, 0, 0},
{0, 9, 0, 0},
{65462, 0, 0, 0},
{65527, 0, 0, 0},
{58363, 58247, 58331, 0},
{65450, 0, 0, 0},
{65436, 0, 0, 0},
{65424, 0, 0, 0},
{65408, 0, 0, 0},
{65410, 0, 0, 0},
{58019, 57987, 0, 0},
{57153, 57121, 0, 0},
{57274, 57242, 0, 0},
{28, 0, 0, 0},
{0, 65508, 0, 0},
{16, 0, 0, 0},
{0, 65520, 0, 0},
{26, 0, 0, 0},
{0, 65510, 0, 0},
{54793, 0, 0, 0},
{61722, 0, 0, 0},
{54809, 0, 0, 0},
{0, 54741, 0, 0},
{0, 54744, 0, 0},
{54756, 0, 0, 0},
{54787, 0, 0, 0},
{54753, 0, 0, 0},
{54754, 0, 0, 0},
{54721, 0, 0, 0},
{0, 58272, 0, 0},
{30204, 0, 0, 0},
{23256, 0, 0, 0},
{23228, 0, 0, 0},
{23217, 0, 0, 0},
{23221, 0, 0, 0},
{23231, 0, 0, 0},
{23278, 0, 0, 0},
{23254, 0, 0, 0},
{23275, 0, 0, 0},
{928, 0, 0, 0},
{0, 64608, 0, 0},
{26672, 0, 0, 0},
};
const uint8_t unicode::folding_index1[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 0, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 22, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 23, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 0, 0, 26, 27, 28, 26, 29, 30,
31, 32, 0, 0, 0, 0, 33, 34, 35, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 36, 37, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 38, 39, 26, 40,
41, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 42,
43, 0, 44, 45, 46, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 47, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 49, 50, 0, 0,
};
const uint8_t unicode::folding_index2[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1,
1, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5,
4, 4, 4, 4, 4, 4, 4, 6, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 8,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
1, 1, 1, 1, 1, 1, 1, 9, 4, 4, 4, 4, 4, 10, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 4, 4, 4,
4, 4, 4, 11, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13,
12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13,
12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 0, 0,
12, 13, 12, 13, 12, 13, 0, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12,
13, 12, 13, 12, 13, 0, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13,
12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13,
12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 14, 12,
13, 12, 13, 12, 13, 15, 16, 17, 12, 13, 12, 13, 18, 12, 13, 19, 19, 12,
13, 0, 20, 21, 22, 12, 13, 19, 23, 24, 25, 26, 12, 13, 27, 0, 25, 28,
29, 30, 12, 13, 12, 13, 12, 13, 31, 12, 13, 31, 0, 0, 12, 13, 31, 12,
13, 32, 32, 12, 13, 12, 13, 33, 12, 13, 0, 0, 12, 13, 0, 34, 0, 0,
0, 0, 35, 36, 37, 35, 36, 37, 35, 36, 37, 12, 13, 12, 13, 12, 13, 12,
13, 12, 13, 12, 13, 12, 13, 12, 13, 38, 12, 13, 12, 13, 12, 13, 12, 13,
12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 0, 35, 36, 37, 12, 13, 39, 40,
12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13,
12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13,
12, 13, 12, 13, 41, 0, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13,
12, 13, 12, 13, 12, 13, 0, 0, 0, 0, 0, 0, 42, 12, 13, 43, 44, 45,
45, 12, 13, 46, 47, 48, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 49, 50,
51, 52, 53, 0, 54, 54, 0, 55, 0, 56, 57, 0, 0, 0, 54, 58, 0, 59,
0, 60, 61, 0, 62, 63, 0, 64, 65, 0, 0, 63, 0, 66, 67, 0, 0, 68,
0, 0, 0, 0, 0, 0, 0, 69, 0, 0, 70, 0, 0, 70, 0, 0, 0, 71,
70, 72, 73, 73, 74, 0, 0, 0, 0, 0, 75, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 76, 77, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 78, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 13, 12, 13,
0, 0, 12, 13, 0, 0, 0, 29, 29, 29, 0, 79, 0, 0, 0, 0, 0, 0,
80, 0, 81, 81, 81, 0, 82, 0, 83, 83, 0, 1, 84, 1, 1, 85, 1, 1,
86, 87, 88, 1, 89, 1, 1, 1, 90, 91, 0, 92, 1, 1, 93, 1, 1, 94,
1, 1, 95, 96, 96, 96, 0, 4, 97, 4, 4, 98, 4, 4, 99, 100, 101, 4,
102, 4, 4, 4, 103, 104, 105, 106, 4, 4, 107, 4, 4, 108, 4, 4, 109, 110,
110, 111, 112, 113, 0, 0, 0, 114, 115, 116, 12, 13, 12, 13, 12, 13, 12, 13,
12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 117, 118,
119, 120, 121, 122, 0, 12, 13, 123, 12, 13, 0, 41, 41, 41, 124, 124, 124, 124,
124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125, 125,
125, 125, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13,
12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13,
0, 0, 0, 0, 0, 0, 0, 0, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13,
12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13,
12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13,
12, 13, 12, 13, 12, 13, 12, 13, 126, 12, 13, 12, 13, 12, 13, 12, 13, 12,
13, 12, 13, 12, 13, 127, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13,
12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13,
12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13,
12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13,
12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13,
12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 0, 128, 128, 128, 128, 128,
128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129,
129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129,
129, 129, 129, 129, 129, 129, 129, 129, 129, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 130, 130, 130, 130, 130, 130, 130, 130, 130, 130,
130, 130, 130, 130, 130, 130, 130, 130, 130, 130, 130, 130, 130, 130, 130, 130, 130, 130,
130, 130, 130, 130, 130, 130, 130, 130, 130, 130, 0, 130, 0, 0, 0, 0, 0, 130,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 131, 131, 131, 131, 131, 131, 131, 131,
131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131,
131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131,
131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131,
131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, 131,
132, 132, 132, 132, 132, 132, 0, 0, 133, 133, 133, 133, 133, 133, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 134, 0, 0, 0, 135, 0, 0, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13,
12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13,
12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13,
12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13,
12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13,
12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 136, 137, 12, 13,
12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13,
12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13,
12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 0, 0, 0, 0,
0, 138, 0, 0, 139, 0, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13,
12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13,
12, 13, 132, 132, 132, 132, 132, 132, 132, 132, 133, 133, 133, 133, 133, 133, 133, 133,
132, 132, 132, 132, 132, 132, 0, 0, 133, 133, 133, 133, 133, 133, 0, 0, 132, 132,
132, 132, 132, 132, 132, 132, 133, 133, 133, 133, 133, 133, 133, 133, 132, 132, 132, 132,
132, 132, 132, 132, 133, 133, 133, 133, 133, 133, 133, 133, 132, 132, 132, 132, 132, 132,
0, 0, 133, 133, 133, 133, 133, 133, 0, 0, 0, 132, 0, 132, 0, 132, 0, 132,
0, 133, 0, 133, 0, 133, 0, 133, 132, 132, 132, 132, 132, 132, 132, 132, 133, 133,
133, 133, 133, 133, 133, 133, 140, 140, 141, 141, 141, 141, 142, 142, 143, 143, 144, 144,
145, 145, 0, 0, 132, 132, 132, 132, 132, 132, 132, 132, 133, 133, 133, 133, 133, 133,
133, 133, 132, 132, 132, 132, 132, 132, 132, 132, 133, 133, 133, 133, 133, 133, 133, 133,
132, 132, 132, 132, 132, 132, 132, 132, 133, 133, 133, 133, 133, 133, 133, 133, 132, 132,
0, 146, 0, 0, 0, 0, 133, 133, 147, 147, 148, 0, 149, 0, 0, 0, 0, 146,
0, 0, 0, 0, 150, 150, 150, 150, 148, 0, 0, 0, 132, 132, 0, 0, 0, 0,
0, 0, 133, 133, 151, 151, 0, 0, 0, 0, 132, 132, 0, 0, 0, 119, 0, 0,
133, 133, 152, 152, 123, 0, 0, 0, 0, 0, 0, 146, 0, 0, 0, 0, 153, 153,
154, 154, 148, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 155, 0, 0, 0, 156, 157, 0, 0, 0, 0,
0, 0, 158, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 159, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 160, 160, 160, 160, 160, 160,
160, 160, 160, 160, 160, 160, 160, 160, 160, 160, 161, 161, 161, 161, 161, 161, 161, 161,
161, 161, 161, 161, 161, 161, 161, 161, 0, 0, 0, 12, 13, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
162, 162, 162, 162, 162, 162, 162, 162, 162, 162, 162, 162, 162, 162, 162, 162, 162, 162,
162, 162, 162, 162, 162, 162, 162, 162, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163,
163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 0, 129, 129, 129, 129,
129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129,
129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129,
129, 129, 129, 129, 129, 129, 129, 0, 12, 13, 164, 165, 166, 167, 168, 12, 13, 12,
13, 12, 13, 169, 170, 171, 172, 0, 12, 13, 0, 12, 13, 0, 0, 0, 0, 0,
0, 0, 173, 173, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13,
12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13,
12, 13, 12, 13, 0, 0, 0, 0, 0, 0, 0, 12, 13, 12, 13, 0, 0, 0,
12, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 174, 174, 174, 174,
174, 174, 174, 174, 174, 174, 174, 174, 174, 174, 174, 174, 174, 174, 174, 174, 174, 174,
174, 174, 174, 174, 174, 174, 174, 174, 174, 174, 174, 174, 174, 174, 174, 174, 0, 174,
0, 0, 0, 0, 0, 174, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13,
12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13,
12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 13,
12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13,
12, 13, 12, 13, 12, 13, 12, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13,
12, 13, 0, 0, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13,
12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13,
12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13,
12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 0, 0, 0, 0, 0, 0,
0, 0, 0, 12, 13, 12, 13, 175, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13,
0, 0, 0, 12, 13, 176, 0, 0, 12, 13, 12, 13, 0, 0, 12, 13, 12, 13,
12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 12, 13, 177, 178,
179, 180, 0, 0, 181, 182, 183, 184, 12, 13, 12, 13, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 185, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 186, 186, 186, 186,
186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186,
186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186,
186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186,
186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186,
186, 186, 186, 186, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0,
};

View File

@ -234,6 +234,55 @@ CanLowerCase(char16_t ch)
return CharInfo(ch).lowerCase != 0;
}
class FoldingInfo {
public:
uint16_t folding;
uint16_t reverse1;
uint16_t reverse2;
uint16_t reverse3;
};
extern const uint8_t folding_index1[];
extern const uint8_t folding_index2[];
extern const FoldingInfo js_foldinfo[];
inline const FoldingInfo&
CaseFoldInfo(char16_t code)
{
const size_t shift = 6;
size_t index = folding_index1[code >> shift];
index = folding_index2[(index << shift) + (code & ((1 << shift) - 1))];
return js_foldinfo[index];
}
inline char16_t
FoldCase(char16_t ch)
{
const FoldingInfo& info = CaseFoldInfo(ch);
return uint16_t(ch) + info.folding;
}
inline char16_t
ReverseFoldCase1(char16_t ch)
{
const FoldingInfo& info = CaseFoldInfo(ch);
return uint16_t(ch) + info.reverse1;
}
inline char16_t
ReverseFoldCase2(char16_t ch)
{
const FoldingInfo& info = CaseFoldInfo(ch);
return uint16_t(ch) + info.reverse2;
}
inline char16_t
ReverseFoldCase3(char16_t ch)
{
const FoldingInfo& info = CaseFoldInfo(ch);
return uint16_t(ch) + info.reverse3;
}
const size_t LeadSurrogateMin = 0xD800;
const size_t LeadSurrogateMax = 0xDBFF;
const size_t TrailSurrogateMin = 0xDC00;

View File

@ -84,13 +84,33 @@ def read_unicode_data(unicode_file):
row[0] = int(row[0], 16)
yield row
def generate_unicode_stuff(unicode_data, data_file, test_mapping, test_space):
def read_case_folding(case_folding):
for line in case_folding:
if line == '\n' or line.startswith('#'):
continue
row = line.split('; ')
if row[1] in ['F', 'T']:
continue
row[0] = int(row[0], 16)
row[2] = int(row[2], 16)
yield row
def generate_unicode_stuff(unicode_data, case_folding,
data_file, test_mapping, test_space, test_icase):
dummy = (0, 0, 0)
table = [dummy]
cache = {dummy: 0}
index = [0] * (MAX + 1)
folding_map = {}
rev_folding_map = {}
folding_dummy = (0, 0, 0, 0)
folding_table = [folding_dummy]
folding_cache = {folding_dummy: 0}
folding_index = [0] * (MAX + 1)
test_table = {}
test_space_table = []
folding_tests = []
folding_codes = set()
for row in read_unicode_data(unicode_data):
code = row[0]
@ -143,6 +163,64 @@ def generate_unicode_stuff(unicode_data, data_file, test_mapping, test_space):
table.append(item)
index[code] = i
for row in read_case_folding(case_folding):
code = row[0]
mapping = row[2]
folding_map[code] = mapping
if mapping not in rev_folding_map:
rev_folding_map[mapping] = [code]
else:
rev_folding_map[mapping].append(code)
folding_codes.add(code)
folding_codes.add(mapping)
for code in sorted(folding_codes):
if code > MAX:
continue
if code in folding_map:
folding = folding_map[code]
else:
folding = code
if code in rev_folding_map:
rev_folding = rev_folding_map[code]
elif folding in rev_folding_map:
rev_folding = [c for c in rev_folding_map[folding] if c != code]
else:
rev_folding = []
assert len(rev_folding) <= 3
if folding != code or len(rev_folding):
item = [code]
if folding != code:
item.append(folding)
folding_tests.append(item + rev_folding)
folding_d = folding - code
rev_folding_ds = [v - code for v in rev_folding]
assert folding_d > -65535 and folding_d < 65535
assert all([v > -65535 and v < 65535 for v in rev_folding])
folding = folding_d & 0xffff
rev_folding = [v & 0xffff for v in rev_folding_ds]
rev_folding_0 = rev_folding[0] if len(rev_folding) >= 1 else 0
rev_folding_1 = rev_folding[1] if len(rev_folding) >= 2 else 0
rev_folding_2 = rev_folding[2] if len(rev_folding) >= 3 else 0
item = (folding, rev_folding_0, rev_folding_1, rev_folding_2)
i = folding_cache.get(item)
if i is None:
assert item not in folding_table
folding_cache[item] = i = len(folding_table)
folding_table.append(item)
folding_index[code] = i
test_mapping.write('/* Generated by make_unicode.py DO NOT MODIFY */\n')
test_mapping.write(public_domain)
test_mapping.write('var mapping = [\n')
@ -180,6 +258,29 @@ assertEq((onlySpace + 'aaaa').trim(), 'aaaa');
assertEq(('aaaa' + onlySpace).trim(), 'aaaa');
assertEq((onlySpace + 'aaaa' + onlySpace).trim(), 'aaaa');
if (typeof reportCompare === "function")
reportCompare(true, true);
""")
test_icase.write('/* Generated by make_unicode.py DO NOT MODIFY */\n')
test_icase.write(public_domain)
test_icase.write("""
var BUGNUMBER = 1135377;
var summary = "Implement RegExp unicode flag -- ignoreCase flag.";
print(BUGNUMBER + ": " + summary);
function test(code, ...equivs) {
var codeRe = new RegExp(String.fromCodePoint(code) + "+", "iu");
var ans = String.fromCodePoint(code) + equivs.map(c => String.fromCodePoint(c)).join("");
assertEqArray(codeRe.exec("<" + ans + ">"), [ans]);
codeRe = new RegExp("[" + String.fromCodePoint(code) + "]+", "iu");
assertEqArray(codeRe.exec("<" + ans + ">"), [ans]);
}
""")
for args in folding_tests:
test_icase.write('test(' + ','.join([hex(c) for c in args]) + ');\n')
test_icase.write("""
if (typeof reportCompare === "function")
reportCompare(true, true);
""")
@ -189,6 +290,11 @@ if (typeof reportCompare === "function")
# Don't forget to update CharInfo in Unicode.cpp if you need to change this
assert shift == 5
folding_index1, folding_index2, folding_shift = splitbins(folding_index)
# Don't forget to update CharInfo in Unicode.cpp if you need to change this
assert folding_shift == 6
# verify correctness
for char in index:
test = table[index[char]]
@ -198,6 +304,14 @@ if (typeof reportCompare === "function")
assert test == table[idx]
# verify correctness
for char in folding_index:
test = folding_table[folding_index[char]]
idx = folding_index1[char >> folding_shift]
idx = folding_index2[(idx << folding_shift) + (char & ((1 << folding_shift) - 1))]
assert test == folding_table[idx]
comment = """
/*
@ -284,6 +398,19 @@ if (typeof reportCompare === "function")
dump(index2, 'index2', data_file)
data_file.write('\n')
data_file.write('const FoldingInfo unicode::js_foldinfo[] = {\n')
for d in folding_table:
data_file.write(' {')
data_file.write(', '.join((str(e) for e in d)))
data_file.write('},\n')
data_file.write('};\n')
data_file.write('\n')
dump(folding_index1, 'folding_index1', data_file)
data_file.write('\n')
dump(folding_index2, 'folding_index2', data_file)
data_file.write('\n')
data_file.write('\n')
def getsize(data):
@ -362,7 +489,7 @@ if __name__ == '__main__':
print('Always make sure you have the newest UnicodeData.txt!')
unicode_data = open(sys.argv[1], 'r')
else:
print('Downloading...')
print('Downloading UnicodeData.txt...')
reader = urllib2.urlopen('http://unicode.org/Public/UNIDATA/UnicodeData.txt')
data = reader.read()
reader.close()
@ -370,8 +497,21 @@ if __name__ == '__main__':
unicode_data.write(data)
unicode_data.seek(0)
if len(sys.argv) > 2:
print('Always make sure you have the newest CaseFolding.txt!')
case_folding = open(sys.argv[2], 'r')
else:
print('Downloading CaseFolding.txt...')
reader = urllib2.urlopen('http://unicode.org/Public/UNIDATA/CaseFolding.txt')
data = reader.read()
reader.close()
case_folding = open('CaseFolding.txt', 'w+')
case_folding.write(data)
case_folding.seek(0)
print('Generating...')
generate_unicode_stuff(unicode_data,
generate_unicode_stuff(unicode_data, case_folding,
open('Unicode.cpp', 'w'),
open('../tests/ecma_5/String/string-upper-lower-mapping.js', 'w'),
open('../tests/ecma_5/String/string-space-trim.js', 'w'))
open('../tests/ecma_5/String/string-space-trim.js', 'w'),
open('../tests/ecma_6/RegExp/unicode-ignoreCase.js', 'w'))