mirror of
https://github.com/flutter/flutter.git
synced 2026-02-20 02:29:02 +08:00
428 lines
15 KiB
C++
428 lines
15 KiB
C++
/*
|
|
* Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2012 Apple Inc. All
|
|
* rights reserved.
|
|
* Copyright (C) 2005 Alexey Proskuryakov.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
|
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
|
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#include "flutter/sky/engine/platform/text/UnicodeUtilities.h"
|
|
|
|
#include <unicode/unorm.h>
|
|
#include "flutter/sky/engine/wtf/text/StringBuffer.h"
|
|
#include "flutter/sky/engine/wtf/unicode/CharacterNames.h"
|
|
|
|
using namespace WTF::Unicode;
|
|
|
|
namespace blink {
|
|
|
|
enum VoicedSoundMarkType {
|
|
NoVoicedSoundMark,
|
|
VoicedSoundMark,
|
|
SemiVoicedSoundMark
|
|
};
|
|
|
|
template <typename CharType>
|
|
static inline CharType foldQuoteMarkOrSoftHyphen(CharType c) {
|
|
switch (static_cast<UChar>(c)) {
|
|
case hebrewPunctuationGershayim:
|
|
case leftDoubleQuotationMark:
|
|
case rightDoubleQuotationMark:
|
|
return '"';
|
|
case hebrewPunctuationGeresh:
|
|
case leftSingleQuotationMark:
|
|
case rightSingleQuotationMark:
|
|
return '\'';
|
|
case softHyphen:
|
|
// Replace soft hyphen with an ignorable character so that their presence
|
|
// or absence will not affect string comparison.
|
|
return 0;
|
|
default:
|
|
return c;
|
|
}
|
|
}
|
|
|
|
void foldQuoteMarksAndSoftHyphens(UChar* data, size_t length) {
|
|
for (size_t i = 0; i < length; ++i)
|
|
data[i] = foldQuoteMarkOrSoftHyphen(data[i]);
|
|
}
|
|
|
|
void foldQuoteMarksAndSoftHyphens(String& s) {
|
|
s.replace(hebrewPunctuationGeresh, '\'');
|
|
s.replace(hebrewPunctuationGershayim, '"');
|
|
s.replace(leftDoubleQuotationMark, '"');
|
|
s.replace(leftSingleQuotationMark, '\'');
|
|
s.replace(rightDoubleQuotationMark, '"');
|
|
s.replace(rightSingleQuotationMark, '\'');
|
|
// Replace soft hyphen with an ignorable character so that their presence or
|
|
// absence will not affect string comparison.
|
|
s.replace(softHyphen, 0);
|
|
}
|
|
|
|
static bool isNonLatin1Separator(UChar32 character) {
|
|
ASSERT_ARG(character, character >= 256);
|
|
|
|
return U_GET_GC_MASK(character) &
|
|
(U_GC_S_MASK | U_GC_P_MASK | U_GC_Z_MASK | U_GC_CF_MASK);
|
|
}
|
|
|
|
bool isSeparator(UChar32 character) {
|
|
static const bool latin1SeparatorTable[256] = {
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, // space ! " # $ % & ' ( ) * + , -
|
|
// . /
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
|
|
1, // : ; < = > ?
|
|
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, // @
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
|
|
1, // [ \ ] ^ _
|
|
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, // `
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
|
|
0, // { | } ~
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
|
|
0, 0, 0};
|
|
|
|
if (character < 256)
|
|
return latin1SeparatorTable[character];
|
|
|
|
return isNonLatin1Separator(character);
|
|
}
|
|
|
|
// ICU's search ignores the distinction between small kana letters and ones
|
|
// that are not small, and also characters that differ only in the voicing
|
|
// marks when considering only primary collation strength differences.
|
|
// This is not helpful for end users, since these differences make words
|
|
// distinct, so for our purposes we need these to be considered.
|
|
// The Unicode folks do not think the collation algorithm should be
|
|
// changed. To work around this, we would like to tailor the ICU searcher,
|
|
// but we can't get that to work yet. So instead, we check for cases where
|
|
// these differences occur, and skip those matches.
|
|
|
|
// We refer to the above technique as the "kana workaround". The next few
|
|
// functions are helper functinos for the kana workaround.
|
|
|
|
bool isKanaLetter(UChar character) {
|
|
// Hiragana letters.
|
|
if (character >= 0x3041 && character <= 0x3096)
|
|
return true;
|
|
|
|
// Katakana letters.
|
|
if (character >= 0x30A1 && character <= 0x30FA)
|
|
return true;
|
|
if (character >= 0x31F0 && character <= 0x31FF)
|
|
return true;
|
|
|
|
// Halfwidth katakana letters.
|
|
if (character >= 0xFF66 && character <= 0xFF9D && character != 0xFF70)
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
bool isSmallKanaLetter(UChar character) {
|
|
ASSERT(isKanaLetter(character));
|
|
|
|
switch (character) {
|
|
case 0x3041: // HIRAGANA LETTER SMALL A
|
|
case 0x3043: // HIRAGANA LETTER SMALL I
|
|
case 0x3045: // HIRAGANA LETTER SMALL U
|
|
case 0x3047: // HIRAGANA LETTER SMALL E
|
|
case 0x3049: // HIRAGANA LETTER SMALL O
|
|
case 0x3063: // HIRAGANA LETTER SMALL TU
|
|
case 0x3083: // HIRAGANA LETTER SMALL YA
|
|
case 0x3085: // HIRAGANA LETTER SMALL YU
|
|
case 0x3087: // HIRAGANA LETTER SMALL YO
|
|
case 0x308E: // HIRAGANA LETTER SMALL WA
|
|
case 0x3095: // HIRAGANA LETTER SMALL KA
|
|
case 0x3096: // HIRAGANA LETTER SMALL KE
|
|
case 0x30A1: // KATAKANA LETTER SMALL A
|
|
case 0x30A3: // KATAKANA LETTER SMALL I
|
|
case 0x30A5: // KATAKANA LETTER SMALL U
|
|
case 0x30A7: // KATAKANA LETTER SMALL E
|
|
case 0x30A9: // KATAKANA LETTER SMALL O
|
|
case 0x30C3: // KATAKANA LETTER SMALL TU
|
|
case 0x30E3: // KATAKANA LETTER SMALL YA
|
|
case 0x30E5: // KATAKANA LETTER SMALL YU
|
|
case 0x30E7: // KATAKANA LETTER SMALL YO
|
|
case 0x30EE: // KATAKANA LETTER SMALL WA
|
|
case 0x30F5: // KATAKANA LETTER SMALL KA
|
|
case 0x30F6: // KATAKANA LETTER SMALL KE
|
|
case 0x31F0: // KATAKANA LETTER SMALL KU
|
|
case 0x31F1: // KATAKANA LETTER SMALL SI
|
|
case 0x31F2: // KATAKANA LETTER SMALL SU
|
|
case 0x31F3: // KATAKANA LETTER SMALL TO
|
|
case 0x31F4: // KATAKANA LETTER SMALL NU
|
|
case 0x31F5: // KATAKANA LETTER SMALL HA
|
|
case 0x31F6: // KATAKANA LETTER SMALL HI
|
|
case 0x31F7: // KATAKANA LETTER SMALL HU
|
|
case 0x31F8: // KATAKANA LETTER SMALL HE
|
|
case 0x31F9: // KATAKANA LETTER SMALL HO
|
|
case 0x31FA: // KATAKANA LETTER SMALL MU
|
|
case 0x31FB: // KATAKANA LETTER SMALL RA
|
|
case 0x31FC: // KATAKANA LETTER SMALL RI
|
|
case 0x31FD: // KATAKANA LETTER SMALL RU
|
|
case 0x31FE: // KATAKANA LETTER SMALL RE
|
|
case 0x31FF: // KATAKANA LETTER SMALL RO
|
|
case 0xFF67: // HALFWIDTH KATAKANA LETTER SMALL A
|
|
case 0xFF68: // HALFWIDTH KATAKANA LETTER SMALL I
|
|
case 0xFF69: // HALFWIDTH KATAKANA LETTER SMALL U
|
|
case 0xFF6A: // HALFWIDTH KATAKANA LETTER SMALL E
|
|
case 0xFF6B: // HALFWIDTH KATAKANA LETTER SMALL O
|
|
case 0xFF6C: // HALFWIDTH KATAKANA LETTER SMALL YA
|
|
case 0xFF6D: // HALFWIDTH KATAKANA LETTER SMALL YU
|
|
case 0xFF6E: // HALFWIDTH KATAKANA LETTER SMALL YO
|
|
case 0xFF6F: // HALFWIDTH KATAKANA LETTER SMALL TU
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
static inline VoicedSoundMarkType composedVoicedSoundMark(UChar character) {
|
|
ASSERT(isKanaLetter(character));
|
|
|
|
switch (character) {
|
|
case 0x304C: // HIRAGANA LETTER GA
|
|
case 0x304E: // HIRAGANA LETTER GI
|
|
case 0x3050: // HIRAGANA LETTER GU
|
|
case 0x3052: // HIRAGANA LETTER GE
|
|
case 0x3054: // HIRAGANA LETTER GO
|
|
case 0x3056: // HIRAGANA LETTER ZA
|
|
case 0x3058: // HIRAGANA LETTER ZI
|
|
case 0x305A: // HIRAGANA LETTER ZU
|
|
case 0x305C: // HIRAGANA LETTER ZE
|
|
case 0x305E: // HIRAGANA LETTER ZO
|
|
case 0x3060: // HIRAGANA LETTER DA
|
|
case 0x3062: // HIRAGANA LETTER DI
|
|
case 0x3065: // HIRAGANA LETTER DU
|
|
case 0x3067: // HIRAGANA LETTER DE
|
|
case 0x3069: // HIRAGANA LETTER DO
|
|
case 0x3070: // HIRAGANA LETTER BA
|
|
case 0x3073: // HIRAGANA LETTER BI
|
|
case 0x3076: // HIRAGANA LETTER BU
|
|
case 0x3079: // HIRAGANA LETTER BE
|
|
case 0x307C: // HIRAGANA LETTER BO
|
|
case 0x3094: // HIRAGANA LETTER VU
|
|
case 0x30AC: // KATAKANA LETTER GA
|
|
case 0x30AE: // KATAKANA LETTER GI
|
|
case 0x30B0: // KATAKANA LETTER GU
|
|
case 0x30B2: // KATAKANA LETTER GE
|
|
case 0x30B4: // KATAKANA LETTER GO
|
|
case 0x30B6: // KATAKANA LETTER ZA
|
|
case 0x30B8: // KATAKANA LETTER ZI
|
|
case 0x30BA: // KATAKANA LETTER ZU
|
|
case 0x30BC: // KATAKANA LETTER ZE
|
|
case 0x30BE: // KATAKANA LETTER ZO
|
|
case 0x30C0: // KATAKANA LETTER DA
|
|
case 0x30C2: // KATAKANA LETTER DI
|
|
case 0x30C5: // KATAKANA LETTER DU
|
|
case 0x30C7: // KATAKANA LETTER DE
|
|
case 0x30C9: // KATAKANA LETTER DO
|
|
case 0x30D0: // KATAKANA LETTER BA
|
|
case 0x30D3: // KATAKANA LETTER BI
|
|
case 0x30D6: // KATAKANA LETTER BU
|
|
case 0x30D9: // KATAKANA LETTER BE
|
|
case 0x30DC: // KATAKANA LETTER BO
|
|
case 0x30F4: // KATAKANA LETTER VU
|
|
case 0x30F7: // KATAKANA LETTER VA
|
|
case 0x30F8: // KATAKANA LETTER VI
|
|
case 0x30F9: // KATAKANA LETTER VE
|
|
case 0x30FA: // KATAKANA LETTER VO
|
|
return VoicedSoundMark;
|
|
case 0x3071: // HIRAGANA LETTER PA
|
|
case 0x3074: // HIRAGANA LETTER PI
|
|
case 0x3077: // HIRAGANA LETTER PU
|
|
case 0x307A: // HIRAGANA LETTER PE
|
|
case 0x307D: // HIRAGANA LETTER PO
|
|
case 0x30D1: // KATAKANA LETTER PA
|
|
case 0x30D4: // KATAKANA LETTER PI
|
|
case 0x30D7: // KATAKANA LETTER PU
|
|
case 0x30DA: // KATAKANA LETTER PE
|
|
case 0x30DD: // KATAKANA LETTER PO
|
|
return SemiVoicedSoundMark;
|
|
}
|
|
return NoVoicedSoundMark;
|
|
}
|
|
|
|
static inline bool isCombiningVoicedSoundMark(UChar character) {
|
|
switch (character) {
|
|
case 0x3099: // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK
|
|
case 0x309A: // COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool containsKanaLetters(const String& pattern) {
|
|
const unsigned length = pattern.length();
|
|
for (unsigned i = 0; i < length; ++i) {
|
|
if (isKanaLetter(pattern[i]))
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
void normalizeCharactersIntoNFCForm(const UChar* characters,
|
|
unsigned length,
|
|
Vector<UChar>& buffer) {
|
|
ASSERT(length);
|
|
|
|
buffer.resize(length);
|
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
size_t bufferSize = unorm_normalize(characters, length, UNORM_NFC, 0,
|
|
buffer.data(), length, &status);
|
|
ASSERT(status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING ||
|
|
status == U_BUFFER_OVERFLOW_ERROR);
|
|
ASSERT(bufferSize);
|
|
|
|
buffer.resize(bufferSize);
|
|
|
|
if (status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING)
|
|
return;
|
|
|
|
status = U_ZERO_ERROR;
|
|
unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), bufferSize,
|
|
&status);
|
|
ASSERT(status == U_STRING_NOT_TERMINATED_WARNING);
|
|
}
|
|
|
|
// This function returns kNotFound if |first| and |second| contain different
|
|
// Kana letters. If |first| and |second| contain the same Kana letter then
|
|
// function returns offset in characters from |first|. Pointers to both strings
|
|
// increase simultaneously so so it is possible to use one offset value.
|
|
static inline size_t compareKanaLetterAndComposedVoicedSoundMarks(
|
|
const UChar* first,
|
|
const UChar* firstEnd,
|
|
const UChar* second,
|
|
const UChar* secondEnd) {
|
|
const UChar* start = first;
|
|
// Check for differences in the kana letter character itself.
|
|
if (isSmallKanaLetter(*first) != isSmallKanaLetter(*second))
|
|
return kNotFound;
|
|
if (composedVoicedSoundMark(*first) != composedVoicedSoundMark(*second))
|
|
return kNotFound;
|
|
++first;
|
|
++second;
|
|
|
|
// Check for differences in combining voiced sound marks found after the
|
|
// letter.
|
|
while (true) {
|
|
const bool secondIsNotSoundMark =
|
|
second == secondEnd || !isCombiningVoicedSoundMark(*second);
|
|
if (first == firstEnd || !isCombiningVoicedSoundMark(*first)) {
|
|
return secondIsNotSoundMark ? first - start : kNotFound;
|
|
}
|
|
if (secondIsNotSoundMark)
|
|
return kNotFound;
|
|
if (*first != *second)
|
|
return kNotFound;
|
|
++first;
|
|
++second;
|
|
}
|
|
}
|
|
|
|
bool checkOnlyKanaLettersInStrings(const UChar* firstData,
|
|
unsigned firstLength,
|
|
const UChar* secondData,
|
|
unsigned secondLength) {
|
|
const UChar* a = firstData;
|
|
const UChar* aEnd = firstData + firstLength;
|
|
|
|
const UChar* b = secondData;
|
|
const UChar* bEnd = secondData + secondLength;
|
|
while (true) {
|
|
// Skip runs of non-kana-letter characters. This is necessary so we can
|
|
// correctly handle strings where the |firstData| and |secondData| have
|
|
// different-length runs of characters that match, while still double
|
|
// checking the correctness of matches of kana letters with other kana
|
|
// letters.
|
|
while (a != aEnd && !isKanaLetter(*a))
|
|
++a;
|
|
while (b != bEnd && !isKanaLetter(*b))
|
|
++b;
|
|
|
|
// If we reached the end of either the target or the match, we should have
|
|
// reached the end of both; both should have the same number of kana
|
|
// letters.
|
|
if (a == aEnd || b == bEnd) {
|
|
return a == aEnd && b == bEnd;
|
|
}
|
|
|
|
// Check that single Kana letters in |a| and |b| are the same.
|
|
const size_t offset =
|
|
compareKanaLetterAndComposedVoicedSoundMarks(a, aEnd, b, bEnd);
|
|
if (offset == kNotFound)
|
|
return false;
|
|
|
|
// Update values of |a| and |b| after comparing.
|
|
a += offset;
|
|
b += offset;
|
|
}
|
|
}
|
|
|
|
bool checkKanaStringsEqual(const UChar* firstData,
|
|
unsigned firstLength,
|
|
const UChar* secondData,
|
|
unsigned secondLength) {
|
|
const UChar* a = firstData;
|
|
const UChar* aEnd = firstData + firstLength;
|
|
|
|
const UChar* b = secondData;
|
|
const UChar* bEnd = secondData + secondLength;
|
|
while (true) {
|
|
// Check for non-kana-letter characters.
|
|
while (a != aEnd && !isKanaLetter(*a) && b != bEnd && !isKanaLetter(*b)) {
|
|
if (*a++ != *b++)
|
|
return false;
|
|
}
|
|
|
|
// If we reached the end of either the target or the match, we should have
|
|
// reached the end of both; both should have the same number of kana
|
|
// letters.
|
|
if (a == aEnd || b == bEnd) {
|
|
return a == aEnd && b == bEnd;
|
|
}
|
|
|
|
if (isKanaLetter(*a) != isKanaLetter(*b))
|
|
return false;
|
|
|
|
// Check that single Kana letters in |a| and |b| are the same.
|
|
const size_t offset =
|
|
compareKanaLetterAndComposedVoicedSoundMarks(a, aEnd, b, bEnd);
|
|
if (offset == kNotFound)
|
|
return false;
|
|
|
|
// Update values of |a| and |b| after comparing.
|
|
a += offset;
|
|
b += offset;
|
|
}
|
|
}
|
|
|
|
} // namespace blink
|