flutter_flutter/sky/engine/platform/text/UnicodeUtilities.cpp

/*
 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2012 Apple Inc. All
 * rights reserved.
 * Copyright (C) 2005 Alexey Proskuryakov.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include "flutter/sky/engine/platform/text/UnicodeUtilities.h"

#include <unicode/unorm.h>
#include "flutter/sky/engine/wtf/text/StringBuffer.h"
#include "flutter/sky/engine/wtf/unicode/CharacterNames.h"

using namespace WTF::Unicode;

namespace blink {

enum VoicedSoundMarkType {
  NoVoicedSoundMark,
  VoicedSoundMark,
  SemiVoicedSoundMark
};

template <typename CharType>
static inline CharType foldQuoteMarkOrSoftHyphen(CharType c) {
  switch (static_cast<UChar>(c)) {
    case hebrewPunctuationGershayim:
    case leftDoubleQuotationMark:
    case rightDoubleQuotationMark:
      return '"';
    case hebrewPunctuationGeresh:
    case leftSingleQuotationMark:
    case rightSingleQuotationMark:
      return '\'';
    case softHyphen:
      // Replace soft hyphen with an ignorable character so that their presence
      // or absence will not affect string comparison.
      return 0;
    default:
      return c;
  }
}

void foldQuoteMarksAndSoftHyphens(UChar* data, size_t length) {
  for (size_t i = 0; i < length; ++i)
    data[i] = foldQuoteMarkOrSoftHyphen(data[i]);
}

void foldQuoteMarksAndSoftHyphens(String& s) {
  s.replace(hebrewPunctuationGeresh, '\'');
  s.replace(hebrewPunctuationGershayim, '"');
  s.replace(leftDoubleQuotationMark, '"');
  s.replace(leftSingleQuotationMark, '\'');
  s.replace(rightDoubleQuotationMark, '"');
  s.replace(rightSingleQuotationMark, '\'');
  // Replace soft hyphen with an ignorable character so that their presence or
  // absence will not affect string comparison.
  s.replace(softHyphen, 0);
}

static bool isNonLatin1Separator(UChar32 character) {
  ASSERT_ARG(character, character >= 256);

  return U_GET_GC_MASK(character) &
         (U_GC_S_MASK | U_GC_P_MASK | U_GC_Z_MASK | U_GC_CF_MASK);
}

bool isSeparator(UChar32 character) {
  static const bool latin1SeparatorTable[256] = {
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
      0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
      1,  // space ! " # $ % & ' ( ) * + , -
          // . /
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
      1,  //                         : ; < = > ?
      1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
      0,  //   @
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
      1,  //                         [ \ ] ^ _
      1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
      0,  //   `
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
      0,  //                           { | } ~
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
      0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
      1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
      0, 0, 0};

  if (character < 256)
    return latin1SeparatorTable[character];

  return isNonLatin1Separator(character);
}

// ICU's search ignores the distinction between small kana letters and ones
// that are not small, and also characters that differ only in the voicing
// marks when considering only primary collation strength differences.
// This is not helpful for end users, since these differences make words
// distinct, so for our purposes we need these to be considered.
// The Unicode folks do not think the collation algorithm should be
// changed. To work around this, we would like to tailor the ICU searcher,
// but we can't get that to work yet. So instead, we check for cases where
// these differences occur, and skip those matches.

// We refer to the above technique as the "kana workaround". The next few
// functions are helper functinos for the kana workaround.

bool isKanaLetter(UChar character) {
  // Hiragana letters.
  if (character >= 0x3041 && character <= 0x3096)
    return true;

  // Katakana letters.
  if (character >= 0x30A1 && character <= 0x30FA)
    return true;
  if (character >= 0x31F0 && character <= 0x31FF)
    return true;

  // Halfwidth katakana letters.
  if (character >= 0xFF66 && character <= 0xFF9D && character != 0xFF70)
    return true;

  return false;
}

bool isSmallKanaLetter(UChar character) {
  ASSERT(isKanaLetter(character));

  switch (character) {
    case 0x3041:  // HIRAGANA LETTER SMALL A
    case 0x3043:  // HIRAGANA LETTER SMALL I
    case 0x3045:  // HIRAGANA LETTER SMALL U
    case 0x3047:  // HIRAGANA LETTER SMALL E
    case 0x3049:  // HIRAGANA LETTER SMALL O
    case 0x3063:  // HIRAGANA LETTER SMALL TU
    case 0x3083:  // HIRAGANA LETTER SMALL YA
    case 0x3085:  // HIRAGANA LETTER SMALL YU
    case 0x3087:  // HIRAGANA LETTER SMALL YO
    case 0x308E:  // HIRAGANA LETTER SMALL WA
    case 0x3095:  // HIRAGANA LETTER SMALL KA
    case 0x3096:  // HIRAGANA LETTER SMALL KE
    case 0x30A1:  // KATAKANA LETTER SMALL A
    case 0x30A3:  // KATAKANA LETTER SMALL I
    case 0x30A5:  // KATAKANA LETTER SMALL U
    case 0x30A7:  // KATAKANA LETTER SMALL E
    case 0x30A9:  // KATAKANA LETTER SMALL O
    case 0x30C3:  // KATAKANA LETTER SMALL TU
    case 0x30E3:  // KATAKANA LETTER SMALL YA
    case 0x30E5:  // KATAKANA LETTER SMALL YU
    case 0x30E7:  // KATAKANA LETTER SMALL YO
    case 0x30EE:  // KATAKANA LETTER SMALL WA
    case 0x30F5:  // KATAKANA LETTER SMALL KA
    case 0x30F6:  // KATAKANA LETTER SMALL KE
    case 0x31F0:  // KATAKANA LETTER SMALL KU
    case 0x31F1:  // KATAKANA LETTER SMALL SI
    case 0x31F2:  // KATAKANA LETTER SMALL SU
    case 0x31F3:  // KATAKANA LETTER SMALL TO
    case 0x31F4:  // KATAKANA LETTER SMALL NU
    case 0x31F5:  // KATAKANA LETTER SMALL HA
    case 0x31F6:  // KATAKANA LETTER SMALL HI
    case 0x31F7:  // KATAKANA LETTER SMALL HU
    case 0x31F8:  // KATAKANA LETTER SMALL HE
    case 0x31F9:  // KATAKANA LETTER SMALL HO
    case 0x31FA:  // KATAKANA LETTER SMALL MU
    case 0x31FB:  // KATAKANA LETTER SMALL RA
    case 0x31FC:  // KATAKANA LETTER SMALL RI
    case 0x31FD:  // KATAKANA LETTER SMALL RU
    case 0x31FE:  // KATAKANA LETTER SMALL RE
    case 0x31FF:  // KATAKANA LETTER SMALL RO
    case 0xFF67:  // HALFWIDTH KATAKANA LETTER SMALL A
    case 0xFF68:  // HALFWIDTH KATAKANA LETTER SMALL I
    case 0xFF69:  // HALFWIDTH KATAKANA LETTER SMALL U
    case 0xFF6A:  // HALFWIDTH KATAKANA LETTER SMALL E
    case 0xFF6B:  // HALFWIDTH KATAKANA LETTER SMALL O
    case 0xFF6C:  // HALFWIDTH KATAKANA LETTER SMALL YA
    case 0xFF6D:  // HALFWIDTH KATAKANA LETTER SMALL YU
    case 0xFF6E:  // HALFWIDTH KATAKANA LETTER SMALL YO
    case 0xFF6F:  // HALFWIDTH KATAKANA LETTER SMALL TU
      return true;
  }
  return false;
}

static inline VoicedSoundMarkType composedVoicedSoundMark(UChar character) {
  ASSERT(isKanaLetter(character));

  switch (character) {
    case 0x304C:  // HIRAGANA LETTER GA
    case 0x304E:  // HIRAGANA LETTER GI
    case 0x3050:  // HIRAGANA LETTER GU
    case 0x3052:  // HIRAGANA LETTER GE
    case 0x3054:  // HIRAGANA LETTER GO
    case 0x3056:  // HIRAGANA LETTER ZA
    case 0x3058:  // HIRAGANA LETTER ZI
    case 0x305A:  // HIRAGANA LETTER ZU
    case 0x305C:  // HIRAGANA LETTER ZE
    case 0x305E:  // HIRAGANA LETTER ZO
    case 0x3060:  // HIRAGANA LETTER DA
    case 0x3062:  // HIRAGANA LETTER DI
    case 0x3065:  // HIRAGANA LETTER DU
    case 0x3067:  // HIRAGANA LETTER DE
    case 0x3069:  // HIRAGANA LETTER DO
    case 0x3070:  // HIRAGANA LETTER BA
    case 0x3073:  // HIRAGANA LETTER BI
    case 0x3076:  // HIRAGANA LETTER BU
    case 0x3079:  // HIRAGANA LETTER BE
    case 0x307C:  // HIRAGANA LETTER BO
    case 0x3094:  // HIRAGANA LETTER VU
    case 0x30AC:  // KATAKANA LETTER GA
    case 0x30AE:  // KATAKANA LETTER GI
    case 0x30B0:  // KATAKANA LETTER GU
    case 0x30B2:  // KATAKANA LETTER GE
    case 0x30B4:  // KATAKANA LETTER GO
    case 0x30B6:  // KATAKANA LETTER ZA
    case 0x30B8:  // KATAKANA LETTER ZI
    case 0x30BA:  // KATAKANA LETTER ZU
    case 0x30BC:  // KATAKANA LETTER ZE
    case 0x30BE:  // KATAKANA LETTER ZO
    case 0x30C0:  // KATAKANA LETTER DA
    case 0x30C2:  // KATAKANA LETTER DI
    case 0x30C5:  // KATAKANA LETTER DU
    case 0x30C7:  // KATAKANA LETTER DE
    case 0x30C9:  // KATAKANA LETTER DO
    case 0x30D0:  // KATAKANA LETTER BA
    case 0x30D3:  // KATAKANA LETTER BI
    case 0x30D6:  // KATAKANA LETTER BU
    case 0x30D9:  // KATAKANA LETTER BE
    case 0x30DC:  // KATAKANA LETTER BO
    case 0x30F4:  // KATAKANA LETTER VU
    case 0x30F7:  // KATAKANA LETTER VA
    case 0x30F8:  // KATAKANA LETTER VI
    case 0x30F9:  // KATAKANA LETTER VE
    case 0x30FA:  // KATAKANA LETTER VO
      return VoicedSoundMark;
    case 0x3071:  // HIRAGANA LETTER PA
    case 0x3074:  // HIRAGANA LETTER PI
    case 0x3077:  // HIRAGANA LETTER PU
    case 0x307A:  // HIRAGANA LETTER PE
    case 0x307D:  // HIRAGANA LETTER PO
    case 0x30D1:  // KATAKANA LETTER PA
    case 0x30D4:  // KATAKANA LETTER PI
    case 0x30D7:  // KATAKANA LETTER PU
    case 0x30DA:  // KATAKANA LETTER PE
    case 0x30DD:  // KATAKANA LETTER PO
      return SemiVoicedSoundMark;
  }
  return NoVoicedSoundMark;
}

static inline bool isCombiningVoicedSoundMark(UChar character) {
  switch (character) {
    case 0x3099:  // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK
    case 0x309A:  // COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
      return true;
  }
  return false;
}

bool containsKanaLetters(const String& pattern) {
  const unsigned length = pattern.length();
  for (unsigned i = 0; i < length; ++i) {
    if (isKanaLetter(pattern[i]))
      return true;
  }
  return false;
}

void normalizeCharactersIntoNFCForm(const UChar* characters,
                                    unsigned length,
                                    Vector<UChar>& buffer) {
  ASSERT(length);

  buffer.resize(length);

  UErrorCode status = U_ZERO_ERROR;
  size_t bufferSize = unorm_normalize(characters, length, UNORM_NFC, 0,
                                      buffer.data(), length, &status);
  ASSERT(status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING ||
         status == U_BUFFER_OVERFLOW_ERROR);
  ASSERT(bufferSize);

  buffer.resize(bufferSize);

  if (status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING)
    return;

  status = U_ZERO_ERROR;
  unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), bufferSize,
                  &status);
  ASSERT(status == U_STRING_NOT_TERMINATED_WARNING);
}

// This function returns kNotFound if |first| and |second| contain different
// Kana letters. If |first| and |second| contain the same Kana letter then
// function returns offset in characters from |first|. Pointers to both strings
// increase simultaneously so so it is possible to use one offset value.
static inline size_t compareKanaLetterAndComposedVoicedSoundMarks(
    const UChar* first,
    const UChar* firstEnd,
    const UChar* second,
    const UChar* secondEnd) {
  const UChar* start = first;
  // Check for differences in the kana letter character itself.
  if (isSmallKanaLetter(*first) != isSmallKanaLetter(*second))
    return kNotFound;
  if (composedVoicedSoundMark(*first) != composedVoicedSoundMark(*second))
    return kNotFound;
  ++first;
  ++second;

  // Check for differences in combining voiced sound marks found after the
  // letter.
  while (true) {
    const bool secondIsNotSoundMark =
        second == secondEnd || !isCombiningVoicedSoundMark(*second);
    if (first == firstEnd || !isCombiningVoicedSoundMark(*first)) {
      return secondIsNotSoundMark ? first - start : kNotFound;
    }
    if (secondIsNotSoundMark)
      return kNotFound;
    if (*first != *second)
      return kNotFound;
    ++first;
    ++second;
  }
}

bool checkOnlyKanaLettersInStrings(const UChar* firstData,
                                   unsigned firstLength,
                                   const UChar* secondData,
                                   unsigned secondLength) {
  const UChar* a = firstData;
  const UChar* aEnd = firstData + firstLength;

  const UChar* b = secondData;
  const UChar* bEnd = secondData + secondLength;
  while (true) {
    // Skip runs of non-kana-letter characters. This is necessary so we can
    // correctly handle strings where the |firstData| and |secondData| have
    // different-length runs of characters that match, while still double
    // checking the correctness of matches of kana letters with other kana
    // letters.
    while (a != aEnd && !isKanaLetter(*a))
      ++a;
    while (b != bEnd && !isKanaLetter(*b))
      ++b;

    // If we reached the end of either the target or the match, we should have
    // reached the end of both; both should have the same number of kana
    // letters.
    if (a == aEnd || b == bEnd) {
      return a == aEnd && b == bEnd;
    }

    // Check that single Kana letters in |a| and |b| are the same.
    const size_t offset =
        compareKanaLetterAndComposedVoicedSoundMarks(a, aEnd, b, bEnd);
    if (offset == kNotFound)
      return false;

    // Update values of |a| and |b| after comparing.
    a += offset;
    b += offset;
  }
}

bool checkKanaStringsEqual(const UChar* firstData,
                           unsigned firstLength,
                           const UChar* secondData,
                           unsigned secondLength) {
  const UChar* a = firstData;
  const UChar* aEnd = firstData + firstLength;

  const UChar* b = secondData;
  const UChar* bEnd = secondData + secondLength;
  while (true) {
    // Check for non-kana-letter characters.
    while (a != aEnd && !isKanaLetter(*a) && b != bEnd && !isKanaLetter(*b)) {
      if (*a++ != *b++)
        return false;
    }

    // If we reached the end of either the target or the match, we should have
    // reached the end of both; both should have the same number of kana
    // letters.
    if (a == aEnd || b == bEnd) {
      return a == aEnd && b == bEnd;
    }

    if (isKanaLetter(*a) != isKanaLetter(*b))
      return false;

    // Check that single Kana letters in |a| and |b| are the same.
    const size_t offset =
        compareKanaLetterAndComposedVoicedSoundMarks(a, aEnd, b, bEnd);
    if (offset == kNotFound)
      return false;

    // Update values of |a| and |b| after comparing.
    a += offset;
    b += offset;
  }
}

}  // namespace blink