[web][3/3] Handle surrogate during line break detection (flutter/engine#19745)

2026-02-20 02:29:02 +08:00 · 2020-07-15 12:29:52 -07:00 · 2020-07-15 12:29:52 -07:00 · 6f800ecb40
commit 6f800ecb40
parent 80963c291d
5 changed files with 1108 additions and 18 deletions
--- a/engine/src/flutter/lib/web_ui/lib/src/engine/text/line_breaker.dart
+++ b/engine/src/flutter/lib/web_ui/lib/src/engine/text/line_breaker.dart
@ -28,7 +28,12 @@ class LineBreakResult {
 }

 /// Normalizes properties that behave the same way into one common property.
-LineCharProperty? _normalizeLineProperty(LineCharProperty? prop) {
+LineCharProperty _normalizeLineProperty(int? codePoint) {
+  if (codePoint == null) {
+    return LineCharProperty.AL;
+  }
+
+  final LineCharProperty? prop = lineLookup.findForChar(codePoint);
  // NL behaves exactly the same as BK.
  // See: https://www.unicode.org/reports/tr14/tr14-45.html#NL
  if (prop == LineCharProperty.NL) {
@ -37,7 +42,8 @@ LineCharProperty? _normalizeLineProperty(LineCharProperty? prop) {
  // In the absence of extra data (ICU data and language dictionaries), the
  // following properties will be treated as AL (alphabetic): AI, SA, SG and XX.
  // See LB1: https://www.unicode.org/reports/tr14/tr14-45.html#LB1
-  if (prop == LineCharProperty.AI ||
+  if (prop == null ||
+      prop == LineCharProperty.AI ||
      prop == LineCharProperty.SA ||
      prop == LineCharProperty.SG ||
      prop == LineCharProperty.XX) {
@ -91,7 +97,8 @@ bool _hasEastAsianWidthFWH(int charCode) {
 /// * https://www.unicode.org/reports/tr14/tr14-45.html#Algorithm
 /// * https://www.unicode.org/Public/11.0.0/ucd/LineBreak.txt
 LineBreakResult nextLineBreak(String text, int index) {
-  LineCharProperty? curr = _normalizeLineProperty(lineLookup.find(text, index));
+  int? codePoint = getCodePoint(text, index);
+  LineCharProperty curr = _normalizeLineProperty(codePoint);

  LineCharProperty? prev1;

@ -117,9 +124,23 @@ LineBreakResult nextLineBreak(String text, int index) {
    curr = LineCharProperty.AL;
  }

+  int regionalIndicatorCount = 0;
+
  // Always break at the end of text.
  // LB3: ! eot
  while (index < text.length) {
+    // Keep count of the RI (regional indicator) sequence.
+    if (curr == LineCharProperty.RI) {
+      regionalIndicatorCount++;
+    } else {
+      regionalIndicatorCount = 0;
+    }
+
+    if (codePoint != null && codePoint > 0xFFFF) {
+      // Advance `index` one extra step when handling a surrogate pair in the
+      // string.
+      index++;
+    }
    index++;
    prev2 = prev1;
    prev1 = curr;
@ -131,7 +152,9 @@ LineBreakResult nextLineBreak(String text, int index) {
      baseOfSpaceSequence = null;
    }

-    curr = _normalizeLineProperty(lineLookup.find(text, index));
+    codePoint = getCodePoint(text, index);
+    curr = _normalizeLineProperty(codePoint);
+
    isCurrZWJ = curr == LineCharProperty.ZWJ;

    // Always break after hard line breaks.
@ -208,6 +231,10 @@ LineBreakResult nextLineBreak(String text, int index) {
        // LB10: Treat any remaining combining mark or ZWJ as AL.
        curr = LineCharProperty.AL;
      } else {
+        if (prev1 == LineCharProperty.RI) {
+          // Prevent the previous RI from being double-counted.
+          regionalIndicatorCount--;
+        }
        // Preserve the property of the previous character to treat the sequence
        // as if it were X.
        curr = prev1;
@ -466,6 +493,24 @@ LineBreakResult nextLineBreak(String text, int index) {
      continue;
    }

+    // Break between two regional indicator symbols if and only if there are an
+    // even number of regional indicators preceding the position of the break.
+    // LB30a: sot (RI RI)* RI × RI
+    //        [^RI] (RI RI)* RI × RI
+    if (curr == LineCharProperty.RI) {
+      if (regionalIndicatorCount.isOdd) {
+        continue;
+      } else {
+        return LineBreakResult(index, LineBreakType.opportunity);
+      }
+    }
+
+    // Do not break between an emoji base and an emoji modifier.
+    // LB30b: EB × EM
+    if (prev1 == LineCharProperty.EB && curr == LineCharProperty.EM) {
+      continue;
+    }
+
    // Break everywhere else.
    // LB31: ALL ÷
    //       ÷ ALL
--- a/engine/src/flutter/lib/web_ui/lib/src/engine/text/measurement.dart
+++ b/engine/src/flutter/lib/web_ui/lib/src/engine/text/measurement.dart
@ -16,8 +16,7 @@ const double _baselineRatioHack = 1.1662499904632568;
 typedef CharPredicate = bool Function(int char);

 bool _whitespacePredicate(int char) {
-  final LineCharProperty? prop =
-      _normalizeLineProperty(lineLookup.findForChar(char));
+  final LineCharProperty? prop = _normalizeLineProperty(char);
  return prop == LineCharProperty.SP ||
      prop == LineCharProperty.BK ||
      prop == LineCharProperty.LF ||
@ -25,8 +24,7 @@ bool _whitespacePredicate(int char) {
 }

 bool _newlinePredicate(int char) {
-  final LineCharProperty? prop =
-      _normalizeLineProperty(lineLookup.findForChar(char));
+  final LineCharProperty? prop = _normalizeLineProperty(char);
  return prop == LineCharProperty.BK || prop == LineCharProperty.LF || prop == LineCharProperty.CR;
 }

--- a/engine/src/flutter/lib/web_ui/lib/src/engine/text/unicode_range.dart
+++ b/engine/src/flutter/lib/web_ui/lib/src/engine/text/unicode_range.dart
@ -57,6 +57,44 @@ class UnicodeRange<P> {
  }
 }

+/// Checks whether the given char code is a UTF-16 surrogate.
+///
+/// See:
+/// - http://www.unicode.org/faq//utf_bom.html#utf16-2
+bool isUtf16Surrogate(int char) {
+  return char & 0xF800 == 0xD800;
+}
+
+/// Combines a pair of UTF-16 surrogate into a single character code point.
+///
+/// The surrogate pair is expected to start at [index] in the [text].
+///
+/// See:
+/// - http://www.unicode.org/faq//utf_bom.html#utf16-3
+int combineSurrogatePair(String text, int index) {
+  final int hi = text.codeUnitAt(index);
+  final int lo = text.codeUnitAt(index + 1);
+
+  int x = (hi & ((1 << 6) - 1)) << 10 | lo & ((1 << 10) - 1);
+  int w = (hi >> 6) & ((1 << 5) - 1);
+  int u = w + 1;
+  return u << 16 | x;
+}
+
+/// Returns the code point from [text] at [index] and handles surrogate pairs
+/// for cases that involve two UTF-16 codes.
+int? getCodePoint(String text, int index) {
+  if (index < 0 || index >= text.length) {
+    return null;
+  }
+
+  final int char = text.codeUnitAt(index);
+  if (isUtf16Surrogate(char) && index < text.length - 1) {
+    return combineSurrogatePair(text, index);
+  }
+  return char;
+}
+
 /// Given a list of [UnicodeRange]s, this class performs efficient lookup
 /// to find which range a value falls into.
 ///
@ -88,11 +126,9 @@ class UnicodePropertyLookup<P> {
  /// located at that [index].
  ///
  /// If the [index] is out of range, null will be returned.
-  P? find(String? text, int index) {
-    if (index < 0 || index >= text!.length) {
-      return null;
-    }
-    return findForChar(text.codeUnitAt(index));
+  P? find(String text, int index) {
+    final int? codePoint = getCodePoint(text, index);
+    return codePoint == null ? null : findForChar(codePoint);
  }

  /// Takes one character as an integer code unit and returns its property.
--- a/engine/src/flutter/lib/web_ui/test/text/line_breaker_test.dart
+++ b/engine/src/flutter/lib/web_ui/test/text/line_breaker_test.dart
@ -168,8 +168,16 @@ void main() {
        final String text = testCase.toText();

        int lastLineBreak = 0;
-        for (int i = 0; i < testCase.signs.length; i++) {
-          final Sign sign = testCase.signs[i];
+        int surrogateCount = 0;
+        // `s` is the index in the `testCase.signs` list.
+        for (int s = 0; s < testCase.signs.length; s++) {
+          // `i` is the index in the `text`.
+          final int i = s + surrogateCount;
+          if (s < testCase.chars.length && testCase.chars[s].isSurrogatePair) {
+            surrogateCount++;
+          }
+
+          final Sign sign = testCase.signs[s];
          final LineBreakResult result = nextLineBreak(text, lastLineBreak);
          if (sign.isBreakOpportunity) {
            // The line break should've been found at index `i`.
--- a/engine/src/flutter/lib/web_ui/test/text/line_breaker_test_data.dart
+++ b/engine/src/flutter/lib/web_ui/test/text/line_breaker_test_data.dart