[web] Use v8BreakIterator where possible (flutter/engine#37317)

* [web] Use v8BreakIterator where possible * address review comments
2026-02-20 02:29:02 +08:00 · 2022-11-08 13:24:07 -05:00 · 2022-11-08 13:24:07 -05:00 · 45ad44de85
commit 45ad44de85
parent c3260418f6
4 changed files with 269 additions and 46 deletions
--- a/engine/src/flutter/lib/web_ui/lib/src/engine/dom.dart
+++ b/engine/src/flutter/lib/web_ui/lib/src/engine/dom.dart
@ -66,6 +66,9 @@ extension DomWindowExtension on DomWindow {
  /// The Trusted Types API (when available).
  /// See: https://developer.mozilla.org/en-US/docs/Web/API/Trusted_Types_API
  external DomTrustedTypePolicyFactory? get trustedTypes;
+
+  // ignore: non_constant_identifier_names
+  external DomIntl get Intl;
 }

 typedef DomRequestAnimationFrameCallback = void Function(num highResTime);
@ -1659,3 +1662,42 @@ class _DomListWrapper<T> extends Iterable<T> {
 /// `toList` on the `Iterable`.
 Iterable<T> createDomListWrapper<T>(_DomList list) =>
    _DomListWrapper<T>._(list).cast<T>();
+
+@JS()
+@staticInterop
+class DomIntl {}
+
+extension DomIntlExtension on DomIntl {
+  /// This is a V8-only API for segmenting text.
+  ///
+  /// See: https://code.google.com/archive/p/v8-i18n/wikis/BreakIterator.wiki
+  external Object? get v8BreakIterator;
+}
+
+
+@JS()
+@staticInterop
+class DomV8BreakIterator {}
+
+extension DomV8BreakIteratorExtension on DomV8BreakIterator {
+  external void adoptText(String text);
+  external int first();
+  external int next();
+  external int current();
+  external String breakType();
+}
+
+DomV8BreakIterator createV8BreakIterator() {
+  final Object? v8BreakIterator = domWindow.Intl.v8BreakIterator;
+  if (v8BreakIterator == null) {
+    throw UnimplementedError('v8BreakIterator is not supported.');
+  }
+
+  return js_util.callConstructor<DomV8BreakIterator>(
+    v8BreakIterator,
+    <Object?>[
+      js_util.getProperty(domWindow, 'undefined'),
+      js_util.jsify(const <String, String>{'type': 'line'}),
+    ],
+  );
+}
--- a/engine/src/flutter/lib/web_ui/lib/src/engine/text/line_breaker.dart
+++ b/engine/src/flutter/lib/web_ui/lib/src/engine/text/line_breaker.dart
@ -2,10 +2,25 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

+import '../dom.dart';
 import 'fragmenter.dart';
 import 'line_break_properties.dart';
 import 'unicode_range.dart';

+const Set<int> _kNewlines = <int>{
+  0x000A, // LF
+  0x000B, // BK
+  0x000C, // BK
+  0x000D, // CR
+  0x0085, // NL
+  0x2028, // BK
+  0x2029, // BK
+};
+const Set<int> _kSpaces = <int>{
+  0x0020, // SP
+  0x200B, // ZW
+};
+
 /// Various types of line breaks as defined by the Unicode spec.
 enum LineBreakType {
  /// Indicates that a line break is possible but not mandatory.
@ -25,8 +40,21 @@ enum LineBreakType {
 }

 /// Splits [text] into fragments based on line breaks.
-class LineBreakFragmenter extends TextFragmenter {
-  const LineBreakFragmenter(super.text);
+abstract class LineBreakFragmenter extends TextFragmenter {
+  factory LineBreakFragmenter(String text) {
+    if (domWindow.Intl.v8BreakIterator != null) {
+      return V8LineBreakFragmenter(text);
+    }
+    return FWLineBreakFragmenter(text);
+  }
+
+  @override
+  List<LineBreakFragment> fragment();
+}
+
+/// Flutter web's custom implementation of [LineBreakFragmenter].
+class FWLineBreakFragmenter extends TextFragmenter implements LineBreakFragmenter {
+  FWLineBreakFragmenter(super.text);

  @override
  List<LineBreakFragment> fragment() {
@ -34,6 +62,85 @@ class LineBreakFragmenter extends TextFragmenter {
  }
 }

+/// An implementation of [LineBreakFragmenter] that uses V8's
+/// `v8BreakIterator` API to find line breaks in the given [text].
+class V8LineBreakFragmenter extends TextFragmenter implements LineBreakFragmenter {
+  V8LineBreakFragmenter(super.text)
+      : assert(domWindow.Intl.v8BreakIterator != null);
+
+  @override
+  List<LineBreakFragment> fragment() {
+    final List<LineBreakFragment> breaks = <LineBreakFragment>[];
+    int fragmentStart = 0;
+
+    final DomV8BreakIterator iterator = createV8BreakIterator();
+
+    iterator.adoptText(text);
+    iterator.first();
+    while (iterator.next() != -1) {
+      final LineBreakType type = _getBreakType(iterator);
+
+      final int fragmentEnd = iterator.current();
+      int trailingNewlines = 0;
+      int trailingSpaces = 0;
+
+      // Calculate trailing newlines and spaces.
+      for (int i = fragmentStart; i < fragmentEnd; i++) {
+        final int codeUnit = text.codeUnitAt(i);
+        if (_kNewlines.contains(codeUnit)) {
+          trailingNewlines++;
+          trailingSpaces++;
+        } else if (_kSpaces.contains(codeUnit)) {
+          trailingSpaces++;
+        } else {
+          // Always break after a sequence of spaces.
+          if (trailingSpaces > 0) {
+            breaks.add(LineBreakFragment(
+              fragmentStart,
+              i,
+              LineBreakType.opportunity,
+              trailingNewlines: trailingNewlines,
+              trailingSpaces: trailingSpaces,
+            ));
+            fragmentStart = i;
+            trailingNewlines = 0;
+            trailingSpaces = 0;
+          }
+        }
+      }
+
+      breaks.add(LineBreakFragment(
+        fragmentStart,
+        fragmentEnd,
+        type,
+        trailingNewlines: trailingNewlines,
+        trailingSpaces: trailingSpaces,
+      ));
+      fragmentStart = fragmentEnd;
+    }
+
+    if (breaks.isEmpty || breaks.last.type == LineBreakType.mandatory) {
+      breaks.add(LineBreakFragment(text.length, text.length, LineBreakType.endOfText, trailingNewlines: 0, trailingSpaces: 0));
+    }
+
+    return breaks;
+  }
+
+  /// Gets break type from v8BreakIterator.
+  LineBreakType _getBreakType(DomV8BreakIterator iterator) {
+    final int fragmentEnd = iterator.current();
+
+    // I don't know why v8BreakIterator uses the type "none" to mean "soft break".
+    if (iterator.breakType() != 'none') {
+      return LineBreakType.mandatory;
+    }
+    if (fragmentEnd == text.length) {
+      return LineBreakType.endOfText;
+    }
+    return LineBreakType.opportunity;
+  }
+}
+
 class LineBreakFragment extends TextFragment {
  const LineBreakFragment(super.start, super.end, this.type, {
    required this.trailingNewlines,
--- a/engine/src/flutter/lib/web_ui/test/text/line_breaker_test.dart
+++ b/engine/src/flutter/lib/web_ui/test/text/line_breaker_test.dart
@ -17,7 +17,16 @@ void main() {
 }

 void testMain() {
-  group('$LineBreakFragmenter', () {
+  groupForEachFragmenter(({required bool isV8}) {
+    List<Line> split(String text) {
+      final LineBreakFragmenter fragmenter =
+          isV8 ? V8LineBreakFragmenter(text) : FWLineBreakFragmenter(text);
+      return <Line>[
+        for (final LineBreakFragment fragment in fragmenter.fragment())
+          Line.fromLineBreakFragment(text, fragment)
+      ];
+    }
+
    test('empty string', () {
      expect(split(''), <Line>[
        Line('', endOfText),
@ -316,13 +325,15 @@ void testMain() {
    });

    test('comprehensive test', () {
-      final List<TestCase> testCollection =
-          parseRawTestData(rawLineBreakTestData);
+      final List<TestCase> testCollection = parseRawTestData(rawLineBreakTestData, isV8: isV8);
      for (int t = 0; t < testCollection.length; t++) {
        final TestCase testCase = testCollection[t];

        final String text = testCase.toText();
-        final List<LineBreakFragment> fragments = LineBreakFragmenter(text).fragment();
+        final LineBreakFragmenter fragmenter = isV8
+            ? V8LineBreakFragmenter(text)
+            : FWLineBreakFragmenter(text);
+        final List<LineBreakFragment> fragments = fragmenter.fragment();

        // `f` is the index in the `fragments` list.
        int f = 0;
@ -401,6 +412,23 @@ void testMain() {
  });
 }

+typedef CreateLineBreakFragmenter = LineBreakFragmenter Function(String text);
+typedef GroupBody = void Function({required bool isV8});
+
+void groupForEachFragmenter(GroupBody callback) {
+  group(
+    '$FWLineBreakFragmenter',
+    () => callback(isV8: false),
+  );
+
+  if (domWindow.Intl.v8BreakIterator != null) {
+    group(
+      '$V8LineBreakFragmenter',
+      () => callback(isV8: true),
+    );
+  }
+}
+
 /// Holds information about how a line was split from a string.
 class Line {
  Line(this.text, this.breakType, {this.nl = 0, this.sp = 0});
@ -447,10 +475,3 @@ class Line {
    return '"$escapedText" ($breakType, nl: $nl, sp: $sp)';
  }
 }
-
-List<Line> split(String text) {
-  return <Line>[
-    for (final LineBreakFragment fragment in LineBreakFragmenter(text).fragment())
-      Line.fromLineBreakFragment(text, fragment)
-  ];
-}
--- a/engine/src/flutter/lib/web_ui/test/text/line_breaker_test_helper.dart
+++ b/engine/src/flutter/lib/web_ui/test/text/line_breaker_test_helper.dart
@ -3,11 +3,11 @@
 // found in the LICENSE file.

 /// Parses raw test data into a list of [TestCase] objects.
-List<TestCase> parseRawTestData(String rawTestData) {
+List<TestCase> parseRawTestData(String rawTestData, {required bool isV8}) {
  return rawTestData
      .split('\n')
      .where(isValidTestCase)
-      .map(_checkReplacement)
+      .map((String line) => _checkReplacement(line, isV8: isV8))
      .map(_parse)
      .toList();
 }
@ -16,7 +16,7 @@ bool isValidTestCase(String line) {
  return line.startsWith('×');
 }

-String _checkReplacement(String line) {
+String _checkReplacement(String line, {required bool isV8}) {
  String replacement = line;

  // Special cases for rules LB8, LB11, LB13, LB14, LB15, LB16, LB17 to allow
@ -28,38 +28,91 @@ String _checkReplacement(String line) {
        .replaceAllMapped(spacesRegex, (Match m) => 'SPACE (SP) ÷ [${m.group(1)}.');
  }

-  // Some test cases contradict rule LB25, so we are fixing them with the few
-  // regexes below.
+  if (!isV8) {
+    // Some test cases contradict rule LB25, so we are fixing them with the few
+    // regexes below.

-  final RegExp lb25Regex1 = RegExp(r'\((CP_CP30|CL)\)(.*?) ÷ \[999\.0\] (PERCENT|DOLLAR)');
-  if (replacement.contains(lb25Regex1)) {
-    replacement = replacement
-        .replaceAll(' ÷ 0024', ' × 0024') // DOLLAR SIGN (PR)
-        .replaceAll(' ÷ 0025', ' × 0025') // PERCENT SIGN (PO)
-        .replaceAllMapped(
-          lb25Regex1,
-          (Match m) => '(${m.group(1)})${m.group(2)} × [999.0] ${m.group(3)}',
-        );
+    final RegExp lb25Regex1 = RegExp(r'\((CP_CP30|CL)\)(.*?) ÷ \[999\.0\] (PERCENT|DOLLAR)');
+    if (replacement.contains(lb25Regex1)) {
+      replacement = replacement
+          .replaceAll(' ÷ 0024', ' × 0024') // DOLLAR SIGN (PR)
+          .replaceAll(' ÷ 0025', ' × 0025') // PERCENT SIGN (PO)
+          .replaceAllMapped(
+            lb25Regex1,
+            (Match m) => '(${m.group(1)})${m.group(2)} × [999.0] ${m.group(3)}',
+          );
+    }
+    final RegExp lb25Regex2 = RegExp(r'\((IS|SY)\)(.*?) ÷ \[999\.0\] (DIGIT)');
+    if (replacement.contains(lb25Regex2)) {
+      replacement = replacement
+          .replaceAll(' ÷ 0030', ' × 0030') // DIGIT ZERO (NU)
+          .replaceAllMapped(
+            lb25Regex2,
+            (Match m) => '(${m.group(1)})${m.group(2)} × [999.0] ${m.group(3)}',
+          );
+    }
+    final RegExp lb25Regex3 = RegExp(r'\((PR|PO)\)(.*?) ÷ \[999\.0\] (LEFT)');
+    if (replacement.contains(lb25Regex3)) {
+      replacement = replacement
+          .replaceAll(' ÷ 0028', ' × 0028') // LEFT PARENTHESIS (OP_OP30)
+          .replaceAll(' ÷ 007B', ' × 007B') // LEFT CURLY BRACKET (OP_OP30)
+          .replaceAll(' ÷ 2329', ' × 2329') // LEFT-POINTING ANGLE BRACKET (OP)
+          .replaceAllMapped(
+            lb25Regex3,
+            (Match m) => '(${m.group(1)})${m.group(2)} × [999.0] ${m.group(3)}',
+          );
+    }
  }
-  final RegExp lb25Regex2 = RegExp(r'\((IS|SY)\)(.*?) ÷ \[999\.0\] (DIGIT)');
-  if (replacement.contains(lb25Regex2)) {
-    replacement = replacement
-        .replaceAll(' ÷ 0030', ' × 0030') // DIGIT ZERO (NU)
-        .replaceAllMapped(
-          lb25Regex2,
-          (Match m) => '(${m.group(1)})${m.group(2)} × [999.0] ${m.group(3)}',
-        );
-  }
-  final RegExp lb25Regex3 = RegExp(r'\((PR|PO)\)(.*?) ÷ \[999\.0\] (LEFT)');
-  if (replacement.contains(lb25Regex3)) {
-    replacement = replacement
-        .replaceAll(' ÷ 0028', ' × 0028') // LEFT PARENTHESIS (OP_OP30)
-        .replaceAll(' ÷ 007B', ' × 007B') // LEFT CURLY BRACKET (OP_OP30)
-        .replaceAll(' ÷ 2329', ' × 2329') // LEFT-POINTING ANGLE BRACKET (OP)
-        .replaceAllMapped(
-          lb25Regex3,
-          (Match m) => '(${m.group(1)})${m.group(2)} × [999.0] ${m.group(3)}',
-        );
+
+  if (isV8) {
+    // v8BreakIterator deviates from the spec around Hiragana and Katakana
+    // letters.
+
+    final RegExp hiragana21Regex = RegExp(r' × \[21\.03\] (HIRAGANA LETTER|KATAKANA LETTER|KATAKANA-HIRAGANA)');
+    if (replacement.contains(hiragana21Regex) && !replacement.contains('(BB)') && !replacement.contains('(PR)')) {
+      replacement = replacement
+          .replaceAll(' × 3041', ' ÷ 3041') // HIRAGANA LETTER (CJ)
+          .replaceAll(' × 30E5', ' ÷ 30E5') // KATAKANA LETTER (CJ)
+          .replaceAll(' × 30FC', ' ÷ 30FC') // KATAKANA-HIRAGANA PROLONGED SOUND MARK (CJ)
+          .replaceAllMapped(
+            hiragana21Regex,
+            (Match m) => ' ÷ [21.03] ${m.group(1)}',
+          );
+    }
+    if (replacement.contains(' × [16.0] HIRAGANA LETTER')) {
+      replacement = replacement
+          .replaceAll(' × 3041', ' ÷ 3041') // HIRAGANA LETTER (CJ)
+          .replaceAll(
+            ' × [16.0] HIRAGANA LETTER',
+            ' ÷ [16.0] HIRAGANA LETTER',
+          );
+    }
+    final RegExp hiraganaPercentRegex = RegExp(r'HIRAGANA .*? ÷ \[999\.0\] PERCENT');
+    if (replacement.contains(hiraganaPercentRegex)) {
+      replacement = replacement
+          .replaceAll(' ÷ 0025', ' × 0025') // PERCENT SIGN (PO)
+          .replaceAll(
+            ' ÷ [999.0] PERCENT',
+            ' × [999.0] PERCENT',
+          );
+    }
+
+    // v8BreakIterator also deviates from the spec around hyphens, commas and
+    // full stops.
+
+    final RegExp hyphenRegex = RegExp(r'\((HY|IS)\)(.*?) ÷ \[999\.0\] (DIGIT|NUMBER|SECTION|THAI|<reserved-50005>)');
+    if (replacement.contains(hyphenRegex)) {
+      replacement = replacement
+          .replaceAll(' ÷ 0030', ' × 0030') // DIGIT ZERO (NU)
+          .replaceAll(' ÷ 0023', ' × 0023') // NUMBER SIGN (AL)
+          .replaceAll(' ÷ 00A7', ' × 00A7') // SECTION SIGN (AI_AL)
+          .replaceAll(' ÷ 0E01', ' × 0E01') // THAI CHARACTER KO KAI (SA_AL)
+          .replaceAll(' ÷ 50005', ' × 50005') // <reserved-50005> (XX_AL)
+          .replaceAllMapped(
+            hyphenRegex,
+            (Match m) => '(${m.group(1)})${m.group(2)} × [999.0] ${m.group(3)}',
+          );
+    }
  }

  return replacement;