[web] Use v8BreakIterator where possible (flutter/engine#37317)

* [web] Use v8BreakIterator where possible

* address review comments
This commit is contained in:
Mouad Debbar 2022-11-08 13:24:07 -05:00 committed by GitHub
parent c3260418f6
commit 45ad44de85
4 changed files with 269 additions and 46 deletions

View File

@ -66,6 +66,9 @@ extension DomWindowExtension on DomWindow {
/// The Trusted Types API (when available).
/// See: https://developer.mozilla.org/en-US/docs/Web/API/Trusted_Types_API
external DomTrustedTypePolicyFactory? get trustedTypes;
// ignore: non_constant_identifier_names
external DomIntl get Intl;
}
typedef DomRequestAnimationFrameCallback = void Function(num highResTime);
@ -1659,3 +1662,42 @@ class _DomListWrapper<T> extends Iterable<T> {
/// `toList` on the `Iterable`.
Iterable<T> createDomListWrapper<T>(_DomList list) =>
_DomListWrapper<T>._(list).cast<T>();
@JS()
@staticInterop
class DomIntl {}
extension DomIntlExtension on DomIntl {
/// This is a V8-only API for segmenting text.
///
/// See: https://code.google.com/archive/p/v8-i18n/wikis/BreakIterator.wiki
external Object? get v8BreakIterator;
}
@JS()
@staticInterop
class DomV8BreakIterator {}
extension DomV8BreakIteratorExtension on DomV8BreakIterator {
external void adoptText(String text);
external int first();
external int next();
external int current();
external String breakType();
}
DomV8BreakIterator createV8BreakIterator() {
final Object? v8BreakIterator = domWindow.Intl.v8BreakIterator;
if (v8BreakIterator == null) {
throw UnimplementedError('v8BreakIterator is not supported.');
}
return js_util.callConstructor<DomV8BreakIterator>(
v8BreakIterator,
<Object?>[
js_util.getProperty(domWindow, 'undefined'),
js_util.jsify(const <String, String>{'type': 'line'}),
],
);
}

View File

@ -2,10 +2,25 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
import '../dom.dart';
import 'fragmenter.dart';
import 'line_break_properties.dart';
import 'unicode_range.dart';
const Set<int> _kNewlines = <int>{
0x000A, // LF
0x000B, // BK
0x000C, // BK
0x000D, // CR
0x0085, // NL
0x2028, // BK
0x2029, // BK
};
const Set<int> _kSpaces = <int>{
0x0020, // SP
0x200B, // ZW
};
/// Various types of line breaks as defined by the Unicode spec.
enum LineBreakType {
/// Indicates that a line break is possible but not mandatory.
@ -25,8 +40,21 @@ enum LineBreakType {
}
/// Splits [text] into fragments based on line breaks.
class LineBreakFragmenter extends TextFragmenter {
const LineBreakFragmenter(super.text);
abstract class LineBreakFragmenter extends TextFragmenter {
factory LineBreakFragmenter(String text) {
if (domWindow.Intl.v8BreakIterator != null) {
return V8LineBreakFragmenter(text);
}
return FWLineBreakFragmenter(text);
}
@override
List<LineBreakFragment> fragment();
}
/// Flutter web's custom implementation of [LineBreakFragmenter].
class FWLineBreakFragmenter extends TextFragmenter implements LineBreakFragmenter {
FWLineBreakFragmenter(super.text);
@override
List<LineBreakFragment> fragment() {
@ -34,6 +62,85 @@ class LineBreakFragmenter extends TextFragmenter {
}
}
/// An implementation of [LineBreakFragmenter] that uses V8's
/// `v8BreakIterator` API to find line breaks in the given [text].
class V8LineBreakFragmenter extends TextFragmenter implements LineBreakFragmenter {
V8LineBreakFragmenter(super.text)
: assert(domWindow.Intl.v8BreakIterator != null);
@override
List<LineBreakFragment> fragment() {
final List<LineBreakFragment> breaks = <LineBreakFragment>[];
int fragmentStart = 0;
final DomV8BreakIterator iterator = createV8BreakIterator();
iterator.adoptText(text);
iterator.first();
while (iterator.next() != -1) {
final LineBreakType type = _getBreakType(iterator);
final int fragmentEnd = iterator.current();
int trailingNewlines = 0;
int trailingSpaces = 0;
// Calculate trailing newlines and spaces.
for (int i = fragmentStart; i < fragmentEnd; i++) {
final int codeUnit = text.codeUnitAt(i);
if (_kNewlines.contains(codeUnit)) {
trailingNewlines++;
trailingSpaces++;
} else if (_kSpaces.contains(codeUnit)) {
trailingSpaces++;
} else {
// Always break after a sequence of spaces.
if (trailingSpaces > 0) {
breaks.add(LineBreakFragment(
fragmentStart,
i,
LineBreakType.opportunity,
trailingNewlines: trailingNewlines,
trailingSpaces: trailingSpaces,
));
fragmentStart = i;
trailingNewlines = 0;
trailingSpaces = 0;
}
}
}
breaks.add(LineBreakFragment(
fragmentStart,
fragmentEnd,
type,
trailingNewlines: trailingNewlines,
trailingSpaces: trailingSpaces,
));
fragmentStart = fragmentEnd;
}
if (breaks.isEmpty || breaks.last.type == LineBreakType.mandatory) {
breaks.add(LineBreakFragment(text.length, text.length, LineBreakType.endOfText, trailingNewlines: 0, trailingSpaces: 0));
}
return breaks;
}
/// Gets break type from v8BreakIterator.
LineBreakType _getBreakType(DomV8BreakIterator iterator) {
final int fragmentEnd = iterator.current();
// I don't know why v8BreakIterator uses the type "none" to mean "soft break".
if (iterator.breakType() != 'none') {
return LineBreakType.mandatory;
}
if (fragmentEnd == text.length) {
return LineBreakType.endOfText;
}
return LineBreakType.opportunity;
}
}
class LineBreakFragment extends TextFragment {
const LineBreakFragment(super.start, super.end, this.type, {
required this.trailingNewlines,

View File

@ -17,7 +17,16 @@ void main() {
}
void testMain() {
group('$LineBreakFragmenter', () {
groupForEachFragmenter(({required bool isV8}) {
List<Line> split(String text) {
final LineBreakFragmenter fragmenter =
isV8 ? V8LineBreakFragmenter(text) : FWLineBreakFragmenter(text);
return <Line>[
for (final LineBreakFragment fragment in fragmenter.fragment())
Line.fromLineBreakFragment(text, fragment)
];
}
test('empty string', () {
expect(split(''), <Line>[
Line('', endOfText),
@ -316,13 +325,15 @@ void testMain() {
});
test('comprehensive test', () {
final List<TestCase> testCollection =
parseRawTestData(rawLineBreakTestData);
final List<TestCase> testCollection = parseRawTestData(rawLineBreakTestData, isV8: isV8);
for (int t = 0; t < testCollection.length; t++) {
final TestCase testCase = testCollection[t];
final String text = testCase.toText();
final List<LineBreakFragment> fragments = LineBreakFragmenter(text).fragment();
final LineBreakFragmenter fragmenter = isV8
? V8LineBreakFragmenter(text)
: FWLineBreakFragmenter(text);
final List<LineBreakFragment> fragments = fragmenter.fragment();
// `f` is the index in the `fragments` list.
int f = 0;
@ -401,6 +412,23 @@ void testMain() {
});
}
typedef CreateLineBreakFragmenter = LineBreakFragmenter Function(String text);
typedef GroupBody = void Function({required bool isV8});
void groupForEachFragmenter(GroupBody callback) {
group(
'$FWLineBreakFragmenter',
() => callback(isV8: false),
);
if (domWindow.Intl.v8BreakIterator != null) {
group(
'$V8LineBreakFragmenter',
() => callback(isV8: true),
);
}
}
/// Holds information about how a line was split from a string.
class Line {
Line(this.text, this.breakType, {this.nl = 0, this.sp = 0});
@ -447,10 +475,3 @@ class Line {
return '"$escapedText" ($breakType, nl: $nl, sp: $sp)';
}
}
List<Line> split(String text) {
return <Line>[
for (final LineBreakFragment fragment in LineBreakFragmenter(text).fragment())
Line.fromLineBreakFragment(text, fragment)
];
}

View File

@ -3,11 +3,11 @@
// found in the LICENSE file.
/// Parses raw test data into a list of [TestCase] objects.
List<TestCase> parseRawTestData(String rawTestData) {
List<TestCase> parseRawTestData(String rawTestData, {required bool isV8}) {
return rawTestData
.split('\n')
.where(isValidTestCase)
.map(_checkReplacement)
.map((String line) => _checkReplacement(line, isV8: isV8))
.map(_parse)
.toList();
}
@ -16,7 +16,7 @@ bool isValidTestCase(String line) {
return line.startsWith('×');
}
String _checkReplacement(String line) {
String _checkReplacement(String line, {required bool isV8}) {
String replacement = line;
// Special cases for rules LB8, LB11, LB13, LB14, LB15, LB16, LB17 to allow
@ -28,38 +28,91 @@ String _checkReplacement(String line) {
.replaceAllMapped(spacesRegex, (Match m) => 'SPACE (SP) ÷ [${m.group(1)}.');
}
// Some test cases contradict rule LB25, so we are fixing them with the few
// regexes below.
if (!isV8) {
// Some test cases contradict rule LB25, so we are fixing them with the few
// regexes below.
final RegExp lb25Regex1 = RegExp(r'\((CP_CP30|CL)\)(.*?) ÷ \[999\.0\] (PERCENT|DOLLAR)');
if (replacement.contains(lb25Regex1)) {
replacement = replacement
.replaceAll(' ÷ 0024', ' × 0024') // DOLLAR SIGN (PR)
.replaceAll(' ÷ 0025', ' × 0025') // PERCENT SIGN (PO)
.replaceAllMapped(
lb25Regex1,
(Match m) => '(${m.group(1)})${m.group(2)} × [999.0] ${m.group(3)}',
);
final RegExp lb25Regex1 = RegExp(r'\((CP_CP30|CL)\)(.*?) ÷ \[999\.0\] (PERCENT|DOLLAR)');
if (replacement.contains(lb25Regex1)) {
replacement = replacement
.replaceAll(' ÷ 0024', ' × 0024') // DOLLAR SIGN (PR)
.replaceAll(' ÷ 0025', ' × 0025') // PERCENT SIGN (PO)
.replaceAllMapped(
lb25Regex1,
(Match m) => '(${m.group(1)})${m.group(2)} × [999.0] ${m.group(3)}',
);
}
final RegExp lb25Regex2 = RegExp(r'\((IS|SY)\)(.*?) ÷ \[999\.0\] (DIGIT)');
if (replacement.contains(lb25Regex2)) {
replacement = replacement
.replaceAll(' ÷ 0030', ' × 0030') // DIGIT ZERO (NU)
.replaceAllMapped(
lb25Regex2,
(Match m) => '(${m.group(1)})${m.group(2)} × [999.0] ${m.group(3)}',
);
}
final RegExp lb25Regex3 = RegExp(r'\((PR|PO)\)(.*?) ÷ \[999\.0\] (LEFT)');
if (replacement.contains(lb25Regex3)) {
replacement = replacement
.replaceAll(' ÷ 0028', ' × 0028') // LEFT PARENTHESIS (OP_OP30)
.replaceAll(' ÷ 007B', ' × 007B') // LEFT CURLY BRACKET (OP_OP30)
.replaceAll(' ÷ 2329', ' × 2329') // LEFT-POINTING ANGLE BRACKET (OP)
.replaceAllMapped(
lb25Regex3,
(Match m) => '(${m.group(1)})${m.group(2)} × [999.0] ${m.group(3)}',
);
}
}
final RegExp lb25Regex2 = RegExp(r'\((IS|SY)\)(.*?) ÷ \[999\.0\] (DIGIT)');
if (replacement.contains(lb25Regex2)) {
replacement = replacement
.replaceAll(' ÷ 0030', ' × 0030') // DIGIT ZERO (NU)
.replaceAllMapped(
lb25Regex2,
(Match m) => '(${m.group(1)})${m.group(2)} × [999.0] ${m.group(3)}',
);
}
final RegExp lb25Regex3 = RegExp(r'\((PR|PO)\)(.*?) ÷ \[999\.0\] (LEFT)');
if (replacement.contains(lb25Regex3)) {
replacement = replacement
.replaceAll(' ÷ 0028', ' × 0028') // LEFT PARENTHESIS (OP_OP30)
.replaceAll(' ÷ 007B', ' × 007B') // LEFT CURLY BRACKET (OP_OP30)
.replaceAll(' ÷ 2329', ' × 2329') // LEFT-POINTING ANGLE BRACKET (OP)
.replaceAllMapped(
lb25Regex3,
(Match m) => '(${m.group(1)})${m.group(2)} × [999.0] ${m.group(3)}',
);
if (isV8) {
// v8BreakIterator deviates from the spec around Hiragana and Katakana
// letters.
final RegExp hiragana21Regex = RegExp(r' × \[21\.03\] (HIRAGANA LETTER|KATAKANA LETTER|KATAKANA-HIRAGANA)');
if (replacement.contains(hiragana21Regex) && !replacement.contains('(BB)') && !replacement.contains('(PR)')) {
replacement = replacement
.replaceAll(' × 3041', ' ÷ 3041') // HIRAGANA LETTER (CJ)
.replaceAll(' × 30E5', ' ÷ 30E5') // KATAKANA LETTER (CJ)
.replaceAll(' × 30FC', ' ÷ 30FC') // KATAKANA-HIRAGANA PROLONGED SOUND MARK (CJ)
.replaceAllMapped(
hiragana21Regex,
(Match m) => ' ÷ [21.03] ${m.group(1)}',
);
}
if (replacement.contains(' × [16.0] HIRAGANA LETTER')) {
replacement = replacement
.replaceAll(' × 3041', ' ÷ 3041') // HIRAGANA LETTER (CJ)
.replaceAll(
' × [16.0] HIRAGANA LETTER',
' ÷ [16.0] HIRAGANA LETTER',
);
}
final RegExp hiraganaPercentRegex = RegExp(r'HIRAGANA .*? ÷ \[999\.0\] PERCENT');
if (replacement.contains(hiraganaPercentRegex)) {
replacement = replacement
.replaceAll(' ÷ 0025', ' × 0025') // PERCENT SIGN (PO)
.replaceAll(
' ÷ [999.0] PERCENT',
' × [999.0] PERCENT',
);
}
// v8BreakIterator also deviates from the spec around hyphens, commas and
// full stops.
final RegExp hyphenRegex = RegExp(r'\((HY|IS)\)(.*?) ÷ \[999\.0\] (DIGIT|NUMBER|SECTION|THAI|<reserved-50005>)');
if (replacement.contains(hyphenRegex)) {
replacement = replacement
.replaceAll(' ÷ 0030', ' × 0030') // DIGIT ZERO (NU)
.replaceAll(' ÷ 0023', ' × 0023') // NUMBER SIGN (AL)
.replaceAll(' ÷ 00A7', ' × 00A7') // SECTION SIGN (AI_AL)
.replaceAll(' ÷ 0E01', ' × 0E01') // THAI CHARACTER KO KAI (SA_AL)
.replaceAll(' ÷ 50005', ' × 50005') // <reserved-50005> (XX_AL)
.replaceAllMapped(
hyphenRegex,
(Match m) => '(${m.group(1)})${m.group(2)} × [999.0] ${m.group(3)}',
);
}
}
return replacement;