[web][3/3] Handle surrogate during line break detection (flutter/engine#19745)

This commit is contained in:
Mouad Debbar 2020-07-15 12:29:52 -07:00 committed by GitHub
parent 80963c291d
commit 6f800ecb40
5 changed files with 1108 additions and 18 deletions

View File

@ -28,7 +28,12 @@ class LineBreakResult {
}
/// Normalizes properties that behave the same way into one common property.
LineCharProperty? _normalizeLineProperty(LineCharProperty? prop) {
LineCharProperty _normalizeLineProperty(int? codePoint) {
if (codePoint == null) {
return LineCharProperty.AL;
}
final LineCharProperty? prop = lineLookup.findForChar(codePoint);
// NL behaves exactly the same as BK.
// See: https://www.unicode.org/reports/tr14/tr14-45.html#NL
if (prop == LineCharProperty.NL) {
@ -37,7 +42,8 @@ LineCharProperty? _normalizeLineProperty(LineCharProperty? prop) {
// In the absence of extra data (ICU data and language dictionaries), the
// following properties will be treated as AL (alphabetic): AI, SA, SG and XX.
// See LB1: https://www.unicode.org/reports/tr14/tr14-45.html#LB1
if (prop == LineCharProperty.AI ||
if (prop == null ||
prop == LineCharProperty.AI ||
prop == LineCharProperty.SA ||
prop == LineCharProperty.SG ||
prop == LineCharProperty.XX) {
@ -91,7 +97,8 @@ bool _hasEastAsianWidthFWH(int charCode) {
/// * https://www.unicode.org/reports/tr14/tr14-45.html#Algorithm
/// * https://www.unicode.org/Public/11.0.0/ucd/LineBreak.txt
LineBreakResult nextLineBreak(String text, int index) {
LineCharProperty? curr = _normalizeLineProperty(lineLookup.find(text, index));
int? codePoint = getCodePoint(text, index);
LineCharProperty curr = _normalizeLineProperty(codePoint);
LineCharProperty? prev1;
@ -117,9 +124,23 @@ LineBreakResult nextLineBreak(String text, int index) {
curr = LineCharProperty.AL;
}
int regionalIndicatorCount = 0;
// Always break at the end of text.
// LB3: ! eot
while (index < text.length) {
// Keep count of the RI (regional indicator) sequence.
if (curr == LineCharProperty.RI) {
regionalIndicatorCount++;
} else {
regionalIndicatorCount = 0;
}
if (codePoint != null && codePoint > 0xFFFF) {
// Advance `index` one extra step when handling a surrogate pair in the
// string.
index++;
}
index++;
prev2 = prev1;
prev1 = curr;
@ -131,7 +152,9 @@ LineBreakResult nextLineBreak(String text, int index) {
baseOfSpaceSequence = null;
}
curr = _normalizeLineProperty(lineLookup.find(text, index));
codePoint = getCodePoint(text, index);
curr = _normalizeLineProperty(codePoint);
isCurrZWJ = curr == LineCharProperty.ZWJ;
// Always break after hard line breaks.
@ -208,6 +231,10 @@ LineBreakResult nextLineBreak(String text, int index) {
// LB10: Treat any remaining combining mark or ZWJ as AL.
curr = LineCharProperty.AL;
} else {
if (prev1 == LineCharProperty.RI) {
// Prevent the previous RI from being double-counted.
regionalIndicatorCount--;
}
// Preserve the property of the previous character to treat the sequence
// as if it were X.
curr = prev1;
@ -466,6 +493,24 @@ LineBreakResult nextLineBreak(String text, int index) {
continue;
}
// Break between two regional indicator symbols if and only if there are an
// even number of regional indicators preceding the position of the break.
// LB30a: sot (RI RI)* RI × RI
// [^RI] (RI RI)* RI × RI
if (curr == LineCharProperty.RI) {
if (regionalIndicatorCount.isOdd) {
continue;
} else {
return LineBreakResult(index, LineBreakType.opportunity);
}
}
// Do not break between an emoji base and an emoji modifier.
// LB30b: EB × EM
if (prev1 == LineCharProperty.EB && curr == LineCharProperty.EM) {
continue;
}
// Break everywhere else.
// LB31: ALL ÷
// ÷ ALL

View File

@ -16,8 +16,7 @@ const double _baselineRatioHack = 1.1662499904632568;
typedef CharPredicate = bool Function(int char);
bool _whitespacePredicate(int char) {
final LineCharProperty? prop =
_normalizeLineProperty(lineLookup.findForChar(char));
final LineCharProperty? prop = _normalizeLineProperty(char);
return prop == LineCharProperty.SP ||
prop == LineCharProperty.BK ||
prop == LineCharProperty.LF ||
@ -25,8 +24,7 @@ bool _whitespacePredicate(int char) {
}
bool _newlinePredicate(int char) {
final LineCharProperty? prop =
_normalizeLineProperty(lineLookup.findForChar(char));
final LineCharProperty? prop = _normalizeLineProperty(char);
return prop == LineCharProperty.BK || prop == LineCharProperty.LF || prop == LineCharProperty.CR;
}

View File

@ -57,6 +57,44 @@ class UnicodeRange<P> {
}
}
/// Checks whether the given char code is a UTF-16 surrogate.
///
/// See:
/// - http://www.unicode.org/faq//utf_bom.html#utf16-2
bool isUtf16Surrogate(int char) {
return char & 0xF800 == 0xD800;
}
/// Combines a pair of UTF-16 surrogate into a single character code point.
///
/// The surrogate pair is expected to start at [index] in the [text].
///
/// See:
/// - http://www.unicode.org/faq//utf_bom.html#utf16-3
int combineSurrogatePair(String text, int index) {
final int hi = text.codeUnitAt(index);
final int lo = text.codeUnitAt(index + 1);
int x = (hi & ((1 << 6) - 1)) << 10 | lo & ((1 << 10) - 1);
int w = (hi >> 6) & ((1 << 5) - 1);
int u = w + 1;
return u << 16 | x;
}
/// Returns the code point from [text] at [index] and handles surrogate pairs
/// for cases that involve two UTF-16 codes.
int? getCodePoint(String text, int index) {
if (index < 0 || index >= text.length) {
return null;
}
final int char = text.codeUnitAt(index);
if (isUtf16Surrogate(char) && index < text.length - 1) {
return combineSurrogatePair(text, index);
}
return char;
}
/// Given a list of [UnicodeRange]s, this class performs efficient lookup
/// to find which range a value falls into.
///
@ -88,11 +126,9 @@ class UnicodePropertyLookup<P> {
/// located at that [index].
///
/// If the [index] is out of range, null will be returned.
P? find(String? text, int index) {
if (index < 0 || index >= text!.length) {
return null;
}
return findForChar(text.codeUnitAt(index));
P? find(String text, int index) {
final int? codePoint = getCodePoint(text, index);
return codePoint == null ? null : findForChar(codePoint);
}
/// Takes one character as an integer code unit and returns its property.

View File

@ -168,8 +168,16 @@ void main() {
final String text = testCase.toText();
int lastLineBreak = 0;
for (int i = 0; i < testCase.signs.length; i++) {
final Sign sign = testCase.signs[i];
int surrogateCount = 0;
// `s` is the index in the `testCase.signs` list.
for (int s = 0; s < testCase.signs.length; s++) {
// `i` is the index in the `text`.
final int i = s + surrogateCount;
if (s < testCase.chars.length && testCase.chars[s].isSurrogatePair) {
surrogateCount++;
}
final Sign sign = testCase.signs[s];
final LineBreakResult result = nextLineBreak(text, lastLineBreak);
if (sign.isBreakOpportunity) {
// The line break should've been found at index `i`.