[web] LRU cache for text segmentation (flutter/engine#40782)

[web] LRU cache for text segmentation
This commit is contained in:
Mouad Debbar 2023-03-30 14:01:08 -04:00 committed by GitHub
parent 2e9cdc9bd8
commit 89becd1e69
7 changed files with 430 additions and 13 deletions

View File

@ -5,7 +5,6 @@
import 'package:meta/meta.dart';
import 'dom.dart';
import 'safe_browser_api.dart';
// iOS 15 launched WebGL 2.0, but there's something broken about it, which
// leads to apps failing to load. For now, we're forcing WebGL 1 on iOS.
@ -269,5 +268,4 @@ int _detectWebGLVersion() {
}
/// Whether the current browser supports the Chromium variant of CanvasKit.
bool get browserSupportsCanvaskitChromium =>
browserSupportsImageDecoder && domIntl.v8BreakIterator != null;
bool get browserSupportsCanvaskitChromium => domIntl.v8BreakIterator != null;

View File

@ -17,6 +17,8 @@ import 'skia_object_cache.dart';
import 'text_fragmenter.dart';
import 'util.dart';
final bool _ckRequiresClientICU = canvasKit.ParagraphBuilder.RequiresClientICU();
final List<String> _testFonts = <String>['FlutterTest', 'Ahem'];
String? _effectiveFontFamily(String? fontFamily) {
return ui.debugEmulateFlutterTesterEnvironment && !_testFonts.contains(fontFamily)
@ -887,7 +889,7 @@ class CkParagraphBuilder implements ui.ParagraphBuilder {
/// Builds the CkParagraph with the builder and deletes the builder.
SkParagraph _buildSkParagraph() {
if (canvasKit.ParagraphBuilder.RequiresClientICU()) {
if (_ckRequiresClientICU) {
injectClientICU(_paragraphBuilder);
}
final SkParagraph result = _paragraphBuilder.build();

View File

@ -6,8 +6,77 @@ import 'dart:typed_data';
import '../dom.dart';
import '../text/line_breaker.dart';
import '../util.dart';
import 'canvaskit_api.dart';
typedef SegmentationResult = ({
Uint32List words,
Uint32List graphemes,
Uint32List breaks,
});
// The cache numbers below were picked based on the following logic.
//
// Most paragraphs in an app are small (e.g. icons, button labels, etc). These
// paragraphs are also cheap to cache. So we cache a lot of them. 100,000 of
// them amounts to a worst case of 5MB (10-character long text + words uint list
// + graphemes uint list + breaks uint list).
//
// Large paragraphs are less common (a handful per page), but are expensive to
// cache. So we cache fewer of them. 20 of them at a length of 50,000 characters
// amount to a memory usage of 5MB (50,000-character long text + words uint list
// + graphemes uint list + breaks uint list).
//
// Medium paragraphs are somewhere in between. 10,000 of them amount to a worst
// case of 5MB (100-character long text + words uint list + graphemes uint list
// + breaks uint list).
typedef SegmentationCacheSpec = ({int cacheSize, int maxTextLength});
const SegmentationCacheSpec kSmallParagraphCacheSpec = (cacheSize: 100000, maxTextLength: 10);
const SegmentationCacheSpec kMediumParagraphCacheSpec = (cacheSize: 10000, maxTextLength: 100);
const SegmentationCacheSpec kLargeParagraphCacheSpec = (cacheSize: 20, maxTextLength: 50000);
typedef SegmentationCache = ({
LruCache<String, SegmentationResult> small,
LruCache<String, SegmentationResult> medium,
LruCache<String, SegmentationResult> large,
});
/// Caches segmentation results for small, medium and large paragraphts.
///
/// Paragraphs are frequently re-created because of style or font changes, while
/// their text contents remain the same. This cache is effective at
/// short-circuiting the segmentation of such paragraphs.
final SegmentationCache segmentationCache = (
small: LruCache<String, SegmentationResult>(kSmallParagraphCacheSpec.cacheSize),
medium: LruCache<String, SegmentationResult>(kMediumParagraphCacheSpec.cacheSize),
large: LruCache<String, SegmentationResult>(kLargeParagraphCacheSpec.cacheSize),
);
extension SegmentationCacheExtensions on SegmentationCache {
/// Gets the appropriate cache for the given [text].
LruCache<String, SegmentationResult>? getCacheForText(String text) {
if (text.length <= kSmallParagraphCacheSpec.maxTextLength) {
return small;
}
if (text.length <= kMediumParagraphCacheSpec.maxTextLength) {
return medium;
}
if (text.length <= kLargeParagraphCacheSpec.maxTextLength) {
return large;
}
return null;
}
/// Clears all the caches.
void clear() {
small.clear();
medium.clear();
large.clear();
}
}
/// Injects required ICU data into the [builder].
///
/// This should only be used with the CanvasKit Chromium variant that's compiled
@ -18,14 +87,33 @@ void injectClientICU(SkParagraphBuilder builder) {
'This method should only be used with the CanvasKit Chromium variant.',
);
final String text = builder.getText();
builder.setWordsUtf16(
fragmentUsingIntlSegmenter(text, IntlSegmenterGranularity.word),
);
builder.setGraphemeBreaksUtf16(
fragmentUsingIntlSegmenter(text, IntlSegmenterGranularity.grapheme),
);
builder.setLineBreaksUtf16(fragmentUsingV8LineBreaker(text));
final SegmentationResult result = segmentText(builder.getText());
builder.setWordsUtf16(result.words);
builder.setGraphemeBreaksUtf16(result.graphemes);
builder.setLineBreaksUtf16(result.breaks);
}
/// Segments the [text] into words, graphemes and line breaks.
///
/// Caches results in [segmentationCache].
SegmentationResult segmentText(String text) {
final LruCache<String, SegmentationResult>? cache = segmentationCache.getCacheForText(text);
final SegmentationResult? cachedResult = cache?[text];
final SegmentationResult result;
if (cachedResult != null) {
result = cachedResult;
} else {
result = (
words: fragmentUsingIntlSegmenter(text, IntlSegmenterGranularity.word),
graphemes: fragmentUsingIntlSegmenter(text, IntlSegmenterGranularity.grapheme),
breaks: fragmentUsingV8LineBreaker(text),
);
}
// Save or promote to most recently used.
cache?.cache(text, result);
return result;
}
/// The granularity at which to segment text.

View File

@ -4,9 +4,11 @@
import 'dart:async';
import 'dart:collection';
import 'dart:math' as math;
import 'dart:typed_data';
import 'package:meta/meta.dart';
import 'package:ui/ui.dart' as ui;
import 'browser_detection.dart';
@ -737,3 +739,102 @@ extension FirstWhereOrNull<T> on Iterable<T> {
return null;
}
}
typedef _LruCacheEntry<K extends Object, V extends Object> = ({K key, V value});
/// Caches up to a [maximumSize] key-value pairs.
///
/// Call [cache] to cache a key-value pair.
class LruCache<K extends Object, V extends Object> {
LruCache(this.maximumSize);
/// The maximum number of key/value pairs this cache can contain.
///
/// To avoid exceeding this limit the cache remove least recently used items.
final int maximumSize;
/// A doubly linked list of the objects in the cache.
///
/// This makes it fast to move a recently used object to the front.
final DoubleLinkedQueue<_LruCacheEntry<K, V>> _itemQueue = DoubleLinkedQueue<_LruCacheEntry<K, V>>();
@visibleForTesting
DoubleLinkedQueue<_LruCacheEntry<K, V>> get debugItemQueue => _itemQueue;
/// A map of objects to their associated node in the [_itemQueue].
///
/// This makes it fast to find the node in the queue when we need to
/// move the object to the front of the queue.
final Map<K, DoubleLinkedQueueEntry<_LruCacheEntry<K, V>>> _itemMap = <K, DoubleLinkedQueueEntry<_LruCacheEntry<K, V>>>{};
@visibleForTesting
Map<K, DoubleLinkedQueueEntry<_LruCacheEntry<K, V>>> get itemMap => _itemMap;
/// The number of objects in the cache.
int get length => _itemQueue.length;
/// Whether or not [object] is in the cache.
///
/// This is only for testing.
@visibleForTesting
bool debugContainsValue(V object) {
return _itemMap.containsValue(object);
}
@visibleForTesting
bool debugContainsKey(K key) {
return _itemMap.containsKey(key);
}
/// Returns the cached value associated with the [key].
///
/// If the value is not in the cache, returns null.
V? operator[](K key) {
return _itemMap[key]?.element.value;
}
/// Caches the given [key]/[value] pair in this cache.
///
/// If the pair is not already in the cache, adds it to the cache as the most
/// recently used pair.
///
/// If the [key] is already in the cache, moves it to the most recently used
/// position. If the [value] corresponding to the [key] is different from
/// what's in the cache, updates the value.
void cache(K key, V value) {
final DoubleLinkedQueueEntry<_LruCacheEntry<K, V>>? item = _itemMap[key];
if (item == null) {
// New key-value pair, just add.
_add(key, value);
} else if (item.element.value != value) {
// Key already in the cache, but value is new. Re-add.
item.remove();
_add(key, value);
} else {
// Key-value pair already in the cache, move to most recently used.
item.remove();
_itemQueue.addFirst(item.element);
_itemMap[key] = _itemQueue.firstEntry()!;
}
}
void clear() {
_itemQueue.clear();
_itemMap.clear();
}
void _add(K key, V value) {
_itemQueue.addFirst((key: key, value: value));
_itemMap[key] = _itemQueue.firstEntry()!;
if (_itemQueue.length > maximumSize) {
_removeLeastRecentlyUsedValue();
}
}
void _removeLeastRecentlyUsedValue() {
final bool didRemove = _itemMap.remove(_itemQueue.last.key) != null;
assert(didRemove);
_itemQueue.removeLast();
}
}

View File

@ -2,6 +2,7 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
import 'dart:math';
import 'dart:typed_data';
import 'package:test/bootstrap/browser.dart';
@ -101,4 +102,130 @@ void testMain() {
);
});
}, skip: !browserSupportsCanvaskitChromium);
group('segmentText', () {
setUp(() {
segmentationCache.clear();
});
tearDown(() {
segmentationCache.clear();
});
test('segments correctly', () {
const String text = 'Lorem-ipsum 你好🙂\nDolor sit';
final SegmentationResult segmentation = segmentText(text);
expect(
segmentation.words,
fragmentUsingIntlSegmenter(text, IntlSegmenterGranularity.word),
);
expect(
segmentation.graphemes,
fragmentUsingIntlSegmenter(text, IntlSegmenterGranularity.grapheme),
);
expect(
segmentation.breaks,
fragmentUsingV8LineBreaker(text),
);
});
test('caches segmentation results in LRU fashion', () {
const String text1 = 'hello';
segmentText(text1);
expect(segmentationCache.small.debugItemQueue, hasLength(1));
expect(segmentationCache.small[text1], isNotNull);
const String text2 = 'world';
segmentText(text2);
expect(segmentationCache.small.debugItemQueue, hasLength(2));
expect(segmentationCache.small[text2], isNotNull);
// "world" was segmented last, so it should be first, as in most recently used.
expect(segmentationCache.small.debugItemQueue.first.key, 'world');
expect(segmentationCache.small.debugItemQueue.last.key, 'hello');
});
test('puts segmentation results in the appropriate cache', () {
final String smallText = 'a' * (kSmallParagraphCacheSpec.maxTextLength - 1);
segmentText(smallText);
expect(segmentationCache.small.debugItemQueue, hasLength(1));
expect(segmentationCache.medium.debugItemQueue, hasLength(0));
expect(segmentationCache.large.debugItemQueue, hasLength(0));
expect(segmentationCache.small[smallText], isNotNull);
segmentationCache.clear();
final String mediumText = 'a' * (kMediumParagraphCacheSpec.maxTextLength - 1);
segmentText(mediumText);
expect(segmentationCache.small.debugItemQueue, hasLength(0));
expect(segmentationCache.medium.debugItemQueue, hasLength(1));
expect(segmentationCache.large.debugItemQueue, hasLength(0));
expect(segmentationCache.medium[mediumText], isNotNull);
segmentationCache.clear();
final String largeText = 'a' * (kLargeParagraphCacheSpec.maxTextLength - 1);
segmentText(largeText);
expect(segmentationCache.small.debugItemQueue, hasLength(0));
expect(segmentationCache.medium.debugItemQueue, hasLength(0));
expect(segmentationCache.large.debugItemQueue, hasLength(1));
expect(segmentationCache.large[largeText], isNotNull);
segmentationCache.clear();
// Should not cache extremely large texts.
final String tooLargeText = 'a' * (kLargeParagraphCacheSpec.maxTextLength + 1);
segmentText(tooLargeText);
expect(segmentationCache.small.debugItemQueue, hasLength(0));
expect(segmentationCache.medium.debugItemQueue, hasLength(0));
expect(segmentationCache.large.debugItemQueue, hasLength(0));
segmentationCache.clear();
});
test('has a limit on the number of entries', () {
testCacheCapacity(segmentationCache.small, kSmallParagraphCacheSpec);
testCacheCapacity(segmentationCache.medium, kMediumParagraphCacheSpec);
testCacheCapacity(segmentationCache.large, kLargeParagraphCacheSpec);
});
}, skip: !browserSupportsCanvaskitChromium);
}
void testCacheCapacity(
LruCache<String, SegmentationResult> cache,
SegmentationCacheSpec spec,
) {
// 1. Fill the cache.
for (int i = 0; i < spec.cacheSize; i++) {
final String text = _randomString(spec.maxTextLength);
segmentText(text);
// The segmented text should have been added to the cache.
// TODO(mdebbar): This may fail if the random string generator generates
// the same string twice.
expect(cache.debugItemQueue, hasLength(i + 1));
}
// 2. Make sure the cache is full.
expect(cache.length, spec.cacheSize);
// 3. Add more items to the cache.
for (int i = 0; i < 10; i++) {
final String text = _randomString(spec.maxTextLength);
segmentText(text);
// The cache size should remain the same.
expect(cache.debugItemQueue, hasLength(spec.cacheSize));
}
// 4. Clear the cache.
cache.clear();
}
int _seed = 0;
String _randomString(int length) {
const String allChars = ' 1234567890'
'abcdefghijklmnopqrstuvwxyz'
'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
final String text = '*' * length;
return text.replaceAllMapped(
'*',
// Passing a seed so the results are reproducible.
(_) => allChars[Random(_seed++).nextInt(allChars.length)],
);
}

View File

@ -177,7 +177,9 @@ void testMain() {
v8BreakIterator = Object(); // Any non-null value.
browserSupportsImageDecoder = false;
expect(browserSupportsCanvaskitChromium, isFalse);
// TODO(mdebbar): we don't check image codecs for now.
// https://github.com/flutter/flutter/issues/122331
expect(browserSupportsCanvaskitChromium, isTrue);
});
test('Detect browsers that do not support v8BreakIterator', () {

View File

@ -0,0 +1,99 @@
// Copyright 2013 The Flutter Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
import 'package:test/bootstrap/browser.dart';
import 'package:test/test.dart';
import 'package:ui/src/engine/util.dart';
typedef TestCacheEntry = ({String key, int value});
void main() {
internalBootstrapBrowserTest(() => testMain);
}
void testMain() {
test('$LruCache starts out empty', () {
final LruCache<String, int> cache = LruCache<String, int>(10);
expect(cache.length, 0);
});
test('$LruCache adds up to a maximum number of items in most recently used first order', () {
final LruCache<String, int> cache = LruCache<String, int>(3);
cache.cache('a', 1);
expect(cache.debugItemQueue.toList(), <TestCacheEntry>[
(key: 'a', value: 1),
]);
expect(cache['a'], 1);
expect(cache['b'], isNull);
cache.cache('b', 2);
expect(cache.debugItemQueue.toList(), <TestCacheEntry>[
(key: 'b', value: 2),
(key: 'a', value: 1),
]);
expect(cache['a'], 1);
expect(cache['b'], 2);
cache.cache('c', 3);
expect(cache.debugItemQueue.toList(), <TestCacheEntry>[
(key: 'c', value: 3),
(key: 'b', value: 2),
(key: 'a', value: 1),
]);
cache.cache('d', 4);
expect(cache.debugItemQueue.toList(), <TestCacheEntry>[
(key: 'd', value: 4),
(key: 'c', value: 3),
(key: 'b', value: 2),
]);
cache.cache('e', 5);
expect(cache.debugItemQueue.toList(), <TestCacheEntry>[
(key: 'e', value: 5),
(key: 'd', value: 4),
(key: 'c', value: 3),
]);
});
test('$LruCache promotes entry to most recently used position', () {
final LruCache<String, int> cache = LruCache<String, int>(3);
cache.cache('a', 1);
cache.cache('b', 2);
cache.cache('c', 3);
expect(cache.debugItemQueue.toList(), <TestCacheEntry>[
(key: 'c', value: 3),
(key: 'b', value: 2),
(key: 'a', value: 1),
]);
cache.cache('b', 2);
expect(cache.debugItemQueue.toList(), <TestCacheEntry>[
(key: 'b', value: 2),
(key: 'c', value: 3),
(key: 'a', value: 1),
]);
});
test('$LruCache updates and promotes entry to most recently used position', () {
final LruCache<String, int> cache = LruCache<String, int>(3);
cache.cache('a', 1);
cache.cache('b', 2);
cache.cache('c', 3);
expect(cache.debugItemQueue.toList(), <TestCacheEntry>[
(key: 'c', value: 3),
(key: 'b', value: 2),
(key: 'a', value: 1),
]);
expect(cache['b'], 2);
cache.cache('b', 42);
expect(cache.debugItemQueue.toList(), <TestCacheEntry>[
(key: 'b', value: 42),
(key: 'c', value: 3),
(key: 'a', value: 1),
]);
expect(cache['b'], 42);
});
}