mirror of
https://github.com/flutter/flutter.git
synced 2026-02-20 02:29:02 +08:00
Due to the way emoji ZWJ sequences are defined, the ICU line breaking algorithm determines that there are valid line breaks inside the sequence. This patch suppresses these line breaks. This is an adaptation of I225ebebc0f4186e4b8f48fee399c4a62b3f0218a into the nyc-dev branch. Bug: 25433289 Change-Id: I84b50b1e6ef13d436965eab389659d02a30d100f
338 lines
14 KiB
C++
338 lines
14 KiB
C++
/*
|
|
* Copyright (C) 2015 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include <gtest/gtest.h>
|
|
#include "ICUTestBase.h"
|
|
#include "UnicodeUtils.h"
|
|
#include <minikin/WordBreaker.h>
|
|
#include <unicode/locid.h>
|
|
#include <unicode/uclean.h>
|
|
#include <unicode/udata.h>
|
|
|
|
#define LOG_TAG "Minikin"
|
|
#include <cutils/log.h>
|
|
|
|
#ifndef NELEM
|
|
#define NELEM(x) ((sizeof(x) / sizeof((x)[0])))
|
|
#endif
|
|
|
|
using namespace android;
|
|
|
|
typedef ICUTestBase WordBreakerTest;
|
|
|
|
TEST_F(WordBreakerTest, basic) {
|
|
uint16_t buf[] = {'h', 'e', 'l', 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'};
|
|
WordBreaker breaker;
|
|
breaker.setLocale(icu::Locale::getEnglish());
|
|
breaker.setText(buf, NELEM(buf));
|
|
EXPECT_EQ(0, breaker.current());
|
|
EXPECT_EQ(6, breaker.next()); // after "hello "
|
|
EXPECT_EQ(0, breaker.wordStart()); // "hello"
|
|
EXPECT_EQ(5, breaker.wordEnd());
|
|
EXPECT_EQ(0, breaker.breakBadness());
|
|
EXPECT_EQ(6, breaker.current());
|
|
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
|
|
EXPECT_EQ(6, breaker.wordStart()); // "world"
|
|
EXPECT_EQ(11, breaker.wordEnd());
|
|
EXPECT_EQ(0, breaker.breakBadness());
|
|
EXPECT_EQ(11, breaker.current());
|
|
}
|
|
|
|
TEST_F(WordBreakerTest, softHyphen) {
|
|
uint16_t buf[] = {'h', 'e', 'l', 0x00AD, 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'};
|
|
WordBreaker breaker;
|
|
breaker.setLocale(icu::Locale::getEnglish());
|
|
breaker.setText(buf, NELEM(buf));
|
|
EXPECT_EQ(0, breaker.current());
|
|
EXPECT_EQ(7, breaker.next()); // after "hel{SOFT HYPHEN}lo "
|
|
EXPECT_EQ(0, breaker.wordStart()); // "hel{SOFT HYPHEN}lo"
|
|
EXPECT_EQ(6, breaker.wordEnd());
|
|
EXPECT_EQ(0, breaker.breakBadness());
|
|
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
|
|
EXPECT_EQ(7, breaker.wordStart()); // "world"
|
|
EXPECT_EQ(12, breaker.wordEnd());
|
|
EXPECT_EQ(0, breaker.breakBadness());
|
|
}
|
|
|
|
TEST_F(WordBreakerTest, zwjEmojiSequences) {
|
|
uint16_t buf[] = {
|
|
// man + zwj + heart + zwj + man
|
|
0xD83D, 0xDC68, 0x200D, 0x2764, 0x200D, 0xD83D, 0xDC68,
|
|
// woman + zwj + heart + zwj + woman
|
|
0xD83D, 0xDC69, 0x200D, 0x2764, 0x200D, 0xD83D, 0xDC8B, 0x200D, 0xD83D, 0xDC69,
|
|
// eye + zwj + left speech bubble
|
|
0xD83D, 0xDC41, 0x200D, 0xD83D, 0xDDE8,
|
|
};
|
|
WordBreaker breaker;
|
|
breaker.setLocale(icu::Locale::getEnglish());
|
|
breaker.setText(buf, NELEM(buf));
|
|
EXPECT_EQ(0, breaker.current());
|
|
EXPECT_EQ(7, breaker.next()); // after man + zwj + heart + zwj + man
|
|
EXPECT_EQ(0, breaker.wordStart());
|
|
EXPECT_EQ(7, breaker.wordEnd());
|
|
EXPECT_EQ(17, breaker.next()); // after woman + zwj + heart + zwj + woman
|
|
EXPECT_EQ(7, breaker.wordStart());
|
|
EXPECT_EQ(17, breaker.wordEnd());
|
|
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
|
|
EXPECT_EQ(17, breaker.wordStart());
|
|
EXPECT_EQ(22, breaker.wordEnd());
|
|
}
|
|
|
|
TEST_F(WordBreakerTest, punct) {
|
|
uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l' ,'o', ',', ' ', 'w', 'o', 'r', 'l', 'd',
|
|
'!', '!'};
|
|
WordBreaker breaker;
|
|
breaker.setLocale(icu::Locale::getEnglish());
|
|
breaker.setText(buf, NELEM(buf));
|
|
EXPECT_EQ(0, breaker.current());
|
|
EXPECT_EQ(9, breaker.next()); // after "¡¡hello, "
|
|
EXPECT_EQ(2, breaker.wordStart()); // "hello"
|
|
EXPECT_EQ(7, breaker.wordEnd());
|
|
EXPECT_EQ(0, breaker.breakBadness());
|
|
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
|
|
EXPECT_EQ(9, breaker.wordStart()); // "world"
|
|
EXPECT_EQ(14, breaker.wordEnd());
|
|
EXPECT_EQ(0, breaker.breakBadness());
|
|
}
|
|
|
|
TEST_F(WordBreakerTest, email) {
|
|
uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm',
|
|
' ', 'x'};
|
|
WordBreaker breaker;
|
|
breaker.setLocale(icu::Locale::getEnglish());
|
|
breaker.setText(buf, NELEM(buf));
|
|
EXPECT_EQ(0, breaker.current());
|
|
EXPECT_EQ(11, breaker.next()); // after "foo@example"
|
|
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
|
EXPECT_EQ(1, breaker.breakBadness());
|
|
EXPECT_EQ(16, breaker.next()); // after ".com "
|
|
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
|
EXPECT_EQ(0, breaker.breakBadness());
|
|
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
|
|
EXPECT_EQ(16, breaker.wordStart()); // "x"
|
|
EXPECT_EQ(17, breaker.wordEnd());
|
|
EXPECT_EQ(0, breaker.breakBadness());
|
|
}
|
|
|
|
TEST_F(WordBreakerTest, mailto) {
|
|
uint16_t buf[] = {'m', 'a', 'i', 'l', 't', 'o', ':', 'f', 'o', 'o', '@',
|
|
'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'};
|
|
WordBreaker breaker;
|
|
breaker.setLocale(icu::Locale::getEnglish());
|
|
breaker.setText(buf, NELEM(buf));
|
|
EXPECT_EQ(0, breaker.current());
|
|
EXPECT_EQ(7, breaker.next()); // after "mailto:"
|
|
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
|
EXPECT_EQ(1, breaker.breakBadness());
|
|
EXPECT_EQ(18, breaker.next()); // after "foo@example"
|
|
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
|
EXPECT_EQ(1, breaker.breakBadness());
|
|
EXPECT_EQ(23, breaker.next()); // after ".com "
|
|
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
|
EXPECT_EQ(0, breaker.breakBadness());
|
|
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
|
|
EXPECT_EQ(23, breaker.wordStart()); // "x"
|
|
EXPECT_EQ(24, breaker.wordEnd());
|
|
EXPECT_EQ(0, breaker.breakBadness());
|
|
}
|
|
|
|
// The current logic always places a line break after a detected email address or URL
|
|
// and an immediately following non-ASCII character.
|
|
TEST_F(WordBreakerTest, emailNonAscii) {
|
|
uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm',
|
|
0x4E00};
|
|
WordBreaker breaker;
|
|
breaker.setLocale(icu::Locale::getEnglish());
|
|
breaker.setText(buf, NELEM(buf));
|
|
EXPECT_EQ(0, breaker.current());
|
|
EXPECT_EQ(11, breaker.next()); // after "foo@example"
|
|
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
|
EXPECT_EQ(1, breaker.breakBadness());
|
|
EXPECT_EQ(15, breaker.next()); // after ".com"
|
|
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
|
EXPECT_EQ(0, breaker.breakBadness());
|
|
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
|
|
EXPECT_EQ(15, breaker.wordStart()); // "一"
|
|
EXPECT_EQ(16, breaker.wordEnd());
|
|
EXPECT_EQ(0, breaker.breakBadness());
|
|
}
|
|
|
|
TEST_F(WordBreakerTest, emailCombining) {
|
|
uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm',
|
|
0x0303, ' ', 'x'};
|
|
WordBreaker breaker;
|
|
breaker.setLocale(icu::Locale::getEnglish());
|
|
breaker.setText(buf, NELEM(buf));
|
|
EXPECT_EQ(0, breaker.current());
|
|
EXPECT_EQ(11, breaker.next()); // after "foo@example"
|
|
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
|
EXPECT_EQ(1, breaker.breakBadness());
|
|
EXPECT_EQ(17, breaker.next()); // after ".com̃ "
|
|
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
|
EXPECT_EQ(0, breaker.breakBadness());
|
|
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
|
|
EXPECT_EQ(17, breaker.wordStart()); // "x"
|
|
EXPECT_EQ(18, breaker.wordEnd());
|
|
EXPECT_EQ(0, breaker.breakBadness());
|
|
}
|
|
|
|
TEST_F(WordBreakerTest, lonelyAt) {
|
|
uint16_t buf[] = {'a', ' ', '@', ' ', 'b'};
|
|
WordBreaker breaker;
|
|
breaker.setLocale(icu::Locale::getEnglish());
|
|
breaker.setText(buf, NELEM(buf));
|
|
EXPECT_EQ(0, breaker.current());
|
|
EXPECT_EQ(2, breaker.next()); // after "a "
|
|
EXPECT_EQ(0, breaker.wordStart()); // "a"
|
|
EXPECT_EQ(1, breaker.wordEnd());
|
|
EXPECT_EQ(0, breaker.breakBadness());
|
|
EXPECT_EQ(4, breaker.next()); // after "@ "
|
|
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
|
EXPECT_EQ(0, breaker.breakBadness());
|
|
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
|
|
EXPECT_EQ(4, breaker.wordStart()); // "b"
|
|
EXPECT_EQ(5, breaker.wordEnd());
|
|
EXPECT_EQ(0, breaker.breakBadness());
|
|
}
|
|
|
|
TEST_F(WordBreakerTest, url) {
|
|
uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'e', 'x', 'a', 'm', 'p', 'l', 'e',
|
|
'.', 'c', 'o', 'm', ' ', 'x'};
|
|
WordBreaker breaker;
|
|
breaker.setLocale(icu::Locale::getEnglish());
|
|
breaker.setText(buf, NELEM(buf));
|
|
EXPECT_EQ(0, breaker.current());
|
|
EXPECT_EQ(5, breaker.next()); // after "http:"
|
|
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
|
EXPECT_EQ(1, breaker.breakBadness());
|
|
EXPECT_EQ(7, breaker.next()); // after "//"
|
|
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
|
EXPECT_EQ(1, breaker.breakBadness());
|
|
EXPECT_EQ(14, breaker.next()); // after "example"
|
|
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
|
EXPECT_EQ(1, breaker.breakBadness());
|
|
EXPECT_EQ(19, breaker.next()); // after ".com "
|
|
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
|
EXPECT_EQ(0, breaker.breakBadness());
|
|
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
|
|
EXPECT_EQ(19, breaker.wordStart()); // "x"
|
|
EXPECT_EQ(20, breaker.wordEnd());
|
|
EXPECT_EQ(0, breaker.breakBadness());
|
|
}
|
|
|
|
// Breaks according to section 14.12 of Chicago Manual of Style, *URLs or DOIs and line breaks*
|
|
TEST_F(WordBreakerTest, urlBreakChars) {
|
|
uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '.', 'b', '/', '~', 'c', ',', 'd',
|
|
'-', 'e', '?', 'f', '=', 'g', '&', 'h', '#', 'i', '%', 'j', '_', 'k', '/', 'l'};
|
|
WordBreaker breaker;
|
|
breaker.setLocale(icu::Locale::getEnglish());
|
|
breaker.setText(buf, NELEM(buf));
|
|
EXPECT_EQ(0, breaker.current());
|
|
EXPECT_EQ(5, breaker.next()); // after "http:"
|
|
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
|
EXPECT_EQ(1, breaker.breakBadness());
|
|
EXPECT_EQ(7, breaker.next()); // after "//"
|
|
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
|
EXPECT_EQ(1, breaker.breakBadness());
|
|
EXPECT_EQ(8, breaker.next()); // after "a"
|
|
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
|
EXPECT_EQ(1, breaker.breakBadness());
|
|
EXPECT_EQ(10, breaker.next()); // after ".b"
|
|
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
|
EXPECT_EQ(1, breaker.breakBadness());
|
|
EXPECT_EQ(11, breaker.next()); // after "/"
|
|
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
|
EXPECT_EQ(1, breaker.breakBadness());
|
|
EXPECT_EQ(13, breaker.next()); // after "~c"
|
|
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
|
EXPECT_EQ(1, breaker.breakBadness());
|
|
EXPECT_EQ(15, breaker.next()); // after ",d"
|
|
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
|
EXPECT_EQ(1, breaker.breakBadness());
|
|
EXPECT_EQ(17, breaker.next()); // after "-e"
|
|
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
|
EXPECT_EQ(1, breaker.breakBadness());
|
|
EXPECT_EQ(19, breaker.next()); // after "?f"
|
|
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
|
EXPECT_EQ(1, breaker.breakBadness());
|
|
EXPECT_EQ(20, breaker.next()); // after "="
|
|
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
|
EXPECT_EQ(1, breaker.breakBadness());
|
|
EXPECT_EQ(21, breaker.next()); // after "g"
|
|
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
|
EXPECT_EQ(1, breaker.breakBadness());
|
|
EXPECT_EQ(22, breaker.next()); // after "&"
|
|
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
|
EXPECT_EQ(1, breaker.breakBadness());
|
|
EXPECT_EQ(23, breaker.next()); // after "h"
|
|
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
|
EXPECT_EQ(1, breaker.breakBadness());
|
|
EXPECT_EQ(25, breaker.next()); // after "#i"
|
|
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
|
EXPECT_EQ(1, breaker.breakBadness());
|
|
EXPECT_EQ(27, breaker.next()); // after "%j"
|
|
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
|
EXPECT_EQ(1, breaker.breakBadness());
|
|
EXPECT_EQ(29, breaker.next()); // after "_k"
|
|
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
|
EXPECT_EQ(1, breaker.breakBadness());
|
|
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
|
|
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
|
EXPECT_EQ(0, breaker.breakBadness());
|
|
}
|
|
|
|
TEST_F(WordBreakerTest, urlNoHyphenBreak) {
|
|
uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '-', '/', 'b'};
|
|
WordBreaker breaker;
|
|
breaker.setLocale(icu::Locale::getEnglish());
|
|
breaker.setText(buf, NELEM(buf));
|
|
EXPECT_EQ(0, breaker.current());
|
|
EXPECT_EQ(5, breaker.next()); // after "http:"
|
|
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
|
EXPECT_EQ(7, breaker.next()); // after "//"
|
|
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
|
EXPECT_EQ(8, breaker.next()); // after "a"
|
|
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
|
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
|
|
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
|
}
|
|
|
|
TEST_F(WordBreakerTest, urlEndsWithSlash) {
|
|
uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '/'};
|
|
WordBreaker breaker;
|
|
breaker.setLocale(icu::Locale::getEnglish());
|
|
breaker.setText(buf, NELEM(buf));
|
|
EXPECT_EQ(0, breaker.current());
|
|
EXPECT_EQ(5, breaker.next()); // after "http:"
|
|
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
|
EXPECT_EQ(7, breaker.next()); // after "//"
|
|
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
|
EXPECT_EQ(8, breaker.next()); // after "a"
|
|
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
|
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
|
|
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
|
}
|
|
|
|
TEST_F(WordBreakerTest, emailStartsWithSlash) {
|
|
uint16_t buf[] = {'/', 'a', '@', 'b'};
|
|
WordBreaker breaker;
|
|
breaker.setLocale(icu::Locale::getEnglish());
|
|
breaker.setText(buf, NELEM(buf));
|
|
EXPECT_EQ(0, breaker.current());
|
|
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
|
|
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
|
|
}
|