flutter_flutter/tests/WordBreakerTests.cpp
Raph Levien d3f45892c7 Suppress linebreaks in emoji ZWJ sequences
Due to the way emoji ZWJ sequences are defined, the ICU line breaking
algorithm determines that there are valid line breaks inside the
sequence. This patch suppresses these line breaks.

This is an adaptation of I225ebebc0f4186e4b8f48fee399c4a62b3f0218a
into the nyc-dev branch.

Bug: 25433289
Change-Id: I84b50b1e6ef13d436965eab389659d02a30d100f
2016-02-18 15:00:24 -08:00

338 lines
14 KiB
C++

/*
* Copyright (C) 2015 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <gtest/gtest.h>
#include "ICUTestBase.h"
#include "UnicodeUtils.h"
#include <minikin/WordBreaker.h>
#include <unicode/locid.h>
#include <unicode/uclean.h>
#include <unicode/udata.h>
#define LOG_TAG "Minikin"
#include <cutils/log.h>
#ifndef NELEM
#define NELEM(x) ((sizeof(x) / sizeof((x)[0])))
#endif
using namespace android;
typedef ICUTestBase WordBreakerTest;
TEST_F(WordBreakerTest, basic) {
uint16_t buf[] = {'h', 'e', 'l', 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'};
WordBreaker breaker;
breaker.setLocale(icu::Locale::getEnglish());
breaker.setText(buf, NELEM(buf));
EXPECT_EQ(0, breaker.current());
EXPECT_EQ(6, breaker.next()); // after "hello "
EXPECT_EQ(0, breaker.wordStart()); // "hello"
EXPECT_EQ(5, breaker.wordEnd());
EXPECT_EQ(0, breaker.breakBadness());
EXPECT_EQ(6, breaker.current());
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
EXPECT_EQ(6, breaker.wordStart()); // "world"
EXPECT_EQ(11, breaker.wordEnd());
EXPECT_EQ(0, breaker.breakBadness());
EXPECT_EQ(11, breaker.current());
}
TEST_F(WordBreakerTest, softHyphen) {
uint16_t buf[] = {'h', 'e', 'l', 0x00AD, 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'};
WordBreaker breaker;
breaker.setLocale(icu::Locale::getEnglish());
breaker.setText(buf, NELEM(buf));
EXPECT_EQ(0, breaker.current());
EXPECT_EQ(7, breaker.next()); // after "hel{SOFT HYPHEN}lo "
EXPECT_EQ(0, breaker.wordStart()); // "hel{SOFT HYPHEN}lo"
EXPECT_EQ(6, breaker.wordEnd());
EXPECT_EQ(0, breaker.breakBadness());
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
EXPECT_EQ(7, breaker.wordStart()); // "world"
EXPECT_EQ(12, breaker.wordEnd());
EXPECT_EQ(0, breaker.breakBadness());
}
TEST_F(WordBreakerTest, zwjEmojiSequences) {
uint16_t buf[] = {
// man + zwj + heart + zwj + man
0xD83D, 0xDC68, 0x200D, 0x2764, 0x200D, 0xD83D, 0xDC68,
// woman + zwj + heart + zwj + woman
0xD83D, 0xDC69, 0x200D, 0x2764, 0x200D, 0xD83D, 0xDC8B, 0x200D, 0xD83D, 0xDC69,
// eye + zwj + left speech bubble
0xD83D, 0xDC41, 0x200D, 0xD83D, 0xDDE8,
};
WordBreaker breaker;
breaker.setLocale(icu::Locale::getEnglish());
breaker.setText(buf, NELEM(buf));
EXPECT_EQ(0, breaker.current());
EXPECT_EQ(7, breaker.next()); // after man + zwj + heart + zwj + man
EXPECT_EQ(0, breaker.wordStart());
EXPECT_EQ(7, breaker.wordEnd());
EXPECT_EQ(17, breaker.next()); // after woman + zwj + heart + zwj + woman
EXPECT_EQ(7, breaker.wordStart());
EXPECT_EQ(17, breaker.wordEnd());
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
EXPECT_EQ(17, breaker.wordStart());
EXPECT_EQ(22, breaker.wordEnd());
}
TEST_F(WordBreakerTest, punct) {
uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l' ,'o', ',', ' ', 'w', 'o', 'r', 'l', 'd',
'!', '!'};
WordBreaker breaker;
breaker.setLocale(icu::Locale::getEnglish());
breaker.setText(buf, NELEM(buf));
EXPECT_EQ(0, breaker.current());
EXPECT_EQ(9, breaker.next()); // after "¡¡hello, "
EXPECT_EQ(2, breaker.wordStart()); // "hello"
EXPECT_EQ(7, breaker.wordEnd());
EXPECT_EQ(0, breaker.breakBadness());
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
EXPECT_EQ(9, breaker.wordStart()); // "world"
EXPECT_EQ(14, breaker.wordEnd());
EXPECT_EQ(0, breaker.breakBadness());
}
TEST_F(WordBreakerTest, email) {
uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm',
' ', 'x'};
WordBreaker breaker;
breaker.setLocale(icu::Locale::getEnglish());
breaker.setText(buf, NELEM(buf));
EXPECT_EQ(0, breaker.current());
EXPECT_EQ(11, breaker.next()); // after "foo@example"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(1, breaker.breakBadness());
EXPECT_EQ(16, breaker.next()); // after ".com "
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(0, breaker.breakBadness());
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
EXPECT_EQ(16, breaker.wordStart()); // "x"
EXPECT_EQ(17, breaker.wordEnd());
EXPECT_EQ(0, breaker.breakBadness());
}
TEST_F(WordBreakerTest, mailto) {
uint16_t buf[] = {'m', 'a', 'i', 'l', 't', 'o', ':', 'f', 'o', 'o', '@',
'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'};
WordBreaker breaker;
breaker.setLocale(icu::Locale::getEnglish());
breaker.setText(buf, NELEM(buf));
EXPECT_EQ(0, breaker.current());
EXPECT_EQ(7, breaker.next()); // after "mailto:"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(1, breaker.breakBadness());
EXPECT_EQ(18, breaker.next()); // after "foo@example"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(1, breaker.breakBadness());
EXPECT_EQ(23, breaker.next()); // after ".com "
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(0, breaker.breakBadness());
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
EXPECT_EQ(23, breaker.wordStart()); // "x"
EXPECT_EQ(24, breaker.wordEnd());
EXPECT_EQ(0, breaker.breakBadness());
}
// The current logic always places a line break after a detected email address or URL
// and an immediately following non-ASCII character.
TEST_F(WordBreakerTest, emailNonAscii) {
uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm',
0x4E00};
WordBreaker breaker;
breaker.setLocale(icu::Locale::getEnglish());
breaker.setText(buf, NELEM(buf));
EXPECT_EQ(0, breaker.current());
EXPECT_EQ(11, breaker.next()); // after "foo@example"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(1, breaker.breakBadness());
EXPECT_EQ(15, breaker.next()); // after ".com"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(0, breaker.breakBadness());
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
EXPECT_EQ(15, breaker.wordStart()); // "一"
EXPECT_EQ(16, breaker.wordEnd());
EXPECT_EQ(0, breaker.breakBadness());
}
TEST_F(WordBreakerTest, emailCombining) {
uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm',
0x0303, ' ', 'x'};
WordBreaker breaker;
breaker.setLocale(icu::Locale::getEnglish());
breaker.setText(buf, NELEM(buf));
EXPECT_EQ(0, breaker.current());
EXPECT_EQ(11, breaker.next()); // after "foo@example"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(1, breaker.breakBadness());
EXPECT_EQ(17, breaker.next()); // after ".com̃ "
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(0, breaker.breakBadness());
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
EXPECT_EQ(17, breaker.wordStart()); // "x"
EXPECT_EQ(18, breaker.wordEnd());
EXPECT_EQ(0, breaker.breakBadness());
}
TEST_F(WordBreakerTest, lonelyAt) {
uint16_t buf[] = {'a', ' ', '@', ' ', 'b'};
WordBreaker breaker;
breaker.setLocale(icu::Locale::getEnglish());
breaker.setText(buf, NELEM(buf));
EXPECT_EQ(0, breaker.current());
EXPECT_EQ(2, breaker.next()); // after "a "
EXPECT_EQ(0, breaker.wordStart()); // "a"
EXPECT_EQ(1, breaker.wordEnd());
EXPECT_EQ(0, breaker.breakBadness());
EXPECT_EQ(4, breaker.next()); // after "@ "
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(0, breaker.breakBadness());
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
EXPECT_EQ(4, breaker.wordStart()); // "b"
EXPECT_EQ(5, breaker.wordEnd());
EXPECT_EQ(0, breaker.breakBadness());
}
TEST_F(WordBreakerTest, url) {
uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'e', 'x', 'a', 'm', 'p', 'l', 'e',
'.', 'c', 'o', 'm', ' ', 'x'};
WordBreaker breaker;
breaker.setLocale(icu::Locale::getEnglish());
breaker.setText(buf, NELEM(buf));
EXPECT_EQ(0, breaker.current());
EXPECT_EQ(5, breaker.next()); // after "http:"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(1, breaker.breakBadness());
EXPECT_EQ(7, breaker.next()); // after "//"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(1, breaker.breakBadness());
EXPECT_EQ(14, breaker.next()); // after "example"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(1, breaker.breakBadness());
EXPECT_EQ(19, breaker.next()); // after ".com "
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(0, breaker.breakBadness());
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
EXPECT_EQ(19, breaker.wordStart()); // "x"
EXPECT_EQ(20, breaker.wordEnd());
EXPECT_EQ(0, breaker.breakBadness());
}
// Breaks according to section 14.12 of Chicago Manual of Style, *URLs or DOIs and line breaks*
TEST_F(WordBreakerTest, urlBreakChars) {
uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '.', 'b', '/', '~', 'c', ',', 'd',
'-', 'e', '?', 'f', '=', 'g', '&', 'h', '#', 'i', '%', 'j', '_', 'k', '/', 'l'};
WordBreaker breaker;
breaker.setLocale(icu::Locale::getEnglish());
breaker.setText(buf, NELEM(buf));
EXPECT_EQ(0, breaker.current());
EXPECT_EQ(5, breaker.next()); // after "http:"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(1, breaker.breakBadness());
EXPECT_EQ(7, breaker.next()); // after "//"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(1, breaker.breakBadness());
EXPECT_EQ(8, breaker.next()); // after "a"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(1, breaker.breakBadness());
EXPECT_EQ(10, breaker.next()); // after ".b"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(1, breaker.breakBadness());
EXPECT_EQ(11, breaker.next()); // after "/"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(1, breaker.breakBadness());
EXPECT_EQ(13, breaker.next()); // after "~c"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(1, breaker.breakBadness());
EXPECT_EQ(15, breaker.next()); // after ",d"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(1, breaker.breakBadness());
EXPECT_EQ(17, breaker.next()); // after "-e"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(1, breaker.breakBadness());
EXPECT_EQ(19, breaker.next()); // after "?f"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(1, breaker.breakBadness());
EXPECT_EQ(20, breaker.next()); // after "="
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(1, breaker.breakBadness());
EXPECT_EQ(21, breaker.next()); // after "g"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(1, breaker.breakBadness());
EXPECT_EQ(22, breaker.next()); // after "&"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(1, breaker.breakBadness());
EXPECT_EQ(23, breaker.next()); // after "h"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(1, breaker.breakBadness());
EXPECT_EQ(25, breaker.next()); // after "#i"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(1, breaker.breakBadness());
EXPECT_EQ(27, breaker.next()); // after "%j"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(1, breaker.breakBadness());
EXPECT_EQ(29, breaker.next()); // after "_k"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(1, breaker.breakBadness());
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(0, breaker.breakBadness());
}
TEST_F(WordBreakerTest, urlNoHyphenBreak) {
uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '-', '/', 'b'};
WordBreaker breaker;
breaker.setLocale(icu::Locale::getEnglish());
breaker.setText(buf, NELEM(buf));
EXPECT_EQ(0, breaker.current());
EXPECT_EQ(5, breaker.next()); // after "http:"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(7, breaker.next()); // after "//"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(8, breaker.next()); // after "a"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
}
TEST_F(WordBreakerTest, urlEndsWithSlash) {
uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '/'};
WordBreaker breaker;
breaker.setLocale(icu::Locale::getEnglish());
breaker.setText(buf, NELEM(buf));
EXPECT_EQ(0, breaker.current());
EXPECT_EQ(5, breaker.next()); // after "http:"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(7, breaker.next()); // after "//"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ(8, breaker.next()); // after "a"
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
}
TEST_F(WordBreakerTest, emailStartsWithSlash) {
uint16_t buf[] = {'/', 'a', '@', 'b'};
WordBreaker breaker;
breaker.setLocale(icu::Locale::getEnglish());
breaker.setText(buf, NELEM(buf));
EXPECT_EQ(0, breaker.current());
EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
}