Add plugin to extract language translations from unicode cldr

https://github.com/unicode-org/cldr
This commit is contained in:
Stypox 2025-09-25 10:02:39 +02:00
parent 8ceea0b1a9
commit 787fe38194
No known key found for this signature in database
GPG Key ID: 4BDF1B40A49FDD23
14 changed files with 525 additions and 2 deletions

View File

@ -1,5 +1,6 @@
import org.gradle.configurationcache.extensions.capitalized
import org.jetbrains.kotlin.gradle.tasks.KotlinCompile
import org.stypox.dicio.unicodeCldrPlugin.UnicodeCldrLanguagesTask
buildscript {
repositories {
@ -7,6 +8,7 @@ buildscript {
}
dependencies {
classpath(libs.dicio.sentences.compiler.plugin)
classpath(libs.dicio.unicode.cldr.plugin)
}
}
@ -20,6 +22,7 @@ plugins {
alias(libs.plugins.com.google.dagger.hilt.android)
alias(libs.plugins.com.google.protobuf)
alias(libs.plugins.dicio.sentences.compiler.plugin)
alias(libs.plugins.dicio.unicode.cldr.plugin)
}
android {
@ -111,6 +114,12 @@ androidComponents {
}
}
tasks.withType(UnicodeCldrLanguagesTask::class) {
// tell the UnicodeCldrLanguagesTask plugin which git commit of the
// https://github.com/unicode-org/cldr repo to use as a source of data
unicodeCldrGitCommit = libs.versions.unicodeCldrGitCommit
}
dependencies {
// Desugaring
coreLibraryDesugaring(libs.desugar.jdk.libs)

View File

@ -0,0 +1,30 @@
package org.stypox.dicio.util
import org.dicio.skill.standard.util.nfkdNormalizeWord
import org.stypox.dicio.cldr.CldrLanguages.LocaleAndTranslation
import org.stypox.dicio.util.StringUtils.customStringDistanceCleaned
/**
* Returns the [LocaleAndTranslation] whose name best matches [query], or `null` if none match well
* enough.
*/
fun List<LocaleAndTranslation>.getLocaleByLanguageName(query: String): LocaleAndTranslation? {
val normalizedQuery = nfkdNormalizeWord(query.trim())
return this.minBy { item ->
customStringDistanceCleaned(item.translationNormalized, normalizedQuery)
}.takeIf { item ->
customStringDistanceCleaned(item.translationNormalized, normalizedQuery) <= 0
}
}
/**
* Returns the [LocaleAndTranslation] corresponding to the provided [code], or `null` if there is no
* translation available for [code]. In [org.stypox.dicio.cldr.CldrLanguages]'s list the more
* relevant translations for the same language come before the alternative ones, and this function
* will choose the more relevant one. E.g. "Central Kurdish" could alternatively be written as
* "Kurdish, Central".
*/
fun List<LocaleAndTranslation>.codeToLanguageOrDefault(code: String): LocaleAndTranslation {
return this.firstOrNull { lang -> lang.locale == code }
?: LocaleAndTranslation(code, code, code)
}

View File

@ -19,6 +19,7 @@ includegitPlugin = "0.1.6"
jacksonDataformatYaml = "2.15.2"
jacksonModuleKotlin = "2.15.2"
java = "21"
jgit = "7.3.0.202506031305-r"
jna = "5.14.0"
jsoup = "1.18.3"
kotest = "5.8.1"
@ -31,6 +32,9 @@ okhttp = "4.12.0"
protobufPlugin = "0.9.4"
protoc = "4.29.3"
unbescape = "1.1.6.RELEASE"
# used by the UnicodeCldrLanguagesTask plugin which git commit of the
# https://github.com/unicode-org/cldr repo to use as a source of data
unicodeCldrGitCommit = "41283df11cce01751c29c400a8f94d1d8687210d"
voskAndroid = "0.3.32"
litert = "1.1.2"
permissionFlow = "2.0.0"
@ -55,6 +59,7 @@ desugar_jdk_libs = { module = "com.android.tools:desugar_jdk_libs", version.ref
dicio-numbers = { module = "git.included.build:dicio-numbers" }
dicio-sentences-compiler = { module = "git.included.build:dicio-sentences-compiler" }
dicio-sentences-compiler-plugin = { module = "org.stypox.dicio.sentencesCompilerPlugin:sentences-compiler-plugin" }
dicio-unicode-cldr-plugin = { module = "org.stypox.dicio.unicodeCldrPlugin:unicode-cldr-plugin" }
exp4j = { module = "net.objecthunter:exp4j", version.ref = "exp4j" }
hilt-android = { module = "com.google.dagger:hilt-android", version.ref = "dagger" }
hilt-android-compiler = { module = "com.google.dagger:hilt-android-compiler", version.ref = "dagger" }
@ -62,6 +67,7 @@ hilt-android-testing = { module = "com.google.dagger:hilt-android-testing", vers
hilt-navigation-compose = { module = "androidx.hilt:hilt-navigation-compose", version.ref = "hilt" }
jackson-dataformat-yaml = { module = "com.fasterxml.jackson.dataformat:jackson-dataformat-yaml", version.ref = "jacksonDataformatYaml" }
jackson-module-kotlin = { module = "com.fasterxml.jackson.module:jackson-module-kotlin", version.ref = "jacksonModuleKotlin" }
jgit = { module = "org.eclipse.jgit:org.eclipse.jgit", version.ref = "jgit" }
jna = { module = "net.java.dev.jna:jna", version.ref = "jna" }
jsoup = { module = "org.jsoup:jsoup", version.ref = "jsoup" }
kotest-property = { module = "io.kotest:kotest-property", version.ref = "kotest" }
@ -99,5 +105,6 @@ com-google-dagger-hilt-android = { id = "com.google.dagger.hilt.android", versio
com-google-protobuf = { id = "com.google.protobuf", version.ref = "protobufPlugin" }
me-champeau-includegit = { id = "me.champeau.includegit", version.ref = "includegitPlugin" }
# this is a local plugin, so "version" here is completely useless
# these are a local plugins, so "version" here is completely useless
dicio-sentences-compiler-plugin = { id = "org.stypox.dicio.sentencesCompilerPlugin", version = "1.0" }
dicio-unicode-cldr-plugin = { id = "org.stypox.dicio.unicodeCldrPlugin", version = "1.0" }

View File

@ -9,8 +9,9 @@ import java.util.Properties
rootProject.name = "Dicio"
include(":app")
include(":skill")
// we use includeBuild here since the plugin is a compile-time dependency
// we use includeBuild here since the plugins are compile-time dependencies
includeBuild("sentences-compiler-plugin")
includeBuild("unicode-cldr-plugin")
pluginManagement {
repositories {

4
unicode-cldr-plugin/.gitignore vendored Normal file
View File

@ -0,0 +1,4 @@
/build
/out
/.kotlin
/.gradle

View File

@ -0,0 +1,62 @@
/*
* Taken from /e/OS Assistant
*
* Copyright (C) 2024 MURENA SAS
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
group = "org.stypox.dicio.unicodeCldrPlugin"
plugins {
`kotlin-dsl`
}
gradlePlugin {
plugins {
create("unicode-cldr-plugin") {
id = "org.stypox.dicio.unicodeCldrPlugin"
implementationClass = "org.stypox.dicio.unicodeCldrPlugin.UnicodeCldrPlugin"
}
}
}
java {
sourceCompatibility = JavaVersion.toVersion(libs.versions.java.get())
targetCompatibility = JavaVersion.toVersion(libs.versions.java.get())
}
kotlin {
jvmToolchain {
languageVersion = JavaLanguageVersion.of(libs.versions.java.get())
}
}
dependencies {
// these dependencies are usually compile-time dependencies, but since this is a plugin, we want
// to access the gradle libraries at the runtime of the plugin, which happens at compile-time
// for the app
implementation(libs.android.tools.build.gradle)
implementation(libs.kotlin.gradle.plugin)
implementation(libs.kotlinpoet)
// Gradle chooses an old incorrect version of JGit, so the following line is useless...
implementation(libs.jgit)
// also depending on sentences compiler for nfkdNormalize
implementation(libs.dicio.sentences.compiler)
}

View File

@ -0,0 +1,35 @@
/*
* Taken from /e/OS Assistant
*
* Copyright (C) 2024 MURENA SAS
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
dependencyResolutionManagement {
repositories {
repositoriesMode.set(RepositoriesMode.FAIL_ON_PROJECT_REPOS)
gradlePluginPortal()
mavenCentral()
google()
}
// need to manually setup the version catalog since the plugin is a separate build (not even a
// sub-build) with its own settings.gradle.kts
versionCatalogs {
create("libs") {
from(files("../gradle/libs.versions.toml"))
}
}
}

View File

@ -0,0 +1,57 @@
package org.stypox.dicio.unicodeCldrPlugin
import org.gradle.api.DefaultTask
import org.gradle.api.file.DirectoryProperty
import org.gradle.api.file.RegularFileProperty
import org.gradle.api.provider.Property
import org.gradle.api.tasks.Input
import org.gradle.api.tasks.InputFile
import org.gradle.api.tasks.OutputDirectory
import org.gradle.api.tasks.TaskAction
import org.stypox.dicio.unicodeCldrPlugin.data.ensureGitRepoDownloaded
import org.stypox.dicio.unicodeCldrPlugin.data.parseLanguages
import org.stypox.dicio.unicodeCldrPlugin.gen.generateSkillSentencesKt
import org.stypox.dicio.unicodeCldrPlugin.util.CLDR_CHECKOUT_PATH
import org.stypox.dicio.unicodeCldrPlugin.util.CLDR_LANGUAGES_PATH
import org.stypox.dicio.unicodeCldrPlugin.util.CLDR_REPO
import org.stypox.dicio.unicodeCldrPlugin.util.UnicodeCldrPluginException
import java.io.File
abstract class UnicodeCldrLanguagesTask : DefaultTask() {
/**
* Which git commit of the https://github.com/unicode-org/cldr repo to use as a source of data.
*/
@get:Input
abstract val unicodeCldrGitCommit: Property<String>
@InputFile
val dicioLanguagesFile: RegularFileProperty = project.objects.fileProperty().apply {
set(project.file("src/main/proto/language.proto"))
}
@OutputDirectory
val outputDir: DirectoryProperty = project.objects.directoryProperty().apply {
set(project.layout.buildDirectory.dir("generated/unicode_cldr_plugin"))
}
@Throws(UnicodeCldrPluginException::class)
@TaskAction
fun generateResource() {
// the same place where the includegit plugin clones repositories
val checkoutsFolder = File(project.rootProject.rootDir, CLDR_CHECKOUT_PATH)
ensureGitRepoDownloaded(
repo = CLDR_REPO,
commit = unicodeCldrGitCommit.get(),
directory = checkoutsFolder
)
val data = parseLanguages(
dicioLanguagesFile = dicioLanguagesFile.get().asFile,
cldrLanguagesDir = File(checkoutsFolder, CLDR_LANGUAGES_PATH),
)
generateSkillSentencesKt(data, outputDir.asFile.get())
}
}

View File

@ -0,0 +1,21 @@
package org.stypox.dicio.unicodeCldrPlugin
import org.gradle.api.Plugin
import org.gradle.api.Project
import org.jetbrains.kotlin.gradle.dsl.KotlinProjectExtension
class UnicodeCldrPlugin : Plugin<Project> {
override fun apply(target: Project) {
val languagesTask = target.tasks.create("unicodeCldrLanguages", UnicodeCldrLanguagesTask::class.java)
// make sure the generated kotlin files are compiled by adding them to the source sets; note
// that this also makes sure any task depending on source sets also depends on this task
// https://slack-chats.kotlinlang.org/t/486810
target.extensions
.getByType(KotlinProjectExtension::class.java)
.sourceSets
.getByName("main")
.kotlin
.srcDir(languagesTask.outputDir)
}
}

View File

@ -0,0 +1,37 @@
package org.stypox.dicio.unicodeCldrPlugin.data
import org.eclipse.jgit.api.Git
import org.stypox.dicio.unicodeCldrPlugin.util.UnicodeCldrPluginException
import java.io.File
@Throws(UnicodeCldrPluginException::class)
fun ensureGitRepoDownloaded(repo: String, commit: String, directory: File) {
if (directory.exists()) {
try {
Git.open(directory).use { git ->
val headCommit = git.repository.resolve("HEAD")?.name
if (headCommit != commit) {
println("Commit mismatch for $repo ($headCommit != $commit), deleting and recloning...")
directory.deleteRecursively()
} else {
return // no need to clone again
}
}
} catch (_: Exception) {
println("Cannot open folder $directory, deleting and recloning...")
directory.deleteRecursively()
}
}
Git.cloneRepository()
.setURI(repo)
.setDirectory(directory)
.setCloneAllBranches(false)
// TODO Shallow clone is not supported by this version of jgit. We can't change version
// because somehow gradle forces an old version upon us despite all attempts. If you
// uncomment the line below you will see Android Studio gives you no error, because it
// resolves the correct version, but Gradle doesn't...
//.setDepth(1)
.call()
.close()
}

View File

@ -0,0 +1,121 @@
package org.stypox.dicio.unicodeCldrPlugin.data
import org.stypox.dicio.unicodeCldrPlugin.util.UnicodeCldrPluginException
import org.w3c.dom.Element
import java.io.File
import javax.xml.parsers.DocumentBuilderFactory
fun parseLanguages(
dicioLanguagesFile: File,
cldrLanguagesDir: File,
): List<Pair<String, List<Pair<String, String>>>> {
// make this a sorted set so the computation below is deterministic
val supportedFromCodes = dicioLanguagesFile
.readLines()
.filter { line -> "LANGUAGE_" in line && "SYSTEM" !in line }
.map { line -> line.split("_", limit = 2)[1].split(" ")[0].lowercase() }
.toSortedSet()
println("Languages supported by Dicio: $supportedFromCodes")
// make this a sorted set so the computation below is deterministic
val supportedToCodes = (
getLanguageElements(File(cldrLanguagesDir, "en.xml"))
.map { it.getAttribute("type").lowercase() } +
supportedFromCodes
).toSortedSet()
println("Languages for which there is an English translation: ${supportedToCodes.size}")
// locale_code -> { language_code -> set[(is_alternative, language_name_translation)] }
val data = mutableMapOf<String, MutableMap<String, MutableSet<Pair<Boolean, String>>>>()
for (lfrom in supportedFromCodes) {
data[lfrom] = mutableMapOf<String, MutableSet<Pair<Boolean, String>>>().also {
for (lto in supportedToCodes) {
it[lto] = mutableSetOf()
}
}
}
// sorted so that general languages (e.g. "en") come before specific ones (e.g. "en_IN"),
// and also so that the computation below is deterministic
val files = cldrLanguagesDir.listFiles()
?.filter { it -> it.extension == "xml" }
?.sortedDescending()
?: throw UnicodeCldrPluginException("Could not list XML files in $cldrLanguagesDir")
// go through each XML file and collect translations
for (filename in files) {
val fromCode = filename.nameWithoutExtension.lowercase()
val fromCodes = supportedFromCodes.filter {
it == fromCode || it.startsWith("${fromCode}_")
}
if (fromCodes.isEmpty()) {
continue
}
for (lang in getLanguageElements(filename)) {
val toCode = lang.getAttribute("type").lowercase()
if (toCode !in supportedToCodes) {
continue
}
var texts: List<Pair<Boolean, String>>? = null
val text = lang.textContent
if (text == "↑↑↑" && "_" in fromCode) {
texts = data[fromCode.substringBefore("_")]
?.get(toCode)
?.map { (_, code) -> Pair(/* isAlternative = */ true, code) }
?.toList()
} else if (text.isNotBlank() && text != "↑↑↑") {
texts = listOf(Pair(
/* isAlternative = */ lang.hasAttribute("alt") || lang.hasAttribute("menu"),
text,
))
}
if (texts == null) {
continue
}
for (fc in fromCodes) {
data[fc]!![toCode]!!.addAll(texts)
}
}
}
val sizeBytes = data.values.sumOf { a -> a.values.sumOf { b -> b.size } }
println("Size of language translations matrix: $sizeBytes bytes")
// ensure everything is sorted so we know that we will deterministically obtain the same result
// every time for reproducible builds
return data.map { (fromCode, tos) ->
Pair(
fromCode,
tos.flatMap { (toCode, translations) ->
translations.map { translation -> Triple(toCode, translation.first, translation.second) }
}
.sortedWith { a, b ->
// compare by locale, then by whether it's the main translation or just an
// alternative, and finally by the translation (so the output is deterministic)
val firstCompare = a.first.compareTo(b.first)
if (firstCompare != 0) return@sortedWith firstCompare
val secondCompare = a.second.compareTo(b.second)
if (secondCompare != 0) return@sortedWith secondCompare
return@sortedWith a.third.compareTo(b.third)
}
.map { (code, _, translation) -> Pair(code, translation) }
)
}
.sortedBy { it.first }
}
fun getLanguageElements(file: File): List<Element> {
val doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(file)
val nodes = doc.getElementsByTagName("language")
val result = mutableListOf<Element>()
for (i in 0 until nodes.length) {
val el = nodes.item(i)
if (el is Element) {
result.add(el)
}
}
return result
}

View File

@ -0,0 +1,123 @@
package org.stypox.dicio.unicodeCldrPlugin.gen
import com.squareup.kotlinpoet.ClassName
import com.squareup.kotlinpoet.CodeBlock
import com.squareup.kotlinpoet.FileSpec
import com.squareup.kotlinpoet.FunSpec
import com.squareup.kotlinpoet.KModifier
import com.squareup.kotlinpoet.ParameterizedTypeName.Companion.parameterizedBy
import com.squareup.kotlinpoet.PropertySpec
import com.squareup.kotlinpoet.TypeSpec
import com.squareup.kotlinpoet.asTypeName
import org.dicio.sentences_compiler.util.StringNormalizer.nfkdNormalize
import org.stypox.dicio.unicodeCldrPlugin.util.CLASS_NAME
import org.stypox.dicio.unicodeCldrPlugin.util.FILE_COMMENT
import org.stypox.dicio.unicodeCldrPlugin.util.PACKAGE_NAME
import java.io.File
fun generateSkillSentencesKt(parsedData: List<Pair<String, List<Pair<String, String>>>>, outputDirFile: File) {
val baseObj = TypeSpec.Companion.objectBuilder(CLASS_NAME)
.addKdoc("This class contains the names of (basically all) languages in the world (from " +
"Unicode CLDR) translated in all languages supported by Dicio. You can " +
"access the translation in a language supported by Dicio by indexing " +
"this class with `[]`. The resulting array is sorted so that relevant " +
"translations come before alternative ones for the same locale code.")
val localeAndTranslationClassName = ClassName.bestGuess( "LocaleAndTranslation")
baseObj.addType(generateLocaleAndTranslationClass(localeAndTranslationClassName))
baseObj.addProperty(generateLanguageToDataProperty(parsedData, localeAndTranslationClassName))
baseObj.addFunction(generateGetOperator(localeAndTranslationClassName))
FileSpec.builder(PACKAGE_NAME, CLASS_NAME)
.addFileComment(FILE_COMMENT)
.addType(baseObj.build())
.build()
.writeTo(outputDirFile)
}
private fun generateLocaleAndTranslationClass(localeAndTranslationClassName: ClassName): TypeSpec {
return TypeSpec.classBuilder(localeAndTranslationClassName)
.addKdoc("A data class holding a locale code along with the corresponding translated " +
"language name")
.addModifiers(KModifier.DATA)
.primaryConstructor(
FunSpec.constructorBuilder()
.addParameter("locale", String::class)
.addParameter("translation", String::class)
.addParameter("translationNormalized", String::class)
.build()
)
.addProperty(
PropertySpec.builder("locale", String::class)
.addKdoc("The locale code for the language, lowercase and with underscores " +
"separating variants (e.g. \"en\" or \"zh_hans\")")
.initializer("locale")
.build()
)
.addProperty(
PropertySpec.builder("translation", String::class)
.addKdoc("The translated name of this language")
.initializer("translation")
.build()
)
.addProperty(
PropertySpec.builder("translationNormalized", String::class)
.addKdoc("The translated name of this language (NFKD-normalized)")
.initializer("translationNormalized")
.build()
)
.build()
}
private fun generateLanguageToDataProperty(
parsedData: List<Pair<String, List<Pair<String, String>>>>,
localeAndTranslationClassName: ClassName,
): PropertySpec {
val dataProp = PropertySpec
.builder(
"languageToData",
Map::class.asTypeName().parameterizedBy(
String::class.asTypeName(),
Lazy::class.asTypeName().parameterizedBy(
List::class.asTypeName().parameterizedBy(
localeAndTranslationClassName
)
)
)
)
.addModifiers(KModifier.PRIVATE)
.initializer(
"mapOf(${"%S to lazy { %L },".repeat(parsedData.size)})",
*parsedData.flatMap { (languageFrom, translations) ->
sequenceOf(
languageFrom,
CodeBlock.of(
"listOf(${"%T(%S, %S, %S),".repeat(translations.size)})",
*translations.flatMap { (locale, translation) ->
sequenceOf(
localeAndTranslationClassName,
locale,
translation,
nfkdNormalize(translation),
)
}.toTypedArray()
)
)
}.toTypedArray()
)
return dataProp.build()
}
private fun generateGetOperator(localeAndTranslationClassName: ClassName): FunSpec {
return FunSpec.builder("get")
.addModifiers(KModifier.OPERATOR)
.addParameter("language", String::class)
.returns(
List::class.asTypeName()
.parameterizedBy(localeAndTranslationClassName)
.copy(nullable = true)
)
.addCode("return languageToData[language]?.value")
.build()
}

View File

@ -0,0 +1,8 @@
package org.stypox.dicio.unicodeCldrPlugin.util
const val CLDR_REPO = "https://github.com/unicode-org/cldr"
const val CLDR_CHECKOUT_PATH = "checkouts/cldr/"
const val CLDR_LANGUAGES_PATH = "common/main/"
const val PACKAGE_NAME = "org.stypox.dicio.cldr"
const val CLASS_NAME = "CldrLanguages"
const val FILE_COMMENT = "File autogenerated by SentencesCompilerTask"

View File

@ -0,0 +1,8 @@
package org.stypox.dicio.unicodeCldrPlugin.util
class UnicodeCldrPluginException : Exception {
constructor() : super()
constructor(message: String) : super(message)
constructor(message: String, cause: Throwable) : super(message, cause)
constructor(cause: Throwable) : super(cause)
}