From 67230142bb35c0986016ae8e50ea313f6c8cd7f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robert=20P=C3=B6sel?= Date: Tue, 18 Jul 2023 10:38:08 +0200 Subject: [PATCH] Update Tesseract to 5.3.2 --- .../src/main/cpp/tesseract/src/.mailmap | 3 + .../src/main/cpp/tesseract/src/CMakeLists.txt | 39 +++++-- .../src/main/cpp/tesseract/src/ChangeLog | 17 +++ .../src/main/cpp/tesseract/src/Makefile.am | 5 + .../src/main/cpp/tesseract/src/VERSION | 2 +- .../tesseract/src/cmake/CheckFunctions.cmake | 49 ++++++++ .../src/main/cpp/tesseract/src/configure.ac | 2 +- .../cpp/tesseract/src/snap/snapcraft.yaml | 16 +-- .../cpp/tesseract/src/src/api/baseapi.cpp | 4 +- .../cpp/tesseract/src/src/ccmain/control.cpp | 12 -- .../cpp/tesseract/src/src/ccmain/osdetect.cpp | 4 +- .../cpp/tesseract/src/src/ccmain/paramsd.cpp | 12 +- .../cpp/tesseract/src/src/ccmain/pgedit.cpp | 25 +++-- .../tesseract/src/src/ccstruct/dppoint.cpp | 2 +- .../cpp/tesseract/src/src/ccutil/errcode.cpp | 33 ++---- .../tesseract/src/src/ccutil/unicharset.cpp | 4 +- .../src/src/opencl/openclwrapper.cpp | 11 +- .../main/cpp/tesseract/src/src/tesseract.cpp | 7 ++ .../cpp/tesseract/src/src/textord/edgblob.cpp | 2 +- .../src/src/training/lstmtraining.cpp | 6 +- .../src/src/training/pango/boxchar.cpp | 4 +- .../src/training/unicharset/lstmtester.cpp | 14 ++- .../src/training/unicharset/lstmtrainer.cpp | 105 +++++++++--------- .../src/src/training/unicharset/lstmtrainer.h | 17 +-- .../training/unicharset/validate_myanmar.cpp | 10 +- .../tesseract/src/src/wordrec/findseam.cpp | 4 +- .../cpp/tesseract/src/unittest/lstm_test.h | 2 +- 27 files changed, 250 insertions(+), 161 deletions(-) create mode 100644 tesseract4android/src/main/cpp/tesseract/src/cmake/CheckFunctions.cmake diff --git a/tesseract4android/src/main/cpp/tesseract/src/.mailmap b/tesseract4android/src/main/cpp/tesseract/src/.mailmap index 57dbd36..9a4bbd1 100644 --- a/tesseract4android/src/main/cpp/tesseract/src/.mailmap +++ b/tesseract4android/src/main/cpp/tesseract/src/.mailmap @@ -2,6 +2,9 @@ Amit Dovev Egor Pugin +Jeff Breidenbach +Jeff Breidenbach + Jim O'Regan Jim O'Regan Jim O'Regan diff --git a/tesseract4android/src/main/cpp/tesseract/src/CMakeLists.txt b/tesseract4android/src/main/cpp/tesseract/src/CMakeLists.txt index db4c39d..70dfd0e 100644 --- a/tesseract4android/src/main/cpp/tesseract/src/CMakeLists.txt +++ b/tesseract4android/src/main/cpp/tesseract/src/CMakeLists.txt @@ -96,6 +96,7 @@ option(ENABLE_NATIVE option(BUILD_TRAINING_TOOLS "Build training tools" ON) option(BUILD_TESTS "Build tests" OFF) option(USE_SYSTEM_ICU "Use system ICU" OFF) +option(DISABLE_TIFF "Disable build with libtiff (if available)" OFF) option(DISABLE_ARCHIVE "Disable build with libarchive (if available)" OFF) option(DISABLE_CURL "Disable build with libcurl (if available)" OFF) option(INSTALL_CONFIGS "Install tesseract configs" ON) @@ -323,7 +324,7 @@ if(OPENMP_BUILD) if(NOT OpenMP_FOUND AND CLANG AND WIN32) - # workaroung because find_package(OpenMP) does not work for clang-cl + # workaround because find_package(OpenMP) does not work for clang-cl # https://gitlab.kitware.com/cmake/cmake/issues/19404 check_include_file_cxx(omp.h HAVE_OMP_H_INCLUDE) find_library(OpenMP_LIBRARY NAMES omp libomp.lib) @@ -371,6 +372,7 @@ add_definitions("-DCMAKE_BUILD") # packages # # ############################################################################## +include(CheckFunctions) if(SW_BUILD) find_package(SW REQUIRED) @@ -397,21 +399,37 @@ else() endif() if(NOT Leptonica_FOUND) message(FATAL_ERROR "Cannot find required library Leptonica. Quitting!") + else() + message(STATUS "Found leptonica version: ${Leptonica_VERSION}") endif(NOT Leptonica_FOUND) include_directories(${Leptonica_INCLUDE_DIRS}) - # Check for optional libraries. - find_package(TIFF) # for tesseract - if(NOT TIFF_FOUND AND PKG_CONFIG_EXECUTABLE) - # try PKG_CONFIG to find libtiff if cmake failed - pkg_check_modules(TIFF libtiff-4) + check_leptonica_tiff_support() + if (NOT LEPT_TIFF_RESULT EQUAL 0) + message(NOTICE "Leptonica was build without TIFF support! Disabling TIFF support...") + set(DISABLE_TIFF ON) + else() + message(STATUS "Leptonica was build with TIFF support.") endif() - if(TIFF_FOUND) - set(HAVE_TIFFIO_H ON) - include_directories(${TIFF_INCLUDE_DIRS}) - endif(TIFF_FOUND) + + # Check for optional libraries. + if(DISABLE_TIFF) + set(HAVE_TIFFIO_H OFF) + message(STATUS "TIFF support disabled.") + else(DISABLE_TIFF) + find_package(TIFF) # for tesseract + if(NOT TIFF_FOUND AND PKG_CONFIG_EXECUTABLE) + # try PKG_CONFIG to find libtiff if cmake failed + pkg_check_modules(TIFF libtiff-4) + endif() + if(TIFF_FOUND) + set(HAVE_TIFFIO_H ON) + include_directories(${TIFF_INCLUDE_DIRS}) + endif(TIFF_FOUND) + endif(DISABLE_TIFF) if(DISABLE_ARCHIVE) set(HAVE_LIBARCHIVE OFF) + message(STATUS "LibArchive support disabled.") else(DISABLE_ARCHIVE) find_package(LibArchive) if(NOT LibArchive_FOUND AND PKG_CONFIG_EXECUTABLE) @@ -425,6 +443,7 @@ else() endif(DISABLE_ARCHIVE) if(DISABLE_CURL) set(HAVE_LIBCURL OFF) + message(STATUS "CURL support disabled.") else(DISABLE_CURL) find_package(CURL) if(NOT CURL_FOUND AND PKG_CONFIG_EXECUTABLE) diff --git a/tesseract4android/src/main/cpp/tesseract/src/ChangeLog b/tesseract4android/src/main/cpp/tesseract/src/ChangeLog index d8d5258..f687401 100644 --- a/tesseract4android/src/main/cpp/tesseract/src/ChangeLog +++ b/tesseract4android/src/main/cpp/tesseract/src/ChangeLog @@ -1,3 +1,20 @@ +2023-07-11 - V5.3.2 +* Updates for snap package building. +* Support for Sgaw and W Pwo Karen languages in the Myanmar validator (#4065). +* Improve format of logging from lstmtraining. +* Use less digits in filenames of checkpoints written by lstmtraining. +* Replace deprecated sprintf. +* Remove unused code in function fix_rep_char. +* Avoid 32 bit overflow in multiplication (fixes 3 CodeQL CI alerts). +* Avoid conversions from std::string to char* to std::string. +* Abort with error message if OSD is requested with LSTM-only model. +* cmake: allow to disable tiff (-DDISABLE_TIFF=ON). +* cmake: provide info about disabled LibArchive and CURL. +* cmake: check if leptonica was build with tiff support. +* Remove old broken GitHub action vcpkg-4.1.1 (fixes issue #4078). +* Create config.yml. +* Fix typos. + 2023-04-01 - V5.3.1 * Bug fixes for some special scenarios: * Fix issue #4010. diff --git a/tesseract4android/src/main/cpp/tesseract/src/Makefile.am b/tesseract4android/src/main/cpp/tesseract/src/Makefile.am index f9c48de..b0e0bae 100644 --- a/tesseract4android/src/main/cpp/tesseract/src/Makefile.am +++ b/tesseract4android/src/main/cpp/tesseract/src/Makefile.am @@ -734,10 +734,15 @@ bin_PROGRAMS = tesseract tesseract_SOURCES = src/tesseract.cpp tesseract_CPPFLAGS = tesseract_CPPFLAGS += -I$(top_srcdir)/src/arch +tesseract_CPPFLAGS += -I$(top_srcdir)/src/ccmain tesseract_CPPFLAGS += -I$(top_srcdir)/src/ccstruct tesseract_CPPFLAGS += -I$(top_srcdir)/src/ccutil +tesseract_CPPFLAGS += -I$(top_srcdir)/src/classify +tesseract_CPPFLAGS += -I$(top_srcdir)/src/cutil tesseract_CPPFLAGS += -I$(top_srcdir)/src/dict +tesseract_CPPFLAGS += -I$(top_srcdir)/src/textord tesseract_CPPFLAGS += -I$(top_srcdir)/src/viewer +tesseract_CPPFLAGS += -I$(top_srcdir)/src/wordrec if OPENCL tesseract_CPPFLAGS += -I$(top_srcdir)/src/opencl endif diff --git a/tesseract4android/src/main/cpp/tesseract/src/VERSION b/tesseract4android/src/main/cpp/tesseract/src/VERSION index c7cb131..84197c8 100644 --- a/tesseract4android/src/main/cpp/tesseract/src/VERSION +++ b/tesseract4android/src/main/cpp/tesseract/src/VERSION @@ -1 +1 @@ -5.3.1 +5.3.2 diff --git a/tesseract4android/src/main/cpp/tesseract/src/cmake/CheckFunctions.cmake b/tesseract4android/src/main/cpp/tesseract/src/cmake/CheckFunctions.cmake new file mode 100644 index 0000000..0c15d8d --- /dev/null +++ b/tesseract4android/src/main/cpp/tesseract/src/cmake/CheckFunctions.cmake @@ -0,0 +1,49 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by +# applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. +# ############################################################################## +# +# macros and functions +# +# ############################################################################## + +# ############################################################################## +# FUNCTION check_leptonica_tiff_support +# ############################################################################## +function(check_leptonica_tiff_support) + # check if leptonica was build with tiff support set result to + # LEPT_TIFF_RESULT + set(TIFF_TEST + "#include \n" + "int main() {\n" + " l_uint8 *data = NULL;\n" + " size_t size = 0;\n" + " PIX* pix = pixCreate(3, 3, 4);\n" + " l_int32 ret_val = pixWriteMemTiff(&data, &size, pix, IFF_TIFF_G3);\n" + " pixDestroy(&pix);\n" + " lept_free(data);\n" + " return ret_val;}\n") + set(CMAKE_TRY_COMPILE_CONFIGURATION ${CMAKE_BUILD_TYPE}) + try_run( + LEPT_TIFF_RESULT + LEPT_TIFF_COMPILE + SOURCE_FROM_CONTENT tiff_test.cpp "${TIFF_TEST}" + CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${Leptonica_INCLUDE_DIRS}" + LINK_LIBRARIES ${Leptonica_LIBRARIES} + COMPILE_OUTPUT_VARIABLE + COMPILE_OUTPUT) + if(NOT LEPT_TIFF_COMPILE) + message(STATUS "COMPILE_OUTPUT: ${COMPILE_OUTPUT}") + message(STATUS "Leptonica_INCLUDE_DIRS: ${Leptonica_INCLUDE_DIRS}") + message(STATUS "Leptonica_LIBRARIES: ${Leptonica_LIBRARIES}") + message(STATUS "LEPT_TIFF_RESULT: ${LEPT_TIFF_RESULT}") + message(STATUS "LEPT_TIFF_COMPILE: ${LEPT_TIFF_COMPILE}") + message(WARNING "Failed to compile test") + endif() +endfunction(check_leptonica_tiff_support) + +# ############################################################################## diff --git a/tesseract4android/src/main/cpp/tesseract/src/configure.ac b/tesseract4android/src/main/cpp/tesseract/src/configure.ac index efbb2b7..029d37c 100644 --- a/tesseract4android/src/main/cpp/tesseract/src/configure.ac +++ b/tesseract4android/src/main/cpp/tesseract/src/configure.ac @@ -29,7 +29,7 @@ AM_INIT_AUTOMAKE([foreign subdir-objects nostdinc]) # Define date of package, etc. Could be useful in auto-generated # documentation. PACKAGE_YEAR=2023 -PACKAGE_DATE="04/01" +PACKAGE_DATE="07/11" abs_top_srcdir=`AS_DIRNAME([$0])` diff --git a/tesseract4android/src/main/cpp/tesseract/src/snap/snapcraft.yaml b/tesseract4android/src/main/cpp/tesseract/src/snap/snapcraft.yaml index 64585c6..b677055 100644 --- a/tesseract4android/src/main/cpp/tesseract/src/snap/snapcraft.yaml +++ b/tesseract4android/src/main/cpp/tesseract/src/snap/snapcraft.yaml @@ -13,13 +13,13 @@ description: | grade: stable # must be 'stable' to release into candidate/stable channels confinement: strict +base: core22 apps: tesseract: - command: > - env - TESSDATA_PREFIX=$SNAP_USER_COMMON - tesseract + command: usr/local/bin/tesseract + environment: + TESSDATA_PREFIX: $SNAP_USER_COMMON plugs: - home - removable-media @@ -30,9 +30,9 @@ parts: plugin: autotools build-packages: - pkg-config - - libpng12-dev - - libjpeg8-dev - - libtiff5-dev + - libpng-dev + - libjpeg-dev + - libtiff-dev - zlib1g-dev - libicu-dev - libpango1.0-dev @@ -41,7 +41,7 @@ parts: - libgomp1 after: [leptonica] leptonica: - source: https://github.com/DanBloomberg/leptonica/archive/1.74.2.tar.gz + source: https://github.com/DanBloomberg/leptonica/archive/1.83.1.tar.gz plugin: autotools stage-packages: - libjbig0 diff --git a/tesseract4android/src/main/cpp/tesseract/src/src/api/baseapi.cpp b/tesseract4android/src/main/cpp/tesseract/src/src/api/baseapi.cpp index f78894c..11398f5 100644 --- a/tesseract4android/src/main/cpp/tesseract/src/src/api/baseapi.cpp +++ b/tesseract4android/src/main/cpp/tesseract/src/src/api/baseapi.cpp @@ -412,7 +412,7 @@ int TessBaseAPI::Init(const char *data, int data_size, const char *language, Ocr if (data_size != 0) { mgr.LoadMemBuffer(language, data, data_size); } - if (tesseract_->init_tesseract(datapath.c_str(), output_file_.c_str(), language, oem, configs, + if (tesseract_->init_tesseract(datapath, output_file_, language, oem, configs, configs_size, vars_vec, vars_values, set_only_non_debug_params, &mgr) != 0) { return -1; @@ -2176,7 +2176,7 @@ int TessBaseAPI::FindLines() { " but data path is undefined\n"); delete osd_tesseract_; osd_tesseract_ = nullptr; - } else if (osd_tesseract_->init_tesseract(datapath_.c_str(), "", "osd", OEM_TESSERACT_ONLY, + } else if (osd_tesseract_->init_tesseract(datapath_, "", "osd", OEM_TESSERACT_ONLY, nullptr, 0, nullptr, nullptr, false, &mgr) == 0) { osd_tess = osd_tesseract_; osd_tesseract_->set_source_resolution(thresholder_->GetSourceYResolution()); diff --git a/tesseract4android/src/main/cpp/tesseract/src/src/ccmain/control.cpp b/tesseract4android/src/main/cpp/tesseract/src/src/ccmain/control.cpp index 6de25d3..d6da06d 100644 --- a/tesseract4android/src/main/cpp/tesseract/src/src/ccmain/control.cpp +++ b/tesseract4android/src/main/cpp/tesseract/src/src/ccmain/control.cpp @@ -1684,18 +1684,6 @@ void Tesseract::fix_rep_char(PAGE_RES_IT *page_res_it) { } word_res->done = true; - // Measure the mean space. - int gap_count = 0; - WERD *werd = word_res->word; - C_BLOB_IT blob_it(werd->cblob_list()); - C_BLOB *prev_blob = blob_it.data(); - for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) { - C_BLOB *blob = blob_it.data(); - int gap = blob->bounding_box().left(); - gap -= prev_blob->bounding_box().right(); - ++gap_count; - prev_blob = blob; - } // Just correct existing classification. CorrectRepcharChoices(best_choice, word_res); word_res->reject_map.initialise(word.length()); diff --git a/tesseract4android/src/main/cpp/tesseract/src/src/ccmain/osdetect.cpp b/tesseract4android/src/main/cpp/tesseract/src/src/ccmain/osdetect.cpp index daee2b4..dcc1aa5 100644 --- a/tesseract4android/src/main/cpp/tesseract/src/src/ccmain/osdetect.cpp +++ b/tesseract4android/src/main/cpp/tesseract/src/src/ccmain/osdetect.cpp @@ -460,7 +460,7 @@ ScriptDetector::ScriptDetector(const std::vector *allowed_scripts, OSResult // adding this blob. void ScriptDetector::detect_blob(BLOB_CHOICE_LIST *scores) { for (int i = 0; i < 4; ++i) { - bool done[kMaxNumberOfScripts] = {false}; + std::vector done(kMaxNumberOfScripts); BLOB_CHOICE_IT choice_it; choice_it.set_to_list(scores + i); @@ -488,7 +488,7 @@ void ScriptDetector::detect_blob(BLOB_CHOICE_LIST *scores) { } } // Script already processed before. - if (done[id]) { + if (done.at(id)) { continue; } done[id] = true; diff --git a/tesseract4android/src/main/cpp/tesseract/src/src/ccmain/paramsd.cpp b/tesseract4android/src/main/cpp/tesseract/src/src/ccmain/paramsd.cpp index 60de457..85e596d 100644 --- a/tesseract4android/src/main/cpp/tesseract/src/src/ccmain/paramsd.cpp +++ b/tesseract4android/src/main/cpp/tesseract/src/src/ccmain/paramsd.cpp @@ -32,7 +32,7 @@ # include "svmnode.h" // for SVMenuNode # include "tesseractclass.h" // for Tesseract -# include // for fclose, fopen, fprintf, sprintf, FILE +# include // for fclose, fopen, fprintf, FILE # include // for atoi # include // for strcmp, strcspn, strlen, strncpy # include // for std::locale::classic @@ -319,16 +319,12 @@ ParamsEditor::ParamsEditor(tesseract::Tesseract *tess, ScrollView *sv) { // Write all (changed_) parameters to a config file. void ParamsEditor::WriteParams(char *filename, bool changes_only) { FILE *fp; // input file - char msg_str[255]; // if file exists if ((fp = fopen(filename, "rb")) != nullptr) { fclose(fp); - sprintf(msg_str, - "Overwrite file " - "%s" - "? (Y/N)", - filename); - int a = sv_window_->ShowYesNoDialog(msg_str); + std::stringstream msg; + msg << "Overwrite file " << filename << "? (Y/N)"; + int a = sv_window_->ShowYesNoDialog(msg.str().c_str()); if (a == 'n') { return; } // don't write diff --git a/tesseract4android/src/main/cpp/tesseract/src/src/ccmain/pgedit.cpp b/tesseract4android/src/main/cpp/tesseract/src/src/ccmain/pgedit.cpp index 9e4902b..dd23985 100644 --- a/tesseract4android/src/main/cpp/tesseract/src/src/ccmain/pgedit.cpp +++ b/tesseract4android/src/main/cpp/tesseract/src/src/ccmain/pgedit.cpp @@ -36,6 +36,9 @@ #include #include +#include // for std::setprecision +#include // for std::locale::classic +#include // for std::stringstream #ifndef GRAPHICS_DISABLED namespace tesseract { @@ -140,32 +143,30 @@ static void show_point(PAGE_RES *page_res, float x, float y) { FCOORD pt(x, y); PAGE_RES_IT pr_it(page_res); - const int kBufsize = 512; - char msg[kBufsize]; - char *msg_ptr = msg; - - msg_ptr += sprintf(msg_ptr, "Pt:(%0.3f, %0.3f) ", x, y); + std::stringstream msg; + msg.imbue(std::locale::classic()); + msg << std::fixed << std::setprecision(3) << "Pt:(" << x << ", " << y << ") "; for (WERD_RES *word = pr_it.word(); word != nullptr; word = pr_it.forward()) { if (pr_it.row() != pr_it.prev_row() && pr_it.row()->row->bounding_box().contains(pt)) { - msg_ptr += sprintf(msg_ptr, "BL(x)=%0.3f ", pr_it.row()->row->base_line(x)); + msg << "BL(x)=" << pr_it.row()->row->base_line(x) << ' '; } if (word->word->bounding_box().contains(pt)) { TBOX box = word->word->bounding_box(); - msg_ptr += sprintf(msg_ptr, "Wd(%d, %d)/(%d, %d) ", box.left(), box.bottom(), box.right(), - box.top()); + msg << "Wd(" << box.left() << ", " << box.bottom() << ")/(" + << box.right() << ", " << box.top() << ") "; C_BLOB_IT cblob_it(word->word->cblob_list()); for (cblob_it.mark_cycle_pt(); !cblob_it.cycled_list(); cblob_it.forward()) { C_BLOB *cblob = cblob_it.data(); box = cblob->bounding_box(); if (box.contains(pt)) { - msg_ptr += sprintf(msg_ptr, "CBlb(%d, %d)/(%d, %d) ", box.left(), box.bottom(), - box.right(), box.top()); + msg << "CBlb(" << box.left() << ", " << box.bottom() << ")/(" + << box.right() << ", " << box.top() << ") "; } } } } - image_win->AddMessage(msg); + image_win->AddMessage(msg.str().c_str()); } /** @@ -622,7 +623,7 @@ void Tesseract::process_image_event( // action in image win break; default: - sprintf(msg, "Mode %d not yet implemented", mode); + snprintf(msg, sizeof(msg), "Mode %d not yet implemented", mode); image_win->AddMessage(msg); break; } diff --git a/tesseract4android/src/main/cpp/tesseract/src/src/ccstruct/dppoint.cpp b/tesseract4android/src/main/cpp/tesseract/src/src/ccstruct/dppoint.cpp index 68f8f94..30e174d 100644 --- a/tesseract4android/src/main/cpp/tesseract/src/src/ccstruct/dppoint.cpp +++ b/tesseract4android/src/main/cpp/tesseract/src/src/ccstruct/dppoint.cpp @@ -76,7 +76,7 @@ int64_t DPPoint::CostWithVariance(const DPPoint *prev) { int delta = this - prev; int32_t n = prev->n_ + 1; int32_t sig_x = prev->sig_x_ + delta; - int64_t sig_xsq = prev->sig_xsq_ + delta * delta; + int64_t sig_xsq = prev->sig_xsq_ + static_cast(delta) * delta; int64_t cost = (sig_xsq - sig_x * sig_x / n) / n; cost += prev->total_cost_; UpdateIfBetter(cost, prev->total_steps_ + 1, prev, n, sig_x, sig_xsq); diff --git a/tesseract4android/src/main/cpp/tesseract/src/src/ccutil/errcode.cpp b/tesseract4android/src/main/cpp/tesseract/src/src/ccutil/errcode.cpp index dddc123..e6b05c2 100644 --- a/tesseract4android/src/main/cpp/tesseract/src/src/ccutil/errcode.cpp +++ b/tesseract4android/src/main/cpp/tesseract/src/src/ccutil/errcode.cpp @@ -22,6 +22,8 @@ #include #include #include +#include // for std::cerr +#include // for std::stringstream namespace tesseract { @@ -41,37 +43,26 @@ void ERRCODE::error( // handle error const char *format, ... // special message ) const { va_list args; // variable args - char msg[MAX_MSG]; - char *msgptr = msg; + std::stringstream msg; if (caller != nullptr) { // name of caller - msgptr += sprintf(msgptr, "%s:", caller); + msg << caller << ':'; } // actual message - msgptr += sprintf(msgptr, "Error:%s", message); + msg << "Error:" << message; if (format != nullptr) { - msgptr += sprintf(msgptr, ":"); + char str[MAX_MSG]; va_start(args, format); // variable list -#ifdef _WIN32 - // print remainder - msgptr += _vsnprintf(msgptr, MAX_MSG - 2 - (msgptr - msg), format, args); - msg[MAX_MSG - 2] = '\0'; // ensure termination - strcat(msg, "\n"); -#else - // print remainder - msgptr += vsprintf(msgptr, format, args); - // no specific - msgptr += sprintf(msgptr, "\n"); -#endif + // print remainder + std::vsnprintf(str, sizeof(str), format, args); + // ensure termination + str[sizeof(str) - 1] = '\0'; va_end(args); - } else { - // no specific - msgptr += sprintf(msgptr, "\n"); + msg << ':' << str; } - // %s is needed here so msg is printed correctly! - fprintf(stderr, "%s", msg); + std::cerr << msg.str() << '\n'; switch (action) { case DBG: diff --git a/tesseract4android/src/main/cpp/tesseract/src/src/ccutil/unicharset.cpp b/tesseract4android/src/main/cpp/tesseract/src/src/ccutil/unicharset.cpp index 7f06f7c..828c528 100644 --- a/tesseract4android/src/main/cpp/tesseract/src/src/ccutil/unicharset.cpp +++ b/tesseract4android/src/main/cpp/tesseract/src/src/ccutil/unicharset.cpp @@ -314,10 +314,10 @@ std::string UNICHARSET::debug_utf8_str(const char *str) { step = UNICHAR::utf8_step(str + i); if (step == 0) { step = 1; - sprintf(hex, "%x", str[i]); + snprintf(hex, sizeof(hex), "%x", str[i]); } else { UNICHAR ch(str + i, step); - sprintf(hex, "%x", ch.first_uni()); + snprintf(hex, sizeof(hex), "%x", ch.first_uni()); } result += hex; result += " "; diff --git a/tesseract4android/src/main/cpp/tesseract/src/src/opencl/openclwrapper.cpp b/tesseract4android/src/main/cpp/tesseract/src/src/opencl/openclwrapper.cpp index 79817f2..15c477c 100644 --- a/tesseract4android/src/main/cpp/tesseract/src/src/opencl/openclwrapper.cpp +++ b/tesseract4android/src/main/cpp/tesseract/src/src/opencl/openclwrapper.cpp @@ -812,7 +812,8 @@ int OpenclDevice::BinaryGenerated(const char *clFileName, FILE **fhandle) { cl_int clStatus; int status = 0; FILE *fd = nullptr; - char fileName[256] = {0}, cl_name[128] = {0}; + char fileName[256]; + char cl_name[128]; char deviceName[1024]; clStatus = clGetDeviceInfo(gpuEnv.mpArryDevsID[i], CL_DEVICE_NAME, sizeof(deviceName), deviceName, nullptr); @@ -820,7 +821,7 @@ int OpenclDevice::BinaryGenerated(const char *clFileName, FILE **fhandle) { const char *str = strstr(clFileName, ".cl"); memcpy(cl_name, clFileName, str - clFileName); cl_name[str - clFileName] = '\0'; - sprintf(fileName, "%s-%s.bin", cl_name, deviceName); + snprintf(fileName, sizeof(fileName), "%s-%s.bin", cl_name, deviceName); legalizeFileName(fileName); fd = fopen(fileName, "rb"); status = (fd != nullptr) ? 1 : 0; @@ -894,9 +895,9 @@ int OpenclDevice::GeneratBinFromKernelSource(cl_program program, const char *clF /* dump out each binary into its own separate file. */ for (i = 0; i < numDevices; i++) { - char fileName[256] = {0}, cl_name[128] = {0}; - if (binarySizes[i] != 0) { + char fileName[256]; + char cl_name[128]; char deviceName[1024]; clStatus = clGetDeviceInfo(mpArryDevsID[i], CL_DEVICE_NAME, sizeof(deviceName), deviceName, nullptr); @@ -905,7 +906,7 @@ int OpenclDevice::GeneratBinFromKernelSource(cl_program program, const char *clF const char *str = strstr(clFileName, ".cl"); memcpy(cl_name, clFileName, str - clFileName); cl_name[str - clFileName] = '\0'; - sprintf(fileName, "%s-%s.bin", cl_name, deviceName); + snprintf(fileName, sizeof(fileName), "%s-%s.bin", cl_name, deviceName); legalizeFileName(fileName); if (!WriteBinaryToFile(fileName, binaries[i], binarySizes[i])) { tprintf("[OD] write binary[%s] failed\n", fileName); diff --git a/tesseract4android/src/main/cpp/tesseract/src/src/tesseract.cpp b/tesseract4android/src/main/cpp/tesseract/src/src/tesseract.cpp index e0697aa..4808155 100644 --- a/tesseract4android/src/main/cpp/tesseract/src/src/tesseract.cpp +++ b/tesseract4android/src/main/cpp/tesseract/src/src/tesseract.cpp @@ -39,6 +39,7 @@ #endif #include #include "simddetect.h" +#include "tesseractclass.h" // for AnyTessLang #include "tprintf.h" // for tprintf #ifdef _OPENMP @@ -787,6 +788,12 @@ int main(int argc, char **argv) { (api.GetBoolVariable("tessedit_make_boxes_from_boxes", &b) && b) || (api.GetBoolVariable("tessedit_train_line_recognizer", &b) && b); + if (api.GetPageSegMode() == tesseract::PSM_OSD_ONLY) { + if (!api.tesseract()->AnyTessLang()) { + fprintf(stderr, "Error, OSD requires a model for the legacy engine\n"); + return EXIT_FAILURE; + } + } #ifdef DISABLED_LEGACY_ENGINE auto cur_psm = api.GetPageSegMode(); auto osd_warning = std::string(""); diff --git a/tesseract4android/src/main/cpp/tesseract/src/src/textord/edgblob.cpp b/tesseract4android/src/main/cpp/tesseract/src/src/textord/edgblob.cpp index ee3e156..781b8e9 100644 --- a/tesseract4android/src/main/cpp/tesseract/src/src/textord/edgblob.cpp +++ b/tesseract4android/src/main/cpp/tesseract/src/src/textord/edgblob.cpp @@ -174,7 +174,7 @@ int32_t OL_BUCKETS::outline_complexity(C_OUTLINE *outline, // parent outline if (child_count + grandchild_count > max_count) { // too complex if (edges_debug) { tprintf( - "Disgard outline on child_count=%d + grandchild_count=%d " + "Discard outline on child_count=%d + grandchild_count=%d " "> max_count=%d\n", child_count, grandchild_count, max_count); } diff --git a/tesseract4android/src/main/cpp/tesseract/src/src/training/lstmtraining.cpp b/tesseract4android/src/main/cpp/tesseract/src/src/training/lstmtraining.cpp index a1068bd..d1cae30 100644 --- a/tesseract4android/src/main/cpp/tesseract/src/src/training/lstmtraining.cpp +++ b/tesseract4android/src/main/cpp/tesseract/src/src/training/lstmtraining.cpp @@ -16,6 +16,7 @@ /////////////////////////////////////////////////////////////////////// #include +#include // for std::locale::classic #if defined(__USE_GNU) # include // for feenableexcept #endif @@ -222,9 +223,10 @@ int main(int argc, char **argv) { iteration = trainer.training_iteration()) { trainer.TrainOnLine(&trainer, false); } - std::string log_str; + std::stringstream log_str; + log_str.imbue(std::locale::classic()); trainer.MaintainCheckpoints(tester_callback, log_str); - tprintf("%s\n", log_str.c_str()); + tprintf("%s\n", log_str.str().c_str()); } while (trainer.best_error_rate() > FLAGS_target_error_rate && (trainer.training_iteration() < max_iterations)); tprintf("Finished! Selected model with minimal training error rate (BCER) = %g\n", diff --git a/tesseract4android/src/main/cpp/tesseract/src/src/training/pango/boxchar.cpp b/tesseract4android/src/main/cpp/tesseract/src/src/training/pango/boxchar.cpp index ec16d74..d31c5a8 100644 --- a/tesseract4android/src/main/cpp/tesseract/src/src/training/pango/boxchar.cpp +++ b/tesseract4android/src/main/cpp/tesseract/src/src/training/pango/boxchar.cpp @@ -278,8 +278,8 @@ bool BoxChar::MostlyVertical(const std::vector &boxes) { int dx = boxes[i]->box_->x - boxes[i - 1]->box_->x; int dy = boxes[i]->box_->y - boxes[i - 1]->box_->y; if (abs(dx) > abs(dy) * kMinNewlineRatio || abs(dy) > abs(dx) * kMinNewlineRatio) { - total_dx += dx * dx; - total_dy += dy * dy; + total_dx += static_cast(dx) * dx; + total_dy += static_cast(dy) * dy; } } } diff --git a/tesseract4android/src/main/cpp/tesseract/src/src/training/unicharset/lstmtester.cpp b/tesseract4android/src/main/cpp/tesseract/src/src/training/unicharset/lstmtester.cpp index bd0f222..052460c 100644 --- a/tesseract4android/src/main/cpp/tesseract/src/src/training/unicharset/lstmtester.cpp +++ b/tesseract4android/src/main/cpp/tesseract/src/src/training/unicharset/lstmtester.cpp @@ -16,6 +16,7 @@ /////////////////////////////////////////////////////////////////////// #include "lstmtester.h" +#include // for std::setprecision #include // for std::thread #include "fileio.h" // for LoadFileLinesToStrings @@ -115,14 +116,15 @@ std::string LSTMTester::RunEvalSync(int iteration, const double *training_errors } char_error *= 100.0 / total_pages_; word_error *= 100.0 / total_pages_; - std::string result; + std::stringstream result; + result.imbue(std::locale::classic()); + result << std::fixed << std::setprecision(3); if (iteration != 0 || training_stage != 0) { - result += "At iteration " + std::to_string(iteration); - result += ", stage " + std::to_string(training_stage) + ", "; + result << "At iteration " << iteration + << ", stage " << training_stage << ", "; } - result += "BCER eval=" + std::to_string(char_error); - result += ", BWER eval=" + std::to_string(word_error); - return result; + result << "BCER eval=" << char_error << ", BWER eval=" << word_error; + return result.str(); } // Helper thread function for RunEvalAsync. diff --git a/tesseract4android/src/main/cpp/tesseract/src/src/training/unicharset/lstmtrainer.cpp b/tesseract4android/src/main/cpp/tesseract/src/src/training/unicharset/lstmtrainer.cpp index 0ebad4d..6e7b780 100644 --- a/tesseract4android/src/main/cpp/tesseract/src/src/training/unicharset/lstmtrainer.cpp +++ b/tesseract4android/src/main/cpp/tesseract/src/src/training/unicharset/lstmtrainer.cpp @@ -23,6 +23,8 @@ #endif #include +#include // for std::setprecision +#include // for std::locale::classic #include #include "lstmtrainer.h" @@ -305,7 +307,7 @@ bool LSTMTrainer::LoadAllTrainingData(const std::vector &filenames, // Writes checkpoints at appropriate times and builds and returns a log message // to indicate progress. Returns false if nothing interesting happened. bool LSTMTrainer::MaintainCheckpoints(const TestCallback &tester, - std::string &log_msg) { + std::stringstream &log_msg) { PrepareLogMsg(log_msg); double error_rate = CharError(); int iteration = learning_iteration(); @@ -330,35 +332,34 @@ bool LSTMTrainer::MaintainCheckpoints(const TestCallback &tester, std::vector rec_model_data; if (error_rate < best_error_rate_) { SaveRecognitionDump(&rec_model_data); - log_msg += " New best BCER = " + std::to_string(error_rate); - log_msg += UpdateErrorGraph(iteration, error_rate, rec_model_data, tester); + log_msg << " New best BCER = " << error_rate; + log_msg << UpdateErrorGraph(iteration, error_rate, rec_model_data, tester); // If sub_trainer_ is not nullptr, either *this beat it to a new best, or it // just overwrote *this. In either case, we have finished with it. sub_trainer_.reset(); stall_iteration_ = learning_iteration() + kMinStallIterations; if (TransitionTrainingStage(kStageTransitionThreshold)) { - log_msg += - " Transitioned to stage " + std::to_string(CurrentTrainingStage()); + log_msg << " Transitioned to stage " << CurrentTrainingStage(); } SaveTrainingDump(NO_BEST_TRAINER, *this, &best_trainer_); if (error_rate < error_rate_of_last_saved_best_ * kBestCheckpointFraction) { std::string best_model_name = DumpFilename(); if (!SaveDataToFile(best_trainer_, best_model_name.c_str())) { - log_msg += " failed to write best model:"; + log_msg << " failed to write best model:"; } else { - log_msg += " wrote best model:"; + log_msg << " wrote best model:"; error_rate_of_last_saved_best_ = best_error_rate_; } - log_msg += best_model_name; + log_msg << best_model_name; } } else if (error_rate > worst_error_rate_) { SaveRecognitionDump(&rec_model_data); - log_msg += " New worst BCER = " + std::to_string(error_rate); - log_msg += UpdateErrorGraph(iteration, error_rate, rec_model_data, tester); + log_msg << " New worst BCER = " << error_rate; + log_msg << UpdateErrorGraph(iteration, error_rate, rec_model_data, tester); if (worst_error_rate_ > best_error_rate_ + kMinDivergenceRate && best_error_rate_ < kMinStartedErrorRate && !best_trainer_.empty()) { // Error rate has ballooned. Go back to the best model. - log_msg += "\nDivergence! "; + log_msg << "\nDivergence! "; // Copy best_trainer_ before reading it, as it will get overwritten. std::vector revert_data(best_trainer_); if (ReadTrainingDump(revert_data, *this)) { @@ -382,34 +383,33 @@ bool LSTMTrainer::MaintainCheckpoints(const TestCallback &tester, std::vector checkpoint; if (!SaveTrainingDump(FULL, *this, &checkpoint) || !SaveDataToFile(checkpoint, checkpoint_name_.c_str())) { - log_msg += " failed to write checkpoint."; + log_msg << " failed to write checkpoint."; } else { - log_msg += " wrote checkpoint."; + log_msg << " wrote checkpoint."; } } - log_msg += "\n"; return result; } // Builds a string containing a progress message with current error rates. -void LSTMTrainer::PrepareLogMsg(std::string &log_msg) const { +void LSTMTrainer::PrepareLogMsg(std::stringstream &log_msg) const { LogIterations("At", log_msg); - log_msg += ", Mean rms=" + std::to_string(error_rates_[ET_RMS]); - log_msg += "%, delta=" + std::to_string(error_rates_[ET_DELTA]); - log_msg += "%, BCER train=" + std::to_string(error_rates_[ET_CHAR_ERROR]); - log_msg += "%, BWER train=" + std::to_string(error_rates_[ET_WORD_RECERR]); - log_msg += "%, skip ratio=" + std::to_string(error_rates_[ET_SKIP_RATIO]); - log_msg += "%, "; + log_msg << std::fixed << std::setprecision(3) + << ", mean rms=" << error_rates_[ET_RMS] + << "%, delta=" << error_rates_[ET_DELTA] + << "%, BCER train=" << error_rates_[ET_CHAR_ERROR] + << "%, BWER train=" << error_rates_[ET_WORD_RECERR] + << "%, skip ratio=" << error_rates_[ET_SKIP_RATIO] << "%,"; } // Appends iteration learning_iteration()/training_iteration()/ // sample_iteration() to the log_msg. void LSTMTrainer::LogIterations(const char *intro_str, - std::string &log_msg) const { - log_msg += intro_str; - log_msg += " iteration " + std::to_string(learning_iteration()); - log_msg += "/" + std::to_string(training_iteration()); - log_msg += "/" + std::to_string(sample_iteration()); + std::stringstream &log_msg) const { + log_msg << intro_str + << " iteration " << learning_iteration() + << "/" << training_iteration() + << "/" << sample_iteration(); } // Returns true and increments the training_stage_ if the error rate has just @@ -602,14 +602,14 @@ bool LSTMTrainer::DeSerialize(const TessdataManager *mgr, TFile *fp) { // De-serializes the saved best_trainer_ into sub_trainer_, and adjusts the // learning rates (by scaling reduction, or layer specific, according to // NF_LAYER_SPECIFIC_LR). -void LSTMTrainer::StartSubtrainer(std::string &log_msg) { +void LSTMTrainer::StartSubtrainer(std::stringstream &log_msg) { sub_trainer_ = std::make_unique(); if (!ReadTrainingDump(best_trainer_, *sub_trainer_)) { - log_msg += " Failed to revert to previous best for trial!"; + log_msg << " Failed to revert to previous best for trial!"; sub_trainer_.reset(); } else { - log_msg += " Trial sub_trainer_ from iteration " + - std::to_string(sub_trainer_->training_iteration()); + log_msg << " Trial sub_trainer_ from iteration " + << sub_trainer_->training_iteration(); // Reduce learning rate so it doesn't diverge this time. sub_trainer_->ReduceLearningRates(this, log_msg); // If it fails again, we will wait twice as long before reverting again. @@ -630,14 +630,13 @@ void LSTMTrainer::StartSubtrainer(std::string &log_msg) { // trainer in *this is replaced with sub_trainer_, and STR_REPLACED is // returned. STR_NONE is returned if the subtrainer wasn't good enough to // receive any training iterations. -SubTrainerResult LSTMTrainer::UpdateSubtrainer(std::string &log_msg) { +SubTrainerResult LSTMTrainer::UpdateSubtrainer(std::stringstream &log_msg) { double training_error = CharError(); double sub_error = sub_trainer_->CharError(); double sub_margin = (training_error - sub_error) / sub_error; if (sub_margin >= kSubTrainerMarginFraction) { - log_msg += " sub_trainer=" + std::to_string(sub_error); - log_msg += " margin=" + std::to_string(100.0 * sub_margin); - log_msg += "\n"; + log_msg << " sub_trainer=" << sub_error + << " margin=" << 100.0 * sub_margin << "\n"; // Catch up to current iteration. int end_iteration = training_iteration(); while (sub_trainer_->training_iteration() < end_iteration && @@ -647,11 +646,12 @@ SubTrainerResult LSTMTrainer::UpdateSubtrainer(std::string &log_msg) { while (sub_trainer_->training_iteration() < target_iteration) { sub_trainer_->TrainOnLine(this, false); } - std::string batch_log = "Sub:"; + std::stringstream batch_log("Sub:"); + batch_log.imbue(std::locale::classic()); sub_trainer_->PrepareLogMsg(batch_log); - batch_log += "\n"; - tprintf("UpdateSubtrainer:%s", batch_log.c_str()); - log_msg += batch_log; + batch_log << "\n"; + tprintf("UpdateSubtrainer:%s", batch_log.str().c_str()); + log_msg << batch_log.str(); sub_error = sub_trainer_->CharError(); sub_margin = (training_error - sub_error) / sub_error; } @@ -661,9 +661,8 @@ SubTrainerResult LSTMTrainer::UpdateSubtrainer(std::string &log_msg) { std::vector updated_trainer; SaveTrainingDump(LIGHT, *sub_trainer_, &updated_trainer); ReadTrainingDump(updated_trainer, *this); - log_msg += " Sub trainer wins at iteration " + - std::to_string(training_iteration()); - log_msg += "\n"; + log_msg << " Sub trainer wins at iteration " + << training_iteration() << "\n"; return STR_REPLACED; } return STR_UPDATED; @@ -674,17 +673,16 @@ SubTrainerResult LSTMTrainer::UpdateSubtrainer(std::string &log_msg) { // Reduces network learning rates, either for everything, or for layers // independently, according to NF_LAYER_SPECIFIC_LR. void LSTMTrainer::ReduceLearningRates(LSTMTrainer *samples_trainer, - std::string &log_msg) { + std::stringstream &log_msg) { if (network_->TestFlag(NF_LAYER_SPECIFIC_LR)) { int num_reduced = ReduceLayerLearningRates( kLearningRateDecay, kNumAdjustmentIterations, samples_trainer); - log_msg += - "\nReduced learning rate on layers: " + std::to_string(num_reduced); + log_msg << "\nReduced learning rate on layers: " << num_reduced; } else { ScaleLearningRate(kLearningRateDecay); - log_msg += "\nReduced learning rate to :" + std::to_string(learning_rate_); + log_msg << "\nReduced learning rate to :" << learning_rate_; } - log_msg += "\n"; + log_msg << "\n"; } // Considers reducing the learning rate independently for each layer down by @@ -1053,13 +1051,14 @@ void LSTMTrainer::SaveRecognitionDump(std::vector *data) const { // Returns a suitable filename for a training dump, based on the model_base_, // best_error_rate_, best_iteration_ and training_iteration_. std::string LSTMTrainer::DumpFilename() const { - std::string filename; - filename += model_base_.c_str(); - filename += "_" + std::to_string(best_error_rate_); - filename += "_" + std::to_string(best_iteration_); - filename += "_" + std::to_string(training_iteration_); - filename += ".checkpoint"; - return filename; + std::stringstream filename; + filename.imbue(std::locale::classic()); + filename << model_base_ << std::fixed << std::setprecision(3) + << "_" << best_error_rate_ + << "_" << best_iteration_ + << "_" << training_iteration_ + << ".checkpoint"; + return filename.str(); } // Fills the whole error buffer of the given type with the given value. diff --git a/tesseract4android/src/main/cpp/tesseract/src/src/training/unicharset/lstmtrainer.h b/tesseract4android/src/main/cpp/tesseract/src/src/training/unicharset/lstmtrainer.h index 026e4b1..6481a59 100644 --- a/tesseract4android/src/main/cpp/tesseract/src/src/training/unicharset/lstmtrainer.h +++ b/tesseract4android/src/main/cpp/tesseract/src/src/training/unicharset/lstmtrainer.h @@ -25,6 +25,7 @@ #include "rect.h" #include // for std::function +#include // for std::stringstream namespace tesseract { @@ -192,7 +193,7 @@ public: // Keeps track of best and locally worst error rate, using internally computed // values. See MaintainCheckpointsSpecific for more detail. - bool MaintainCheckpoints(const TestCallback &tester, std::string &log_msg); + bool MaintainCheckpoints(const TestCallback &tester, std::stringstream &log_msg); // Keeps track of best and locally worst error_rate (whatever it is) and // launches tests using rec_model, when a new min or max is reached. // Writes checkpoints using train_model at appropriate times and builds and @@ -201,12 +202,12 @@ public: bool MaintainCheckpointsSpecific(int iteration, const std::vector *train_model, const std::vector *rec_model, - TestCallback tester, std::string &log_msg); - // Builds a string containing a progress message with current error rates. - void PrepareLogMsg(std::string &log_msg) const; + TestCallback tester, std::stringstream &log_msg); + // Builds a progress message with current error rates. + void PrepareLogMsg(std::stringstream &log_msg) const; // Appends iteration learning_iteration()/training_iteration()/ // sample_iteration() to the log_msg. - void LogIterations(const char *intro_str, std::string &log_msg) const; + void LogIterations(const char *intro_str, std::stringstream &log_msg) const; // TODO(rays) Add curriculum learning. // Returns true and increments the training_stage_ if the error rate has just @@ -226,7 +227,7 @@ public: // De-serializes the saved best_trainer_ into sub_trainer_, and adjusts the // learning rates (by scaling reduction, or layer specific, according to // NF_LAYER_SPECIFIC_LR). - void StartSubtrainer(std::string &log_msg); + void StartSubtrainer(std::stringstream &log_msg); // While the sub_trainer_ is behind the current training iteration and its // training error is at least kSubTrainerMarginFraction better than the // current training error, trains the sub_trainer_, and returns STR_UPDATED if @@ -235,10 +236,10 @@ public: // trainer in *this is replaced with sub_trainer_, and STR_REPLACED is // returned. STR_NONE is returned if the subtrainer wasn't good enough to // receive any training iterations. - SubTrainerResult UpdateSubtrainer(std::string &log_msg); + SubTrainerResult UpdateSubtrainer(std::stringstream &log_msg); // Reduces network learning rates, either for everything, or for layers // independently, according to NF_LAYER_SPECIFIC_LR. - void ReduceLearningRates(LSTMTrainer *samples_trainer, std::string &log_msg); + void ReduceLearningRates(LSTMTrainer *samples_trainer, std::stringstream &log_msg); // Considers reducing the learning rate independently for each layer down by // factor(<1), or leaving it the same, by double-training the given number of // samples and minimizing the amount of changing of sign of weight updates. diff --git a/tesseract4android/src/main/cpp/tesseract/src/src/training/unicharset/validate_myanmar.cpp b/tesseract4android/src/main/cpp/tesseract/src/src/training/unicharset/validate_myanmar.cpp index 8e97bcc..49443d6 100644 --- a/tesseract4android/src/main/cpp/tesseract/src/src/training/unicharset/validate_myanmar.cpp +++ b/tesseract4android/src/main/cpp/tesseract/src/src/training/unicharset/validate_myanmar.cpp @@ -140,13 +140,21 @@ bool ValidateMyanmar::ConsumeOptionalSignsIfPresent() { } // Tone mark extensions. ch = codes_[codes_used_].second; - if (ch == 0x1038 || ch == kMyanmarAsat || ch == 0x1063 || ch == 0x1064 || + if (ch == 0x102c || ch == 0x1038 || ch == kMyanmarAsat || (0x1062 <= ch && ch <= 0x1064) || (0x1069 <= ch && ch <= 0x106d) || (0x1087 <= ch && ch <= 0x108d) || ch == 0x108f || ch == 0x109a || ch == 0x109b || (0xaa7b <= ch && ch <= 0xaa7d)) { if (UseMultiCode(1)) { return true; } } + // Sgaw tones 0x1062, 0x1063 must be followed by asat. + // W Pwo tones 0x1069, 0x106a, and 0x106b may be followed by dot below or visarga (nasal). + ch = codes_[codes_used_].second; + if (ch == 0x103a || ch == 0x1037 || ch == 0x1038) { + if (UseMultiCode(1)) { + return true; + } + } return false; } diff --git a/tesseract4android/src/main/cpp/tesseract/src/src/wordrec/findseam.cpp b/tesseract4android/src/main/cpp/tesseract/src/src/wordrec/findseam.cpp index 74a0578..fdc347a 100644 --- a/tesseract4android/src/main/cpp/tesseract/src/src/wordrec/findseam.cpp +++ b/tesseract4android/src/main/cpp/tesseract/src/src/wordrec/findseam.cpp @@ -103,7 +103,6 @@ void Wordrec::add_seam_to_queue(float new_priority, SEAM *new_seam, SeamQueue *s void Wordrec::choose_best_seam(SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile) { SEAM *seam; - char str[80]; float my_priority; /* Add seam of split */ my_priority = priority; @@ -133,7 +132,8 @@ void Wordrec::choose_best_seam(SeamQueue *seam_queue, const SPLIT *split, PRIORI seam->FullPriority(bbox.left(), bbox.right(), chop_overlap_knob, chop_centered_maxwidth, chop_center_knob, chop_width_change_knob); if (chop_debug) { - sprintf(str, "Full my_priority %0.0f, ", my_priority); + char str[80]; + snprintf(str, sizeof(str), "Full my_priority %0.0f, ", my_priority); seam->Print(str); } diff --git a/tesseract4android/src/main/cpp/tesseract/src/unittest/lstm_test.h b/tesseract4android/src/main/cpp/tesseract/src/unittest/lstm_test.h index 0b0ff6f..d1de2eb 100644 --- a/tesseract4android/src/main/cpp/tesseract/src/unittest/lstm_test.h +++ b/tesseract4android/src/main/cpp/tesseract/src/unittest/lstm_test.h @@ -103,7 +103,7 @@ protected: int iteration_limit = iteration + max_iterations; double best_error = 100.0; do { - std::string log_str; + std::stringstream log_str; int target_iteration = iteration + kBatchIterations; // Train a few. double mean_error = 0.0;