Update Tesseract to 5.3.3

This commit is contained in:
Robert Pösel 2023-10-17 13:23:56 +02:00
parent 8ae584f545
commit 96389cb655
38 changed files with 1522 additions and 91 deletions

View File

@ -0,0 +1,77 @@
name: Bug Report
description: File a bug report
body:
- type: markdown
attributes:
value: |
### Attention
Before you submit an issue, please review [the guidelines for this repository](https://github.com/tesseract-ocr/tesseract/blob/main/CONTRIBUTING.md).
Have a question? Need help?
Please use [our forum](https://groups.google.com/g/tesseract-ocr).
Please follow these rules:
* Don't open an issue for [Tesseract version which was released more than a year ago](https://tesseract-ocr.github.io/tessdoc/ReleaseNotes.html).
* Don't open an issue which involves 3rd party tools that use Tesseract as a library. Only report about an issue with the Tesseract command line tool or the C/C++ API.
* Please provide the input image.
* Also provide output files (txt and/or tsv, hocr, pdf). You can make a zip archive that will contain these files, so GitHub will let you upload them.
* Don't attach a screenshot of the command line and output. Instead, copy the text and paste it in your bug report.
Windows versions 7, 8, 8.1 are not supported.
- type: textarea
attributes:
label: Current Behavior
- type: textarea
attributes:
label: Expected Behavior
- type: textarea
attributes:
label: Suggested Fix
- type: textarea
attributes:
label: tesseract -v
description: Version info, compiled libraries, SIMD, OpenMP
placeholder: "Please paste the output of the command: tesseract -v"
- type: dropdown
id: os-linux
attributes:
label: Operating System
description: Choose the OS where the bug occurs
multiple: true
options:
- Windows 11
- Windows 10
- macOS 13 Ventura
- macOS 12 Monterey
- macOS 11 Big Sur
- Ubuntu 22.04 Jammy
- Ubuntu 20.04 Focal
- Debian 12 Bookworm
- Debian 11 Bullseye
- RHEL 9
- RHEL 8
- type: textarea
attributes:
label: Other Operating System
placeholder: Enter the name and version of the OS
- type: textarea
attributes:
label: uname -a
placeholder: "Paste the output of the command: umame -a (if available in your system)."
- type: textarea
attributes:
label: Compiler
placeholder: "Enter compiler name and version (Examples: MSVC 2019 16.11, Clang 13.0.1, GCC 11.2, Xcode 14.1)"
- type: textarea
attributes:
label: CPU
placeholder: "Enter your CPU vendor name and model (Examples: Intel Core i7-11700K, AMD Ryzen 7 5800X, Apple Silicon M1)"
- type: textarea
attributes:
label: Virtualization / Containers
placeholder: "Enter the name and version of the VM / container which you use (Examples: Oracle VM VirtualBox 7.0.4,VMware Workstation 17.0, Hyper-V, Docker 20.10.22)"
- type: textarea
attributes:
label: Other Information
placeholder: Add more details here.

View File

@ -0,0 +1,7 @@
name: Feature Request
description: File a feature request
body:
- type: textarea
attributes:
label: Your Feature Request
description: Please look first at the [open issues labeled as 'feature request'](https://github.com/tesseract-ocr/tesseract/labels/feature%20request).

View File

@ -0,0 +1,208 @@
name: autotools-macos
# autotools build of tesseract and training tools on macos homebrew and macports.
# run command line tests, basicapitest and unittests. '--disable-openmp'
on:
#push:
schedule:
- cron: 0 20 * * *
workflow_dispatch:
jobs:
brew:
runs-on: ${{ matrix.config.os }}
strategy:
fail-fast: false
matrix:
config:
- { name: macos-12-clang-14-autotools, os: macos-12, cxx: clang++ }
#- { name: macos-12-gcc-11-autotools, os: macos-12, cxx: g++-11 }
steps:
- uses: actions/checkout@v3
with:
submodules: recursive
- name: Get fonts, tessdata and langdata required for unit tests
run: |
git clone https://github.com/egorpugin/tessdata tessdata_unittest
cp tessdata_unittest/fonts/* test/testing/
mv tessdata_unittest/* ../
- name: Install dependencies
run: |
brew install autoconf automake
brew install leptonica
brew install cairo pango icu4c
brew install cabextract
brew install libarchive curl
- name: Setup Tesseract
run: |
mkdir -p m4
./autogen.sh
- name: Configure Tesseract
run: |
./configure '--disable-shared' '--disable-openmp' '--disable-doc' '--with-pic' 'CXX=${{ matrix.config.cxx }}' 'CXXFLAGS=-g -O2'
- name: Make and Install Tesseract
run: |
make -j 8
sudo make install install
- name: Make and Install Training Tools
run: |
make training -j 8
sudo make install training-install
- name: Make and run Unit Tests (clang)
if: startsWith(matrix.config.cxx, 'clang')
run: |
make check
- name: Make and run Unit Tests (unset LANG needed for g++-8, g++-9, g++-10 on macOS)
if: startsWith(matrix.config.cxx, 'g')
shell: bash
run: |
unset LANG LC_ALL LC_CTYPE
locale
make check
- name: Display Version for tesseract, lstmtraining, text2image
run: |
tesseract -v
lstmtraining -v
text2image -v
if: success() || failure()
- name: List languages in different test tessdata-dir
run: |
tesseract --list-langs --tessdata-dir ../tessdata
tesseract --list-langs --tessdata-dir ../tessdata_best
tesseract --list-langs --tessdata-dir ../tessdata_fast
- name: Run Tesseract on test images in different languages
run: |
tesseract test/testing/phototest.tif - --oem 1 --tessdata-dir ../tessdata
tesseract test/testing/raaj.tif - -l hin --oem 1 --tessdata-dir ../tessdata
tesseract test/testing/viet.tif - -l vie --oem 1 --tessdata-dir ../tessdata
tesseract test/testing/hebrew.png - -l heb --oem 1 --tessdata-dir ../tessdata
tesseract test/testing/eurotext.tif - -l fra --oem 1 --tessdata-dir ../tessdata_best
tesseract test/testing/arabic.tif - -l ara --oem 1 --psm 6 --tessdata-dir ../tessdata
- name: Run Tesseract basicapitest
run: |
export "PKG_CONFIG_PATH=/usr/local/lib/pkgconfig"
cd test
${{ matrix.config.cxx }} -o basicapitest testing/basicapitest.cpp $(pkg-config --cflags --libs tesseract lept) -pthread -std=c++11 -framework accelerate
./basicapitest
- name: Display Compiler Version
run: |
${{ matrix.config.cxx }} --version
git log -3 --pretty=format:'%h %ad %s | %an'
if: always()
- name: Display Unit Tests Report
run: |
cat test-suite.log
if: always()
# ============================================================================================
ports:
runs-on: ${{ matrix.config.os }}
strategy:
fail-fast: false
matrix:
config:
- { name: macos-12-clang-14-autotools, os: macos-12, cxx: clang++ }
steps:
- uses: actions/checkout@v3
with:
submodules: recursive
- name: Get fonts, tessdata and langdata required for tests
run: |
git clone https://github.com/egorpugin/tessdata tessdata_unittest
cp tessdata_unittest/fonts/* test/testing/
mv tessdata_unittest/* ../
- name: Install Macports
run: |
curl -LO https://raw.githubusercontent.com/GiovanniBussi/macports-ci/master/macports-ci; source ./macports-ci install
# --remove-brew does not remove the Homebrew entries in bin,
# so remove them now.
rm -v $(brew --prefix)/bin/*
- name: Install Dependencies
run: |
sudo port install autoconf automake libtool pkgconfig
sudo port install leptonica
sudo port install cairo pango
sudo port install icu +devel
sudo port install cabextract libarchive curl
- name: Setup Tesseract
run: |
mkdir -p m4
./autogen.sh
- name: Configure Tesseract
run: |
./configure '--disable-shared' '--disable-openmp' '--disable-doc' '--with-pic' 'CXX=${{ matrix.config.cxx }}' 'CXXFLAGS=-g -O2'
- name: Make and Install Tesseract
run: |
make -j 8
sudo make install install
- name: Make and Install Training Tools
run: |
make training -j 8
sudo make install training-install
- name: Make and run Unit Tests (clang)
if: startsWith(matrix.config.cxx, 'clang')
run: |
make check
- name: Display Version for tesseract, lstmtraining, text2image
run: |
tesseract -v
lstmtraining -v
text2image -v
if: success() || failure()
- name: List languages in different test tessdata-dir
run: |
tesseract --list-langs --tessdata-dir ../tessdata
tesseract --list-langs --tessdata-dir ../tessdata_best
tesseract --list-langs --tessdata-dir ../tessdata_fast
- name: Run Tesseract on test images in different languages
run: |
tesseract test/testing/phototest.tif - --oem 1 --tessdata-dir ../tessdata
tesseract test/testing/raaj.tif - -l hin --oem 1 --tessdata-dir ../tessdata
tesseract test/testing/viet.tif - -l vie --oem 1 --tessdata-dir ../tessdata
tesseract test/testing/hebrew.png - -l heb --oem 1 --tessdata-dir ../tessdata
tesseract test/testing/eurotext.tif - -l fra --oem 1 --tessdata-dir ../tessdata_best
tesseract test/testing/arabic.tif - -l ara --oem 1 --psm 6 --tessdata-dir ../tessdata
- name: Run Tesseract basicapitest
run: |
export "PKG_CONFIG_PATH=/usr/local/lib/pkgconfig"
cd test
${{ matrix.config.cxx }} -o basicapitest testing/basicapitest.cpp -I/opt/local/include -L/opt/local/lib $(pkg-config --cflags --libs tesseract lept) -pthread -std=c++11 -framework Accelerate
./basicapitest
- name: Display Compiler Version
run: |
${{ matrix.config.cxx }} --version
git log -3 --pretty=format:'%h %ad %s | %an'
if: always()
- name: Display Unit Tests Report
run: |
cat test-suite.log
if: always()

View File

@ -0,0 +1,83 @@
name: autotools-openmp
# autotools on Ubuntu - run benchmark test. '--enable-openmp' no training tools
on:
#push:
#schedule:
# - cron: 0 20 * * *
workflow_dispatch:
jobs:
linux:
runs-on: ${{ matrix.config.os }}
strategy:
fail-fast: false
matrix:
config:
- { name: 20.04-openmp, os: ubuntu-20.04 }
- { name: 22.04-openmp, os: ubuntu-22.04 }
steps:
- uses: actions/checkout@v3
with:
submodules: recursive
- name: Download fonts, tessdata and langdata required for tests
run: |
git clone https://github.com/egorpugin/tessdata tessdata_unittest
cp tessdata_unittest/fonts/* test/testing/
mv tessdata_unittest/* ../
- name: Install dependencies
run: |
sudo apt-get update
sudo apt-get install autoconf libleptonica-dev -y
sudo apt-get install libpango1.0-dev -y
sudo apt-get install cabextract libarchive-dev -y
sudo apt-get install libcurl4-openssl-dev libcurl4 curl -y
- name: Setup Tesseract
run: |
mkdir -p m4
./autogen.sh
- name: Configure Tesseract
run: |
./configure '--disable-shared' '--enable-openmp' '--disable-doc' 'CXX=g++' 'CXXFLAGS=-g -O2'
grep -i OpenMP config.log
- name: Make and Install Tesseract
run: |
make
sudo make install
- name: Setup for Tesseract benchmark using image from issue 263 fifteen times in a list file
run: |
wget -O i263_speed.jpg https://cloud.githubusercontent.com/assets/9968625/13674495/ac261db4-e6ab-11e5-9b4a-ad91d5b4ff87.jpg
printf 'i263_speed.jpg\n%.0s' {1..15} > benchmarks.list
- name: Run Tesseract using image from issue 263 with tessdata_fast
run: |
lscpu
free
g++ --version
tesseract -v
time tesseract benchmarks.list - --tessdata-dir ../tessdata_fast > /dev/null 2>&1
echo "tessdata_fast"
- name: Run Tesseract using image from issue 263 with tessdata_fast and OpenMP Thread Limit
run: |
for lmt in {1..3}; do
time OMP_THREAD_LIMIT=$lmt tesseract benchmarks.list - --tessdata-dir ../tessdata_fast > /dev/null 2>&1 && echo "OMP_THREAD_LIMIT=" $lmt "tessdata_fast"
done
- name: Run Tesseract using image from issue 263 with tessdata_best and OpenMP Thread Limit
run: |
for lmt in {1..3}; do
time OMP_THREAD_LIMIT=$lmt tesseract benchmarks.list - --tessdata-dir ../tessdata_best > /dev/null 2>&1 && echo "OMP_THREAD_LIMIT=" $lmt "tessdata_best"
done
- name: Run Tesseract using image from issue 263 with tessdata and OpenMP Thread Limit
run: |
for lmt in {1..3}; do
time OMP_THREAD_LIMIT=$lmt tesseract benchmarks.list - --tessdata-dir ../tessdata > /dev/null 2>&1 && echo "OMP_THREAD_LIMIT=" $lmt "tessdata"
done

View File

@ -0,0 +1,132 @@
name: autotools
# autotools build of tesseract and training tools on Ubuntu.
# run command line tests, basicapitest and unittests. '--disable-openmp'
on:
#push:
schedule:
- cron: 0 20 * * *
jobs:
linux:
runs-on: ${{ matrix.config.os }}
strategy:
fail-fast: false
matrix:
config:
- { name: ubuntu-22.04-clang-15-autotools, os: ubuntu-22.04, cxx: clang++-15 }
- { name: ubuntu-22.04-clang-14-autotools, os: ubuntu-22.04, cxx: clang++-14 } #installed
- { name: ubuntu-22.04-gcc-12-autotools, os: ubuntu-22.04, cxx: g++-12 } #installed
- { name: ubuntu-22.04-gcc-11-autotools, os: ubuntu-22.04, cxx: g++-11 } #installed
- { name: ubuntu-20.04-gcc-10-autotools, os: ubuntu-20.04, cxx: g++-10 } #installed
- { name: ubuntu-20.04-gcc-9-autotools, os: ubuntu-20.04, cxx: g++-9 } #installed
- { name: ubuntu-20.04-gcc-8-autotools, os: ubuntu-20.04, cxx: g++-8 }
steps:
- uses: actions/checkout@v3
with:
submodules: recursive
- name: Download fonts, tessdata and langdata required for tests
run: |
git clone https://github.com/egorpugin/tessdata tessdata_unittest
cp tessdata_unittest/fonts/* test/testing/
mv tessdata_unittest/* ../
- name: Install Compiler
run: |
sudo apt-get update
sudo apt-get install -y ${{ matrix.config.cxx }}
- name: Install dependencies
run: |
sudo apt-get install autoconf libleptonica-dev -y
sudo apt-get install libpango1.0-dev -y
sudo apt-get install cabextract libarchive-dev -y
sudo apt-get install libcurl4-openssl-dev libcurl4 curl -y
- name: Setup Tesseract
run: |
mkdir -p m4
./autogen.sh
- name: Configure Tesseract
run: |
./configure '--disable-shared' '--disable-openmp' '--disable-doc' 'CXX=${{ matrix.config.cxx }}' 'CXXFLAGS=-g -O2'
- name: Make and Install Tesseract
run: |
make -j 8
sudo make install install
- name: Make and Install Training Tools
run: |
make training -j 8
sudo make install training-install
- name: Make and run Unit Tests
run: |
make check
- name: Display Version for tesseract, lstmtraining, text2image
run: |
tesseract -v
lstmtraining -v
text2image -v
if: success() || failure()
- name: List languages in different test tessdata-dir
run: |
tesseract --list-langs --tessdata-dir ../tessdata
tesseract --list-langs --tessdata-dir ../tessdata_best
tesseract --list-langs --tessdata-dir ../tessdata_fast
- name: Run Tesseract on test images in different languages
run: |
tesseract test/testing/phototest.tif - --oem 1 --tessdata-dir ../tessdata
tesseract test/testing/raaj.tif - -l hin --oem 1 --tessdata-dir ../tessdata
tesseract test/testing/viet.tif - -l vie --oem 1 --tessdata-dir ../tessdata
tesseract test/testing/hebrew.png - -l heb --oem 1 --tessdata-dir ../tessdata
tesseract test/testing/eurotext.tif - -l fra --oem 1 --tessdata-dir ../tessdata_best
tesseract test/testing/arabic.tif - -l ara --oem 1 --psm 6 --tessdata-dir ../tessdata
- name: Run Tesseract basicapitest
run: |
export "PKG_CONFIG_PATH=/usr/local/lib/pkgconfig"
cd test
${{ matrix.config.cxx }} -o basicapitest testing/basicapitest.cpp -I/usr/local/include -L/usr/local/lib `pkg-config --cflags --libs tesseract lept ` -pthread -std=c++11
./basicapitest
- name: Setup for Tesseract benchmark using image from issue 263 fifteen times in a list file
run: |
wget -O i263_speed.jpg https://cloud.githubusercontent.com/assets/9968625/13674495/ac261db4-e6ab-11e5-9b4a-ad91d5b4ff87.jpg
printf 'i263_speed.jpg\n%.0s' {1..15} > benchmarks.list
lscpu
free
tesseract -v
- name: Run Tesseract using image from issue 263 with tessdata_fast
run: |
time tesseract benchmarks.list - --tessdata-dir ../tessdata_fast > /dev/null 2>&1
echo "tessdata_fast - disable-openmp"
- name: Run Tesseract using image from issue 263 with tessdata_best
run: |
time tesseract benchmarks.list - --tessdata-dir ../tessdata_best > /dev/null 2>&1
echo "tessdata_best - disable-openmp"
- name: Run Tesseract using image from issue 263 with tessdata_fast
run: |
time tesseract benchmarks.list - --tessdata-dir ../tessdata > /dev/null 2>&1
echo "tessdata - disable-openmp"
- name: Display Compiler Version
run: |
${{ matrix.config.cxx }} --version
git log -3 --pretty=format:'%h %ad %s | %an'
if: always()
- name: Display Unit Tests Report
run: |
cat test-suite.log
if: always()

View File

@ -0,0 +1,33 @@
name: CIFuzz
# OSS-Fuzz CI
# See https://google.github.io/oss-fuzz/getting-started/continuous-integration/
on:
pull_request:
branches:
- main
paths:
- '**.cpp'
- '**.h'
jobs:
Fuzzing:
runs-on: ubuntu-latest
steps:
- name: Build Fuzzers
id: build
uses: google/oss-fuzz/infra/cifuzz/actions/build_fuzzers@master
with:
oss-fuzz-project-name: 'tesseract-ocr'
language: c++
dry-run: false
- name: Run Fuzzers
uses: google/oss-fuzz/infra/cifuzz/actions/run_fuzzers@master
with:
oss-fuzz-project-name: 'tesseract-ocr'
fuzz-seconds: 600
dry-run: false
- name: Upload Crash
uses: actions/upload-artifact@v3
if: failure() && steps.build.outcome == 'success'
with:
name: artifacts
path: ./out/artifacts

View File

@ -0,0 +1,123 @@
# Based on https://github.com/zdenop/tesserocr/actions/runs/691257659/workflow
# Build Tesseract on Windows using cmake. No Training Tools.
name: cmake-win64
on:
#push:
schedule:
- cron: 0 23 * * *
workflow_dispatch:
env:
ILOC: d:/a/local
jobs:
build:
name: cmake-win64
runs-on: windows-latest
steps:
- uses: ilammy/setup-nasm@v1
- name: "Checkout ${{ github.ref }} ( ${{ github.sha }} )"
uses: actions/checkout@v3
with:
submodules: recursive
- run: git fetch --prune --unshallow --tags
- name: Get the version
id: get_version
run: |
$git_info=$(git describe --tags HEAD)
echo "version=${git_info}" >> $env:GITHUB_OUTPUT
- name: Setup Installation Location
run: |
mkdir ${{env.ILOC}}
- name: Uninstall Perl
run: |
choco uninstall strawberryperl
- name: Build and Install zlib-ng
shell: cmd
run: |
git clone --depth 1 https://github.com/zlib-ng/zlib-ng.git
cd zlib-ng
cmake -Bbuild -DCMAKE_PREFIX_PATH=${{env.ILOC}} -DCMAKE_INSTALL_PREFIX=${{env.ILOC}} -DBUILD_SHARED_LIBS=OFF -DZLIB_COMPAT=ON -DZLIB_ENABLE_TESTS=OFF -DINSTALL_UTILS=OFF
cmake --build build --config Release --target install
cd ..
- name: Build and Install libpng
shell: cmd
run: |
curl -sSL -o lpng1639.zip https://download.sourceforge.net/libpng/lpng1639.zip
unzip.exe -qq lpng1639.zip
cd lpng1639
cmake -Bbuild -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH=${{env.ILOC}} -DCMAKE_INSTALL_PREFIX=${{env.ILOC}} -DPNG_TESTS=OFF -DPNG_SHARED=OFF
cmake --build build --config Release --target install
cd ..
- name: Build and Install libjpeg
shell: cmd
run: |
git clone --depth 1 https://github.com/libjpeg-turbo/libjpeg-turbo.git
cd libjpeg-turbo
cmake -Bbuild -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH=${{env.ILOC}} -DCMAKE_INSTALL_PREFIX=${{env.ILOC}} -DWITH_TURBOJPEG=OFF -DENABLE_SHARED=OFF
cmake --build build --config Release --target install
cd ..
- name: Build and Install jbigkit
shell: cmd
run: |
git clone --depth 1 https://github.com/zdenop/jbigkit.git
cd jbigkit
cmake -Bbuild -DCMAKE_PREFIX_PATH=${{env.ILOC}} -DCMAKE_INSTALL_PREFIX=${{env.ILOC}} -DBUILD_PROGRAMS=OFF -DBUILD_TOOLS=OFF -DCMAKE_WARN_DEPRECATED=OFF
cmake --build build --config Release --target install
cd ..
- name: Build and Install libtiff
shell: cmd
run: |
git clone -c advice.detachedHead=false -b "v4.0.10" --depth 1 https://gitlab.com/libtiff/libtiff.git
cd libtiff
cmake -Bbuild -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH=${{env.ILOC}} -DCMAKE_INSTALL_PREFIX=${{env.ILOC}} -Dtiff-tools=OFF -Dtiff-tests=OFF -Dtiff-contrib=OFF -Dtiff-docs=OFF
cmake --build build --config Release --target install
cd ..
- name: Build and Install leptonica
shell: cmd
run: |
echo "Building leptonica..."
git clone --depth 1 https://github.com/DanBloomberg/leptonica.git
cd leptonica
cmake -Bbuild -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH=${{env.ILOC}} -DCMAKE_INSTALL_PREFIX=${{env.ILOC}} -DSW_BUILD=OFF -DBUILD_PROG=OFF -DBUILD_SHARED_LIBS=ON
cmake --build build --config Release --target install
- name: Remove not needed tools Before building tesseract
shell: cmd
run: >
rm -Rf ${{env.ILOC}}/bin/*.exe
- name: Build and Install tesseract
shell: cmd
run: |
cmake -Bbuild -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH=${{env.ILOC}} -DCMAKE_INSTALL_PREFIX=${{env.ILOC}} -DSW_BUILD=OFF -DBUILD_SHARED_LIBS=ON -DENABLE_LTO=ON -DBUILD_TRAINING_TOOLS=OFF -DFAST_FLOAT=ON -DGRAPHICS_DISABLED=ON -DOPENMP_BUILD=OFF
cmake --build build --config Release --target install
- name: Upload Build Results
uses: actions/upload-artifact@v3.1.1
with:
name: tesseract-${{ steps.get_version.outputs.version }}-VS2019_win64
path: ${{env.ILOC}}
retention-days: 5
- name: Display Tesseract Version and Test Command Line Usage
shell: cmd
run: |
curl -L https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata --output ${{env.ILOC}}/share/tessdata/eng.traineddata
curl -L https://github.com/tesseract-ocr/tessdata/raw/main/osd.traineddata --output ${{env.ILOC}}/share/tessdata/osd.traineddata
echo "Setting TESSDATA_PREFIX..."
set TESSDATA_PREFIX=${{env.ILOC}}/share/tessdata
echo "Setting PATH..."
set PATH=${{env.ILOC}}/bin;%PATH%
echo "Checking installed tesseract version..."
tesseract -v
echo "Checking installed langs"
tesseract --list-langs
echo "Checking OCR process"
tesseract test/testing/phototest.tif -

View File

@ -0,0 +1,156 @@
name: cmake
# cmake build of tesseract and training tools on ubuntu and macOS homebrew using Ninja.
# test command line version of tesseract. run basicapitest.
on:
#push:
schedule:
- cron: 0 21 * * *
jobs:
basictests:
name: ${{ matrix.config.name }}
runs-on: ${{ matrix.config.os }}
strategy:
fail-fast: false
matrix:
config:
- { name: macos-12-clang-14-cmake, os: macos-12, cxx: clang++ } # default
- { name: macos-11-clang-13-cmake, os: macos-11, cxx: clang++ } # default
- { name: macos-11-gcc-12-cmake, os: macos-11, cxx: g++-12 } #installed
- { name: ubuntu-22.04-clang-15-cmake, os: ubuntu-22.04, cxx: clang++-15 } #installed
- { name: ubuntu-22.04-clang-14-cmake, os: ubuntu-22.04, cxx: clang++-14 } #installed
- { name: ubuntu-22.04-gcc-12-cmake, os: ubuntu-22.04, cxx: g++-12 } #installed
- { name: ubuntu-22.04-gcc-11-cmake, os: ubuntu-22.04, cxx: g++-11 } #installed
- { name: ubuntu-20.04-gcc-10-cmake, os: ubuntu-20.04, cxx: g++-10 } #installed
- { name: ubuntu-20.04-gcc-9-cmake, os: ubuntu-20.04, cxx: g++-9 } #installed
- { name: ubuntu-20.04-gcc-8-cmake, os: ubuntu-20.04, cxx: g++-8 }
steps:
- name: Install compilers on Linux
run: |
sudo apt-get update
sudo apt-get install ${{ matrix.config.cxx }} -y
if: runner.os == 'Linux'
- name: Install dependencies on Linux
run: |
sudo apt-get install autoconf libleptonica-dev -y
sudo apt-get install libarchive-dev libcurl4-openssl-dev -y
sudo apt-get install libpango1.0-dev -y
sudo apt-get install cabextract -y
sudo apt-get install ninja-build -y
cmake --version
if: runner.os == 'Linux'
- name: Install dependencies on macOS
run: |
brew install autoconf automake
brew install leptonica
# brew install libarchive
brew install pango
brew install icu4c && brew link icu4c
brew install cabextract
brew install ninja
ninja --version
cmake --version
if: runner.os == 'macOS'
- name: Checkout Source
uses: actions/checkout@v3
with:
submodules: recursive
- name: Configure Tesseract (Linux)
run: |
mkdir build
mkdir inst
cmake \
-S . \
-B build \
-G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DOPENMP_BUILD=OFF \
-DCMAKE_CXX_COMPILER=${{ matrix.config.cxx }} \
-DCMAKE_INSTALL_PREFIX:PATH=inst
if: runner.os == 'Linux'
- name: Configure Tesseract (macOS)
shell: bash
run: |
set -e
export PKG_CONFIG_PATH=$(brew --prefix)/opt/icu4c/lib/pkgconfig:$(brew --prefix)/opt/libarchive/lib/pkgconfig:/$(brew --prefix)/opt/libffi/lib/pkgconfig:$PKG_CONFIG_PATH
export LDFLAGS="-L/usr/local/opt/icu4c/lib"
export CPPFLAGS="-I/usr/local/opt/icu4c/include"
mkdir build
mkdir inst
cmake \
-S . \
-B build \
-G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DOPENMP_BUILD=OFF \
-DCMAKE_CXX_COMPILER=${{ matrix.config.cxx }} \
-DCMAKE_INSTALL_PREFIX:PATH=inst
if: runner.os == 'macOS'
- name: Build Tesseract
run: |
cmake --build build --config Release --target install
- name: Display Tesseract Version
run: |
build/inst/bin/tesseract -v
- name: Display Training Tools Version
run: |
build/inst/bin/lstmtraining -v
build/inst/bin/text2image -v
- name: Download fonts, tessdata and langdata required for tests
run: |
git clone https://github.com/egorpugin/tessdata tessdata_unittest
cp tessdata_unittest/fonts/* test/testing/
mv tessdata_unittest/* ../
- name: List languages in different tessdata-dir
run: |
build/inst/bin/tesseract --list-langs --tessdata-dir ../tessdata
build/inst/bin/tesseract --list-langs --tessdata-dir ../tessdata_best
build/inst/bin/tesseract --list-langs --tessdata-dir ../tessdata_fast
- name: Run Tesseract on test images in different languages
run: |
build/inst/bin/tesseract test/testing/phototest.tif - --oem 1 --tessdata-dir ../tessdata
build/inst/bin/tesseract test/testing/raaj.tif - -l hin --oem 1 --tessdata-dir ../tessdata
build/inst/bin/tesseract test/testing/viet.tif - -l vie --oem 1 --tessdata-dir ../tessdata
build/inst/bin/tesseract test/testing/hebrew.png - -l heb --oem 1 --tessdata-dir ../tessdata
build/inst/bin/tesseract test/testing/eurotext.tif - -l fra --oem 1 --tessdata-dir ../tessdata_best
build/inst/bin/tesseract test/testing/arabic.tif - -l ara --oem 1 --psm 6 --tessdata-dir ../tessdata
- name: Build and run basicapitest (Linux)
run: |
export "PKG_CONFIG_PATH=$GITHUB_WORKSPACE/build/inst/lib/pkgconfig/:$PKG_CONFIG_PATH"
cd test
${{ matrix.config.cxx }} -o basicapitest testing/basicapitest.cpp "-I$GITHUB_WORKSPACE/build/inst/include" "-L$GITHUB_WORKSPACE/build/inst/lib" $(pkg-config --cflags --libs tesseract lept libarchive libcurl) -pthread -std=c++11
./basicapitest
if: runner.os == 'Linux'
- name: Build and run basicapitest (macOS)
run: |
export "PKG_CONFIG_PATH=$GITHUB_WORKSPACE/build/inst/lib/pkgconfig/:$(brew --prefix)/opt/libarchive/lib/pkgconfig:$(brew --prefix)/Library/Homebrew/os/mac/pkgconfig/11:$PKG_CONFIG_PATH"
cd test
${{ matrix.config.cxx }} -o basicapitest testing/basicapitest.cpp "-I$GITHUB_WORKSPACE/build/inst/include" "-L$GITHUB_WORKSPACE/build/inst/lib" $(pkg-config --cflags --libs tesseract lept libcurl) -pthread -std=c++11
./basicapitest
if: runner.os == 'macOS'
- name: Display Compiler Version
run: |
${{ matrix.config.cxx }} --version
pwd
ls -la
# git log -3 --pretty=format:'%h %ad %s | %an'
if: always()

View File

@ -0,0 +1,85 @@
# For most projects, this workflow file will not need changing; you simply need
# to commit it to your repository.
#
# You may wish to alter this file to override the set of languages analyzed,
# or to provide custom queries or build logic.
#
# ******** NOTE ********
# We have attempted to detect the languages in your repository. Please check
# the `language` matrix defined below to confirm you have the correct set of
# supported CodeQL languages.
#
name: "CodeQL"
on:
push:
branches: [ main ]
paths:
- '**.cpp'
- '**.h'
- '**/codeql-analysis.yml'
- 'm4/*.m4'
- 'Makefile.am'
- 'autogen.sh'
- 'configure.ac'
pull_request:
# The branches below must be a subset of the branches above
branches: [ main ]
paths:
- '**.cpp'
- '**.h'
- '**/codeql-analysis.yml'
- 'm4/*.m4'
- 'Makefile.am'
- 'autogen.sh'
- 'configure.ac'
schedule:
- cron: '34 23 * * 2'
jobs:
analyze:
name: Analyze
runs-on: ubuntu-latest
permissions:
actions: read
contents: read
security-events: write
strategy:
fail-fast: false
matrix:
language: [ 'cpp' ]
# CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ]
# Learn more:
# https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Install dependencies
run: |
sudo apt-get update
sudo apt-get install autoconf libleptonica-dev -y
sudo apt-get install libpango1.0-dev -y
sudo apt-get install cabextract libarchive-dev -y
sudo apt-get install libcurl4-openssl-dev libcurl4 curl -y
# Initializes the CodeQL tools for scanning.
- name: Initialize CodeQL
uses: github/codeql-action/init@v2
with:
languages: ${{ matrix.language }}
# If you wish to specify custom queries, you can do so here or in a config file.
# By default, queries listed here will override any specified in a config file.
# Prefix the list here with "+" to use these queries and those in the config file.
# queries: ./path/to/local/query, your-org/your-repo/queries@main
- name: Build
run: |
./autogen.sh
./configure
make all training
- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@v2

View File

@ -0,0 +1,78 @@
name: msys2
# msys2 build for tesseract -head from main branch.
on:
#push:
schedule:
- cron: 0 17 * * *
jobs:
windows:
runs-on: windows-2019
strategy:
fail-fast: false
matrix:
include:
- msystem: MINGW32
mingw_package_prefix: mingw-w64-i686
- msystem: MINGW64
mingw_package_prefix: mingw-w64-x86_64
defaults:
run:
shell: msys2 {0}
steps:
- uses: actions/checkout@v3
with:
submodules: recursive
- uses: msys2/setup-msys2@v2
with:
msystem: ${{ matrix.msystem }}
install: autoconf automake automake-wrapper git libtool make
- run: pacman --noconfirm -S ${{ matrix.mingw_package_prefix }}-gcc
- run: gcc --version
- name: Install dependencies
run: |
pacman --noconfirm -S ${{ matrix.mingw_package_prefix }}-cairo
pacman --noconfirm -S ${{ matrix.mingw_package_prefix }}-curl
pacman --noconfirm -S ${{ matrix.mingw_package_prefix }}-gcc-libs
pacman --noconfirm -S ${{ matrix.mingw_package_prefix }}-icu
pacman --noconfirm -S ${{ matrix.mingw_package_prefix }}-leptonica
pacman --noconfirm -S ${{ matrix.mingw_package_prefix }}-libarchive
pacman --noconfirm -S ${{ matrix.mingw_package_prefix }}-pango
pacman --noconfirm -S ${{ matrix.mingw_package_prefix }}-pkg-config
pacman --noconfirm -S ${{ matrix.mingw_package_prefix }}-zlib
- name: Setup Tesseract
run: |
mkdir -p m4
./autogen.sh
- name: Configure Tesseract
run: |
./configure '--disable-shared' '--disable-openmp' '--disable-doc' 'CXX=${{ matrix.config.cxx }}' 'CXXFLAGS=-g -O2'
- name: Build and install Tesseract
run: |
make
make install
- name: Make and install training tools
run: |
make training
make training-install
- name: Display version
run: |
tesseract -v
text2image -v
lstmtraining -v
- name: Download fonts, tessdata and langdata required for tests
run: |
git clone https://github.com/egorpugin/tessdata tessdata_unittest
cp tessdata_unittest/fonts/* test/testing/
mv tessdata_unittest/* ../
- name: Run Tesseract on phototest.tif and devatest.png
run: |
tesseract test/testing/phototest.tif - --tessdata-dir ../tessdata
tesseract test/testing/devatest.png - -l hin+eng --tessdata-dir ../tessdata

View File

@ -0,0 +1,94 @@
name: sw
on:
push:
paths:
- '**.cpp'
- '**.h'
- '**/sw.yml'
- 'unittest/**.c'
- 'unittest/**.cc'
pull_request:
paths:
- '**.cpp'
- '**.h'
- '**/sw.yml'
- 'unittest/**.c'
- 'unittest/**.cc'
schedule:
# every day
- cron: 0 0 * * *
jobs:
build:
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [windows-2022, ubuntu-22.04, macos-12]
steps:
- uses: actions/checkout@v3
with:
submodules: recursive
- uses: egorpugin/sw-action@master
- name: build
if: github.event_name != 'pull_request' && (matrix.os == 'windows-2022')
run: ./sw -static -shared -platform x86,x64 -config d,r build
- name: build-pr
if: github.event_name == 'pull_request' && (matrix.os == 'windows-2022')
run: ./sw build
- name: build
if: github.event_name != 'pull_request' && (matrix.os != 'windows-2022')
run: ./sw -static -shared -config d,r build -Dwith-tests=1
- name: build-pr
if: github.event_name == 'pull_request' && (matrix.os != 'windows-2022')
run: ./sw build -Dwith-tests=1
- name: download test data
run: git clone https://github.com/egorpugin/tessdata tessdata_unittest
- name: copy fonts
if: matrix.os != 'windows-2022'
run: cp tessdata_unittest/fonts/* test/testing/
- name: copy fonts
if: matrix.os == 'windows-2022'
run: Copy-Item -Path "tessdata_unittest\fonts\*" -Destination "test\testing" -Recurse
shell: pwsh
- name: test
if: github.event_name != 'pull_request' && (matrix.os != 'windows-2022')
run: ./sw -static -shared -config "d,r" test -Dwith-tests=1 "-Dskip-tests=lstm,lstm_recode"
continue-on-error: true
- name: test
if: github.event_name == 'pull_request' && (matrix.os != 'windows-2022')
run: ./sw test -Dwith-tests=1 "-Dskip-tests=lstm,lstm_recode"
continue-on-error: true
- name: test-nightly
if: matrix.os != 'windows-2022' && matrix.os != 'macos-12' && github.event.schedule=='0 0 * * *'
run: ./sw -static -shared -config "d,r" test -Dwith-tests=1
continue-on-error: true
# windows and macos-12 tests hang here for some reason, investigate
#- name: test
#if: matrix.os == 'windows-2022' || matrix.os == 'macos-12'
#run: ./sw test -Dwith-tests=1 "-Dskip-tests=lstm,lstm_recode"
#continue-on-error: true
- name: Upload Unit Test Results
if: always() && matrix.os != 'windows-2022'
uses: actions/upload-artifact@v3
with:
name: Test Results (${{ matrix.os }})
path: .sw/test/results.xml
- name: Publish Test Report
if: always() && matrix.os != 'windows-2022'
uses: mikepenz/action-junit-report@v3
with:
check_name: test (${{ matrix.os }})
report_paths: .sw/test/results.xml
github_token: ${{ secrets.GITHUB_TOKEN }}

View File

@ -0,0 +1,78 @@
name: unittest-disablelegacy
# autotools build on ubuntu, unittests with disabled legacy engine.
# currently some unittests are failing with disabled legacy engine.
on:
#push:
schedule:
- cron: 0 10 * * *
jobs:
linux:
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
compiler: [ g++, clang++ ]
os: [ ubuntu-20.04 ]
steps:
- uses: actions/checkout@v3
with:
submodules: recursive
- name: Install dependencies
run: |
sudo apt-get update
sudo apt-get install autoconf libleptonica-dev libpango1.0-dev -y
sudo apt-get install cabextract -y
#sudo apt-get install libc++-7-dev libc++abi-7-dev -y
- name: Setup
run: |
mkdir -p m4
./autogen.sh
- name: Configure
run: |
./configure '--disable-shared' '--disable-legacy' 'CXX=${{ matrix.compiler }}'
- name: Make and Install Tesseract
run: |
make -j 8
sudo make install install
- name: Make and Install Training Tools
run: |
make training -j 8
sudo make install training-install
- name: Display Version
run: |
${{ matrix.compiler }} --version
tesseract -v
lstmtraining -v
text2image -v
if: success() || failure()
- name: Download fonts, tessdata and langdata required for tests
run: |
git clone https://github.com/egorpugin/tessdata tessdata_unittest
cp tessdata_unittest/fonts/* test/testing/
mv tessdata_unittest/* ../
- name: Run Tesseract on phototest.tif and devatest.png
run: |
tesseract test/testing/phototest.tif - --tessdata-dir ../tessdata
tesseract test/testing/devatest.png - -l hin+eng --tessdata-dir ../tessdata
- name: Make and run Unit Tests
run: |
make check -j 4
- name: Display Unit Tests Report
run: |
git log -3
${{ matrix.compiler }} --version
cat test-suite.log
if: always()

View File

@ -0,0 +1,78 @@
name: unittest-macos
# autotools build on homebrew. unittests with address sanitizers. with openmp.
on:
#push:
schedule:
- cron: 0 0 * * *
jobs:
sanitizers:
name: ${{ matrix.config.name }}
runs-on: ${{ matrix.config.os }}
strategy:
fail-fast: false
matrix:
config:
- { name: macos-12-clang-unittest, os: macos-12, cxx: clang++ }
- { name: macos-12-gcc-unittest, os: macos-12, cxx: g++ }
steps:
- uses: actions/checkout@v3
with:
submodules: recursive
- name: Install dependencies (macOS Homebrew)
run: |
brew install autoconf automake libarchive
brew install leptonica cairo pango
brew install cabextract
- name: Setup
run: |
mkdir -p m4
./autogen.sh
- name: Configure (macOS Homebrew)
run: |
./configure '--disable-shared' '--with-pic' \
'CXX=${{ matrix.config.cxx }}' \
'CXXFLAGS=-g -O2 -fsanitize=address,undefined'
- name: Make and Install Tesseract
run: |
make
sudo make install
- name: Make and Install Training Tools
run: |
make training
sudo make training-install
- name: Display Tesseract and Training Tools Version
run: |
tesseract -v
lstmtraining -v
text2image -v
if: success() || failure()
- name: Download fonts, tessdata and langdata required for tests
run: |
git clone https://github.com/egorpugin/tessdata tessdata_unittest
cp tessdata_unittest/fonts/* test/testing/
mv tessdata_unittest/* ../
- name: Run Tesseract on phototest.tif and devatest.png
run: |
tesseract test/testing/phototest.tif - --tessdata-dir ../tessdata
tesseract test/testing/devatest.png - -l hin+eng --tessdata-dir ../tessdata
- name: Make and run Unit Tests
run: |
make check
- name: Display Unit Tests Report and compiler version
run: |
cat test-suite.log
${{ matrix.config.cxx }} --version
git log -3 --pretty=format:'%h %ad %s | %an'
if: always()

View File

@ -0,0 +1,85 @@
name: unittest
# autotools build on ubuntu. unittests with address sanitizers. with openmp.
# ubuntu-20.04-gcc-unittest - CI runs out of diskspace.
on:
#push:
schedule:
- cron: 0 0 * * *
jobs:
sanitizers:
name: ${{ matrix.config.name }}
runs-on: ${{ matrix.config.os }}
strategy:
fail-fast: false
matrix:
config:
- { name: ubuntu-20.04-gcc-unittest, os: ubuntu-20.04, cxx: g++ }
- { name: ubuntu-22.04-clang-unittest, os: ubuntu-22.04, cxx: clang++ }
steps:
- uses: actions/checkout@v3
with:
submodules: recursive
- name: Remove Homebrew, Android and .NET to provide more disk space
run: |
# https://github.com/actions/virtual-environments/issues/2606#issuecomment-772683150
sudo rm -rf /home/linuxbrew # will release Homebrew
sudo rm -rf /usr/local/lib/android # will release about 10 GB if you don't need Android
sudo rm -rf /usr/share/dotnet # will release about 20GB if you don't need .NET
- name: Install dependencies (Linux)
run: |
sudo apt-get update
sudo apt-get install autoconf libleptonica-dev libpango1.0-dev -y
sudo apt-get install cabextract -y
- name: Setup
run: |
mkdir -p m4
./autogen.sh
- name: Configure (Linux)
run: |
./configure '--disable-shared' 'CXX=${{ matrix.config.cxx }}' \
'CXXFLAGS=-g -O2 -fsanitize=address,undefined'
- name: Make and Install Tesseract
run: |
${{ matrix.config.cxx }} --version
make
sudo make install
- name: Make and Install Training Tools
run: |
make training
sudo make training-install
- name: Display Tesseract and Training Tools Version
run: |
tesseract -v
lstmtraining -v
text2image -v
if: success() || failure()
- name: Download fonts, tessdata and langdata required for tests
run: |
git clone https://github.com/egorpugin/tessdata tessdata_unittest
cp tessdata_unittest/fonts/* test/testing/
mv tessdata_unittest/* ../
- name: Run Tesseract on phototest.tif and devatest.png
run: |
tesseract test/testing/phototest.tif - --tessdata-dir ../tessdata
tesseract test/testing/devatest.png - -l hin+eng --tessdata-dir ../tessdata
- name: Make and run Unit Tests
run: |
make check
- name: Display Unit Tests Report and Compiler Version
run: |
cat test-suite.log
${{ matrix.config.cxx }} --version
git log -3 --pretty=format:'%h %ad %s | %an'
if: always()

View File

@ -0,0 +1,100 @@
name: vcpkg
# build and test of tesseract on windows using vcpkg and cmake.
# vcpkg with -head does not work. https://github.com/microsoft/vcpkg/issues/16019
on:
#push:
schedule:
- cron: 0 23 * * *
jobs:
build:
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [windows-2019]
steps:
- name: Checkout Tesseract Source (--head from main branch)
uses: actions/checkout@v3
with:
submodules: recursive
- name: Visual Studio Setup
shell: cmd
run: |
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
- name: Install vcpkg
run: |
git clone https://github.com/microsoft/vcpkg
vcpkg/bootstrap-vcpkg.bat
vcpkg/vcpkg integrate install
- name: Build and Install Leptonica and image libraries using vcpkg
run: |
vcpkg/vcpkg install leptonica:x64-windows
- name: Configure and Build Tesseract (--head from main branch) with cmake
run: |
cmake . -B build -DCMAKE_BUILD_TYPE=Release -DSW_BUILD=OFF -DOPENMP_BUILD=OFF -DBUILD_TRAINING_TOOLS=OFF "-DCMAKE_TOOLCHAIN_FILE=${env:GITHUB_WORKSPACE}/vcpkg/scripts/buildsystems/vcpkg.cmake"
cmake --build build --config Release --target install
- name: Display Tesseract Version
run: |
D:\a\tesseract\tesseract\build\bin\Release\tesseract.exe --version
- name: Create CMakeLists.txt file for basicapitest
shell: bash
run: |
cd test
cat << "EOF" > CMakeLists.txt
cmake_minimum_required(VERSION 3.19)
project( basicapitest )
find_package( Tesseract REQUIRED )
find_package( Leptonica REQUIRED )
include_directories(${Tesseract_INCLUDE_DIRS})
include_directories(${Leptonica_INCLUDE_DIRS})
add_executable( basicapitest testing/basicapitest.cpp )
target_link_libraries(basicapitest ${Leptonica_LIBRARIES})
target_link_libraries(basicapitest Tesseract::libtesseract)
add_library(libtesseract UNKNOWN IMPORTED)
set_property(TARGET libtesseract PROPERTY IMPORTED_LOCATION D:/a/tesseract/tesseract/build/Release/tesseract50.lib)
target_link_libraries(basicapitest Tesseract::libtesseract)
EOF
cat CMakeLists.txt
- name: Configure basicapitest
run: |
cd test
cmake . "-DCMAKE_TOOLCHAIN_FILE=${env:GITHUB_WORKSPACE}/vcpkg/scripts/buildsystems/vcpkg.cmake"
- name: Build basicapitest
run: |
cd test
cmake --build . --config Release
- name: Download tessdata and image files used for tests
run: |
git clone https://github.com/egorpugin/tessdata tessdata_unittest
mv tessdata_unittest/* ../
- name: Run basicapitest
run: |
cd test
D:\a\tesseract\tesseract\test\Release\basicapitest.exe
- name: Run Tesseract CLI on test images in different languages
run: |
D:\a\tesseract\tesseract\build\bin\Release\tesseract.exe test\testing\phototest.tif - --oem 1 --tessdata-dir ..\tessdata
D:\a\tesseract\tesseract\build\bin\Release\tesseract.exe test\testing\raaj.tif - -l hin --oem 1 --tessdata-dir ..\tessdata
D:\a\tesseract\tesseract\build\bin\Release\tesseract.exe test\testing\viet.tif - -l vie --oem 1 --tessdata-dir ..\tessdata
D:\a\tesseract\tesseract\build\bin\Release\tesseract.exe test\testing\hebrew.png - -l heb --oem 1 --tessdata-dir ..\tessdata
D:\a\tesseract\tesseract\build\bin\Release\tesseract.exe test\testing\eurotext.tif - -l fra --oem 1 --tessdata-dir ..\tessdata_best
D:\a\tesseract\tesseract\build\bin\Release\tesseract.exe test\testing\arabic.tif - -l ara --oem 1 --psm 6 --tessdata-dir ..\tessdata
- name: List languages in different test tessdata-dir
run: |
D:\a\tesseract\tesseract\build\bin\Release\tesseract.exe --list-langs --tessdata-dir ..\tessdata
D:\a\tesseract\tesseract\build\bin\Release\tesseract.exe --list-langs --tessdata-dir ..\tessdata_best
D:\a\tesseract\tesseract\build\bin\Release\tesseract.exe --list-langs --tessdata-dir ..\tessdata_fast

View File

@ -20,11 +20,11 @@
publisher = {ACM},
series = {ACM International Conference Proceeding Series},
title = {Adapting the Tesseract Open Source OCR Engine for Multilingual OCR.},
url = {http://www.google.de/research/pubs/archive/35248.pdf},
url = {https://storage.googleapis.com/pub-tools-public-publication-data/pdf/35248.pdf},
year = 2009,
isbn = {978-1-60558-698-4},
date = {2009-07-25}
doi = {http://doi.acm.org/10/1145/1577802.1577804}
date = {2009-07-25},
doi = {http://doi.acm.org/10/1145/1577802.1577804},
location = {Barcelona, Spain},
}
@ -33,7 +33,7 @@
title = {Combined Orientation and Script Detection using the Tesseract OCR Engine},
booktitle = {MOCR '09: Proceedings of the International Workshop on Multilingual OCR},
editor = {Venu Govindaraju and Premkumar Natarajan and Santanu Chaudhury and Daniel P. Lopresti},
url = {http://www.google.de/research/pubs/archive/35506.pdf}
url = {https://storage.googleapis.com/pub-tools-public-publication-data/pdf/35506.pdf},
year = {2009},
isbn = {978-1-60558-698-4},
pages = {1--7},
@ -47,7 +47,7 @@
author = {Ray Smith},
title = {Hybrid Page Layout Analysis via Tab-Stop Detection},
booktitle = {ICDAR '09: Proceedings of the 2009 10th International Conference on Document Analysis and Recognition},
url = {http://www.google.de/research/pubs/archive/35094.pdf}
url = {https://storage.googleapis.com/pub-tools-public-publication-data/pdf/35094.pdf},
year = {2009},
isbn = {978-0-7695-3725-2},
pages = {241--245},
@ -60,10 +60,11 @@
author = {Ray Smith},
title = {An Overview of the Tesseract OCR Engine},
booktitle = {ICDAR '07: Proceedings of the Ninth International Conference on Document Analysis and Recognition},
url = {http://www.google.de/research/pubs/archive/33418.pdf}
url = {https://storage.googleapis.com/pub-tools-public-publication-data/pdf/33418.pdf},
year = {2007},
isbn = {0-7695-2822-8},
pages = {629--633},
publisher = {IEEE Computer Society},
address = {Washington, DC, USA},
}

View File

@ -258,7 +258,11 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm.*")
set(HAVE_AVX512F FALSE)
set(HAVE_FMA FALSE)
set(HAVE_SSE4_1 FALSE)
check_cxx_compiler_flag("-mfpu=neon" HAVE_NEON)
if(HAVE_NEON)
set(NEON_COMPILE_FLAGS "-mfpu=neon")
endif(HAVE_NEON)
else()
@ -272,7 +276,6 @@ else()
endif(CMAKE_SYSTEM_PROCESSOR MATCHES "x86|x86_64|AMD64|amd64|i386|i686")
if(HAVE_NEON)
set(NEON_COMPILE_FLAGS "-mfpu=neon")
message(STATUS "LTO build is not supported on arm/RBPi.")
set(ENABLE_LTO FALSE) # enable LTO cause fatal error on arm/RBPi
endif()
@ -405,10 +408,10 @@ else()
include_directories(${Leptonica_INCLUDE_DIRS})
check_leptonica_tiff_support()
if (NOT LEPT_TIFF_RESULT EQUAL 0)
if ((NOT LEPT_TIFF_RESULT EQUAL 0) AND LEPT_TIFF_COMPILE_SUCCESS)
message(NOTICE "Leptonica was build without TIFF support! Disabling TIFF support...")
set(DISABLE_TIFF ON)
else()
elseif(NOT ${CMAKE_VERSION} VERSION_LESS "3.25")
message(STATUS "Leptonica was build with TIFF support.")
endif()

View File

@ -1,3 +1,12 @@
2023-10-05 - V5.3.3
* Small code fixes and improvements to fix Coverity Scan issues.
* Disable -mfpu=neon for aarch64.
* Fix build without git clone in cloned directory (required for FreeBSD).
* Other build fixes for autotools, cmake and sw.
* Fix regression in layout detection which was introduced in release 5.0.0.
* Fix regression which prevented loading of submodels, introduced in release 5.0.0-rc2.
* Other small improvements for code and documentation.
2023-07-11 - V5.3.2
* Updates for snap package building.
* Support for Sgaw and W Pwo Karen languages in the Myanmar validator (#4065).
@ -264,7 +273,7 @@
* Many other fixes, including the way in which the chopper finds chops and messes with the outline while it does so.
2010-11-29 - V3.01
* Removed old/dead serialise/deserialze methods on *LISTIZED classes.
* Removed old/dead serialise/deserialize methods on *LISTIZED classes.
* Total rewrite of DENORM to better encapsulate operation and make
for potential to extract features from images.
* Thread-safety! Moved all critical global and static variables to members of the appropriate class. Tesseract is now thread-safe (multiple instances can be used in parallel in multiple threads.) with the minor exception that some control parameters are still global and affect all threads.

View File

@ -1 +1 @@
5.3.2
5.3.3

View File

@ -18,31 +18,35 @@ function(check_leptonica_tiff_support)
# check if leptonica was build with tiff support set result to
# LEPT_TIFF_RESULT
set(TIFF_TEST
"#include <leptonica/allheaders.h>\n"
"int main() {\n"
" l_uint8 *data = NULL;\n"
" size_t size = 0;\n"
" PIX* pix = pixCreate(3, 3, 4);\n"
" l_int32 ret_val = pixWriteMemTiff(&data, &size, pix, IFF_TIFF_G3);\n"
" pixDestroy(&pix);\n"
" lept_free(data);\n"
" return ret_val;}\n")
set(CMAKE_TRY_COMPILE_CONFIGURATION ${CMAKE_BUILD_TYPE})
try_run(
LEPT_TIFF_RESULT
LEPT_TIFF_COMPILE
SOURCE_FROM_CONTENT tiff_test.cpp "${TIFF_TEST}"
CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${Leptonica_INCLUDE_DIRS}"
LINK_LIBRARIES ${Leptonica_LIBRARIES}
COMPILE_OUTPUT_VARIABLE
COMPILE_OUTPUT)
if(NOT LEPT_TIFF_COMPILE)
message(STATUS "COMPILE_OUTPUT: ${COMPILE_OUTPUT}")
message(STATUS "Leptonica_INCLUDE_DIRS: ${Leptonica_INCLUDE_DIRS}")
message(STATUS "Leptonica_LIBRARIES: ${Leptonica_LIBRARIES}")
message(STATUS "LEPT_TIFF_RESULT: ${LEPT_TIFF_RESULT}")
message(STATUS "LEPT_TIFF_COMPILE: ${LEPT_TIFF_COMPILE}")
message(WARNING "Failed to compile test")
"#include \"leptonica/allheaders.h\"\n"
"int main() {\n"
" l_uint8 *data = NULL;\n"
" size_t size = 0;\n"
" PIX* pix = pixCreate(3, 3, 4);\n"
" l_int32 ret_val = pixWriteMemTiff(&data, &size, pix, IFF_TIFF_G3);\n"
" pixDestroy(&pix);\n"
" lept_free(data);\n"
" return ret_val;}\n")
if(${CMAKE_VERSION} VERSION_LESS "3.25")
message(STATUS "Testing TIFF support in Leptonica is available with CMake >= 3.25 (you have ${CMAKE_VERSION}))")
else()
set(CMAKE_TRY_COMPILE_CONFIGURATION ${CMAKE_BUILD_TYPE})
try_run(
LEPT_TIFF_RESULT
LEPT_TIFF_COMPILE_SUCCESS
SOURCE_FROM_CONTENT tiff_test.cpp "${TIFF_TEST}"
CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${Leptonica_INCLUDE_DIRS}"
LINK_LIBRARIES ${Leptonica_LIBRARIES}
COMPILE_OUTPUT_VARIABLE
COMPILE_OUTPUT)
if(NOT LEPT_TIFF_COMPILE_SUCCESS)
message(STATUS "COMPILE_OUTPUT: ${COMPILE_OUTPUT}")
message(STATUS "Leptonica_INCLUDE_DIRS: ${Leptonica_INCLUDE_DIRS}")
message(STATUS "Leptonica_LIBRARIES: ${Leptonica_LIBRARIES}")
message(STATUS "LEPT_TIFF_RESULT: ${LEPT_TIFF_RESULT}")
message(STATUS "LEPT_TIFF_COMPILE: ${LEPT_TIFF_COMPILE}")
message(WARNING "Failed to compile test")
endif()
endif()
endfunction(check_leptonica_tiff_support)

View File

@ -7,7 +7,7 @@
# ----------------------------------------
AC_PREREQ([2.69])
AC_INIT([tesseract],
[m4_esyscmd_s([git describe --abbrev=4 2>/dev/null || cat VERSION])],
[m4_esyscmd_s([test -d .git && git describe --abbrev=4 2>/dev/null || cat VERSION])],
[https://github.com/tesseract-ocr/tesseract/issues],,
[https://github.com/tesseract-ocr/tesseract/])
@ -29,7 +29,7 @@ AM_INIT_AUTOMAKE([foreign subdir-objects nostdinc])
# Define date of package, etc. Could be useful in auto-generated
# documentation.
PACKAGE_YEAR=2023
PACKAGE_DATE="07/11"
PACKAGE_DATE="10/05"
abs_top_srcdir=`AS_DIRNAME([$0])`

View File

@ -804,7 +804,7 @@ private:
int tessedit_page_number);
}; // class TessBaseAPI.
/** Escape a char string - remove &<>"' with HTML codes. */
/** Escape a char string - replace &<>"' with HTML codes. */
std::string HOcrEscape(const char *text);
} // namespace tesseract

View File

@ -56,7 +56,7 @@ public class ScrollView {
/** Prints all received messages to the console if true. */
static boolean debugViewNetworkTraffic = false;
/** Add a new message to the outgoing queue */
/** Add a new message to the outgoing queue. */
public static void addMessage(SVEvent e) {
if (debugViewNetworkTraffic) {
System.out.println("(S->c) " + e.toString());
@ -191,7 +191,7 @@ public class ScrollView {
}
}
// If str is not null here, then we have a string with a comma in it.
// Append , and the next argument at the next iteration, but check
// Append, and the next argument at the next iteration, but check
// that str is null after the loop terminates in case it was an
// unterminated string.
} else if (floatPattern.matcher(argStr).matches()) {
@ -390,8 +390,7 @@ public class ScrollView {
"UTF8"));
} catch (IOException e) {
// Something went wrong and we were unable to set up a connection. This is
// pretty
// much a fatal error.
// pretty much a fatal error.
// Note: The server does not get restarted automatically if this happens.
e.printStackTrace();
System.exit(1);

View File

@ -36,7 +36,7 @@ import javax.swing.Timer;
* The ScrollViewEventHandler takes care of any events which might happen on the
* canvas and converts them to an according SVEvent, which is (using the
* processEvent method) then added to a message queue. All events from the
* message queue get sent gradually
* message queue get sent gradually.
*
* @author wanke@google.com
*/
@ -60,7 +60,7 @@ public class SVEventHandler extends PBasicInputEventHandler implements
private int lastXMove = 0;
private int lastYMove = 0;
/** For Drawing a rubber-band rectangle for selection */
/** For Drawing a rubber-band rectangle for selection. */
private int startX = 0;
private int startY = 0;
private float rubberBandTransparency = 0.5f;
@ -274,7 +274,7 @@ public class SVEventHandler extends PBasicInputEventHandler implements
}
}
/** These are all events we do not care about and throw away */
/** These are all events we do not care about and throw away. */
public void keyReleased(KeyEvent e) {
}

View File

@ -27,7 +27,7 @@ import javax.xml.bind.DatatypeConverter;
* @author wanke@google.com
*/
public class SVImageHandler {
/* All methods are static, so we forbid to construct SVImageHandler objects */
/* All methods are static, so we forbid to construct SVImageHandler objects. */
private SVImageHandler() {
}

View File

@ -56,7 +56,7 @@ public class SVPopupMenu implements ActionListener {
*
* @param parent The menu we add our new entry to (should have been defined
* before). If the parent is "", we will add the entry to the root
* (top-level)
* (top-level).
* @param name The caption of the new entry.
* @param id The Id of the new entry. If it is -1, the entry will be treated
* as a menu.
@ -64,14 +64,14 @@ public class SVPopupMenu implements ActionListener {
public void add(String parent, String name, int id) {
// A duplicate entry - we just throw it away, since its already in.
if (items.get(name) != null) { return; }
// A new submenu at the top-level
// A new submenu at the top-level.
if (parent.equals("")) {
JMenu jli = new JMenu(name);
SVAbstractMenuItem mli = new SVSubMenuItem(name, jli);
items.put(name, mli);
root.add(jli);
}
// A new sub-submenu
// A new sub-submenu.
else if (id == -1) {
SVAbstractMenuItem jmi = items.get(parent);
JMenu jli = new JMenu(name);
@ -101,7 +101,7 @@ public class SVPopupMenu implements ActionListener {
*
* @param parent The menu we add our new entry to (should have been defined
* before). If the parent is "", we will add the entry to the root
* (top-level)
* (top-level).
* @param name The caption of the new entry.
* @param id The Id of the new entry. If it is -1, the entry will be treated
* as a menu.

View File

@ -103,7 +103,7 @@ static STRING_VAR(document_title, "", "Title of output document (used for hOCR a
static INT_VAR(curl_timeout, 0, "Timeout for curl in seconds");
#endif
/** Minimum sensible image size to be worth running tesseract. */
/** Minimum sensible image size to be worth running Tesseract. */
const int kMinRectSize = 10;
/** Character returned when Tesseract couldn't recognize as anything. */
const char kTesseractReject = '~';
@ -613,7 +613,7 @@ void TessBaseAPI::SetImage(Pix *pix) {
/**
* Restrict recognition to a sub-rectangle of the image. Call after SetImage.
* Each SetRectangle clears the recogntion results so multiple rectangles
* Each SetRectangle clears the recognition results so multiple rectangles
* can be recognized with the same image.
*/
void TessBaseAPI::SetRectangle(int left, int top, int width, int height) {
@ -2374,7 +2374,7 @@ int TessBaseAPI::NumDawgs() const {
return tesseract_ == nullptr ? 0 : tesseract_->getDict().NumDawgs();
}
/** Escape a char string - remove <>&"' with HTML codes. */
/** Escape a char string - replace <>&"' with HTML codes. */
std::string HOcrEscape(const char *text) {
std::string ret;
const char *ptr;

View File

@ -306,9 +306,10 @@ int Tesseract::init_tesseract(const std::string &arg0, const std::string &textba
// Add any languages that this language requires
bool loaded_primary = false;
// Load the rest into sub_langs_.
// A range based for loop does not work here because langs_to_load
// WARNING: A range based for loop does not work here because langs_to_load
// might be changed in the loop when a new submodel is found.
for (auto &lang_to_load : langs_to_load) {
for (size_t lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) {
auto &lang_to_load = langs_to_load[lang_index];
if (!IsStrInList(lang_to_load, langs_not_to_load)) {
const char *lang_str = lang_to_load.c_str();
Tesseract *tess_to_init;

View File

@ -42,6 +42,8 @@ DENORM::DENORM() {
DENORM::DENORM(const DENORM &src) {
rotation_ = nullptr;
x_map_ = nullptr;
y_map_ = nullptr;
*this = src;
}

View File

@ -78,7 +78,7 @@ public:
int32_t count_transitions( // count maxima
int32_t threshold); // size threshold
void move(const ICOORD vec); // repostion blob by vector
void move(const ICOORD vec); // reposition blob by vector
void rotate(const FCOORD &rotation); // Rotate by given vector.
// Adds sub-pixel resolution EdgeOffsets for the outlines using greyscale

View File

@ -123,7 +123,7 @@ public:
// Add a callback to be called to delete the elements when the array took
// their ownership.
void set_clear_callback(std::function<void(T)> cb) {
void set_clear_callback(const std::function<void(T)> &cb) {
clear_cb_ = cb;
}
@ -148,8 +148,8 @@ public:
// fread (and swapping)/fwrite.
// Returns false on error or if the callback returns false.
// DEPRECATED. Use [De]Serialize[Classes] instead.
bool write(FILE *f, std::function<bool(FILE *, const T &)> cb) const;
bool read(TFile *f, std::function<bool(TFile *, T *)> cb);
bool write(FILE *f, const std::function<bool(FILE *, const T &)> &cb) const;
bool read(TFile *f, const std::function<bool(TFile *, T *)> &cb);
// Writes a vector of simple types to the given file. Assumes that bitwise
// read/write of T will work. Returns false in case of error.
// TODO(rays) Change all callers to use TFile and remove deprecated methods.
@ -577,7 +577,7 @@ int GenericVector<T>::push_back(T object) {
double_the_size();
}
index = size_used_++;
data_[index] = object;
data_[index] = std::move(object);
return index;
}
@ -627,7 +627,7 @@ void GenericVector<T>::delete_data_pointers() {
}
template <typename T>
bool GenericVector<T>::write(FILE *f, std::function<bool(FILE *, const T &)> cb) const {
bool GenericVector<T>::write(FILE *f, const std::function<bool(FILE *, const T &)> &cb) const {
if (fwrite(&size_reserved_, sizeof(size_reserved_), 1, f) != 1) {
return false;
}
@ -649,7 +649,7 @@ bool GenericVector<T>::write(FILE *f, std::function<bool(FILE *, const T &)> cb)
}
template <typename T>
bool GenericVector<T>::read(TFile *f, std::function<bool(TFile *, T *)> cb) {
bool GenericVector<T>::read(TFile *f, const std::function<bool(TFile *, T *)> &cb) {
int32_t reserved;
if (f->FReadEndian(&reserved, sizeof(reserved), 1) != 1) {
return false;

View File

@ -1000,7 +1000,7 @@ bool UNICHARSET::major_right_to_left() const {
// Set a whitelist and/or blacklist of characters to recognize.
// An empty or nullptr whitelist enables everything (minus any blacklist).
// An empty or nullptr blacklist disables nothing.
// An empty or nullptr blacklist has no effect.
// An empty or nullptr unblacklist has no effect.
void UNICHARSET::set_black_and_whitelist(const char *blacklist,
const char *whitelist,
const char *unblacklist) {

View File

@ -87,7 +87,7 @@ public:
/// Add a callback to be called to delete the elements when the table took
/// their ownership.
void set_clear_callback(std::function<void(T)> cb) {
void set_clear_callback(const std::function<void(T)> &cb) {
table_.set_clear_callback(cb);
}
@ -109,10 +109,10 @@ public:
/// The Callback given must be permanent since they will be called more than
/// once. The given callback will be deleted at the end.
/// Returns false on read/write error.
bool write(FILE *f, std::function<bool(FILE *, const T &)> cb) const {
bool write(FILE *f, const std::function<bool(FILE *, const T &)> &cb) const {
return table_.write(f, cb);
}
bool read(tesseract::TFile *f, std::function<bool(tesseract::TFile *, T *)> cb) {
bool read(tesseract::TFile *f, const std::function<bool(tesseract::TFile *, T *)> &cb) {
return table_.read(f, cb);
}

View File

@ -174,8 +174,8 @@ static ds_status initDSProfile(ds_profile **p, const char *version) {
clGetPlatformIDs(0, nullptr, &numPlatforms);
if (numPlatforms > 0) {
platforms.reserve(numPlatforms);
clGetPlatformIDs(numPlatforms, &platforms[0], nullptr);
platforms.resize(numPlatforms);
clGetPlatformIDs(numPlatforms, platforms.data(), nullptr);
}
numDevices = 0;
@ -186,12 +186,11 @@ static ds_status initDSProfile(ds_profile **p, const char *version) {
}
if (numDevices > 0) {
devices.reserve(numDevices);
devices.resize(numDevices);
}
profile->numDevices = numDevices + 1; // +1 to numDevices to include the native CPU
profile->devices.reserve(profile->numDevices);
memset(&profile->devices[0], 0, profile->numDevices * sizeof(ds_device));
profile->devices.resize(profile->numDevices);
next = 0;
for (i = 0; i < numPlatforms; i++) {

View File

@ -1609,10 +1609,10 @@ BlobRegionType ColPartitionGrid::SmoothInOneDirection(
}
// See if we have a decision yet.
auto image_count = counts[NPT_IMAGE];
auto htext_score = counts[NPT_HTEXT] + counts[NPT_WEAK_HTEXT] -
(image_count + counts[NPT_WEAK_VTEXT]);
auto vtext_score = counts[NPT_VTEXT] + counts[NPT_WEAK_VTEXT] -
(image_count + counts[NPT_WEAK_HTEXT]);
int htext_score = counts[NPT_HTEXT] + counts[NPT_WEAK_HTEXT] -
(image_count + counts[NPT_WEAK_VTEXT]);
int vtext_score = counts[NPT_VTEXT] + counts[NPT_WEAK_VTEXT] -
(image_count + counts[NPT_WEAK_HTEXT]);
if (image_count > 0 && image_bias - htext_score >= kSmoothDecisionMargin &&
image_bias - vtext_score >= kSmoothDecisionMargin) {
*best_distance = dists[NPT_IMAGE][0];

View File

@ -250,9 +250,14 @@ static void ConnCompAndRectangularize(Image pix, DebugPixa *pixa_debug, Boxa **b
// If not nullptr, it must be PixDestroyed by the caller.
// If textord_tabfind_show_images, debug images are appended to pixa_debug.
Image ImageFind::FindImages(Image pix, DebugPixa *pixa_debug) {
auto width = pixGetWidth(pix);
auto height = pixGetHeight(pix);
// Not worth looking at small images.
if (pixGetWidth(pix) < kMinImageFindSize || pixGetHeight(pix) < kMinImageFindSize) {
return pixCreate(pixGetWidth(pix), pixGetHeight(pix), 1);
// Leptonica will print an error message and return nullptr if we call
// pixGenHalftoneMask(pixr, nullptr, ...) with width or height < 100
// for the reduced image, so we want to bypass that, too.
if (width / 2 < kMinImageFindSize || height / 2 < kMinImageFindSize) {
return pixCreate(width, height, 1);
}
// Reduce by factor 2.
@ -262,15 +267,6 @@ Image ImageFind::FindImages(Image pix, DebugPixa *pixa_debug) {
}
// Get the halftone mask directly from Leptonica.
//
// Leptonica will print an error message and return nullptr if we call
// pixGenHalftoneMask(pixr, nullptr, ...) with too small image, so we
// want to bypass that.
if (pixGetWidth(pixr) < kMinImageFindSize || pixGetHeight(pixr) < kMinImageFindSize) {
pixr.destroy();
return pixCreate(pixGetWidth(pix), pixGetHeight(pix), 1);
}
// Get the halftone mask.
l_int32 ht_found = 0;
Pixa *pixadb = (textord_tabfind_show_images && pixa_debug != nullptr) ? pixaCreate(0) : nullptr;
Image pixht2 = pixGenerateHalftoneMask(pixr, nullptr, &ht_found, pixadb);
@ -287,7 +283,7 @@ Image ImageFind::FindImages(Image pix, DebugPixa *pixa_debug) {
pixht2.destroy();
}
if (pixht2 == nullptr) {
return pixCreate(pixGetWidth(pix), pixGetHeight(pix), 1);
return pixCreate(width, height, 1);
}
// Expand back up again.
@ -334,7 +330,7 @@ Image ImageFind::FindImages(Image pix, DebugPixa *pixa_debug) {
pixa_debug->AddPix(pixht, "FinalMask");
}
// Make the result image the same size as the input.
Image result = pixCreate(pixGetWidth(pix), pixGetHeight(pix), 1);
Image result = pixCreate(width, height, 1);
result |= pixht;
pixht.destroy();
return result;

View File

@ -432,7 +432,7 @@ void TableFinder::InsertImagePartition(ColPartition *part) {
// text lines on the page. The assumption is that a table
// will have several lines with similar overlapping whitespace
// whereas text will not have this type of property.
// Note: The code Assumes that blobs are sorted by the left side x!
// Note: The code assumes that blobs are sorted by the left side x!
// This will not work (as well) if the blobs are sorted by center/right.
void TableFinder::SplitAndInsertFragmentedTextPartition(ColPartition *part) {
ASSERT_HOST(part != nullptr);

View File

@ -149,7 +149,7 @@ bool ValidateMyanmar::ConsumeOptionalSignsIfPresent() {
}
// Sgaw tones 0x1062, 0x1063 must be followed by asat.
// W Pwo tones 0x1069, 0x106a, and 0x106b may be followed by dot below or visarga (nasal).
ch = codes_[codes_used_].second;
ch = codes_[codes_used_].second;
if (ch == 0x103a || ch == 0x1037 || ch == 0x1038) {
if (UseMultiCode(1)) {
return true;