tesseract: add a wrapper to setup languages

Tesseract is now decoupled from the tessdata language corpus.

This avoids recompilation when building Tesseract with a custom set
of languages.

Update k2pdfopt to use the new wrapper interface.
This commit is contained in:
Erik Arvstedt 2018-12-18 19:02:13 +01:00
parent 45d2a2dd91
commit aaaed13077
No known key found for this signature in database
GPG Key ID: 33312B944DD97846
6 changed files with 145 additions and 96 deletions

View File

@ -1,11 +1,18 @@
{ callPackage, lowPrio }:
let
tesseract3 = callPackage ./tesseract3.nix {};
tesseract4 = callPackage ./tesseract4.nix {};
base3 = callPackage ./tesseract3.nix {};
base4 = callPackage ./tesseract4.nix {};
languages = callPackage ./languages.nix {};
in
{
tesseract = tesseract3;
tesseract = callPackage ./wrapper.nix {
tesseractBase = base3;
languages = languages.v3;
};
tesseract_4 = lowPrio tesseract4;
tesseract_4 = lowPrio (callPackage ./wrapper.nix {
tesseractBase = base4;
languages = languages.v4;
});
}

View File

@ -0,0 +1,43 @@
{ stdenv, lib, fetchurl, fetchFromGitHub }:
rec {
makeLanguages = { tessdataRev, tessdata ? null, all ? null, languages ? {} }:
let
tessdataSrc = fetchFromGitHub {
owner = "tesseract-ocr";
repo = "tessdata";
rev = tessdataRev;
sha256 = tessdata;
};
languageFile = lang: sha256: fetchurl {
url = "https://github.com/tesseract-ocr/tessdata/raw/${tessdataRev}/${lang}.traineddata";
inherit sha256;
};
in
{
all = stdenv.mkDerivation {
name = "all";
buildCommand = ''
mkdir $out
cd ${tessdataSrc}
cp *.traineddata $out
'';
outputHashMode = "recursive";
outputHashAlgo = "sha256";
outputHash = all;
};
};
v3 = makeLanguages {
tessdataRev = "3cf1e2df1fe1d1da29295c9ef0983796c7958b7d";
tessdata = "1v4b63v5nzcxr2y3635r19l7lj5smjmc9vfk0wmxlryxncb4vpg7";
all = "0yj6h9n6h0kzzcqsn3z87vsi8pa60szp0yiayb0znd0v9my0dqhn";
};
v4 = makeLanguages {
tessdataRev = "4.0.0";
tessdata = "1chw1ya5zf8aaj2ixr9x013x7vwwwjjmx6f2ag0d6i14lypygy28";
all = "0dqgkp369rcvq72yhgnzj1pj8yrv7kqzc7y6sqs7nzcq7l5qazlg";
};
}

View File

@ -1,37 +1,5 @@
{ stdenv, fetchFromGitHub, autoreconfHook, pkgconfig
, leptonica, libpng, libtiff, icu, pango, opencl-headers
# Supported list of languages or `null' for all available languages
, enableLanguages ? null
# if you want just a specific list of languages, optionally specify a hash
# to make tessdata a fixed output derivation.
, enableLanguagesHash ? (if enableLanguages == null # all languages
then "1h48xfzabhn0ldbx5ib67cp9607pr0zpblsy8z6fs4knn0zznfnw"
else null)
}:
let tessdata = stdenv.mkDerivation ({
name = "tessdata";
src = fetchFromGitHub {
owner = "tesseract-ocr";
repo = "tessdata";
rev = "3cf1e2df1fe1d1da29295c9ef0983796c7958b7d";
# when updating don't forget to update the default value fo enableLanguagesHash
sha256 = "1v4b63v5nzcxr2y3635r19l7lj5smjmc9vfk0wmxlryxncb4vpg7";
};
buildCommand = ''
cd $src;
for lang in ${if enableLanguages==null then "*.traineddata" else stdenv.lib.concatMapStringsSep " " (x: x+".traineddata") enableLanguages} ; do
install -Dt $out/share/tessdata $src/$lang ;
done;
'';
preferLocalBuild = true;
} // (stdenv.lib.optionalAttrs (enableLanguagesHash != null) {
# when a hash is given, we make this a fixed output derivation.
outputHashMode = "recursive";
outputHashAlgo = "sha256";
outputHash = enableLanguagesHash;
}));
in
{ stdenv, fetchurl, fetchFromGitHub, autoreconfHook, pkgconfig
, leptonica, libpng, libtiff, icu, pango, opencl-headers }:
stdenv.mkDerivation rec {
name = "tesseract-${version}";
@ -51,17 +19,11 @@ stdenv.mkDerivation rec {
LIBLEPT_HEADERSDIR = "${leptonica}/include";
postInstall = ''
for i in ${tessdata}/share/tessdata/*; do
ln -s $i $out/share/tessdata;
done
'';
meta = {
description = "OCR engine";
homepage = https://github.com/tesseract-ocr/tesseract;
license = stdenv.lib.licenses.asl20;
maintainers = with stdenv.lib.maintainers; [viric];
maintainers = with stdenv.lib.maintainers; [ viric earvstedt ];
platforms = with stdenv.lib.platforms; linux ++ darwin;
};
}

View File

@ -1,9 +1,5 @@
{ stdenv, fetchFromGitHub, autoreconfHook, autoconf-archive, pkgconfig
, leptonica, libpng, libtiff, icu, pango, opencl-headers
# Supported list of languages or `null' for all available languages
, enableLanguages ? null
}:
, leptonica, libpng, libtiff, icu, pango, opencl-headers }:
stdenv.mkDerivation rec {
name = "tesseract-${version}";
@ -16,46 +12,16 @@ stdenv.mkDerivation rec {
sha256 = "1b5fi2vibc4kk9b30kkk4ais4bw8fbbv24bzr5709194hb81cav8";
};
tessdata = fetchFromGitHub {
owner = "tesseract-ocr";
repo = "tessdata";
rev = version;
sha256 = "1chw1ya5zf8aaj2ixr9x013x7vwwwjjmx6f2ag0d6i14lypygy28";
};
enableParallelBuilding = true;
nativeBuildInputs = [ pkgconfig autoreconfHook autoconf-archive ];
buildInputs = [ leptonica libpng libtiff icu pango opencl-headers ];
# Copy the .traineddata files of the languages specified in enableLanguages
# into `$out/share/tessdata' and check afterwards if copying was successful.
postInstall = let
mkArg = lang: "-iname ${stdenv.lib.escapeShellArg "${lang}.traineddata"}";
mkFindArgs = stdenv.lib.concatMapStringsSep " -o " mkArg;
findLangArgs = if enableLanguages != null
then "\\( ${mkFindArgs enableLanguages} \\)"
else "-iname '*.traineddata'";
in ''
numLangs="$(find "$tessdata" -mindepth 1 -maxdepth 1 -type f \
${findLangArgs} -exec cp -t "$out/share/tessdata" {} + -print | wc -l)"
${if enableLanguages != null then ''
expected=${toString (builtins.length enableLanguages)}
'' else ''
expected="$(ls -1 "$tessdata/"*.traineddata | wc -l)"
''}
if [ "$numLangs" -ne "$expected" ]; then
echo "Expected $expected languages, but $numLangs" \
"were copied to \`$out/share/tessdata'" >&2
exit 1
fi
'';
meta = {
description = "OCR engine";
homepage = https://github.com/tesseract-ocr/tesseract;
license = stdenv.lib.licenses.asl20;
maintainers = with stdenv.lib.maintainers; [viric];
platforms = with stdenv.lib.platforms; linux;
maintainers = with stdenv.lib.maintainers; [ viric earvstedt ];
platforms = with stdenv.lib.platforms; linux ++ darwin;
};
}

View File

@ -0,0 +1,69 @@
{ stdenv, makeWrapper, tesseractBase, languages
# A list of languages like [ "eng" "spa" … ] or `null` for all available languages
, enableLanguages ? null
# A list of files or a directory containing files
, tessdata ? (
if enableLanguages == null then
languages.all
else stdenv.mkDerivation ({
name = "tessdata";
buildCommand = ''
for lang in ${stdenv.lib.concatMapStringsSep " " (x: x + ".traineddata") enableLanguages}; do
install -Dt $out ${languages.all}/$lang
done
'';
preferLocalBuild = true;
} // (stdenv.lib.optionalAttrs (enableLanguagesHash != null) {
# when a hash is given, we make this a fixed output derivation.
outputHashMode = "recursive";
outputHashAlgo = "sha256";
outputHash = enableLanguagesHash;
}))
)
, enableLanguagesHash ? null
}:
let
passthru = { inherit tesseractBase languages tessdata; };
tesseractWithData = tesseractBase.overrideAttrs (_: {
inherit tesseractBase tessdata;
buildInputs = [ makeWrapper ];
buildCommand = ''
makeWrapper {$tesseractBase,$out}/bin/tesseract --set-default TESSDATA_PREFIX $out/share/tessdata
# Recursively link include, share
cp -rs --no-preserve=mode $tesseractBase/{include,share} $out
cp -r --no-preserve=mode $tesseractBase/lib $out
# Fixup the store paths in lib so that the tessdata from this derivation is used.
if (( ''${#tesseractBase} != ''${#out} )); then
echo "Can't replace store paths due to differing lengths"
exit 1
fi
find $out/lib -type f -exec sed -i "s|$tesseractBase|$out|g" {} \;
if [[ -d "$tessdata" ]]; then
ln -s $tessdata/* $out/share/tessdata
else
for lang in $tessdata; do
ln -s $lang $out/share/tessdata/''${lang#/nix/store*-}
done
fi
if [[ ! -e $out/share/tessdata/eng.traineddata ]]; then
# This is a bug in Tesseract's internal tessdata discovery mechanism
echo "eng.traineddata must be present in tessdata for Tesseract to work"
exit 1
fi
'';
});
tesseract = (if enableLanguages == [] then tesseractBase else tesseractWithData) // passthru;
in
tesseract

View File

@ -75,19 +75,21 @@ stdenv.mkDerivation rec {
cp ${src}/leptonica_mod/* src/
'';
});
tesseract_modded = tesseract.overrideAttrs (attrs: {
prePatch = ''
cp ${src}/tesseract_mod/{ambigs.cpp,ccutil.h,ccutil.cpp} ccutil/
cp ${src}/tesseract_mod/dawg.cpp api/
cp ${src}/tesseract_mod/{imagedata.cpp,tessdatamanager.cpp} ccstruct/
cp ${src}/tesseract_mod/openclwrapper.h opencl/
cp ${src}/tesseract_mod/{tessedit.cpp,thresholder.cpp} ccmain/
cp ${src}/tesseract_mod/tess_lang_mod_edge.h cube/
cp ${src}/tesseract_mod/tesscapi.cpp api/
cp ${src}/include_mod/{tesseract.h,leptonica.h} api/
'';
patches = [ ./tesseract.patch ];
});
tesseract_modded = tesseract.override {
tesseractBase = tesseract.tesseractBase.overrideAttrs (_: {
prePatch = ''
cp ${src}/tesseract_mod/{ambigs.cpp,ccutil.h,ccutil.cpp} ccutil/
cp ${src}/tesseract_mod/dawg.cpp api/
cp ${src}/tesseract_mod/{imagedata.cpp,tessdatamanager.cpp} ccstruct/
cp ${src}/tesseract_mod/openclwrapper.h opencl/
cp ${src}/tesseract_mod/{tessedit.cpp,thresholder.cpp} ccmain/
cp ${src}/tesseract_mod/tess_lang_mod_edge.h cube/
cp ${src}/tesseract_mod/tesscapi.cpp api/
cp ${src}/include_mod/{tesseract.h,leptonica.h} api/
'';
patches = [ ./tesseract.patch ];
});
};
in
[ zlib libpng ] ++
optional enableGSL gsl ++