2021-01-25 08:26:54 +00:00
|
|
|
{ lib, makeWrapper, tesseractBase, languages
|
2022-05-02 10:38:51 +00:00
|
|
|
, runCommand, imagemagick
|
2018-12-18 18:02:13 +00:00
|
|
|
|
|
|
|
# A list of languages like [ "eng" "spa" … ] or `null` for all available languages
|
|
|
|
, enableLanguages ? null
|
|
|
|
|
|
|
|
# A list of files or a directory containing files
|
2018-12-18 18:02:14 +00:00
|
|
|
, tessdata ? (if enableLanguages == null then languages.all
|
|
|
|
else map (lang: languages.${lang}) enableLanguages)
|
2018-12-18 18:02:13 +00:00
|
|
|
|
2018-12-18 18:02:14 +00:00
|
|
|
# This argument is obsolete
|
2018-12-18 18:02:13 +00:00
|
|
|
, enableLanguagesHash ? null
|
|
|
|
}:
|
|
|
|
|
|
|
|
let
|
|
|
|
tesseractWithData = tesseractBase.overrideAttrs (_: {
|
|
|
|
inherit tesseractBase tessdata;
|
|
|
|
|
2021-02-07 09:17:39 +00:00
|
|
|
nativeBuildInputs = [ makeWrapper ];
|
2018-12-18 18:02:13 +00:00
|
|
|
|
|
|
|
buildCommand = ''
|
|
|
|
makeWrapper {$tesseractBase,$out}/bin/tesseract --set-default TESSDATA_PREFIX $out/share/tessdata
|
|
|
|
|
|
|
|
# Recursively link include, share
|
|
|
|
cp -rs --no-preserve=mode $tesseractBase/{include,share} $out
|
|
|
|
|
|
|
|
cp -r --no-preserve=mode $tesseractBase/lib $out
|
|
|
|
# Fixup the store paths in lib so that the tessdata from this derivation is used.
|
|
|
|
if (( ''${#tesseractBase} != ''${#out} )); then
|
|
|
|
echo "Can't replace store paths due to differing lengths"
|
|
|
|
exit 1
|
|
|
|
fi
|
|
|
|
find $out/lib -type f -exec sed -i "s|$tesseractBase|$out|g" {} \;
|
|
|
|
|
|
|
|
if [[ -d "$tessdata" ]]; then
|
|
|
|
ln -s $tessdata/* $out/share/tessdata
|
|
|
|
else
|
|
|
|
for lang in $tessdata; do
|
|
|
|
ln -s $lang $out/share/tessdata/''${lang#/nix/store*-}
|
|
|
|
done
|
|
|
|
fi
|
|
|
|
|
|
|
|
if [[ ! -e $out/share/tessdata/eng.traineddata ]]; then
|
|
|
|
# This is a bug in Tesseract's internal tessdata discovery mechanism
|
|
|
|
echo "eng.traineddata must be present in tessdata for Tesseract to work"
|
|
|
|
exit 1
|
|
|
|
fi
|
|
|
|
'';
|
2022-05-02 10:38:51 +00:00
|
|
|
|
2018-12-18 18:02:13 +00:00
|
|
|
});
|
|
|
|
|
2022-05-02 10:38:51 +00:00
|
|
|
passthru = { inherit tesseractBase languages tessdata; };
|
|
|
|
|
|
|
|
# Only run test when all languages are available
|
|
|
|
test = lib.optionalAttrs (enableLanguages == null) {
|
|
|
|
tests.default = runCommand "tesseract-test-ocr" {
|
|
|
|
buildInputs = [
|
|
|
|
tesseractWithData
|
|
|
|
imagemagick
|
|
|
|
];
|
|
|
|
} ''
|
|
|
|
text="hello nix"
|
|
|
|
|
|
|
|
convert -size 400x40 xc:white -font 'DejaVu-Sans' -pointsize 20 \
|
|
|
|
-fill black -annotate +5+20 "$text" /tmp/test-img.png 2>/dev/null
|
|
|
|
ocrResult=$(tesseract /tmp/test-img.png - | tr -d "\f")
|
|
|
|
|
|
|
|
if [[ $ocrResult != $text ]]; then
|
|
|
|
echo "OCR test failed"
|
|
|
|
echo "expected: '$text'"
|
|
|
|
echo "actual: '$ocrResult'"
|
|
|
|
exit 1
|
|
|
|
fi
|
|
|
|
touch $out
|
|
|
|
'';
|
|
|
|
};
|
|
|
|
|
|
|
|
tesseract = (if enableLanguages == [] then tesseractBase else tesseractWithData) // passthru // test;
|
2018-12-18 18:02:13 +00:00
|
|
|
in
|
2018-12-18 18:02:14 +00:00
|
|
|
if enableLanguagesHash == null then
|
|
|
|
tesseract
|
|
|
|
else
|
2021-01-15 13:21:58 +00:00
|
|
|
lib.warn "Argument `enableLanguagesHash` is obsolete and can be removed."
|
2018-12-18 18:02:14 +00:00
|
|
|
tesseract
|