Merge pull request #138463 from dotlambda/ocrmypdf-pythonPackages

ocrmypdf: move to python3Packages
This commit is contained in:
Robert Schütz 2021-09-21 16:44:29 -07:00 committed by GitHub
commit b8c97a0d97
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 240 additions and 70 deletions

View File

@ -5,7 +5,6 @@
, ghostscript
, imagemagick
, jbig2enc
, ocrmypdf
, optipng
, pngquant
, qpdf

View File

@ -1,34 +1,32 @@
{ fetchFromGitHub
{ lib
, buildPythonPackage
, cffi
, coloredlogs
, fetchFromGitHub
, ghostscript
, img2pdf
, importlib-resources
, jbig2enc
, leptonica
, pdfminer
, pikepdf
, pillow
, pluggy
, pngquant
, python3
, python3Packages
, qpdf
, lib
, pytest-xdist
, pytestCheckHook
, reportlab
, setuptools
, setuptools-scm
, setuptools-scm-git-archive
, stdenv
, tesseract4
, unpaper
, substituteAll
, tesseract4
, tqdm
, unpaper
}:
let
inherit (python3Packages) buildPythonApplication;
runtimeDeps = with python3Packages; [
ghostscript
jbig2enc
leptonica
pngquant
qpdf
tesseract4
unpaper
pillow
];
in
buildPythonApplication rec {
buildPythonPackage rec {
pname = "ocrmypdf";
version = "12.5.0";
@ -39,51 +37,48 @@ buildPythonApplication rec {
sha256 = "sha256-g80WedX+TGHE9EJ/RSgOc53PM17V3WZslUNaHoqKTo0=";
};
nativeBuildInputs = with python3Packages; [
setuptools
patches = [
(substituteAll {
src = ./paths.patch;
gs = "${lib.getBin ghostscript}/bin/gs";
jbig2 = "${lib.getBin jbig2enc}/bin/jbig2";
liblept = "${lib.getLib leptonica}/lib/liblept${stdenv.hostPlatform.extensions.sharedLibrary}";
pngquant = "${lib.getBin pngquant}/bin/pngquant";
tesseract = "${lib.getBin tesseract4}/bin/tesseract";
unpaper = "${lib.getBin unpaper}/bin/unpaper";
})
];
nativeBuildInputs = [
setuptools-scm-git-archive
setuptools-scm
];
propagatedBuildInputs = with python3Packages; [
propagatedBuildInputs = [
cffi
coloredlogs
img2pdf
importlib-resources
pdfminer
pluggy
pikepdf
pillow
pluggy
reportlab
setuptools
tqdm
];
checkInputs = with python3Packages; [
pypdf2
pytest
pytest-helpers-namespace
checkInputs = [
pytest-xdist
pytest-cov
python-xmp-toolkit
pytestCheckHook
] ++ runtimeDeps;
patches = [
(substituteAll {
src = ./liblept.patch;
liblept = "${lib.getLib leptonica}/lib/liblept${stdenv.hostPlatform.extensions.sharedLibrary}";
})
];
makeWrapperArgs = [ "--prefix PATH : ${lib.makeBinPath [ ghostscript jbig2enc pngquant qpdf tesseract4 unpaper ]}" ];
meta = with lib; {
homepage = "https://github.com/jbarlow83/OCRmyPDF";
description = "Adds an OCR text layer to scanned PDF files, allowing them to be searched";
license = with licenses; [ mpl20 mit ];
platforms = platforms.linux;
maintainers = [ maintainers.kiwi ];
maintainers = with maintainers; [ kiwi dotlambda ];
changelog = "https://github.com/jbarlow83/OCRmyPDF/blob/v${version}/docs/release_notes.rst";
};
}

View File

@ -0,0 +1,160 @@
diff --git a/src/ocrmypdf/_exec/ghostscript.py b/src/ocrmypdf/_exec/ghostscript.py
index 5c357f1b..f459763a 100644
--- a/src/ocrmypdf/_exec/ghostscript.py
+++ b/src/ocrmypdf/_exec/ghostscript.py
@@ -25,28 +25,7 @@ from ocrmypdf.subprocess import get_version, run, run_polling_stderr
log = logging.getLogger(__name__)
-missing_gs_error = """
----------------------------------------------------------------------
-This error normally occurs when ocrmypdf find can't Ghostscript.
-Please ensure Ghostscript is installed and its location is added to
-the system PATH environment variable.
-
-For details see:
- https://ocrmypdf.readthedocs.io/en/latest/installation.html
----------------------------------------------------------------------
-"""
-
-_gswin = None
-if os.name == 'nt':
- _gswin = which('gswin64c')
- if not _gswin:
- _gswin = which('gswin32c')
- if not _gswin:
- raise MissingDependencyError(missing_gs_error)
- _gswin = Path(_gswin).stem
-
-GS = _gswin if _gswin else 'gs'
-del _gswin
+GS = '@gs@'
def version():
diff --git a/src/ocrmypdf/_exec/jbig2enc.py b/src/ocrmypdf/_exec/jbig2enc.py
index 2e8a058b..65a09088 100644
--- a/src/ocrmypdf/_exec/jbig2enc.py
+++ b/src/ocrmypdf/_exec/jbig2enc.py
@@ -14,7 +14,7 @@ from ocrmypdf.subprocess import get_version, run
def version():
- return get_version('jbig2', regex=r'jbig2enc (\d+(\.\d+)*).*')
+ return get_version('@jbig2@', regex=r'jbig2enc (\d+(\.\d+)*).*')
def available():
@@ -27,7 +27,7 @@ def available():
def convert_group(*, cwd, infiles, out_prefix):
args = [
- 'jbig2',
+ '@jbig2@',
'-b',
out_prefix,
'-s', # symbol mode (lossy)
@@ -46,7 +46,7 @@ def convert_group_mp(args):
def convert_single(*, cwd, infile, outfile):
- args = ['jbig2', '-p', infile]
+ args = ['@jbig2@', '-p', infile]
with open(outfile, 'wb') as fstdout:
proc = run(args, cwd=cwd, stdout=fstdout, stderr=PIPE)
proc.check_returncode()
diff --git a/src/ocrmypdf/_exec/pngquant.py b/src/ocrmypdf/_exec/pngquant.py
index ca8a4542..d0544174 100644
--- a/src/ocrmypdf/_exec/pngquant.py
+++ b/src/ocrmypdf/_exec/pngquant.py
@@ -19,7 +19,7 @@ from ocrmypdf.subprocess import get_version, run
def version():
- return get_version('pngquant', regex=r'(\d+(\.\d+)*).*')
+ return get_version('@pngquant@', regex=r'(\d+(\.\d+)*).*')
def available():
@@ -46,7 +46,7 @@ def input_as_png(input_file: Path):
def quantize(input_file: Path, output_file: Path, quality_min: int, quality_max: int):
with input_as_png(input_file) as input_stream:
args = [
- 'pngquant',
+ '@pngquant@',
'--force',
'--skip-if-larger',
'--quality',
diff --git a/src/ocrmypdf/_exec/tesseract.py b/src/ocrmypdf/_exec/tesseract.py
index 33ead41e..5840f7c1 100644
--- a/src/ocrmypdf/_exec/tesseract.py
+++ b/src/ocrmypdf/_exec/tesseract.py
@@ -78,7 +78,7 @@ class TesseractVersion(StrictVersion):
def version():
- return get_version('tesseract', regex=r'tesseract\s(.+)')
+ return get_version('@tesseract@', regex=r'tesseract\s(.+)')
def has_user_words():
@@ -100,7 +100,7 @@ def get_languages():
msg += output
return msg
- args_tess = ['tesseract', '--list-langs']
+ args_tess = ['@tesseract@', '--list-langs']
try:
proc = run(
args_tess,
@@ -122,7 +122,7 @@ def get_languages():
def tess_base_args(langs: List[str], engine_mode: Optional[int]) -> List[str]:
- args = ['tesseract']
+ args = ['@tesseract@']
if langs:
args.extend(['-l', '+'.join(langs)])
if engine_mode is not None:
diff --git a/src/ocrmypdf/_exec/unpaper.py b/src/ocrmypdf/_exec/unpaper.py
index 3c3ae72c..d269966a 100644
--- a/src/ocrmypdf/_exec/unpaper.py
+++ b/src/ocrmypdf/_exec/unpaper.py
@@ -31,7 +31,7 @@ log = logging.getLogger(__name__)
def version() -> str:
- return get_version('unpaper')
+ return get_version('@unpaper@')
def _setup_unpaper_io(tmpdir: Path, input_file: Path) -> Tuple[Path, Path]:
@@ -71,7 +71,7 @@ def _setup_unpaper_io(tmpdir: Path, input_file: Path) -> Tuple[Path, Path]:
def run(
input_file: Path, output_file: Path, *, dpi: DecFloat, mode_args: List[str]
) -> None:
- args_unpaper = ['unpaper', '-v', '--dpi', str(round(dpi, 6))] + mode_args
+ args_unpaper = ['@unpaper@', '-v', '--dpi', str(round(dpi, 6))] + mode_args
with TemporaryDirectory() as tmpdir:
input_pnm, output_pnm = _setup_unpaper_io(Path(tmpdir), input_file)
diff --git a/src/ocrmypdf/leptonica.py b/src/ocrmypdf/leptonica.py
index e4814f1a..fdaf7ea4 100644
--- a/src/ocrmypdf/leptonica.py
+++ b/src/ocrmypdf/leptonica.py
@@ -33,14 +33,7 @@ from ocrmypdf.lib._leptonica import ffi
logger = logging.getLogger(__name__)
-if os.name == 'nt':
- from ocrmypdf.subprocess._windows import shim_env_path
-
- libname = 'liblept-5'
- os.environ['PATH'] = shim_env_path()
-else:
- libname = 'lept'
-_libpath = find_library(libname)
+_libpath = '@liblept@'
if not _libpath:
raise MissingDependencyError(
"""

View File

@ -5,13 +5,12 @@
, fetchPypi
, hypothesis
, isPy3k
, jbig2dec
, lxml
, mupdf
, pillow
, psutil
, pybind11
, pytest-cov
, pytest-helpers-namespace
, pytest-timeout
, pytest-xdist
, pytestCheckHook
, python-dateutil
@ -20,18 +19,27 @@
, setuptools
, setuptools-scm
, setuptools-scm-git-archive
, substituteAll
}:
buildPythonPackage rec {
pname = "pikepdf";
version = "2.16.1";
version = "3.0.0";
disabled = ! isPy3k;
src = fetchPypi {
inherit pname version;
sha256 = "sha256-4k3/avMfHrcy/LXbRniDXR8xJkOZb9zZ2+uKylK8Dd4=";
sha256 = "sha256-PBeTfiMLIq+pdeaRMOid8pEd0eLHu+IAE4aEFU5CiEM=";
};
patches = [
(substituteAll {
src = ./paths.patch;
jbig2dec = "${lib.getBin jbig2dec}/bin/jbig2dec";
mudraw = "${lib.getBin mupdf}/bin/mudraw";
})
];
buildInputs = [
pybind11
qpdf
@ -45,11 +53,8 @@ buildPythonPackage rec {
checkInputs = [
attrs
hypothesis
pytest-helpers-namespace
pytest-timeout
pytest-xdist
psutil
pytest-cov
pytestCheckHook
python-dateutil
python-xmp-toolkit
@ -62,17 +67,13 @@ buildPythonPackage rec {
setuptools
];
preBuild = ''
HOME=$TMPDIR
'';
pythonImportsCheck = [ "pikepdf" ];
meta = with lib; {
homepage = "https://github.com/pikepdf/pikepdf";
description = "Read and write PDFs with Python, powered by qpdf";
license = licenses.mpl20;
maintainers = [ maintainers.kiwi ];
maintainers = with maintainers; [ kiwi dotlambda ];
changelog = "https://github.com/pikepdf/pikepdf/blob/${version}/docs/release_notes.rst";
};
}

View File

@ -0,0 +1,26 @@
diff --git a/src/pikepdf/_methods.py b/src/pikepdf/_methods.py
index 70cdc9e..c3a14d0 100644
--- a/src/pikepdf/_methods.py
+++ b/src/pikepdf/_methods.py
@@ -190,7 +190,7 @@ def _mudraw(buffer, fmt) -> bytes:
tmp_in.flush()
proc = run(
- ['mudraw', '-F', fmt, '-o', '-', tmp_in.name],
+ ['@mudraw@', '-F', fmt, '-o', '-', tmp_in.name],
stdout=PIPE,
stderr=PIPE,
check=True,
diff --git a/src/pikepdf/jbig2.py b/src/pikepdf/jbig2.py
index 80cc910..64f6d31 100644
--- a/src/pikepdf/jbig2.py
+++ b/src/pikepdf/jbig2.py
@@ -25,7 +25,7 @@ def extract_jbig2(
global_path = Path(tmpdir) / "global"
output_path = Path(tmpdir) / "outfile"
- args = ["jbig2dec", "-e", "-o", os.fspath(output_path)]
+ args = ["@jbig2dec@", "-e", "-o", os.fspath(output_path)]
# Get the raw stream, because we can't decode im_obj - that is why we are here
# (Strictly speaking we should remove any non-JBIG2 filters if double encoded)

View File

@ -1,13 +0,0 @@
diff --git a/src/ocrmypdf/leptonica.py b/src/ocrmypdf/leptonica.py
index 328b063..b993cc9 100644
--- a/src/ocrmypdf/leptonica.py
+++ b/src/ocrmypdf/leptonica.py
@@ -46,7 +46,7 @@ if os.name == 'nt':
os.environ['PATH'] = shim_paths_with_program_files()
else:
libname = 'lept'
-_libpath = find_library(libname)
+_libpath = '@liblept@'
if not _libpath:
raise MissingDependencyError(
"""

View File

@ -3249,7 +3249,7 @@ with pkgs;
oci-cli = callPackage ../tools/admin/oci-cli { };
ocrmypdf = callPackage ../tools/text/ocrmypdf { };
ocrmypdf = with python3.pkgs; toPythonApplication ocrmypdf;
ocrfeeder = callPackage ../applications/graphics/ocrfeeder { };

View File

@ -5061,6 +5061,8 @@ in {
oci = callPackage ../development/python-modules/oci { };
ocrmypdf = callPackage ../development/python-modules/ocrmypdf { };
od = callPackage ../development/python-modules/od { };
odfpy = callPackage ../development/python-modules/odfpy { };