1
0
mirror of https://github.com/kmein/niveum synced 2026-03-16 10:11:08 +01:00
Files
niveum/packages/pdf-ocr.nix
Kierán Meinhardt e67d6d7df2 use lib.getExe and lib.getExe' in packaged scripts
Replace all ${pkg}/bin/name patterns with:
- lib.getExe pkg (for main executables: curl, jq, gnused, ffmpeg, etc.)
- lib.getExe' pkg "name" (for specific binaries: coreutils, util-linux, etc.)
2026-02-17 21:35:28 +01:00

31 lines
575 B
Nix

# OCR a PDF file to text using tesseract
{
lib,
writers,
poppler_utils,
tesseract,
coreutils,
}:
writers.writeDashBin "pdf-ocr" ''
set -efu
pdf_path="$(${lib.getExe' coreutils "realpath"} "$1")"
[ -f "$pdf_path" ] || {
echo "Usage: pdf-ocr FILE.pdf" >&2
exit 1
}
tmpdir="$(${lib.getExe' coreutils "mktemp"} -d)"
trap 'rm -rf $tmpdir' EXIT
cd "$tmpdir"
${lib.getExe' poppler_utils "pdftoppm"} -png "$pdf_path" pdf-ocr
for png in pdf-ocr*.png; do
${lib.getExe tesseract} "$png" "$png.txt" 2>/dev/null
done
cat pdf-ocr-*.txt
''