2026-02-17 21:32:10 +01:00
|
|
|
# OCR a PDF file to text using tesseract
|
|
|
|
|
{
|
2026-02-17 21:35:28 +01:00
|
|
|
lib,
|
2026-02-17 21:32:10 +01:00
|
|
|
writers,
|
|
|
|
|
poppler_utils,
|
|
|
|
|
tesseract,
|
|
|
|
|
coreutils,
|
|
|
|
|
}:
|
|
|
|
|
writers.writeDashBin "pdf-ocr" ''
|
|
|
|
|
set -efu
|
|
|
|
|
|
2026-02-17 21:35:28 +01:00
|
|
|
pdf_path="$(${lib.getExe' coreutils "realpath"} "$1")"
|
2026-02-17 21:32:10 +01:00
|
|
|
|
|
|
|
|
[ -f "$pdf_path" ] || {
|
|
|
|
|
echo "Usage: pdf-ocr FILE.pdf" >&2
|
|
|
|
|
exit 1
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-17 21:35:28 +01:00
|
|
|
tmpdir="$(${lib.getExe' coreutils "mktemp"} -d)"
|
2026-02-17 21:32:10 +01:00
|
|
|
trap 'rm -rf $tmpdir' EXIT
|
|
|
|
|
|
|
|
|
|
cd "$tmpdir"
|
|
|
|
|
|
2026-02-17 21:35:28 +01:00
|
|
|
${lib.getExe' poppler_utils "pdftoppm"} -png "$pdf_path" pdf-ocr
|
2026-02-17 21:32:10 +01:00
|
|
|
for png in pdf-ocr*.png; do
|
2026-02-17 21:35:28 +01:00
|
|
|
${lib.getExe tesseract} "$png" "$png.txt" 2>/dev/null
|
2026-02-17 21:32:10 +01:00
|
|
|
done
|
|
|
|
|
|
|
|
|
|
cat pdf-ocr-*.txt
|
|
|
|
|
''
|