1
0
mirror of https://github.com/kmein/niveum synced 2026-03-16 18:21:07 +01:00
Files
niveum/.bin/pdf-ocr.sh

24 lines
373 B
Bash
Executable File

#!/usr/bin/env nix-shell
#! nix-shell -i bash -p poppler_utils tesseract4
set -eu
pdf_path="$(realpath "$1")"
[ -f "$pdf_path" ] || {
echo "Usage: $0 FILE.pdf" >&2
exit 1
}
tmpdir="$(mktemp -d)"
trap 'rm -rf $tmpdir' EXIT
cd "$tmpdir"
pdftoppm -png "$pdf_path" pdf-ocr
for png in pdf-ocr*.png; do
tesseract "$png" "$png.txt" 2>/dev/null
done
cat pdf-ocr-*.txt