Added draft

This commit is contained in:
Kevin Veen-Birkenbach 2020-12-10 20:57:35 +01:00
parent 4a736efe71
commit 781f01ed7b
1 changed files with 18 additions and 0 deletions

18
generate.sh Normal file
View File

@ -0,0 +1,18 @@
#!/bin/bash
# @see https://faceted.wordpress.com/2010/07/11/how-to-extract-text-from-pdf-files-using-poppler-and-gocr-on-ubuntu/
# sudo pacman -Syyu tesseract-data-deu tesseract-data-en tesseract
PPM_FOLDER="$PWD/ppm/";
PDF_FOLDER="$PWD/pdf/";
TXT_FOLDER="$PWD/txt/";
for pdf_origin_file in $PDF_FOLDER*.*; do
ppm_output_file="$PPM_FOLDER$(basename $pdf_origin_file)"
echo "Generating $ppm_output_file..."
pdfimages $pdf_origin_file $ppm_output_file
done
for ppm_origin_file in $PPM_FOLDER*.ppm; do
txt_output_file_without_suffix="$TXT_FOLDER$(basename $ppm_origin_file)";
echo "Generating $txt_output_file_without_suffix.txt..."
tesseract -l deu+eng "$ppm_origin_file" "$txt_output_file_without_suffix";
echo "file content:"
cat "$txt_output_file_without_suffix.txt"
done