Optimized draft

This commit is contained in:
Kevin Veen-Birkenbach 2020-12-10 21:28:43 +01:00
parent 781f01ed7b
commit 1266e2f24a

View File

@ -1,18 +1,26 @@
#!/bin/bash #!/bin/bash
# @see https://faceted.wordpress.com/2010/07/11/how-to-extract-text-from-pdf-files-using-poppler-and-gocr-on-ubuntu/ # @see https://faceted.wordpress.com/2010/07/11/how-to-extract-text-from-pdf-files-using-poppler-and-gocr-on-ubuntu/
# sudo pacman -Syyu tesseract-data-deu tesseract-data-en tesseract # sudo pacman -Syyu tesseract-data-deu tesseract-data-en tesseract
PPM_FOLDER="$PWD/ppm/"; if [ -z "$1" ]
PDF_FOLDER="$PWD/pdf/"; then
TXT_FOLDER="$PWD/txt/"; echo "You need to define an working directory" && exit 1;
for pdf_origin_file in $PDF_FOLDER*.*; do fi
ppm_output_file="$PPM_FOLDER$(basename $pdf_origin_file)" TMP_FOLDER="$(mktemp -d)/"
echo "Generating $ppm_output_file..." ORIGIN_FOLDER="$1/origin/";
pdfimages $pdf_origin_file $ppm_output_file OUTPUT_FOLDER="$1/readable/";
for origin_file in "$ORIGIN_FOLDER"*.*; do
if [ "$(head -c 4 "$origin_file")" = "%PDF" ]; then
tmp_file="$TMP_FOLDER$(basename "$origin_file")"
echo "Generating $tmp_file..."
pdfimages "$origin_file" "$tmp_file"
else
cp "$origin_file" "$TMP_FOLDER"
fi
done done
for ppm_origin_file in $PPM_FOLDER*.ppm; do for tesseract_input_file in "$TMP_FOLDER"*.*; do
txt_output_file_without_suffix="$TXT_FOLDER$(basename $ppm_origin_file)"; txt_output_file_without_suffix="$OUTPUT_FOLDER$(basename "$origin_file")";
echo "Generating $txt_output_file_without_suffix.txt..." echo "Generating $txt_output_file_without_suffix.txt..."
tesseract -l deu+eng "$ppm_origin_file" "$txt_output_file_without_suffix"; tesseract -l deu "$tesseract_input_file" "$txt_output_file_without_suffix";
echo "file content:" echo "file content:"
cat "$txt_output_file_without_suffix.txt" cat "$txt_output_file_without_suffix.txt"
done done