diff --git a/generate.sh b/generate.sh index a6967b6..847ba10 100644 --- a/generate.sh +++ b/generate.sh @@ -1,26 +1,30 @@ #!/bin/bash # @see https://faceted.wordpress.com/2010/07/11/how-to-extract-text-from-pdf-files-using-poppler-and-gocr-on-ubuntu/ +# @param $1 Working directory +# @param $2 language # sudo pacman -Syyu tesseract-data-deu tesseract-data-en tesseract -if [ -z "$1" ] +if [ -z "$2" ] then echo "You need to define an working directory" && exit 1; fi -TMP_FOLDER="$(mktemp -d)/" -ORIGIN_FOLDER="$1/origin/"; -OUTPUT_FOLDER="$1/readable/"; +TMP_FOLDER="$(mktemp -d)/" && +ORIGIN_FOLDER="$1/origin/" && +OUTPUT_FOLDER="$1/generated/" && +echo "Cleaning up $OUTPUT_FOLDER..." && +rm -v "$OUTPUT_FOLDER"* || exit 1; for origin_file in "$ORIGIN_FOLDER"*.*; do if [ "$(head -c 4 "$origin_file")" = "%PDF" ]; then tmp_file="$TMP_FOLDER$(basename "$origin_file")" echo "Generating $tmp_file..." pdfimages "$origin_file" "$tmp_file" else - cp "$origin_file" "$TMP_FOLDER" + cp -v "$origin_file" "$TMP_FOLDER" fi done for tesseract_input_file in "$TMP_FOLDER"*.*; do - txt_output_file_without_suffix="$OUTPUT_FOLDER$(basename "$origin_file")"; + txt_output_file_without_suffix="$OUTPUT_FOLDER$(basename "$tesseract_input_file")"; echo "Generating $txt_output_file_without_suffix.txt..." - tesseract -l deu "$tesseract_input_file" "$txt_output_file_without_suffix"; + tesseract -l "$2" "$tesseract_input_file" "$txt_output_file_without_suffix"; echo "file content:" cat "$txt_output_file_without_suffix.txt" done