Implemented update option

This commit is contained in:
Kevin Veen-Birkenbach 2020-12-10 22:28:54 +01:00
parent c91929f28e
commit 4b0d6ed9ea

View File

@ -1,30 +1,59 @@
#!/bin/bash #!/bin/bash
# @see https://faceted.wordpress.com/2010/07/11/how-to-extract-text-from-pdf-files-using-poppler-and-gocr-on-ubuntu/ # @author Kevin Veen-Birkenbach
# @param $1 Working directory # @param $1 Working directory
# @param $2 language # @param $2 language
# @param $3 mode (update,initialize)
# sudo pacman -Syyu tesseract-data-deu tesseract-data-en tesseract # sudo pacman -Syyu tesseract-data-deu tesseract-data-en tesseract
if [ -z "$2" ] if [ -z "$2" ]
then then
echo "You need to define an working directory" && exit 1; echo "You need to define an working directory and a language" && exit 1;
fi
if [ -z "$3" ]
then
MODE="initialize"
else
if [ "$3" != "update" ]
then
echo "Unknown option: $3" && exit 1
fi
MODE="$3"
fi fi
TMP_FOLDER="$(mktemp -d)/" && TMP_FOLDER="$(mktemp -d)/" &&
ORIGIN_FOLDER="$1/origin/" && ORIGIN_FOLDER="$1/origin/" &&
OUTPUT_FOLDER="$1/generated/" && OUTPUT_FOLDER="$1/generated/" || exit 1
echo "Cleaning up $OUTPUT_FOLDER..." && if [ "$MODE" = "update" ]; then
rm -v "$OUTPUT_FOLDER"* || exit 1; echo "Updating bills..."
else
if [ "$(ls -A "$TMP_FOLDER")" ]
then
echo "Cleaning up $OUTPUT_FOLDER..."
rm -v "$OUTPUT_FOLDER"* || exit 1;
else
echo "$OUTPUT_FOLDER is allready cleaned up!"
fi
fi
for origin_file in "$ORIGIN_FOLDER"*.*; do for origin_file in "$ORIGIN_FOLDER"*.*; do
if [ "$(head -c 4 "$origin_file")" = "%PDF" ]; then if [ "$MODE" = "update" ] && [ "$(test -f "$OUTPUT_FOLDER$(basename "$origin_file")"*)" ] || [ "$MODE" = "initialize" ]; then
tmp_file="$TMP_FOLDER$(basename "$origin_file")" if [ "$(head -c 4 "$origin_file")" = "%PDF" ]; then
echo "Generating $tmp_file..." tmp_file="$TMP_FOLDER$(basename "$origin_file")"
pdfimages "$origin_file" "$tmp_file" echo "Generating $tmp_file..."
pdfimages "$origin_file" "$tmp_file"
else
cp -v "$origin_file" "$TMP_FOLDER"
fi
else else
cp -v "$origin_file" "$TMP_FOLDER" echo "Skipped $origin_file..."
fi fi
done done
for tesseract_input_file in "$TMP_FOLDER"*.*; do if [ "$(ls -A "$TMP_FOLDER")" ]
txt_output_file_without_suffix="$OUTPUT_FOLDER$(basename "$tesseract_input_file")"; then
echo "Generating $txt_output_file_without_suffix.txt..." for tesseract_input_file in "$TMP_FOLDER"*.*; do
tesseract -l "$2" "$tesseract_input_file" "$txt_output_file_without_suffix"; txt_output_file_without_suffix="$OUTPUT_FOLDER$(basename "$tesseract_input_file")";
echo "file content:" echo "Generating $txt_output_file_without_suffix.txt..."
cat "$txt_output_file_without_suffix.txt" tesseract -l "$2" "$tesseract_input_file" "$txt_output_file_without_suffix";
done echo "file content:"
cat "$txt_output_file_without_suffix.txt"
done
else
echo "Skipped text generation because $TMP_FOLDER is empty..."
fi