From 4b0d6ed9eacbc3abaa0e1d2c22aa051937fd50d1 Mon Sep 17 00:00:00 2001 From: "Kevin Veen-Birkenbach [aka. Frantz]" Date: Thu, 10 Dec 2020 22:28:54 +0100 Subject: [PATCH] Implemented update option --- generate.sh | 63 ++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 46 insertions(+), 17 deletions(-) diff --git a/generate.sh b/generate.sh index 847ba10..f8cb620 100644 --- a/generate.sh +++ b/generate.sh @@ -1,30 +1,59 @@ #!/bin/bash -# @see https://faceted.wordpress.com/2010/07/11/how-to-extract-text-from-pdf-files-using-poppler-and-gocr-on-ubuntu/ +# @author Kevin Veen-Birkenbach # @param $1 Working directory # @param $2 language +# @param $3 mode (update,initialize) # sudo pacman -Syyu tesseract-data-deu tesseract-data-en tesseract if [ -z "$2" ] then - echo "You need to define an working directory" && exit 1; + echo "You need to define an working directory and a language" && exit 1; +fi +if [ -z "$3" ] + then + MODE="initialize" + else + if [ "$3" != "update" ] + then + echo "Unknown option: $3" && exit 1 + fi + MODE="$3" fi TMP_FOLDER="$(mktemp -d)/" && ORIGIN_FOLDER="$1/origin/" && -OUTPUT_FOLDER="$1/generated/" && -echo "Cleaning up $OUTPUT_FOLDER..." && -rm -v "$OUTPUT_FOLDER"* || exit 1; +OUTPUT_FOLDER="$1/generated/" || exit 1 +if [ "$MODE" = "update" ]; then + echo "Updating bills..." +else + if [ "$(ls -A "$TMP_FOLDER")" ] + then + echo "Cleaning up $OUTPUT_FOLDER..." + rm -v "$OUTPUT_FOLDER"* || exit 1; + else + echo "$OUTPUT_FOLDER is allready cleaned up!" + fi +fi for origin_file in "$ORIGIN_FOLDER"*.*; do - if [ "$(head -c 4 "$origin_file")" = "%PDF" ]; then - tmp_file="$TMP_FOLDER$(basename "$origin_file")" - echo "Generating $tmp_file..." - pdfimages "$origin_file" "$tmp_file" + if [ "$MODE" = "update" ] && [ "$(test -f "$OUTPUT_FOLDER$(basename "$origin_file")"*)" ] || [ "$MODE" = "initialize" ]; then + if [ "$(head -c 4 "$origin_file")" = "%PDF" ]; then + tmp_file="$TMP_FOLDER$(basename "$origin_file")" + echo "Generating $tmp_file..." + pdfimages "$origin_file" "$tmp_file" + else + cp -v "$origin_file" "$TMP_FOLDER" + fi else - cp -v "$origin_file" "$TMP_FOLDER" + echo "Skipped $origin_file..." fi done -for tesseract_input_file in "$TMP_FOLDER"*.*; do - txt_output_file_without_suffix="$OUTPUT_FOLDER$(basename "$tesseract_input_file")"; - echo "Generating $txt_output_file_without_suffix.txt..." - tesseract -l "$2" "$tesseract_input_file" "$txt_output_file_without_suffix"; - echo "file content:" - cat "$txt_output_file_without_suffix.txt" -done +if [ "$(ls -A "$TMP_FOLDER")" ] + then + for tesseract_input_file in "$TMP_FOLDER"*.*; do + txt_output_file_without_suffix="$OUTPUT_FOLDER$(basename "$tesseract_input_file")"; + echo "Generating $txt_output_file_without_suffix.txt..." + tesseract -l "$2" "$tesseract_input_file" "$txt_output_file_without_suffix"; + echo "file content:" + cat "$txt_output_file_without_suffix.txt" + done + else + echo "Skipped text generation because $TMP_FOLDER is empty..." +fi