bill-manager/generate.sh

71 lines
2.1 KiB
Bash
Raw Normal View History

2020-12-10 20:57:35 +01:00
#!/bin/bash
2020-12-10 22:28:54 +01:00
# @author Kevin Veen-Birkenbach
2020-12-10 21:48:34 +01:00
# @param $1 Working directory
# @param $2 language
2020-12-10 22:28:54 +01:00
# @param $3 mode (update,initialize)
2020-12-10 22:37:21 +01:00
# shellcheck disable=SC2144 # Deactivate wrong error message
2020-12-10 21:48:34 +01:00
if [ -z "$2" ]
2020-12-10 21:28:43 +01:00
then
2020-12-10 22:28:54 +01:00
echo "You need to define an working directory and a language" && exit 1;
fi
if [ -z "$3" ]
then
MODE="initialize"
else
if [ "$3" != "update" ]
then
echo "Unknown option: $3" && exit 1
fi
MODE="$3"
2020-12-10 21:28:43 +01:00
fi
2020-12-10 21:48:34 +01:00
TMP_FOLDER="$(mktemp -d)/" &&
ORIGIN_FOLDER="$1/origin/" &&
2020-12-10 22:28:54 +01:00
OUTPUT_FOLDER="$1/generated/" || exit 1
if [ "$MODE" = "update" ]; then
echo "Updating bills..."
else
if [ "$(ls -A "$OUTPUT_FOLDER")" ]
2020-12-10 22:28:54 +01:00
then
echo "Cleaning up $OUTPUT_FOLDER..."
rm -v "$OUTPUT_FOLDER"* || exit 1;
else
echo "$OUTPUT_FOLDER is allready cleaned up!"
fi
fi
2020-12-10 21:28:43 +01:00
for origin_file in "$ORIGIN_FOLDER"*.*; do
2020-12-10 22:37:21 +01:00
if [ "$MODE" = "update" ] && [ ! -f "$OUTPUT_FOLDER$(basename "$origin_file")"* ] || [ "$MODE" = "initialize" ]; then
2020-12-10 22:28:54 +01:00
if [ "$(head -c 4 "$origin_file")" = "%PDF" ]; then
tmp_file="$TMP_FOLDER$(basename "$origin_file")"
txt_output_file="$OUTPUT_FOLDER$(basename "$origin_file").txt"
pdftotext "$origin_file" "$txt_output_file"
content="$(cat "$txt_output_file")"
if [ ${#content} -gt "9" ]
then
echo "Text successfully extracted to $txt_output_file:"
cat "$txt_output_file"
else
2020-12-11 00:07:00 +01:00
rm -v "$txt_output_file"
echo "Extract images..."
pdfimages "$origin_file" "$tmp_file"
fi
2020-12-10 22:28:54 +01:00
else
cp -v "$origin_file" "$TMP_FOLDER"
fi
2020-12-10 21:28:43 +01:00
else
2020-12-10 22:28:54 +01:00
echo "Skipped $origin_file..."
2020-12-10 21:28:43 +01:00
fi
2020-12-10 20:57:35 +01:00
done
2020-12-10 22:28:54 +01:00
if [ "$(ls -A "$TMP_FOLDER")" ]
then
for tesseract_input_file in "$TMP_FOLDER"*.*; do
txt_output_file_without_suffix="$OUTPUT_FOLDER$(basename "$tesseract_input_file")";
echo "Generating $txt_output_file_without_suffix.txt..."
tesseract -l "$2" "$tesseract_input_file" "$txt_output_file_without_suffix";
echo "file content:"
cat "$txt_output_file_without_suffix.txt"
done
else
echo "Skipped text generation because $TMP_FOLDER is empty..."
fi
echo "Cleanup..." && rm -v "$TMP_FOLDER"* && rmdir -v "$TMP_FOLDER";