From c91929f28e7c532ad35726de9554a6cee837f6b7 Mon Sep 17 00:00:00 2001
From: "Kevin Veen-Birkenbach [aka. Frantz]" <kevin@veen.world>
Date: Thu, 10 Dec 2020 21:48:34 +0100
Subject: [PATCH] Optimized

---
 generate.sh | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/generate.sh b/generate.sh
index a6967b6..847ba10 100644
--- a/generate.sh
+++ b/generate.sh
@@ -1,26 +1,30 @@
 #!/bin/bash
 # @see https://faceted.wordpress.com/2010/07/11/how-to-extract-text-from-pdf-files-using-poppler-and-gocr-on-ubuntu/
+# @param $1 Working directory
+# @param $2 language
 # sudo pacman -Syyu tesseract-data-deu tesseract-data-en tesseract
-if [ -z "$1" ]
+if [ -z "$2" ]
   then
 		echo "You need to define an working directory" && exit 1;
 fi
-TMP_FOLDER="$(mktemp -d)/"
-ORIGIN_FOLDER="$1/origin/";
-OUTPUT_FOLDER="$1/readable/";
+TMP_FOLDER="$(mktemp -d)/" &&
+ORIGIN_FOLDER="$1/origin/" &&
+OUTPUT_FOLDER="$1/generated/" &&
+echo "Cleaning up $OUTPUT_FOLDER..." &&
+rm -v "$OUTPUT_FOLDER"* || exit 1;
 for origin_file in "$ORIGIN_FOLDER"*.*; do
 	if [ "$(head -c 4 "$origin_file")" = "%PDF" ]; then
 		tmp_file="$TMP_FOLDER$(basename "$origin_file")"
 		echo "Generating $tmp_file..."
 		pdfimages "$origin_file" "$tmp_file"
 	else
-		cp "$origin_file" "$TMP_FOLDER"
+		cp -v "$origin_file" "$TMP_FOLDER"
 	fi
 done
 for tesseract_input_file in "$TMP_FOLDER"*.*; do
-	txt_output_file_without_suffix="$OUTPUT_FOLDER$(basename "$origin_file")";
+	txt_output_file_without_suffix="$OUTPUT_FOLDER$(basename "$tesseract_input_file")";
 	echo "Generating $txt_output_file_without_suffix.txt..."
-	tesseract -l deu "$tesseract_input_file" "$txt_output_file_without_suffix";
+	tesseract -l "$2" "$tesseract_input_file" "$txt_output_file_without_suffix";
 	echo "file content:"
 	cat "$txt_output_file_without_suffix.txt"
 done