From 162127f29f2a5a628ecea79d4718d3a51b1bffac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Wed, 21 Oct 2020 12:33:25 +0200 Subject: tests/acceptance: Extract tesseract_available() helper in new namespace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We are going to reuse tesseract_available(). Extract it to a new 'tesseract_utils' namespace. Signed-off-by: Philippe Mathieu-Daudé Reviewed-by: Thomas Huth Message-Id: <20201021105035.2477784-4-f4bug@amsat.org> Signed-off-by: Philippe Mathieu-Daudé --- tests/acceptance/tesseract_utils.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 tests/acceptance/tesseract_utils.py (limited to 'tests/acceptance/tesseract_utils.py') diff --git a/tests/acceptance/tesseract_utils.py b/tests/acceptance/tesseract_utils.py new file mode 100644 index 0000000000..acd6e8c2fa --- /dev/null +++ b/tests/acceptance/tesseract_utils.py @@ -0,0 +1,28 @@ +# ... +# +# Copyright (c) 2019 Philippe Mathieu-Daudé +# +# This work is licensed under the terms of the GNU GPL, version 2 or +# later. See the COPYING file in the top-level directory. + +import re + +from avocado.utils.path import find_command, CmdNotFoundError + +def tesseract_available(expected_version): + try: + find_command('tesseract') + except CmdNotFoundError: + return False + res = process.run('tesseract --version') + try: + version = res.stdout_text.split()[1] + except IndexError: + version = res.stderr_text.split()[1] + return int(version.split('.')[0]) == expected_version + + match = re.match(r'tesseract\s(\d)', res) + if match is None: + return False + # now this is guaranteed to be a digit + return int(match.groups()[0]) == expected_version -- cgit v1.2.3 From ca8224492854a2930d0cadc76e715bf59582bf66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Wed, 21 Oct 2020 12:35:30 +0200 Subject: tests/acceptance: Introduce tesseract_ocr() helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We are going to reuse the tesseract OCR code. Create a new tesseract_ocr() helper and use it. Signed-off-by: Philippe Mathieu-Daudé Message-Id: <20201021105035.2477784-5-f4bug@amsat.org> Signed-off-by: Philippe Mathieu-Daudé --- tests/acceptance/tesseract_utils.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'tests/acceptance/tesseract_utils.py') diff --git a/tests/acceptance/tesseract_utils.py b/tests/acceptance/tesseract_utils.py index acd6e8c2fa..72cd9ab798 100644 --- a/tests/acceptance/tesseract_utils.py +++ b/tests/acceptance/tesseract_utils.py @@ -6,7 +6,9 @@ # later. See the COPYING file in the top-level directory. import re +import logging +from avocado.utils import process from avocado.utils.path import find_command, CmdNotFoundError def tesseract_available(expected_version): @@ -26,3 +28,19 @@ def tesseract_available(expected_version): return False # now this is guaranteed to be a digit return int(match.groups()[0]) == expected_version + + +def tesseract_ocr(image_path, tesseract_args='', tesseract_version=3): + console_logger = logging.getLogger('tesseract') + console_logger.debug(image_path) + if tesseract_version == 4: + tesseract_args += ' --oem 1' + proc = process.run("tesseract {} {} stdout".format(tesseract_args, + image_path)) + lines = [] + for line in proc.stdout_text.split('\n'): + sline = line.strip() + if len(sline): + console_logger.debug(sline) + lines += [sline] + return lines -- cgit v1.2.3