Source code for fortrace.utility.image_processing.text_detection

import logging
import os
from difflib import SequenceMatcher
from typing import Literal

import cv2
import imutils.object_detection
import jellyfish
import numpy as np
import pytesseract

from fortrace.fortrace_definitions import FORTRACE_ROOT_DIR
from fortrace.utility.logger_helper import setup_logger
from fortrace.utility.string_filtering import ansi_escape

logger = setup_logger(__name__)


# FIXME: remove this function, as it is too cumbersome
[docs] def texts_similar( recognized_text: list[str], target_text: list[str], threshold: float = 0.6 ): """Compare two texts with each other using Jaro-Winkler distance. The texts are aligned with each other and target text can be longer than the text to be recognized. Args: recognized_text: presumably text obtained with perform_ocr, thus starting with gibberish target_text: the text to be compared against threshold: for comparison Returns: True: if both texts are somewhat similar False: otherwise """ first_word = target_text[0].split()[ 0 ] # get first word of target text, to determine correct beginning of # recognized text idx = [ i for i, item in enumerate(recognized_text) if jellyfish.jaro_winkler_similarity( item.strip()[: len(first_word)], first_word ) > threshold ][0] for target_idx, line in enumerate(recognized_text[idx:], idx): if ( jellyfish.jaro_winkler_similarity( line.strip(), target_text[target_idx].strip() ) < threshold ): try: assert ( line.strip() == target_text[target_idx].strip() ) # so lines are printed in test output except AssertionError as e: print(e) return False return True
[docs] def jaro(recognized_text: list[str], target: str, threshold: float = 0.6) -> bool: """Calculates the Jaro-Winkler distance for the provided inputs. This function iterates over the list of detected strings (most likely OCR-ed screenshot of a VM) and tries to match the provided target string to one of them. Immediately stops, once target is matched. Args: recognized_text: list of detected strings, e.g., from OCR-ed screenshot of a VM target: string that is searched for threshold: threshold which should trigger a True result Returns: True, if `target` could be found in `recognized_text`. False otherwise. """ for line in recognized_text: if jellyfish.jaro_winkler_similarity(line.strip(), target) > threshold: return True return False
[docs] def text_line_contains( recognized_text: list[str] | str, substrings: list[str] | str, compare_method: Literal["plain", "ignore_case", "jaro", "sequence"] = "ignore_case", escape_ansi_characters: bool = True, **kwargs, ) -> bool: """Scans the recognized text for provided substrings and immediately returns True for a match. Args: recognized_text: list of strings with recognized text substrings: substrings to scan for compare_method: Literal, describing the supported compare methods escape_ansi_characters: escape ANSI characters in detected text **kwargs: provide additional information for some string matching methods, e.g., a threshold Returns: True, if any substring is in recognized text. """ if isinstance(recognized_text, str): recognized_text = [recognized_text] if isinstance(substrings, str): substrings = [substrings] if escape_ansi_characters: recognized_text = list(map(ansi_escape, recognized_text)) match compare_method: case "plain": for line in recognized_text: if [s for s in substrings if s in line]: return True case "ignore_case": for line in recognized_text: if [s for s in substrings if s.lower() in line.lower()]: return True case "jaro": for line in substrings: if jaro(recognized_text, line, kwargs.get("threshold", 0.85)): return True case "sequence": for line in recognized_text: for target_line in substrings: # TODO: review value if SequenceMatcher(None, line, target_line).ratio() > kwargs.get( "threshold", 0.6 ): return True case _: raise ValueError(f"Compare method {compare_method} is unknown") return False
def _decode_predictions(scores: cv2.typing.MatLike, geometry: np.ndarray): """Helper function to decode predictions from EAST text detector. Args: scores: geometry: Returns: """ # Grab the number of rows and columns from the scores volume (num_rows, num_cols) = scores.shape[2:4] rects = [] # bounding box rectangles confidences = [] # associated confidences for y in range(num_rows): # Extract the scores (probabilities), followed by the # geometrical data used to derive potential bounding box # coordinates that surround text scores_data = scores[0, 0, y] x_data0 = geometry[0, 0, y] x_data1 = geometry[0, 1, y] x_data2 = geometry[0, 2, y] x_data3 = geometry[0, 3, y] angles_data = geometry[0, 4, y] for x in range(num_cols): # If our score does not have sufficient probability, ignore it if scores_data[x] < 0.5: continue # Compute the offset factor as our resulting feature # maps will be 4x smaller than the input image (offset_x, offset_y) = (x * 4.0, y * 4.0) # Extract the rotation angle for the prediction and # then compute the sin and cosine angle = angles_data[x] cos = np.cos(angle) sin = np.sin(angle) # Use the geometry volume to derive the width and height # of the bounding box h = x_data0[x] + x_data2[x] w = x_data1[x] + x_data3[x] # Compute both the starting and ending (x, y)-coordinates # for the text prediction bounding box end_x = int(offset_x + (cos * x_data1[x]) + (sin * x_data2[x])) end_y = int(offset_y - (sin * x_data1[x]) + (cos * x_data2[x])) start_x = int(end_x - w) start_y = int(end_y - h) rects.append((start_x, start_y, end_x, end_y)) confidences.append(scores_data[x]) return rects, confidences def _merge_close_boxes( boxes: list[tuple[int, int, int, int]], horizontal_thresh: int, vertical_thresh: int ): """Merge bounding boxes that are close to each other based on separate horizontal and vertical proximity thresholds. Parameters: boxes: A list of bounding boxes, where each box is represented by a tuple (startX, startY, endX, endY). horizontal_thresh: The horizontal threshold distance in pixels to determine whether boxes should be merged. vertical_thresh: The vertical threshold distance in pixels to determine whether boxes should be merged. Returns: list of merged bounding boxes. """ # If there are no boxes or only one box, no merging is needed if len(boxes) <= 1: return boxes # Calculate the centroids of the bounding boxes centroids = np.array( [[(box[0] + box[2]) / 2, (box[1] + box[3]) / 2] for box in boxes] ) # Initialize labels for each box labels = np.arange(len(boxes)) # Iterate over the boxes and merge those that are close based on the specified thresholds for i in range(len(boxes)): for j in range(i + 1, len(boxes)): # Calculate horizontal and vertical distances between centroids horizontal_dist = abs(centroids[i][0] - centroids[j][0]) vertical_dist = abs(centroids[i][1] - centroids[j][1]) # Check if the boxes are close enough to merge if ( horizontal_dist <= horizontal_thresh and vertical_dist <= vertical_thresh ): # Merge the boxes by updating the label labels[j] = labels[i] merged_boxes = [] # Merge the boxes based on the labels for label in np.unique(labels): # Find the indices of all boxes with the current label indices = np.where(labels == label)[0] # Calculate the minimum and maximum coordinates for the merged box start_x = min(boxes[i][0] for i in indices) start_y = min(boxes[i][1] for i in indices) end_x = max(boxes[i][2] for i in indices) end_y = max(boxes[i][3] for i in indices) # Add the merged box to the list merged_boxes.append((start_x, start_y, end_x, end_y)) return merged_boxes
[docs] def detect_and_recognize_text( vm_image: bytes | os.PathLike[str], coordinates: tuple[int, int, int, int] | None = None, language: str = "eng", ) -> tuple[list[tuple[int, int, int, int]], list[str]]: """Advanced method for text detection with OpenCV's EAST detector and eventual OCR with Tesseract Args: vm_image: screenshot of the VM coordinates: OpenCV coordinates of the area to be processed in this function, e.g., window coordinates language: a list of languages that are expected in the picture. Use ISO 3-digit language codes, separated by '+' Example: 'eng+fra' Returns: tuple containing two lists: coordinates (start_x, start_y, end_x, end_y) of text and text itself """ if isinstance(vm_image, bytes): buffer = np.frombuffer(vm_image, dtype=np.uint8) image = cv2.imdecode(buffer, cv2.IMREAD_COLOR) else: image = cv2.imread(str(vm_image), cv2.IMREAD_COLOR) if coordinates: image = image[coordinates[1] : coordinates[3], coordinates[0] : coordinates[2]] gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) # Store original image dimensions orig_h, orig_w = image.shape[:2] # Set the new width and height and then determine the ratio in change # for both the width and height new_w, new_h = (800, 800) ratio_w = orig_w / float(new_w) ratio_h = orig_h / float(new_h) image = cv2.resize(image, (new_w, new_h)) # Use OpenCV's EAST text detector to find text regions net = cv2.dnn.readNet( os.path.join( FORTRACE_ROOT_DIR, "src/fortrace/utility/image_processing/" "EAST-Detector-for-text-detection-using-OpenCV/" "frozen_east_text_detection.pb", ) ) blob = cv2.dnn.blobFromImage( image, 1.0, (new_w, new_h), (123.68, 116.78, 103.94), True, False ) net.setInput(blob) (scores, geometry) = net.forward( ["feature_fusion/Conv_7/Sigmoid", "feature_fusion/concat_3"] ) rects, _ = _decode_predictions(scores, geometry) boxes = imutils.object_detection.non_max_suppression( np.array(rects, dtype=np.float16) ) boxes = _merge_close_boxes(boxes, new_w // 5, new_h // 100) bounding_boxes = [] ocr_result = [] for start_x, start_y, end_x, end_y in boxes: # Scale the bounding box coordinates based on the image dimensions and ensure positive coordinates # since bounding boxes are always a bit too small, make them a bit larger start_x = max(0, int(start_x * ratio_w)) start_y = max(0, int(start_y * ratio_h)) end_x = min(max(0, int((end_x + 5) * ratio_w)), orig_w) end_y = min(max(0, int((end_y + 2) * ratio_h)), orig_h) roi = gray[start_y:end_y, start_x:end_x] # config = "--oem 1 --psm 7" # TODO: determine best config options (consider adding this to method head) # TODO: add path to list with user words with parameter '--user-words <PATH>' text = pytesseract.image_to_string(roi, config="--oem 1", lang=language) bounding_boxes.append((start_x, start_y, end_x, end_y)) ocr_result.append(text) # for debugging print out the image with drawn on bounding boxes # cv2.rectangle(image, (start_x, start_y), (end_x, end_y), (0, 255, 0), 2) if logger.isEnabledFor(logging.DEBUG): logger.debug("OCR text: %s", ocr_result) return bounding_boxes, ocr_result