Source code for fortrace.utility.image_processing.text_detection

import logging
import os
from difflib import SequenceMatcher
from typing import Literal

import cv2
import imutils.object_detection
import jellyfish
import numpy as np
import pytesseract

from fortrace.fortrace_definitions import FORTRACE_ROOT_DIR
from fortrace.utility.logger_helper import setup_logger
from fortrace.utility.string_filtering import ansi_escape

logger = setup_logger(__name__)


# FIXME: remove this function, as it is too cumbersome

[docs]
def texts_similar(
    recognized_text: list[str], target_text: list[str], threshold: float = 0.6
):
    """Compare two texts with each other using Jaro-Winkler distance.

    The texts are aligned with each other and target text can be longer than the text to be recognized.

    Args:
        recognized_text: presumably text obtained with perform_ocr, thus starting with gibberish
        target_text: the text to be compared against
        threshold: for comparison

    Returns:
        True: if both texts are somewhat similar
        False: otherwise
    """
    first_word = target_text[0].split()[
        0
    ]  # get first word of target text, to determine correct beginning of
    # recognized text
    idx = [
        i
        for i, item in enumerate(recognized_text)
        if jellyfish.jaro_winkler_similarity(
            item.strip()[: len(first_word)], first_word
        )
        > threshold
    ][0]

    for target_idx, line in enumerate(recognized_text[idx:], idx):
        if (
            jellyfish.jaro_winkler_similarity(
                line.strip(), target_text[target_idx].strip()
            )
            < threshold
        ):
            try:
                assert (
                    line.strip() == target_text[target_idx].strip()
                )  # so lines are printed in test output
            except AssertionError as e:
                print(e)
            return False
    return True




[docs]
def jaro(recognized_text: list[str], target: str, threshold: float = 0.6) -> bool:
    """Calculates the Jaro-Winkler distance for the provided inputs.

    This function iterates over the list of detected strings (most likely OCR-ed
    screenshot of a VM) and tries to match the provided target string to one of them.
    Immediately stops, once target is matched.

    Args:
        recognized_text: list of detected strings, e.g., from OCR-ed screenshot of a VM
        target: string that is searched for
        threshold: threshold which should trigger a True result

    Returns:
        True, if `target` could be found in `recognized_text`. False otherwise.
    """
    for line in recognized_text:
        if jellyfish.jaro_winkler_similarity(line.strip(), target) > threshold:
            return True
    return False




[docs]
def text_line_contains(
    recognized_text: list[str] | str,
    substrings: list[str] | str,
    compare_method: Literal["plain", "ignore_case", "jaro", "sequence"] = "ignore_case",
    escape_ansi_characters: bool = True,
    **kwargs,
) -> bool:
    """Scans the recognized text for provided substrings and immediately returns True for a match.

    Args:
        recognized_text: list of strings with recognized text
        substrings: substrings to scan for
        compare_method: Literal, describing the supported compare methods
        escape_ansi_characters: escape ANSI characters in detected text
        **kwargs: provide additional information for some string matching methods, e.g., a threshold

    Returns:
        True, if any substring is in recognized text.
    """
    if isinstance(recognized_text, str):
        recognized_text = [recognized_text]

    if isinstance(substrings, str):
        substrings = [substrings]

    if escape_ansi_characters:
        recognized_text = list(map(ansi_escape, recognized_text))

    match compare_method:
        case "plain":
            for line in recognized_text:
                if [s for s in substrings if s in line]:
                    return True
        case "ignore_case":
            for line in recognized_text:
                if [s for s in substrings if s.lower() in line.lower()]:
                    return True
        case "jaro":
            for line in substrings:
                if jaro(recognized_text, line, kwargs.get("threshold", 0.85)):
                    return True
        case "sequence":
            for line in recognized_text:
                for target_line in substrings:
                    # TODO: review value
                    if SequenceMatcher(None, line, target_line).ratio() > kwargs.get(
                        "threshold", 0.6
                    ):
                        return True
        case _:
            raise ValueError(f"Compare method {compare_method} is unknown")
    return False



def _decode_predictions(scores: cv2.typing.MatLike, geometry: np.ndarray):
    """Helper function to decode predictions from EAST text detector.

    Args:
        scores:
        geometry:

    Returns:

    """
    # Grab the number of rows and columns from the scores volume
    (num_rows, num_cols) = scores.shape[2:4]
    rects = []  # bounding box rectangles
    confidences = []  # associated confidences

    for y in range(num_rows):
        # Extract the scores (probabilities), followed by the
        # geometrical data used to derive potential bounding box
        # coordinates that surround text
        scores_data = scores[0, 0, y]
        x_data0 = geometry[0, 0, y]
        x_data1 = geometry[0, 1, y]
        x_data2 = geometry[0, 2, y]
        x_data3 = geometry[0, 3, y]
        angles_data = geometry[0, 4, y]

        for x in range(num_cols):
            # If our score does not have sufficient probability, ignore it
            if scores_data[x] < 0.5:
                continue

            # Compute the offset factor as our resulting feature
            # maps will be 4x smaller than the input image
            (offset_x, offset_y) = (x * 4.0, y * 4.0)

            # Extract the rotation angle for the prediction and
            # then compute the sin and cosine
            angle = angles_data[x]
            cos = np.cos(angle)
            sin = np.sin(angle)

            # Use the geometry volume to derive the width and height
            # of the bounding box
            h = x_data0[x] + x_data2[x]
            w = x_data1[x] + x_data3[x]

            # Compute both the starting and ending (x, y)-coordinates
            # for the text prediction bounding box
            end_x = int(offset_x + (cos * x_data1[x]) + (sin * x_data2[x]))
            end_y = int(offset_y - (sin * x_data1[x]) + (cos * x_data2[x]))
            start_x = int(end_x - w)
            start_y = int(end_y - h)

            rects.append((start_x, start_y, end_x, end_y))
            confidences.append(scores_data[x])

    return rects, confidences


def _merge_close_boxes(
    boxes: list[tuple[int, int, int, int]], horizontal_thresh: int, vertical_thresh: int
):
    """Merge bounding boxes that are close to each other based on separate horizontal and vertical proximity thresholds.

    Parameters:
        boxes: A list of bounding boxes, where each box is represented by a tuple (startX, startY, endX, endY).
        horizontal_thresh: The horizontal threshold distance in pixels to determine whether boxes should be merged.
        vertical_thresh: The vertical threshold distance in pixels to determine whether boxes should be merged.

    Returns:
        list of merged bounding boxes.
    """

    # If there are no boxes or only one box, no merging is needed
    if len(boxes) <= 1:
        return boxes

    # Calculate the centroids of the bounding boxes
    centroids = np.array(
        [[(box[0] + box[2]) / 2, (box[1] + box[3]) / 2] for box in boxes]
    )

    # Initialize labels for each box
    labels = np.arange(len(boxes))

    # Iterate over the boxes and merge those that are close based on the specified thresholds
    for i in range(len(boxes)):
        for j in range(i + 1, len(boxes)):
            # Calculate horizontal and vertical distances between centroids
            horizontal_dist = abs(centroids[i][0] - centroids[j][0])
            vertical_dist = abs(centroids[i][1] - centroids[j][1])

            # Check if the boxes are close enough to merge
            if (
                horizontal_dist <= horizontal_thresh
                and vertical_dist <= vertical_thresh
            ):
                # Merge the boxes by updating the label
                labels[j] = labels[i]

    merged_boxes = []

    # Merge the boxes based on the labels
    for label in np.unique(labels):
        # Find the indices of all boxes with the current label
        indices = np.where(labels == label)[0]

        # Calculate the minimum and maximum coordinates for the merged box
        start_x = min(boxes[i][0] for i in indices)
        start_y = min(boxes[i][1] for i in indices)
        end_x = max(boxes[i][2] for i in indices)
        end_y = max(boxes[i][3] for i in indices)

        # Add the merged box to the list
        merged_boxes.append((start_x, start_y, end_x, end_y))

    return merged_boxes



[docs]
def detect_and_recognize_text(
    vm_image: bytes | os.PathLike[str],
    coordinates: tuple[int, int, int, int] | None = None,
    language: str = "eng",
) -> tuple[list[tuple[int, int, int, int]], list[str]]:
    """Advanced method for text detection with OpenCV's EAST detector and eventual OCR with Tesseract

    Args:
        vm_image: screenshot of the VM
        coordinates: OpenCV coordinates of the area to be processed in this function, e.g., window coordinates
        language: a list of languages that are expected in the picture. Use ISO 3-digit language codes, separated by '+'
            Example: 'eng+fra'

    Returns:
        tuple containing two lists: coordinates (start_x, start_y, end_x, end_y) of text and text itself
    """
    if isinstance(vm_image, bytes):
        buffer = np.frombuffer(vm_image, dtype=np.uint8)
        image = cv2.imdecode(buffer, cv2.IMREAD_COLOR)
    else:
        image = cv2.imread(str(vm_image), cv2.IMREAD_COLOR)

    if coordinates:
        image = image[coordinates[1] : coordinates[3], coordinates[0] : coordinates[2]]

    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    # Store original image dimensions
    orig_h, orig_w = image.shape[:2]

    # Set the new width and height and then determine the ratio in change
    # for both the width and height
    new_w, new_h = (800, 800)
    ratio_w = orig_w / float(new_w)
    ratio_h = orig_h / float(new_h)

    image = cv2.resize(image, (new_w, new_h))

    # Use OpenCV's EAST text detector to find text regions
    net = cv2.dnn.readNet(
        os.path.join(
            FORTRACE_ROOT_DIR,
            "src/fortrace/utility/image_processing/"
            "EAST-Detector-for-text-detection-using-OpenCV/"
            "frozen_east_text_detection.pb",
        )
    )
    blob = cv2.dnn.blobFromImage(
        image, 1.0, (new_w, new_h), (123.68, 116.78, 103.94), True, False
    )

    net.setInput(blob)
    (scores, geometry) = net.forward(
        ["feature_fusion/Conv_7/Sigmoid", "feature_fusion/concat_3"]
    )

    rects, _ = _decode_predictions(scores, geometry)
    boxes = imutils.object_detection.non_max_suppression(
        np.array(rects, dtype=np.float16)
    )
    boxes = _merge_close_boxes(boxes, new_w // 5, new_h // 100)
    bounding_boxes = []
    ocr_result = []

    for start_x, start_y, end_x, end_y in boxes:
        # Scale the bounding box coordinates based on the image dimensions and ensure positive coordinates
        # since bounding boxes are always a bit too small, make them a bit larger
        start_x = max(0, int(start_x * ratio_w))
        start_y = max(0, int(start_y * ratio_h))
        end_x = min(max(0, int((end_x + 5) * ratio_w)), orig_w)
        end_y = min(max(0, int((end_y + 2) * ratio_h)), orig_h)

        roi = gray[start_y:end_y, start_x:end_x]

        # config = "--oem 1 --psm 7"
        # TODO: determine best config options (consider adding this to method head)
        # TODO: add path to list with user words with parameter '--user-words <PATH>'
        text = pytesseract.image_to_string(roi, config="--oem 1", lang=language)

        bounding_boxes.append((start_x, start_y, end_x, end_y))
        ocr_result.append(text)
        # for debugging print out the image with drawn on bounding boxes
        # cv2.rectangle(image, (start_x, start_y), (end_x, end_y), (0, 255, 0), 2)

    if logger.isEnabledFor(logging.DEBUG):
        logger.debug("OCR text: %s", ocr_result)

    return bounding_boxes, ocr_result