import logging
import os
from difflib import SequenceMatcher
from typing import Literal
import cv2
import imutils.object_detection
import jellyfish
import numpy as np
import pytesseract
from fortrace.fortrace_definitions import FORTRACE_ROOT_DIR
from fortrace.utility.logger_helper import setup_logger
from fortrace.utility.string_filtering import ansi_escape
logger = setup_logger(__name__)
# FIXME: remove this function, as it is too cumbersome
[docs]
def texts_similar(
recognized_text: list[str], target_text: list[str], threshold: float = 0.6
):
"""Compare two texts with each other using Jaro-Winkler distance.
The texts are aligned with each other and target text can be longer than the text to be recognized.
Args:
recognized_text: presumably text obtained with perform_ocr, thus starting with gibberish
target_text: the text to be compared against
threshold: for comparison
Returns:
True: if both texts are somewhat similar
False: otherwise
"""
first_word = target_text[0].split()[
0
] # get first word of target text, to determine correct beginning of
# recognized text
idx = [
i
for i, item in enumerate(recognized_text)
if jellyfish.jaro_winkler_similarity(
item.strip()[: len(first_word)], first_word
)
> threshold
][0]
for target_idx, line in enumerate(recognized_text[idx:], idx):
if (
jellyfish.jaro_winkler_similarity(
line.strip(), target_text[target_idx].strip()
)
< threshold
):
try:
assert (
line.strip() == target_text[target_idx].strip()
) # so lines are printed in test output
except AssertionError as e:
print(e)
return False
return True
[docs]
def jaro(recognized_text: list[str], target: str, threshold: float = 0.6) -> bool:
"""Calculates the Jaro-Winkler distance for the provided inputs.
This function iterates over the list of detected strings (most likely OCR-ed
screenshot of a VM) and tries to match the provided target string to one of them.
Immediately stops, once target is matched.
Args:
recognized_text: list of detected strings, e.g., from OCR-ed screenshot of a VM
target: string that is searched for
threshold: threshold which should trigger a True result
Returns:
True, if `target` could be found in `recognized_text`. False otherwise.
"""
for line in recognized_text:
if jellyfish.jaro_winkler_similarity(line.strip(), target) > threshold:
return True
return False
[docs]
def text_line_contains(
recognized_text: list[str] | str,
substrings: list[str] | str,
compare_method: Literal["plain", "ignore_case", "jaro", "sequence"] = "ignore_case",
escape_ansi_characters: bool = True,
**kwargs,
) -> bool:
"""Scans the recognized text for provided substrings and immediately returns True for a match.
Args:
recognized_text: list of strings with recognized text
substrings: substrings to scan for
compare_method: Literal, describing the supported compare methods
escape_ansi_characters: escape ANSI characters in detected text
**kwargs: provide additional information for some string matching methods, e.g., a threshold
Returns:
True, if any substring is in recognized text.
"""
if isinstance(recognized_text, str):
recognized_text = [recognized_text]
if isinstance(substrings, str):
substrings = [substrings]
if escape_ansi_characters:
recognized_text = list(map(ansi_escape, recognized_text))
match compare_method:
case "plain":
for line in recognized_text:
if [s for s in substrings if s in line]:
return True
case "ignore_case":
for line in recognized_text:
if [s for s in substrings if s.lower() in line.lower()]:
return True
case "jaro":
for line in substrings:
if jaro(recognized_text, line, kwargs.get("threshold", 0.85)):
return True
case "sequence":
for line in recognized_text:
for target_line in substrings:
# TODO: review value
if SequenceMatcher(None, line, target_line).ratio() > kwargs.get(
"threshold", 0.6
):
return True
case _:
raise ValueError(f"Compare method {compare_method} is unknown")
return False
def _decode_predictions(scores: cv2.typing.MatLike, geometry: np.ndarray):
"""Helper function to decode predictions from EAST text detector.
Args:
scores:
geometry:
Returns:
"""
# Grab the number of rows and columns from the scores volume
(num_rows, num_cols) = scores.shape[2:4]
rects = [] # bounding box rectangles
confidences = [] # associated confidences
for y in range(num_rows):
# Extract the scores (probabilities), followed by the
# geometrical data used to derive potential bounding box
# coordinates that surround text
scores_data = scores[0, 0, y]
x_data0 = geometry[0, 0, y]
x_data1 = geometry[0, 1, y]
x_data2 = geometry[0, 2, y]
x_data3 = geometry[0, 3, y]
angles_data = geometry[0, 4, y]
for x in range(num_cols):
# If our score does not have sufficient probability, ignore it
if scores_data[x] < 0.5:
continue
# Compute the offset factor as our resulting feature
# maps will be 4x smaller than the input image
(offset_x, offset_y) = (x * 4.0, y * 4.0)
# Extract the rotation angle for the prediction and
# then compute the sin and cosine
angle = angles_data[x]
cos = np.cos(angle)
sin = np.sin(angle)
# Use the geometry volume to derive the width and height
# of the bounding box
h = x_data0[x] + x_data2[x]
w = x_data1[x] + x_data3[x]
# Compute both the starting and ending (x, y)-coordinates
# for the text prediction bounding box
end_x = int(offset_x + (cos * x_data1[x]) + (sin * x_data2[x]))
end_y = int(offset_y - (sin * x_data1[x]) + (cos * x_data2[x]))
start_x = int(end_x - w)
start_y = int(end_y - h)
rects.append((start_x, start_y, end_x, end_y))
confidences.append(scores_data[x])
return rects, confidences
def _merge_close_boxes(
boxes: list[tuple[int, int, int, int]], horizontal_thresh: int, vertical_thresh: int
):
"""Merge bounding boxes that are close to each other based on separate horizontal and vertical proximity thresholds.
Parameters:
boxes: A list of bounding boxes, where each box is represented by a tuple (startX, startY, endX, endY).
horizontal_thresh: The horizontal threshold distance in pixels to determine whether boxes should be merged.
vertical_thresh: The vertical threshold distance in pixels to determine whether boxes should be merged.
Returns:
list of merged bounding boxes.
"""
# If there are no boxes or only one box, no merging is needed
if len(boxes) <= 1:
return boxes
# Calculate the centroids of the bounding boxes
centroids = np.array(
[[(box[0] + box[2]) / 2, (box[1] + box[3]) / 2] for box in boxes]
)
# Initialize labels for each box
labels = np.arange(len(boxes))
# Iterate over the boxes and merge those that are close based on the specified thresholds
for i in range(len(boxes)):
for j in range(i + 1, len(boxes)):
# Calculate horizontal and vertical distances between centroids
horizontal_dist = abs(centroids[i][0] - centroids[j][0])
vertical_dist = abs(centroids[i][1] - centroids[j][1])
# Check if the boxes are close enough to merge
if (
horizontal_dist <= horizontal_thresh
and vertical_dist <= vertical_thresh
):
# Merge the boxes by updating the label
labels[j] = labels[i]
merged_boxes = []
# Merge the boxes based on the labels
for label in np.unique(labels):
# Find the indices of all boxes with the current label
indices = np.where(labels == label)[0]
# Calculate the minimum and maximum coordinates for the merged box
start_x = min(boxes[i][0] for i in indices)
start_y = min(boxes[i][1] for i in indices)
end_x = max(boxes[i][2] for i in indices)
end_y = max(boxes[i][3] for i in indices)
# Add the merged box to the list
merged_boxes.append((start_x, start_y, end_x, end_y))
return merged_boxes
[docs]
def detect_and_recognize_text(
vm_image: bytes | os.PathLike[str],
coordinates: tuple[int, int, int, int] | None = None,
language: str = "eng",
) -> tuple[list[tuple[int, int, int, int]], list[str]]:
"""Advanced method for text detection with OpenCV's EAST detector and eventual OCR with Tesseract
Args:
vm_image: screenshot of the VM
coordinates: OpenCV coordinates of the area to be processed in this function, e.g., window coordinates
language: a list of languages that are expected in the picture. Use ISO 3-digit language codes, separated by '+'
Example: 'eng+fra'
Returns:
tuple containing two lists: coordinates (start_x, start_y, end_x, end_y) of text and text itself
"""
if isinstance(vm_image, bytes):
buffer = np.frombuffer(vm_image, dtype=np.uint8)
image = cv2.imdecode(buffer, cv2.IMREAD_COLOR)
else:
image = cv2.imread(str(vm_image), cv2.IMREAD_COLOR)
if coordinates:
image = image[coordinates[1] : coordinates[3], coordinates[0] : coordinates[2]]
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# Store original image dimensions
orig_h, orig_w = image.shape[:2]
# Set the new width and height and then determine the ratio in change
# for both the width and height
new_w, new_h = (800, 800)
ratio_w = orig_w / float(new_w)
ratio_h = orig_h / float(new_h)
image = cv2.resize(image, (new_w, new_h))
# Use OpenCV's EAST text detector to find text regions
net = cv2.dnn.readNet(
os.path.join(
FORTRACE_ROOT_DIR,
"src/fortrace/utility/image_processing/"
"EAST-Detector-for-text-detection-using-OpenCV/"
"frozen_east_text_detection.pb",
)
)
blob = cv2.dnn.blobFromImage(
image, 1.0, (new_w, new_h), (123.68, 116.78, 103.94), True, False
)
net.setInput(blob)
(scores, geometry) = net.forward(
["feature_fusion/Conv_7/Sigmoid", "feature_fusion/concat_3"]
)
rects, _ = _decode_predictions(scores, geometry)
boxes = imutils.object_detection.non_max_suppression(
np.array(rects, dtype=np.float16)
)
boxes = _merge_close_boxes(boxes, new_w // 5, new_h // 100)
bounding_boxes = []
ocr_result = []
for start_x, start_y, end_x, end_y in boxes:
# Scale the bounding box coordinates based on the image dimensions and ensure positive coordinates
# since bounding boxes are always a bit too small, make them a bit larger
start_x = max(0, int(start_x * ratio_w))
start_y = max(0, int(start_y * ratio_h))
end_x = min(max(0, int((end_x + 5) * ratio_w)), orig_w)
end_y = min(max(0, int((end_y + 2) * ratio_h)), orig_h)
roi = gray[start_y:end_y, start_x:end_x]
# config = "--oem 1 --psm 7"
# TODO: determine best config options (consider adding this to method head)
# TODO: add path to list with user words with parameter '--user-words <PATH>'
text = pytesseract.image_to_string(roi, config="--oem 1", lang=language)
bounding_boxes.append((start_x, start_y, end_x, end_y))
ocr_result.append(text)
# for debugging print out the image with drawn on bounding boxes
# cv2.rectangle(image, (start_x, start_y), (end_x, end_y), (0, 255, 0), 2)
if logger.isEnabledFor(logging.DEBUG):
logger.debug("OCR text: %s", ocr_result)
return bounding_boxes, ocr_result