Source code for stringalign.error_classification.confusable_error

from typing import Literal

from stringalign.align import Kept, align_strings
from stringalign.normalize import StringNormalizer
from stringalign.tokenize import Tokenizer


[docs] def count_confusable_errors( reference: str, predicted: str, tokenizer: Tokenizer, consider_confusables: Literal["confusables", "intentional"] | dict[str, str], ) -> int: """Count the number of errors that are solely due to characters being replaced with a confusable (e.g. I and 1). This function counts the number of edits we can avoid if we resolve the confusable characters in the strings before aligning them. Parameters: ----------- reference The reference text. predicted The predicted text. tokenizer: Tokenizer Tokenizer to use consider_confusables Which confusable list to use, see :func:`stringalign.normalize.StringNormalizer` or :ref:`confusables` for more information. Returns: -------- int The number of confusable errors. """ normalizer = StringNormalizer(normalization=None, resolve_confusables=consider_confusables) alignment, _ = align_strings(reference, predicted, tokenizer=tokenizer) num_confusable_errors = 0 for alignment_op in alignment: if isinstance(alignment_op, Kept): continue alignment_op = alignment_op.generalize() resolved_ref = normalizer(alignment_op.reference) resolved_pred = normalizer(alignment_op.predicted) num_confusable_errors += resolved_ref == resolved_pred return num_confusable_errors