Source code for stringalign.error_classification.confusable_error

from typing import Literal

from stringalign.align import Kept, align_strings
from stringalign.normalize import StringNormalizer
from stringalign.tokenize import Tokenizer



[docs]
def count_confusable_errors(
    reference: str,
    predicted: str,
    tokenizer: Tokenizer,
    consider_confusables: Literal["confusables", "intentional"] | dict[str, str],
) -> int:
    """Count the number of errors that are solely due to characters being replaced with a confusable (e.g. I and 1).

    This function counts the number of edits we can avoid if we resolve the confusable characters in the strings before
    aligning them.

    Parameters:
    -----------
    reference
        The reference text.
    predicted
        The predicted text.
    tokenizer: Tokenizer
        Tokenizer to use
    consider_confusables
        Which confusable list to use, see :func:`stringalign.normalize.StringNormalizer` or :ref:`confusables` for more information.

    Returns:
    --------
    int
        The number of confusable errors.
    """
    normalizer = StringNormalizer(normalization=None, resolve_confusables=consider_confusables)
    alignment, _ = align_strings(reference, predicted, tokenizer=tokenizer)

    num_confusable_errors = 0
    for alignment_op in alignment:
        if isinstance(alignment_op, Kept):
            continue

        alignment_op = alignment_op.generalize()
        resolved_ref = normalizer(alignment_op.reference)
        resolved_pred = normalizer(alignment_op.predicted)
        num_confusable_errors += resolved_ref == resolved_pred

    return num_confusable_errors