Source code for stringalign.error_classification.case_error

import stringalign
from stringalign.align import levenshtein_distance



[docs]
def count_case_errors(reference: str, predicted: str) -> int:
    """Count the number of character errors that are solely due to mistaken casing.

    This function counts the number of edits we can avoid if we make casefold the strings before aligning them.

    Parameters:
    -----------
    reference
        The reference text.
    predicted
        The predicted text.

    Returns:
    --------
    int
        The number of case errors.
    """
    distance = levenshtein_distance(reference, predicted)
    casefolded_distance = levenshtein_distance(
        reference,
        predicted,
        tokenizer=stringalign.tokenize.GraphemeClusterTokenizer(
            post_tokenization_normalizer=stringalign.normalize.StringNormalizer(case_insensitive=True)
        ),
    )

    return distance - casefolded_distance