Source code for csvsmith.utils.distance

from __future__ import annotations

from dataclasses import dataclass
from enum import Enum, auto



[docs]
class Relation(Enum):
    EXACT_MATCH = auto()
    CASE_INSENSITIVE_MATCH = auto()
    WHITESPACE_TRIMMED_MATCH = auto()
    NORMALIZED_SPACE_MATCH = auto()
    NO_STRUCTURAL_MATCH = auto()




[docs]
@dataclass
class Result:
    classification: Relation
    damerau_levenshtein_distance: int
    jaro_winkler_score: float
    similarity_percentage: float


[docs]
    def get_relation_string(self) -> str:
        if self.classification is Relation.EXACT_MATCH:
            return "Identical"
        if self.classification is Relation.CASE_INSENSITIVE_MATCH:
            return "Case-Insensitive Match"
        if self.classification is Relation.WHITESPACE_TRIMMED_MATCH:
            return "Similar (Trimmed)"
        if self.classification is Relation.NORMALIZED_SPACE_MATCH:
            return "Synonymous (No Spaces)"
        return "Different"





[docs]
class StringDistance:
    """
    Provides functionality for calculating string distances and relationships between
    two strings based on various algorithms.

    This class includes methods for analyzing string similarities and relationships,
    including exact matches, case-insensitive comparisons, and whitespace normalization.
    It also implements Damerau-Levenshtein and Jaro-Winkler distance calculations.

    :return classification: Possible relationship classification between two strings.
    :return damerau_levenshtein_distance: Integer distance calculated using the Damerau-Levenshtein algorithm.
    :return jaro_winkler_score: A float score indicating similarity using the Jaro-Winkler algorithm.
    :return similarity_percentage: A percentage similarity score between two strings.
    """

[docs]
    @staticmethod
    def analyze(a: str, b: str, ignore_case: bool = False) -> Result:
        sa = str(a)
        sb = str(b)

        relation = StringDistance.classify(sa, sb)

        if ignore_case:
            sa = sa.lower()
            sb = sb.lower()

        d_dist = StringDistance.calculate_damerau_levenshtein(sa, sb)
        jw_score = StringDistance.calculate_jaro_winkler(sa, sb)

        max_len = max(len(sa), len(sb))
        similarity = 100.0 if max_len == 0 else (1.0 - d_dist / max_len) * 100.0

        return Result(
            classification=relation,
            damerau_levenshtein_distance=d_dist,
            jaro_winkler_score=jw_score,
            similarity_percentage=similarity,
        )



[docs]
    @staticmethod
    def trim(s: str) -> str:
        return s.strip()



[docs]
    @staticmethod
    def strip_all(s: str) -> str:
        """
        Remove all whitespace characters from a string.

        Uses split/join logic so any whitespace character acts as a separator,
        including spaces, tabs, and newlines.
        """
        return "".join(s.split())



[docs]
    @staticmethod
    def equals_ignore_case(a: str, b: str) -> bool:
        return a.lower() == b.lower()



[docs]
    @staticmethod
    def classify(a: str, b: str) -> Relation:
        if a == b:
            return Relation.EXACT_MATCH
        if StringDistance.equals_ignore_case(a, b):
            return Relation.CASE_INSENSITIVE_MATCH
        if StringDistance.trim(a) == StringDistance.trim(b):
            return Relation.WHITESPACE_TRIMMED_MATCH
        if StringDistance.strip_all(a) == StringDistance.strip_all(b):
            return Relation.NORMALIZED_SPACE_MATCH
        return Relation.NO_STRUCTURAL_MATCH



[docs]
    @staticmethod
    def calculate_damerau_levenshtein(s1: str, s2: str) -> int:
        """
        Restricted Damerau-Levenshtein distance:
        insertion, deletion, substitution, adjacent transposition.
        """
        m = len(s1)
        n = len(s2)

        d = [[0] * (n + 1) for _ in range(m + 1)]

        for i in range(m + 1):
            d[i][0] = i
        for j in range(n + 1):
            d[0][j] = j

        for i in range(1, m + 1):
            for j in range(1, n + 1):
                cost = 0 if s1[i - 1] == s2[j - 1] else 1

                d[i][j] = min(
                    d[i - 1][j] + 1,        # deletion
                    d[i][j - 1] + 1,        # insertion
                    d[i - 1][j - 1] + cost, # substitution
                )

                if (
                    i > 1
                    and j > 1
                    and s1[i - 1] == s2[j - 2]
                    and s1[i - 2] == s2[j - 1]
                ):
                    d[i][j] = min(d[i][j], d[i - 2][j - 2] + cost)

        return d[m][n]



[docs]
    @staticmethod
    def calculate_jaro_winkler(s1: str, s2: str) -> float:
        len1 = len(s1)
        len2 = len(s2)

        if len1 == 0 and len2 == 0:
            return 1.0
        if len1 == 0 or len2 == 0:
            return 0.0

        match_distance = max(len1, len2) // 2 - 1
        if match_distance < 0:
            match_distance = 0

        s1_matches = [False] * len1
        s2_matches = [False] * len2

        matches = 0
        for i in range(len1):
            start = max(0, i - match_distance)
            end = min(i + match_distance + 1, len2)

            for j in range(start, end):
                if not s2_matches[j] and s1[i] == s2[j]:
                    s1_matches[i] = True
                    s2_matches[j] = True
                    matches += 1
                    break

        if matches == 0:
            return 0.0

        transpositions = 0
        k = 0
        for i in range(len1):
            if s1_matches[i]:
                while not s2_matches[k]:
                    k += 1
                if s1[i] != s2[k]:
                    transpositions += 1
                k += 1

        jaro = (
            matches / len1
            + matches / len2
            + (matches - transpositions / 2.0) / matches
        ) / 3.0

        p = 0.1
        max_l = 4
        l = 0
        while l < min(len1, len2, max_l) and s1[l] == s2[l]:
            l += 1

        return jaro + (l * p * (1.0 - jaro))





[docs]
def analyze_pair(a: str, b: str, ignore_case: bool = False) -> Result:
    return StringDistance.analyze(a, b, ignore_case)