Source code for csvsmith.utils.distance

from __future__ import annotations

from dataclasses import dataclass
from enum import Enum, auto


[docs] class Relation(Enum): EXACT_MATCH = auto() CASE_INSENSITIVE_MATCH = auto() WHITESPACE_TRIMMED_MATCH = auto() NORMALIZED_SPACE_MATCH = auto() NO_STRUCTURAL_MATCH = auto()
[docs] @dataclass class Result: classification: Relation damerau_levenshtein_distance: int jaro_winkler_score: float similarity_percentage: float
[docs] def get_relation_string(self) -> str: if self.classification is Relation.EXACT_MATCH: return "Identical" if self.classification is Relation.CASE_INSENSITIVE_MATCH: return "Case-Insensitive Match" if self.classification is Relation.WHITESPACE_TRIMMED_MATCH: return "Similar (Trimmed)" if self.classification is Relation.NORMALIZED_SPACE_MATCH: return "Synonymous (No Spaces)" return "Different"
[docs] class StringDistance: """ Provides functionality for calculating string distances and relationships between two strings based on various algorithms. This class includes methods for analyzing string similarities and relationships, including exact matches, case-insensitive comparisons, and whitespace normalization. It also implements Damerau-Levenshtein and Jaro-Winkler distance calculations. :return classification: Possible relationship classification between two strings. :return damerau_levenshtein_distance: Integer distance calculated using the Damerau-Levenshtein algorithm. :return jaro_winkler_score: A float score indicating similarity using the Jaro-Winkler algorithm. :return similarity_percentage: A percentage similarity score between two strings. """
[docs] @staticmethod def analyze(a: str, b: str, ignore_case: bool = False) -> Result: sa = str(a) sb = str(b) relation = StringDistance.classify(sa, sb) if ignore_case: sa = sa.lower() sb = sb.lower() d_dist = StringDistance.calculate_damerau_levenshtein(sa, sb) jw_score = StringDistance.calculate_jaro_winkler(sa, sb) max_len = max(len(sa), len(sb)) similarity = 100.0 if max_len == 0 else (1.0 - d_dist / max_len) * 100.0 return Result( classification=relation, damerau_levenshtein_distance=d_dist, jaro_winkler_score=jw_score, similarity_percentage=similarity, )
[docs] @staticmethod def trim(s: str) -> str: return s.strip()
[docs] @staticmethod def strip_all(s: str) -> str: """ Remove all whitespace characters from a string. Uses split/join logic so any whitespace character acts as a separator, including spaces, tabs, and newlines. """ return "".join(s.split())
[docs] @staticmethod def equals_ignore_case(a: str, b: str) -> bool: return a.lower() == b.lower()
[docs] @staticmethod def classify(a: str, b: str) -> Relation: if a == b: return Relation.EXACT_MATCH if StringDistance.equals_ignore_case(a, b): return Relation.CASE_INSENSITIVE_MATCH if StringDistance.trim(a) == StringDistance.trim(b): return Relation.WHITESPACE_TRIMMED_MATCH if StringDistance.strip_all(a) == StringDistance.strip_all(b): return Relation.NORMALIZED_SPACE_MATCH return Relation.NO_STRUCTURAL_MATCH
[docs] @staticmethod def calculate_damerau_levenshtein(s1: str, s2: str) -> int: """ Restricted Damerau-Levenshtein distance: insertion, deletion, substitution, adjacent transposition. """ m = len(s1) n = len(s2) d = [[0] * (n + 1) for _ in range(m + 1)] for i in range(m + 1): d[i][0] = i for j in range(n + 1): d[0][j] = j for i in range(1, m + 1): for j in range(1, n + 1): cost = 0 if s1[i - 1] == s2[j - 1] else 1 d[i][j] = min( d[i - 1][j] + 1, # deletion d[i][j - 1] + 1, # insertion d[i - 1][j - 1] + cost, # substitution ) if ( i > 1 and j > 1 and s1[i - 1] == s2[j - 2] and s1[i - 2] == s2[j - 1] ): d[i][j] = min(d[i][j], d[i - 2][j - 2] + cost) return d[m][n]
[docs] @staticmethod def calculate_jaro_winkler(s1: str, s2: str) -> float: len1 = len(s1) len2 = len(s2) if len1 == 0 and len2 == 0: return 1.0 if len1 == 0 or len2 == 0: return 0.0 match_distance = max(len1, len2) // 2 - 1 if match_distance < 0: match_distance = 0 s1_matches = [False] * len1 s2_matches = [False] * len2 matches = 0 for i in range(len1): start = max(0, i - match_distance) end = min(i + match_distance + 1, len2) for j in range(start, end): if not s2_matches[j] and s1[i] == s2[j]: s1_matches[i] = True s2_matches[j] = True matches += 1 break if matches == 0: return 0.0 transpositions = 0 k = 0 for i in range(len1): if s1_matches[i]: while not s2_matches[k]: k += 1 if s1[i] != s2[k]: transpositions += 1 k += 1 jaro = ( matches / len1 + matches / len2 + (matches - transpositions / 2.0) / matches ) / 3.0 p = 0.1 max_l = 4 l = 0 while l < min(len1, len2, max_l) and s1[l] == s2[l]: l += 1 return jaro + (l * p * (1.0 - jaro))
[docs] def analyze_pair(a: str, b: str, ignore_case: bool = False) -> Result: return StringDistance.analyze(a, b, ignore_case)