Source code for csvsmith.utils.clean_numeric

import re
from typing import Any

NON_BREAKING_SPACE = "\xa0"
SEPARATOR_PATTERN = re.compile(r"[ _\xa0]")
NUMBER_PATTERN = re.compile(r"^-?(?:\d+|\d*\.\d+)$")
INVALID_NUMBER_MESSAGE = "Could not convert {value!r} to a valid number."
CURRENCY_PREFIX_PATTERN = re.compile(r"^[\$€£¥₹]")



[docs]
def strip_currency_prefix(value: Any) -> Any:
    """
    Remove a single common currency symbol from the start of a value.
    """
    text = str(value).strip()
    if text and CURRENCY_PREFIX_PATTERN.match(text):
        return text[1:].strip()
    return value



def _normalize_numeric_text(value: Any, *, sep: str, decimal: str) -> str:
    """
    Normalize a numeric text string for consistent formatting.

    Converts a given value to a string representation and ensures normalization of numeric formatting,
    such as removing group separators, converting localized decimal separators, and handling negative
    values enclosed in parentheses.
    """
    numeric_text = str(value).strip()

    if numeric_text.startswith("(") and numeric_text.endswith(")"):
        numeric_text = f"-{numeric_text[1:-1]}"

    if sep:
        numeric_text = numeric_text.replace(sep, "")

    if decimal != ".":
        numeric_text = numeric_text.replace(decimal, ".")

    return numeric_text


def _has_valid_grouping(numeric_text: str, *, decimal: str) -> bool:
    """
    Checks whether a numeric text string has valid grouping based on a specified decimal character.
    """
    if not numeric_text:
        return False

    unsigned_text = numeric_text[1:] if numeric_text.startswith("-") else numeric_text

    if unsigned_text.count(decimal) > 1:
        return False

    integer_text, _, fraction_text = unsigned_text.partition(decimal)

    if not integer_text and not fraction_text:
        return False

    for part in (integer_text, fraction_text):
        if not part:
            continue
        if part.startswith("_") or part.endswith("_"):
            return False
        if part.startswith(" ") or part.endswith(" "):
            return False
        if part.startswith(NON_BREAKING_SPACE) or part.endswith(NON_BREAKING_SPACE):
            return False
        if "__" in part or "  " in part or NON_BREAKING_SPACE * 2 in part:
            return False

    stripped_text = SEPARATOR_PATTERN.sub("", numeric_text)
    return bool(NUMBER_PATTERN.fullmatch(stripped_text))


def _strip_group_separators(numeric_text: str) -> str:
    return SEPARATOR_PATTERN.sub("", numeric_text)


def _invalid_number_error(value: Any) -> ValueError:
    return ValueError(INVALID_NUMBER_MESSAGE.format(value=value))



[docs]
def clean_numeric(
        value: Any, *, sep: str = ",", decimal: str = ".", relaxed: bool = False
) -> float | Any:
    """
    Cleans and converts a given input to a float by normalizing its numeric representation.
    """
    if value is None:
        return 0.0

    normalized_text = _normalize_numeric_text(value, sep=sep, decimal=decimal)

    if not _has_valid_grouping(normalized_text, decimal=decimal):
        if relaxed:
            return value
        raise _invalid_number_error(value)

    candidate_text = _strip_group_separators(normalized_text)

    try:
        return float(candidate_text)
    except ValueError as exc:
        if relaxed:
            return value
        raise _invalid_number_error(value) from exc




[docs]
def clean_currency_numeric(
        value: Any, *, sep: str = ",", decimal: str = ".", relaxed: bool = False
) -> float | Any:
    """
    Cleans and converts a currency-prefixed numeric string to a float.
    """
    if value is None:
        return 0.0

    return clean_numeric(
        strip_currency_prefix(value),
        sep=sep,
        decimal=decimal,
        relaxed=relaxed,
    )