hadith-api/app/utils/arabic.py

"""
Arabic text normalization utilities.

Used both in the extraction pipeline (post-processing) and in the API
(query parameter normalization) to ensure consistent matching against
the deduplicated Neo4j graph.
"""
import re
import unicodedata


# ── Diacritical marks to strip ──────────────────────────────────────────────
ARABIC_DIACRITICS = re.compile(
    r"[\u0610-\u061A"   # Small signs
    r"\u064B-\u065F"    # Tashkeel (harakat)
    r"\u0670"           # Superscript Alef
    r"\u06D6-\u06DC"    # Small Quranic marks
    r"\u06DF-\u06E4"    # Small signs continued
    r"\u06E7-\u06E8"    # Small signs continued
    r"\u06EA-\u06ED"    # Small signs continued
    r"]"
)

# ── Character normalization map ─────────────────────────────────────────────
CHAR_MAP = {
    "أ": "ا", "إ": "ا", "آ": "ا", "ٱ": "ا",  # Alef variants → bare Alef
    "ؤ": "و",                                      # Waw with Hamza → Waw
    "ئ": "ي",                                      # Ya with Hamza → Ya
    "ى": "ي",                                      # Alef Maksura → Ya
    "ة": "ه",                                      # Ta Marbuta → Ha
    "ٰ": "",                                        # Dagger Alef → remove
}

# ── Honorific patterns to strip from names ──────────────────────────────────
HONORIFICS = [
    r"صلى\s*الله\s*عليه\s*وسلم",
    r"عليه\s*السلام",
    r"رضي\s*الله\s*عنه(ا|م|ما)?",
    r"رحمه\s*الله",
    r"تعالى",
    r"عز\s*وجل",
    r"ﷺ",
    r"﷽",
]
HONORIFIC_PATTERN = re.compile(r"\s*(" + "|".join(HONORIFICS) + r")\s*", re.UNICODE)


def strip_diacritics(text: str) -> str:
    """Remove all Arabic diacritical marks (tashkeel/harakat)."""
    return ARABIC_DIACRITICS.sub("", text)


def normalize_chars(text: str) -> str:
    """Normalize Alef variants, Ta Marbuta, Alef Maksura, etc."""
    for src, dst in CHAR_MAP.items():
        text = text.replace(src, dst)
    return text


def strip_honorifics(text: str) -> str:
    """Remove common Arabic honorific phrases from names."""
    return HONORIFIC_PATTERN.sub(" ", text).strip()


def collapse_whitespace(text: str) -> str:
    """Collapse multiple spaces / ZWNJ / ZWJ into single space."""
    text = re.sub(r"[\u200B-\u200F\u202A-\u202E\uFEFF]", "", text)
    return re.sub(r"\s+", " ", text).strip()


def normalize_arabic(text: str) -> str:
    """
    Full normalization pipeline for Arabic text matching.
    Steps: strip diacritics → normalize chars → collapse whitespace.
    Does NOT strip honorifics (use normalize_name for that).
    """
    if not text:
        return ""
    text = strip_diacritics(text)
    text = normalize_chars(text)
    text = collapse_whitespace(text)
    return text


def normalize_name(text: str) -> str:
    """
    Normalize an Arabic name for matching against the graph.
    Strips diacritics, honorifics, normalizes characters.
    """
    if not text:
        return ""
    text = strip_honorifics(text)
    text = strip_diacritics(text)
    text = normalize_chars(text)
    text = collapse_whitespace(text)
    return text


def normalize_query(text: str) -> str:
    """
    Normalize a search query parameter.
    Lighter than name normalization — preserves more structure
    but still ensures matching against normalized graph data.
    """
    if not text:
        return ""
    text = strip_diacritics(text)
    text = normalize_chars(text)
    text = collapse_whitespace(text)
    return text