feat: Add Arabic text normalization utilities for consistent matching

2026-03-02 22:00:19 +01:00 · 2026-03-02 22:00:19 +01:00 · f2bab9cadd
parent 649b6a40a3
commit f2bab9cadd
1 changed files with 110 additions and 0 deletions
--- a/app/utils/arabic.py
+++ b/app/utils/arabic.py
@ -0,0 +1,110 @@
 """
 Arabic text normalization utilities.
 Used both in the extraction pipeline (post-processing) and in the API
 (query parameter normalization) to ensure consistent matching against
 the deduplicated Neo4j graph.
 """
 import re
 import unicodedata
 # ── Diacritical marks to strip ──────────────────────────────────────────────
 ARABIC_DIACRITICS = re.compile(
    r"[\u0610-\u061A"   # Small signs
    r"\u064B-\u065F"    # Tashkeel (harakat)
    r"\u0670"           # Superscript Alef
    r"\u06D6-\u06DC"    # Small Quranic marks
    r"\u06DF-\u06E4"    # Small signs continued
    r"\u06E7-\u06E8"    # Small signs continued
    r"\u06EA-\u06ED"    # Small signs continued
    r"]"
 )
 # ── Character normalization map ─────────────────────────────────────────────
 CHAR_MAP = {
    "أ": "ا", "إ": "ا", "آ": "ا", "ٱ": "ا",  # Alef variants → bare Alef
    "ؤ": "و",                                      # Waw with Hamza → Waw
    "ئ": "ي",                                      # Ya with Hamza → Ya
    "ى": "ي",                                      # Alef Maksura → Ya
    "ة": "ه",                                      # Ta Marbuta → Ha
    "ٰ": "",                                        # Dagger Alef → remove
 }
 # ── Honorific patterns to strip from names ──────────────────────────────────
 HONORIFICS = [
    r"صلى\s*الله\s*عليه\s*وسلم",
    r"عليه\s*السلام",
    r"رضي\s*الله\s*عنه(ا|م|ما)?",
    r"رحمه\s*الله",
    r"تعالى",
    r"عز\s*وجل",
    r"ﷺ",
    r"﷽",
 ]
 HONORIFIC_PATTERN = re.compile(r"\s*(" + "|".join(HONORIFICS) + r")\s*", re.UNICODE)
 def strip_diacritics(text: str) -> str:
    """Remove all Arabic diacritical marks (tashkeel/harakat)."""
    return ARABIC_DIACRITICS.sub("", text)
 def normalize_chars(text: str) -> str:
    """Normalize Alef variants, Ta Marbuta, Alef Maksura, etc."""
    for src, dst in CHAR_MAP.items():
        text = text.replace(src, dst)
    return text
 def strip_honorifics(text: str) -> str:
    """Remove common Arabic honorific phrases from names."""
    return HONORIFIC_PATTERN.sub(" ", text).strip()
 def collapse_whitespace(text: str) -> str:
    """Collapse multiple spaces / ZWNJ / ZWJ into single space."""
    text = re.sub(r"[\u200B-\u200F\u202A-\u202E\uFEFF]", "", text)
    return re.sub(r"\s+", " ", text).strip()
 def normalize_arabic(text: str) -> str:
    """
    Full normalization pipeline for Arabic text matching.
    Steps: strip diacritics → normalize chars → collapse whitespace.
    Does NOT strip honorifics (use normalize_name for that).
    """
    if not text:
        return ""
    text = strip_diacritics(text)
    text = normalize_chars(text)
    text = collapse_whitespace(text)
    return text
 def normalize_name(text: str) -> str:
    """
    Normalize an Arabic name for matching against the graph.
    Strips diacritics, honorifics, normalizes characters.
    """
    if not text:
        return ""
    text = strip_honorifics(text)
    text = strip_diacritics(text)
    text = normalize_chars(text)
    text = collapse_whitespace(text)
    return text
 def normalize_query(text: str) -> str:
    """
    Normalize a search query parameter.
    Lighter than name normalization — preserves more structure
    but still ensures matching against normalized graph data.
    """
    if not text:
        return ""
    text = strip_diacritics(text)
    text = normalize_chars(text)
    text = collapse_whitespace(text)
    return text