feat: Add Arabic text normalization utilities for consistent matching

2026-03-02 22:00:19 +01:00 · 2026-03-02 22:00:19 +01:00 · f2bab9cadd
parent 649b6a40a3
commit f2bab9cadd
1 changed files with 110 additions and 0 deletions
--- a/app/utils/arabic.py
+++ b/app/utils/arabic.py
@ -0,0 +1,110 @@
+"""
+Arabic text normalization utilities.
+
+Used both in the extraction pipeline (post-processing) and in the API
+(query parameter normalization) to ensure consistent matching against
+the deduplicated Neo4j graph.
+"""
+import re
+import unicodedata
+
+
+# ── Diacritical marks to strip ──────────────────────────────────────────────
+ARABIC_DIACRITICS = re.compile(
+    r"[\u0610-\u061A"   # Small signs
+    r"\u064B-\u065F"    # Tashkeel (harakat)
+    r"\u0670"           # Superscript Alef
+    r"\u06D6-\u06DC"    # Small Quranic marks
+    r"\u06DF-\u06E4"    # Small signs continued
+    r"\u06E7-\u06E8"    # Small signs continued
+    r"\u06EA-\u06ED"    # Small signs continued
+    r"]"
+)
+
+# ── Character normalization map ─────────────────────────────────────────────
+CHAR_MAP = {
+    "أ": "ا", "إ": "ا", "آ": "ا", "ٱ": "ا",  # Alef variants → bare Alef
+    "ؤ": "و",                                      # Waw with Hamza → Waw
+    "ئ": "ي",                                      # Ya with Hamza → Ya
+    "ى": "ي",                                      # Alef Maksura → Ya
+    "ة": "ه",                                      # Ta Marbuta → Ha
+    "ٰ": "",                                        # Dagger Alef → remove
+}
+
+# ── Honorific patterns to strip from names ──────────────────────────────────
+HONORIFICS = [
+    r"صلى\s*الله\s*عليه\s*وسلم",
+    r"عليه\s*السلام",
+    r"رضي\s*الله\s*عنه(ا|م|ما)?",
+    r"رحمه\s*الله",
+    r"تعالى",
+    r"عز\s*وجل",
+    r"ﷺ",
+    r"﷽",
+]
+HONORIFIC_PATTERN = re.compile(r"\s*(" + "|".join(HONORIFICS) + r")\s*", re.UNICODE)
+
+
+def strip_diacritics(text: str) -> str:
+    """Remove all Arabic diacritical marks (tashkeel/harakat)."""
+    return ARABIC_DIACRITICS.sub("", text)
+
+
+def normalize_chars(text: str) -> str:
+    """Normalize Alef variants, Ta Marbuta, Alef Maksura, etc."""
+    for src, dst in CHAR_MAP.items():
+        text = text.replace(src, dst)
+    return text
+
+
+def strip_honorifics(text: str) -> str:
+    """Remove common Arabic honorific phrases from names."""
+    return HONORIFIC_PATTERN.sub(" ", text).strip()
+
+
+def collapse_whitespace(text: str) -> str:
+    """Collapse multiple spaces / ZWNJ / ZWJ into single space."""
+    text = re.sub(r"[\u200B-\u200F\u202A-\u202E\uFEFF]", "", text)
+    return re.sub(r"\s+", " ", text).strip()
+
+
+def normalize_arabic(text: str) -> str:
+    """
+    Full normalization pipeline for Arabic text matching.
+    Steps: strip diacritics → normalize chars → collapse whitespace.
+    Does NOT strip honorifics (use normalize_name for that).
+    """
+    if not text:
+        return ""
+    text = strip_diacritics(text)
+    text = normalize_chars(text)
+    text = collapse_whitespace(text)
+    return text
+
+
+def normalize_name(text: str) -> str:
+    """
+    Normalize an Arabic name for matching against the graph.
+    Strips diacritics, honorifics, normalizes characters.
+    """
+    if not text:
+        return ""
+    text = strip_honorifics(text)
+    text = strip_diacritics(text)
+    text = normalize_chars(text)
+    text = collapse_whitespace(text)
+    return text
+
+
+def normalize_query(text: str) -> str:
+    """
+    Normalize a search query parameter.
+    Lighter than name normalization — preserves more structure
+    but still ensures matching against normalized graph data.
+    """
+    if not text:
+        return ""
+    text = strip_diacritics(text)
+    text = normalize_chars(text)
+    text = collapse_whitespace(text)
+    return text