""" Arabic text normalization utilities. Used both in the extraction pipeline (post-processing) and in the API (query parameter normalization) to ensure consistent matching against the deduplicated Neo4j graph. """ import re import unicodedata # ── Diacritical marks to strip ────────────────────────────────────────────── ARABIC_DIACRITICS = re.compile( r"[\u0610-\u061A" # Small signs r"\u064B-\u065F" # Tashkeel (harakat) r"\u0670" # Superscript Alef r"\u06D6-\u06DC" # Small Quranic marks r"\u06DF-\u06E4" # Small signs continued r"\u06E7-\u06E8" # Small signs continued r"\u06EA-\u06ED" # Small signs continued r"]" ) # ── Character normalization map ───────────────────────────────────────────── CHAR_MAP = { "أ": "ا", "إ": "ا", "آ": "ا", "ٱ": "ا", # Alef variants → bare Alef "ؤ": "و", # Waw with Hamza → Waw "ئ": "ي", # Ya with Hamza → Ya "ى": "ي", # Alef Maksura → Ya "ة": "ه", # Ta Marbuta → Ha "ٰ": "", # Dagger Alef → remove } # ── Honorific patterns to strip from names ────────────────────────────────── HONORIFICS = [ r"صلى\s*الله\s*عليه\s*وسلم", r"عليه\s*السلام", r"رضي\s*الله\s*عنه(ا|م|ما)?", r"رحمه\s*الله", r"تعالى", r"عز\s*وجل", r"ﷺ", r"﷽", ] HONORIFIC_PATTERN = re.compile(r"\s*(" + "|".join(HONORIFICS) + r")\s*", re.UNICODE) def strip_diacritics(text: str) -> str: """Remove all Arabic diacritical marks (tashkeel/harakat).""" return ARABIC_DIACRITICS.sub("", text) def normalize_chars(text: str) -> str: """Normalize Alef variants, Ta Marbuta, Alef Maksura, etc.""" for src, dst in CHAR_MAP.items(): text = text.replace(src, dst) return text def strip_honorifics(text: str) -> str: """Remove common Arabic honorific phrases from names.""" return HONORIFIC_PATTERN.sub(" ", text).strip() def collapse_whitespace(text: str) -> str: """Collapse multiple spaces / ZWNJ / ZWJ into single space.""" text = re.sub(r"[\u200B-\u200F\u202A-\u202E\uFEFF]", "", text) return re.sub(r"\s+", " ", text).strip() def normalize_arabic(text: str) -> str: """ Full normalization pipeline for Arabic text matching. Steps: strip diacritics → normalize chars → collapse whitespace. Does NOT strip honorifics (use normalize_name for that). """ if not text: return "" text = strip_diacritics(text) text = normalize_chars(text) text = collapse_whitespace(text) return text def normalize_name(text: str) -> str: """ Normalize an Arabic name for matching against the graph. Strips diacritics, honorifics, normalizes characters. """ if not text: return "" text = strip_honorifics(text) text = strip_diacritics(text) text = normalize_chars(text) text = collapse_whitespace(text) return text def normalize_query(text: str) -> str: """ Normalize a search query parameter. Lighter than name normalization — preserves more structure but still ensures matching against normalized graph data. """ if not text: return "" text = strip_diacritics(text) text = normalize_chars(text) text = collapse_whitespace(text) return text