diff --git a/app/utils/arabic.py b/app/utils/arabic.py new file mode 100644 index 0000000..dde196d --- /dev/null +++ b/app/utils/arabic.py @@ -0,0 +1,110 @@ +""" +Arabic text normalization utilities. + +Used both in the extraction pipeline (post-processing) and in the API +(query parameter normalization) to ensure consistent matching against +the deduplicated Neo4j graph. +""" +import re +import unicodedata + + +# ── Diacritical marks to strip ────────────────────────────────────────────── +ARABIC_DIACRITICS = re.compile( + r"[\u0610-\u061A" # Small signs + r"\u064B-\u065F" # Tashkeel (harakat) + r"\u0670" # Superscript Alef + r"\u06D6-\u06DC" # Small Quranic marks + r"\u06DF-\u06E4" # Small signs continued + r"\u06E7-\u06E8" # Small signs continued + r"\u06EA-\u06ED" # Small signs continued + r"]" +) + +# ── Character normalization map ───────────────────────────────────────────── +CHAR_MAP = { + "أ": "ا", "إ": "ا", "آ": "ا", "ٱ": "ا", # Alef variants → bare Alef + "ؤ": "و", # Waw with Hamza → Waw + "ئ": "ي", # Ya with Hamza → Ya + "ى": "ي", # Alef Maksura → Ya + "ة": "ه", # Ta Marbuta → Ha + "ٰ": "", # Dagger Alef → remove +} + +# ── Honorific patterns to strip from names ────────────────────────────────── +HONORIFICS = [ + r"صلى\s*الله\s*عليه\s*وسلم", + r"عليه\s*السلام", + r"رضي\s*الله\s*عنه(ا|م|ما)?", + r"رحمه\s*الله", + r"تعالى", + r"عز\s*وجل", + r"ﷺ", + r"﷽", +] +HONORIFIC_PATTERN = re.compile(r"\s*(" + "|".join(HONORIFICS) + r")\s*", re.UNICODE) + + +def strip_diacritics(text: str) -> str: + """Remove all Arabic diacritical marks (tashkeel/harakat).""" + return ARABIC_DIACRITICS.sub("", text) + + +def normalize_chars(text: str) -> str: + """Normalize Alef variants, Ta Marbuta, Alef Maksura, etc.""" + for src, dst in CHAR_MAP.items(): + text = text.replace(src, dst) + return text + + +def strip_honorifics(text: str) -> str: + """Remove common Arabic honorific phrases from names.""" + return HONORIFIC_PATTERN.sub(" ", text).strip() + + +def collapse_whitespace(text: str) -> str: + """Collapse multiple spaces / ZWNJ / ZWJ into single space.""" + text = re.sub(r"[\u200B-\u200F\u202A-\u202E\uFEFF]", "", text) + return re.sub(r"\s+", " ", text).strip() + + +def normalize_arabic(text: str) -> str: + """ + Full normalization pipeline for Arabic text matching. + Steps: strip diacritics → normalize chars → collapse whitespace. + Does NOT strip honorifics (use normalize_name for that). + """ + if not text: + return "" + text = strip_diacritics(text) + text = normalize_chars(text) + text = collapse_whitespace(text) + return text + + +def normalize_name(text: str) -> str: + """ + Normalize an Arabic name for matching against the graph. + Strips diacritics, honorifics, normalizes characters. + """ + if not text: + return "" + text = strip_honorifics(text) + text = strip_diacritics(text) + text = normalize_chars(text) + text = collapse_whitespace(text) + return text + + +def normalize_query(text: str) -> str: + """ + Normalize a search query parameter. + Lighter than name normalization — preserves more structure + but still ensures matching against normalized graph data. + """ + if not text: + return "" + text = strip_diacritics(text) + text = normalize_chars(text) + text = collapse_whitespace(text) + return text