feat: Add Arabic text normalization utilities for consistent matching

This commit is contained in:
salah 2026-03-02 22:00:19 +01:00
parent 649b6a40a3
commit f2bab9cadd
1 changed files with 110 additions and 0 deletions

110
app/utils/arabic.py Normal file
View File

@ -0,0 +1,110 @@
"""
Arabic text normalization utilities.
Used both in the extraction pipeline (post-processing) and in the API
(query parameter normalization) to ensure consistent matching against
the deduplicated Neo4j graph.
"""
import re
import unicodedata
# ── Diacritical marks to strip ──────────────────────────────────────────────
ARABIC_DIACRITICS = re.compile(
r"[\u0610-\u061A" # Small signs
r"\u064B-\u065F" # Tashkeel (harakat)
r"\u0670" # Superscript Alef
r"\u06D6-\u06DC" # Small Quranic marks
r"\u06DF-\u06E4" # Small signs continued
r"\u06E7-\u06E8" # Small signs continued
r"\u06EA-\u06ED" # Small signs continued
r"]"
)
# ── Character normalization map ─────────────────────────────────────────────
CHAR_MAP = {
"أ": "ا", "إ": "ا", "آ": "ا", "ٱ": "ا", # Alef variants → bare Alef
"ؤ": "و", # Waw with Hamza → Waw
"ئ": "ي", # Ya with Hamza → Ya
"ى": "ي", # Alef Maksura → Ya
"ة": "ه", # Ta Marbuta → Ha
"ٰ": "", # Dagger Alef → remove
}
# ── Honorific patterns to strip from names ──────────────────────────────────
HONORIFICS = [
r"صلى\s*الله\s*عليه\s*وسلم",
r"عليه\s*السلام",
r"رضي\s*الله\s*عنه(ا|م|ما)?",
r"رحمه\s*الله",
r"تعالى",
r"عز\s*وجل",
r"",
r"",
]
HONORIFIC_PATTERN = re.compile(r"\s*(" + "|".join(HONORIFICS) + r")\s*", re.UNICODE)
def strip_diacritics(text: str) -> str:
"""Remove all Arabic diacritical marks (tashkeel/harakat)."""
return ARABIC_DIACRITICS.sub("", text)
def normalize_chars(text: str) -> str:
"""Normalize Alef variants, Ta Marbuta, Alef Maksura, etc."""
for src, dst in CHAR_MAP.items():
text = text.replace(src, dst)
return text
def strip_honorifics(text: str) -> str:
"""Remove common Arabic honorific phrases from names."""
return HONORIFIC_PATTERN.sub(" ", text).strip()
def collapse_whitespace(text: str) -> str:
"""Collapse multiple spaces / ZWNJ / ZWJ into single space."""
text = re.sub(r"[\u200B-\u200F\u202A-\u202E\uFEFF]", "", text)
return re.sub(r"\s+", " ", text).strip()
def normalize_arabic(text: str) -> str:
"""
Full normalization pipeline for Arabic text matching.
Steps: strip diacritics normalize chars collapse whitespace.
Does NOT strip honorifics (use normalize_name for that).
"""
if not text:
return ""
text = strip_diacritics(text)
text = normalize_chars(text)
text = collapse_whitespace(text)
return text
def normalize_name(text: str) -> str:
"""
Normalize an Arabic name for matching against the graph.
Strips diacritics, honorifics, normalizes characters.
"""
if not text:
return ""
text = strip_honorifics(text)
text = strip_diacritics(text)
text = normalize_chars(text)
text = collapse_whitespace(text)
return text
def normalize_query(text: str) -> str:
"""
Normalize a search query parameter.
Lighter than name normalization preserves more structure
but still ensures matching against normalized graph data.
"""
if not text:
return ""
text = strip_diacritics(text)
text = normalize_chars(text)
text = collapse_whitespace(text)
return text