111 lines
3.7 KiB
Python
111 lines
3.7 KiB
Python
"""
|
||
Arabic text normalization utilities.
|
||
|
||
Used both in the extraction pipeline (post-processing) and in the API
|
||
(query parameter normalization) to ensure consistent matching against
|
||
the deduplicated Neo4j graph.
|
||
"""
|
||
import re
|
||
import unicodedata
|
||
|
||
|
||
# ── Diacritical marks to strip ──────────────────────────────────────────────
|
||
ARABIC_DIACRITICS = re.compile(
|
||
r"[\u0610-\u061A" # Small signs
|
||
r"\u064B-\u065F" # Tashkeel (harakat)
|
||
r"\u0670" # Superscript Alef
|
||
r"\u06D6-\u06DC" # Small Quranic marks
|
||
r"\u06DF-\u06E4" # Small signs continued
|
||
r"\u06E7-\u06E8" # Small signs continued
|
||
r"\u06EA-\u06ED" # Small signs continued
|
||
r"]"
|
||
)
|
||
|
||
# ── Character normalization map ─────────────────────────────────────────────
|
||
CHAR_MAP = {
|
||
"أ": "ا", "إ": "ا", "آ": "ا", "ٱ": "ا", # Alef variants → bare Alef
|
||
"ؤ": "و", # Waw with Hamza → Waw
|
||
"ئ": "ي", # Ya with Hamza → Ya
|
||
"ى": "ي", # Alef Maksura → Ya
|
||
"ة": "ه", # Ta Marbuta → Ha
|
||
"ٰ": "", # Dagger Alef → remove
|
||
}
|
||
|
||
# ── Honorific patterns to strip from names ──────────────────────────────────
|
||
HONORIFICS = [
|
||
r"صلى\s*الله\s*عليه\s*وسلم",
|
||
r"عليه\s*السلام",
|
||
r"رضي\s*الله\s*عنه(ا|م|ما)?",
|
||
r"رحمه\s*الله",
|
||
r"تعالى",
|
||
r"عز\s*وجل",
|
||
r"ﷺ",
|
||
r"﷽",
|
||
]
|
||
HONORIFIC_PATTERN = re.compile(r"\s*(" + "|".join(HONORIFICS) + r")\s*", re.UNICODE)
|
||
|
||
|
||
def strip_diacritics(text: str) -> str:
|
||
"""Remove all Arabic diacritical marks (tashkeel/harakat)."""
|
||
return ARABIC_DIACRITICS.sub("", text)
|
||
|
||
|
||
def normalize_chars(text: str) -> str:
|
||
"""Normalize Alef variants, Ta Marbuta, Alef Maksura, etc."""
|
||
for src, dst in CHAR_MAP.items():
|
||
text = text.replace(src, dst)
|
||
return text
|
||
|
||
|
||
def strip_honorifics(text: str) -> str:
|
||
"""Remove common Arabic honorific phrases from names."""
|
||
return HONORIFIC_PATTERN.sub(" ", text).strip()
|
||
|
||
|
||
def collapse_whitespace(text: str) -> str:
|
||
"""Collapse multiple spaces / ZWNJ / ZWJ into single space."""
|
||
text = re.sub(r"[\u200B-\u200F\u202A-\u202E\uFEFF]", "", text)
|
||
return re.sub(r"\s+", " ", text).strip()
|
||
|
||
|
||
def normalize_arabic(text: str) -> str:
|
||
"""
|
||
Full normalization pipeline for Arabic text matching.
|
||
Steps: strip diacritics → normalize chars → collapse whitespace.
|
||
Does NOT strip honorifics (use normalize_name for that).
|
||
"""
|
||
if not text:
|
||
return ""
|
||
text = strip_diacritics(text)
|
||
text = normalize_chars(text)
|
||
text = collapse_whitespace(text)
|
||
return text
|
||
|
||
|
||
def normalize_name(text: str) -> str:
|
||
"""
|
||
Normalize an Arabic name for matching against the graph.
|
||
Strips diacritics, honorifics, normalizes characters.
|
||
"""
|
||
if not text:
|
||
return ""
|
||
text = strip_honorifics(text)
|
||
text = strip_diacritics(text)
|
||
text = normalize_chars(text)
|
||
text = collapse_whitespace(text)
|
||
return text
|
||
|
||
|
||
def normalize_query(text: str) -> str:
|
||
"""
|
||
Normalize a search query parameter.
|
||
Lighter than name normalization — preserves more structure
|
||
but still ensures matching against normalized graph data.
|
||
"""
|
||
if not text:
|
||
return ""
|
||
text = strip_diacritics(text)
|
||
text = normalize_chars(text)
|
||
text = collapse_whitespace(text)
|
||
return text
|