feat: Add Arabic text normalization utilities for consistent matching
This commit is contained in:
parent
649b6a40a3
commit
f2bab9cadd
|
|
@ -0,0 +1,110 @@
|
||||||
|
"""
|
||||||
|
Arabic text normalization utilities.
|
||||||
|
|
||||||
|
Used both in the extraction pipeline (post-processing) and in the API
|
||||||
|
(query parameter normalization) to ensure consistent matching against
|
||||||
|
the deduplicated Neo4j graph.
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
|
|
||||||
|
# ── Diacritical marks to strip ──────────────────────────────────────────────
|
||||||
|
ARABIC_DIACRITICS = re.compile(
|
||||||
|
r"[\u0610-\u061A" # Small signs
|
||||||
|
r"\u064B-\u065F" # Tashkeel (harakat)
|
||||||
|
r"\u0670" # Superscript Alef
|
||||||
|
r"\u06D6-\u06DC" # Small Quranic marks
|
||||||
|
r"\u06DF-\u06E4" # Small signs continued
|
||||||
|
r"\u06E7-\u06E8" # Small signs continued
|
||||||
|
r"\u06EA-\u06ED" # Small signs continued
|
||||||
|
r"]"
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── Character normalization map ─────────────────────────────────────────────
|
||||||
|
CHAR_MAP = {
|
||||||
|
"أ": "ا", "إ": "ا", "آ": "ا", "ٱ": "ا", # Alef variants → bare Alef
|
||||||
|
"ؤ": "و", # Waw with Hamza → Waw
|
||||||
|
"ئ": "ي", # Ya with Hamza → Ya
|
||||||
|
"ى": "ي", # Alef Maksura → Ya
|
||||||
|
"ة": "ه", # Ta Marbuta → Ha
|
||||||
|
"ٰ": "", # Dagger Alef → remove
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Honorific patterns to strip from names ──────────────────────────────────
|
||||||
|
HONORIFICS = [
|
||||||
|
r"صلى\s*الله\s*عليه\s*وسلم",
|
||||||
|
r"عليه\s*السلام",
|
||||||
|
r"رضي\s*الله\s*عنه(ا|م|ما)?",
|
||||||
|
r"رحمه\s*الله",
|
||||||
|
r"تعالى",
|
||||||
|
r"عز\s*وجل",
|
||||||
|
r"ﷺ",
|
||||||
|
r"﷽",
|
||||||
|
]
|
||||||
|
HONORIFIC_PATTERN = re.compile(r"\s*(" + "|".join(HONORIFICS) + r")\s*", re.UNICODE)
|
||||||
|
|
||||||
|
|
||||||
|
def strip_diacritics(text: str) -> str:
|
||||||
|
"""Remove all Arabic diacritical marks (tashkeel/harakat)."""
|
||||||
|
return ARABIC_DIACRITICS.sub("", text)
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_chars(text: str) -> str:
|
||||||
|
"""Normalize Alef variants, Ta Marbuta, Alef Maksura, etc."""
|
||||||
|
for src, dst in CHAR_MAP.items():
|
||||||
|
text = text.replace(src, dst)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def strip_honorifics(text: str) -> str:
|
||||||
|
"""Remove common Arabic honorific phrases from names."""
|
||||||
|
return HONORIFIC_PATTERN.sub(" ", text).strip()
|
||||||
|
|
||||||
|
|
||||||
|
def collapse_whitespace(text: str) -> str:
|
||||||
|
"""Collapse multiple spaces / ZWNJ / ZWJ into single space."""
|
||||||
|
text = re.sub(r"[\u200B-\u200F\u202A-\u202E\uFEFF]", "", text)
|
||||||
|
return re.sub(r"\s+", " ", text).strip()
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_arabic(text: str) -> str:
|
||||||
|
"""
|
||||||
|
Full normalization pipeline for Arabic text matching.
|
||||||
|
Steps: strip diacritics → normalize chars → collapse whitespace.
|
||||||
|
Does NOT strip honorifics (use normalize_name for that).
|
||||||
|
"""
|
||||||
|
if not text:
|
||||||
|
return ""
|
||||||
|
text = strip_diacritics(text)
|
||||||
|
text = normalize_chars(text)
|
||||||
|
text = collapse_whitespace(text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_name(text: str) -> str:
|
||||||
|
"""
|
||||||
|
Normalize an Arabic name for matching against the graph.
|
||||||
|
Strips diacritics, honorifics, normalizes characters.
|
||||||
|
"""
|
||||||
|
if not text:
|
||||||
|
return ""
|
||||||
|
text = strip_honorifics(text)
|
||||||
|
text = strip_diacritics(text)
|
||||||
|
text = normalize_chars(text)
|
||||||
|
text = collapse_whitespace(text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_query(text: str) -> str:
|
||||||
|
"""
|
||||||
|
Normalize a search query parameter.
|
||||||
|
Lighter than name normalization — preserves more structure
|
||||||
|
but still ensures matching against normalized graph data.
|
||||||
|
"""
|
||||||
|
if not text:
|
||||||
|
return ""
|
||||||
|
text = strip_diacritics(text)
|
||||||
|
text = normalize_chars(text)
|
||||||
|
text = collapse_whitespace(text)
|
||||||
|
return text
|
||||||
Loading…
Reference in New Issue