feat: Add Arabic text normalization utilities for consistent matching
This commit is contained in:
parent
649b6a40a3
commit
f2bab9cadd
|
|
@ -0,0 +1,110 @@
|
|||
"""
|
||||
Arabic text normalization utilities.
|
||||
|
||||
Used both in the extraction pipeline (post-processing) and in the API
|
||||
(query parameter normalization) to ensure consistent matching against
|
||||
the deduplicated Neo4j graph.
|
||||
"""
|
||||
import re
|
||||
import unicodedata
|
||||
|
||||
|
||||
# ── Diacritical marks to strip ──────────────────────────────────────────────
|
||||
ARABIC_DIACRITICS = re.compile(
|
||||
r"[\u0610-\u061A" # Small signs
|
||||
r"\u064B-\u065F" # Tashkeel (harakat)
|
||||
r"\u0670" # Superscript Alef
|
||||
r"\u06D6-\u06DC" # Small Quranic marks
|
||||
r"\u06DF-\u06E4" # Small signs continued
|
||||
r"\u06E7-\u06E8" # Small signs continued
|
||||
r"\u06EA-\u06ED" # Small signs continued
|
||||
r"]"
|
||||
)
|
||||
|
||||
# ── Character normalization map ─────────────────────────────────────────────
|
||||
CHAR_MAP = {
|
||||
"أ": "ا", "إ": "ا", "آ": "ا", "ٱ": "ا", # Alef variants → bare Alef
|
||||
"ؤ": "و", # Waw with Hamza → Waw
|
||||
"ئ": "ي", # Ya with Hamza → Ya
|
||||
"ى": "ي", # Alef Maksura → Ya
|
||||
"ة": "ه", # Ta Marbuta → Ha
|
||||
"ٰ": "", # Dagger Alef → remove
|
||||
}
|
||||
|
||||
# ── Honorific patterns to strip from names ──────────────────────────────────
|
||||
HONORIFICS = [
|
||||
r"صلى\s*الله\s*عليه\s*وسلم",
|
||||
r"عليه\s*السلام",
|
||||
r"رضي\s*الله\s*عنه(ا|م|ما)?",
|
||||
r"رحمه\s*الله",
|
||||
r"تعالى",
|
||||
r"عز\s*وجل",
|
||||
r"ﷺ",
|
||||
r"﷽",
|
||||
]
|
||||
HONORIFIC_PATTERN = re.compile(r"\s*(" + "|".join(HONORIFICS) + r")\s*", re.UNICODE)
|
||||
|
||||
|
||||
def strip_diacritics(text: str) -> str:
|
||||
"""Remove all Arabic diacritical marks (tashkeel/harakat)."""
|
||||
return ARABIC_DIACRITICS.sub("", text)
|
||||
|
||||
|
||||
def normalize_chars(text: str) -> str:
|
||||
"""Normalize Alef variants, Ta Marbuta, Alef Maksura, etc."""
|
||||
for src, dst in CHAR_MAP.items():
|
||||
text = text.replace(src, dst)
|
||||
return text
|
||||
|
||||
|
||||
def strip_honorifics(text: str) -> str:
|
||||
"""Remove common Arabic honorific phrases from names."""
|
||||
return HONORIFIC_PATTERN.sub(" ", text).strip()
|
||||
|
||||
|
||||
def collapse_whitespace(text: str) -> str:
|
||||
"""Collapse multiple spaces / ZWNJ / ZWJ into single space."""
|
||||
text = re.sub(r"[\u200B-\u200F\u202A-\u202E\uFEFF]", "", text)
|
||||
return re.sub(r"\s+", " ", text).strip()
|
||||
|
||||
|
||||
def normalize_arabic(text: str) -> str:
|
||||
"""
|
||||
Full normalization pipeline for Arabic text matching.
|
||||
Steps: strip diacritics → normalize chars → collapse whitespace.
|
||||
Does NOT strip honorifics (use normalize_name for that).
|
||||
"""
|
||||
if not text:
|
||||
return ""
|
||||
text = strip_diacritics(text)
|
||||
text = normalize_chars(text)
|
||||
text = collapse_whitespace(text)
|
||||
return text
|
||||
|
||||
|
||||
def normalize_name(text: str) -> str:
|
||||
"""
|
||||
Normalize an Arabic name for matching against the graph.
|
||||
Strips diacritics, honorifics, normalizes characters.
|
||||
"""
|
||||
if not text:
|
||||
return ""
|
||||
text = strip_honorifics(text)
|
||||
text = strip_diacritics(text)
|
||||
text = normalize_chars(text)
|
||||
text = collapse_whitespace(text)
|
||||
return text
|
||||
|
||||
|
||||
def normalize_query(text: str) -> str:
|
||||
"""
|
||||
Normalize a search query parameter.
|
||||
Lighter than name normalization — preserves more structure
|
||||
but still ensures matching against normalized graph data.
|
||||
"""
|
||||
if not text:
|
||||
return ""
|
||||
text = strip_diacritics(text)
|
||||
text = normalize_chars(text)
|
||||
text = collapse_whitespace(text)
|
||||
return text
|
||||
Loading…
Reference in New Issue