hadith-api/app/utils/arabic.py

111 lines
3.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Arabic text normalization utilities.
Used both in the extraction pipeline (post-processing) and in the API
(query parameter normalization) to ensure consistent matching against
the deduplicated Neo4j graph.
"""
import re
import unicodedata
# ── Diacritical marks to strip ──────────────────────────────────────────────
ARABIC_DIACRITICS = re.compile(
r"[\u0610-\u061A" # Small signs
r"\u064B-\u065F" # Tashkeel (harakat)
r"\u0670" # Superscript Alef
r"\u06D6-\u06DC" # Small Quranic marks
r"\u06DF-\u06E4" # Small signs continued
r"\u06E7-\u06E8" # Small signs continued
r"\u06EA-\u06ED" # Small signs continued
r"]"
)
# ── Character normalization map ─────────────────────────────────────────────
CHAR_MAP = {
"أ": "ا", "إ": "ا", "آ": "ا", "ٱ": "ا", # Alef variants → bare Alef
"ؤ": "و", # Waw with Hamza → Waw
"ئ": "ي", # Ya with Hamza → Ya
"ى": "ي", # Alef Maksura → Ya
"ة": "ه", # Ta Marbuta → Ha
"ٰ": "", # Dagger Alef → remove
}
# ── Honorific patterns to strip from names ──────────────────────────────────
HONORIFICS = [
r"صلى\s*الله\s*عليه\s*وسلم",
r"عليه\s*السلام",
r"رضي\s*الله\s*عنه(ا|م|ما)?",
r"رحمه\s*الله",
r"تعالى",
r"عز\s*وجل",
r"",
r"",
]
HONORIFIC_PATTERN = re.compile(r"\s*(" + "|".join(HONORIFICS) + r")\s*", re.UNICODE)
def strip_diacritics(text: str) -> str:
"""Remove all Arabic diacritical marks (tashkeel/harakat)."""
return ARABIC_DIACRITICS.sub("", text)
def normalize_chars(text: str) -> str:
"""Normalize Alef variants, Ta Marbuta, Alef Maksura, etc."""
for src, dst in CHAR_MAP.items():
text = text.replace(src, dst)
return text
def strip_honorifics(text: str) -> str:
"""Remove common Arabic honorific phrases from names."""
return HONORIFIC_PATTERN.sub(" ", text).strip()
def collapse_whitespace(text: str) -> str:
"""Collapse multiple spaces / ZWNJ / ZWJ into single space."""
text = re.sub(r"[\u200B-\u200F\u202A-\u202E\uFEFF]", "", text)
return re.sub(r"\s+", " ", text).strip()
def normalize_arabic(text: str) -> str:
"""
Full normalization pipeline for Arabic text matching.
Steps: strip diacritics → normalize chars → collapse whitespace.
Does NOT strip honorifics (use normalize_name for that).
"""
if not text:
return ""
text = strip_diacritics(text)
text = normalize_chars(text)
text = collapse_whitespace(text)
return text
def normalize_name(text: str) -> str:
"""
Normalize an Arabic name for matching against the graph.
Strips diacritics, honorifics, normalizes characters.
"""
if not text:
return ""
text = strip_honorifics(text)
text = strip_diacritics(text)
text = normalize_chars(text)
text = collapse_whitespace(text)
return text
def normalize_query(text: str) -> str:
"""
Normalize a search query parameter.
Lighter than name normalization — preserves more structure
but still ensures matching against normalized graph data.
"""
if not text:
return ""
text = strip_diacritics(text)
text = normalize_chars(text)
text = collapse_whitespace(text)
return text