hadith-api/app/routers/search.py

197 lines
7.1 KiB
Python

"""
Search endpoints — semantic search (Qdrant + TEI) and full-text Arabic (Elasticsearch).
"""
from fastapi import APIRouter, Query, HTTPException
from typing import Optional
from app.services.database import db
from app.config import get_settings
from app.utils.arabic import normalize_query
from app.models.schemas import (
SemanticSearchResult, FullTextSearchResult, CombinedSearchResult,
HadithSummary,
)
router = APIRouter(prefix="/search", tags=["Search"])
async def _get_embedding(text: str) -> list[float]:
"""Get embedding vector from TEI (BGE-M3)."""
settings = get_settings()
response = await db.http_client.post(
f"{settings.tei_url}/embed",
json={"inputs": text},
)
if response.status_code != 200:
raise HTTPException(status_code=502, detail=f"TEI embedding failed: {response.text}")
embeddings = response.json()
if isinstance(embeddings, list) and len(embeddings) > 0:
if isinstance(embeddings[0], list):
return embeddings[0]
return embeddings
raise HTTPException(status_code=502, detail="Unexpected TEI response format")
# ── Semantic search ─────────────────────────────────────────────────────────
@router.get("/semantic", response_model=list[SemanticSearchResult])
async def semantic_search(
q: str = Query(..., min_length=2, description="Search query (any language — Arabic, English, etc.)"),
collection: Optional[str] = Query(None, description="Filter by collection name"),
limit: int = Query(10, ge=1, le=50),
):
"""
Semantic search — find hadiths by meaning, not just keywords.
Supports cross-language queries (English query → Arabic results).
Uses BGE-M3 embeddings + Qdrant vector search.
"""
if not db.qdrant_available():
raise HTTPException(status_code=503, detail="Qdrant unavailable")
settings = get_settings()
query_vector = await _get_embedding(q)
query_filter = None
if collection:
from qdrant_client.models import Filter, FieldCondition, MatchValue
query_filter = Filter(
must=[FieldCondition(key="collection", match=MatchValue(value=collection))]
)
results = db.qdrant.search(
collection_name=settings.qdrant_collection,
query_vector=query_vector,
limit=limit,
query_filter=query_filter,
with_payload=True,
)
output = []
for hit in results:
payload = hit.payload or {}
output.append(SemanticSearchResult(
hadith=HadithSummary(
id=str(payload.get("id", hit.id)),
collection=payload.get("collection"),
hadith_number=payload.get("hadith_number"),
grade=payload.get("grade"),
arabic_text=(payload.get("arabic_text") or "")[:300],
),
score=round(hit.score, 4),
collection=payload.get("collection", ""),
))
return output
# ── Full-text Arabic search ─────────────────────────────────────────────────
@router.get("/fulltext", response_model=list[FullTextSearchResult])
async def fulltext_search(
q: str = Query(..., min_length=2, description="Arabic text search query"),
collection: Optional[str] = Query(None, description="Filter by collection"),
limit: int = Query(10, ge=1, le=50),
):
"""
Full-text Arabic search using Elasticsearch.
Supports Arabic morphological analysis (root-based matching).
"""
if not db.es_available():
raise HTTPException(status_code=503, detail="Elasticsearch unavailable")
settings = get_settings()
must = [{"multi_match": {
"query": q,
"fields": ["arabic_text^3", "english_text", "urdu_text"],
"type": "best_fields",
"analyzer": "arabic",
}}]
if collection:
must.append({"match": {"collection": collection}})
body = {
"query": {"bool": {"must": must}},
"highlight": {
"fields": {"arabic_text": {"fragment_size": 200, "number_of_fragments": 3}},
},
"size": limit,
}
resp = db.es.search(index=settings.es_index, body=body)
hits = resp.get("hits", {}).get("hits", [])
output = []
for hit in hits:
src = hit["_source"]
highlights = hit.get("highlight", {}).get("arabic_text", [])
output.append(FullTextSearchResult(
hadith=HadithSummary(
id=str(src.get("id", hit["_id"])),
collection=src.get("collection"),
hadith_number=src.get("hadith_number"),
grade=src.get("grade"),
arabic_text=(src.get("arabic_text") or "")[:300],
),
score=round(hit["_score"], 4),
highlights=highlights,
))
return output
# ── Combined search (semantic + fulltext) ───────────────────────────────────
@router.get("/combined", response_model=list[CombinedSearchResult])
async def combined_search(
q: str = Query(..., min_length=2, description="Search query"),
collection: Optional[str] = Query(None),
limit: int = Query(10, ge=1, le=50),
semantic_weight: float = Query(0.6, ge=0, le=1, description="Weight for semantic score (0-1)"),
):
"""Combined semantic + full-text search. Results merged and ranked by weighted score."""
results_map: dict[str, CombinedSearchResult] = {}
# Semantic
if db.qdrant_available():
try:
sem_results = await semantic_search(q=q, collection=collection, limit=limit)
for sr in sem_results:
hid = sr.hadith.id
results_map[hid] = CombinedSearchResult(
hadith=sr.hadith,
semantic_score=sr.score,
combined_score=sr.score * semantic_weight,
source="semantic",
)
except Exception:
pass
# Full-text
if db.es_available():
try:
ft_results = await fulltext_search(q=q, collection=collection, limit=limit)
ft_weight = 1.0 - semantic_weight
for fr in ft_results:
hid = fr.hadith.id
norm_score = min(fr.score / 20.0, 1.0)
if hid in results_map:
existing = results_map[hid]
existing.fulltext_score = norm_score
existing.combined_score += norm_score * ft_weight
existing.source = "both"
else:
results_map[hid] = CombinedSearchResult(
hadith=fr.hadith,
fulltext_score=norm_score,
combined_score=norm_score * ft_weight,
source="fulltext",
)
except Exception:
pass
results = sorted(results_map.values(), key=lambda x: x.combined_score, reverse=True)
return results[:limit]