""" Search endpoints — semantic search (Qdrant + TEI) and full-text Arabic (Elasticsearch). """ from fastapi import APIRouter, Query, HTTPException from typing import Optional from app.services.database import db from app.config import get_settings from app.utils.arabic import normalize_query from app.models.schemas import ( SemanticSearchResult, FullTextSearchResult, CombinedSearchResult, HadithSummary, ) router = APIRouter(prefix="/search", tags=["Search"]) async def _get_embedding(text: str) -> list[float]: """Get embedding vector from TEI (BGE-M3).""" settings = get_settings() response = await db.http_client.post( f"{settings.tei_url}/embed", json={"inputs": text}, ) if response.status_code != 200: raise HTTPException(status_code=502, detail=f"TEI embedding failed: {response.text}") embeddings = response.json() if isinstance(embeddings, list) and len(embeddings) > 0: if isinstance(embeddings[0], list): return embeddings[0] return embeddings raise HTTPException(status_code=502, detail="Unexpected TEI response format") # ── Semantic search ───────────────────────────────────────────────────────── @router.get("/semantic", response_model=list[SemanticSearchResult]) async def semantic_search( q: str = Query(..., min_length=2, description="Search query (any language — Arabic, English, etc.)"), collection: Optional[str] = Query(None, description="Filter by collection name"), limit: int = Query(10, ge=1, le=50), ): """ Semantic search — find hadiths by meaning, not just keywords. Supports cross-language queries (English query → Arabic results). Uses BGE-M3 embeddings + Qdrant vector search. """ if not db.qdrant_available(): raise HTTPException(status_code=503, detail="Qdrant unavailable") settings = get_settings() query_vector = await _get_embedding(q) query_filter = None if collection: from qdrant_client.models import Filter, FieldCondition, MatchValue query_filter = Filter( must=[FieldCondition(key="collection", match=MatchValue(value=collection))] ) results = db.qdrant.search( collection_name=settings.qdrant_collection, query_vector=query_vector, limit=limit, query_filter=query_filter, with_payload=True, ) output = [] for hit in results: payload = hit.payload or {} output.append(SemanticSearchResult( hadith=HadithSummary( id=str(payload.get("id", hit.id)), collection=payload.get("collection"), hadith_number=payload.get("hadith_number"), grade=payload.get("grade"), arabic_text=(payload.get("arabic_text") or "")[:300], ), score=round(hit.score, 4), collection=payload.get("collection", ""), )) return output # ── Full-text Arabic search ───────────────────────────────────────────────── @router.get("/fulltext", response_model=list[FullTextSearchResult]) async def fulltext_search( q: str = Query(..., min_length=2, description="Arabic text search query"), collection: Optional[str] = Query(None, description="Filter by collection"), limit: int = Query(10, ge=1, le=50), ): """ Full-text Arabic search using Elasticsearch. Supports Arabic morphological analysis (root-based matching). """ if not db.es_available(): raise HTTPException(status_code=503, detail="Elasticsearch unavailable") settings = get_settings() must = [{"multi_match": { "query": q, "fields": ["arabic_text^3", "english_text", "urdu_text"], "type": "best_fields", "analyzer": "arabic", }}] if collection: must.append({"match": {"collection": collection}}) body = { "query": {"bool": {"must": must}}, "highlight": { "fields": {"arabic_text": {"fragment_size": 200, "number_of_fragments": 3}}, }, "size": limit, } resp = db.es.search(index=settings.es_index, body=body) hits = resp.get("hits", {}).get("hits", []) output = [] for hit in hits: src = hit["_source"] highlights = hit.get("highlight", {}).get("arabic_text", []) output.append(FullTextSearchResult( hadith=HadithSummary( id=str(src.get("id", hit["_id"])), collection=src.get("collection"), hadith_number=src.get("hadith_number"), grade=src.get("grade"), arabic_text=(src.get("arabic_text") or "")[:300], ), score=round(hit["_score"], 4), highlights=highlights, )) return output # ── Combined search (semantic + fulltext) ─────────────────────────────────── @router.get("/combined", response_model=list[CombinedSearchResult]) async def combined_search( q: str = Query(..., min_length=2, description="Search query"), collection: Optional[str] = Query(None), limit: int = Query(10, ge=1, le=50), semantic_weight: float = Query(0.6, ge=0, le=1, description="Weight for semantic score (0-1)"), ): """Combined semantic + full-text search. Results merged and ranked by weighted score.""" results_map: dict[str, CombinedSearchResult] = {} # Semantic if db.qdrant_available(): try: sem_results = await semantic_search(q=q, collection=collection, limit=limit) for sr in sem_results: hid = sr.hadith.id results_map[hid] = CombinedSearchResult( hadith=sr.hadith, semantic_score=sr.score, combined_score=sr.score * semantic_weight, source="semantic", ) except Exception: pass # Full-text if db.es_available(): try: ft_results = await fulltext_search(q=q, collection=collection, limit=limit) ft_weight = 1.0 - semantic_weight for fr in ft_results: hid = fr.hadith.id norm_score = min(fr.score / 20.0, 1.0) if hid in results_map: existing = results_map[hid] existing.fulltext_score = norm_score existing.combined_score += norm_score * ft_weight existing.source = "both" else: results_map[hid] = CombinedSearchResult( hadith=fr.hadith, fulltext_score=norm_score, combined_score=norm_score * ft_weight, source="fulltext", ) except Exception: pass results = sorted(results_map.values(), key=lambda x: x.combined_score, reverse=True) return results[:limit]