""" Search endpoints — semantic search (Qdrant + TEI) and full-text Arabic (Elasticsearch). """ from fastapi import APIRouter, Query, HTTPException from typing import Optional from app.services.database import db from app.config import get_settings from app.models.schemas import SemanticSearchResult, FullTextSearchResult, HadithSummary router = APIRouter(prefix="/search", tags=["Search"]) async def get_embedding(text: str) -> list[float]: """Get embedding vector from TEI (BGE-M3).""" settings = get_settings() response = await db.http_client.post( f"{settings.tei_url}/embed", json={"inputs": text}, ) if response.status_code != 200: raise HTTPException(status_code=502, detail=f"TEI embedding failed: {response.text}") embeddings = response.json() # TEI returns list of embeddings; we sent one input if isinstance(embeddings, list) and len(embeddings) > 0: if isinstance(embeddings[0], list): return embeddings[0] return embeddings raise HTTPException(status_code=502, detail="Unexpected TEI response format") @router.get("/semantic", response_model=list[SemanticSearchResult], summary="Semantic search (find by meaning)", description="Search hadiths by meaning using BGE-M3 multilingual embeddings + Qdrant. " "Supports cross-language queries: search in English and find Arabic hadiths, or vice versa. " "Example: `what did the prophet say about fasting` → finds Arabic hadiths about صيام") async def semantic_search( q: str = Query( ..., min_length=2, description="Search query in any language. The embedding model handles Arabic, English, and Urdu.", examples=["what is the reward of prayer", "أحاديث عن الصيام", "حكم الربا"], ), collection: Optional[str] = Query( None, description="Filter by collection name. Example: Sahih Bukhari", ), limit: int = Query(10, ge=1, le=50, description="Number of results (max 50)"), ): """ Semantic search — find hadiths by meaning, not just keywords. Supports Arabic, English, and cross-language queries. Uses BGE-M3 embeddings + Qdrant vector search. """ settings = get_settings() # Get query embedding from TEI query_vector = await get_embedding(q) # Build Qdrant filter if collection specified query_filter = None if collection: from qdrant_client.models import Filter, FieldCondition, MatchValue query_filter = Filter( must=[FieldCondition(key="collection", match=MatchValue(value=collection))] ) # Search Qdrant results = db.qdrant.search( collection_name=settings.qdrant_collection, query_vector=query_vector, limit=limit, query_filter=query_filter, with_payload=True, ) output = [] for hit in results: payload = hit.payload or {} output.append(SemanticSearchResult( hadith=HadithSummary( id=str(payload.get("id", hit.id)), collection=payload.get("collection", ""), hadith_number=payload.get("hadith_number", 0), grade=payload.get("grade"), arabic_text=(payload.get("arabic_text") or "")[:300], ), score=round(hit.score, 4), collection=payload.get("collection", ""), )) return output @router.get("/fulltext", response_model=list[FullTextSearchResult], summary="Full-text Arabic search", description="Keyword search using Elasticsearch with Arabic morphological analysis (stemming, root extraction). " "Returns highlighted text fragments showing where matches occurred. " "Handles both vocalized (الصَّلاة) and unvocalized (الصلاة) Arabic.") async def fulltext_search( q: str = Query( ..., min_length=2, description="Arabic text search query. Examples: الصلاة (prayer), النكاح (marriage), الجهاد (jihad)", examples=["الصلاة", "صيام رمضان", "بيع وشراء"], ), collection: Optional[str] = Query( None, description="Filter by collection. Example: Sahih Muslim", ), limit: int = Query(10, ge=1, le=50, description="Number of results (max 50)"), ): """ Full-text Arabic search using Elasticsearch. Supports Arabic morphological analysis. """ settings = get_settings() # Build ES query must = [ { "multi_match": { "query": q, "fields": ["arabic_text^3", "arabic_normalized^2", "matn", "sanad"], "type": "best_fields", "analyzer": "arabic", } } ] if collection: must.append({"match": {"collection_name": collection}}) body = { "query": {"bool": {"must": must}}, "highlight": { "fields": { "arabic_text": {"fragment_size": 200, "number_of_fragments": 2}, "matn": {"fragment_size": 200, "number_of_fragments": 1}, } }, "size": limit, } try: response = db.es.search(index=settings.es_index, body=body) except Exception as e: # ES index might not exist yet raise HTTPException(status_code=503, detail=f"Elasticsearch error: {str(e)}") output = [] for hit in response["hits"]["hits"]: src = hit["_source"] highlights = [] if "highlight" in hit: for field_highlights in hit["highlight"].values(): highlights.extend(field_highlights) output.append(FullTextSearchResult( hadith=HadithSummary( id=str(src.get("id", hit["_id"])), collection=src.get("collection_name", ""), hadith_number=src.get("hadith_number", 0), grade=src.get("grade"), arabic_text=(src.get("arabic_text") or "")[:300], ), score=round(hit["_score"], 4), highlights=highlights, )) return output @router.get("/combined", response_model=dict, summary="Combined search (semantic + full-text)", description="Runs both semantic and full-text search in parallel and returns merged results. " "Best for the mobile app search bar — gives both meaning-based and keyword-based results. " "Returns `{semantic: [...], fulltext: [...], query: '...'}`") async def combined_search( q: str = Query( ..., min_length=2, description="Search query. Works with Arabic keywords or natural language in any language.", examples=["الصلاة في وقتها", "hadith about charity"], ), collection: Optional[str] = Query(None, description="Filter by collection name"), limit: int = Query(10, ge=1, le=20, description="Results per search type (max 20)"), ): """ Combined search — runs both semantic and full-text in parallel, returns merged results. Best for the mobile app search bar. """ import asyncio semantic_task = semantic_search(q=q, collection=collection, limit=limit) # Full-text only makes sense for Arabic queries fulltext_task = fulltext_search(q=q, collection=collection, limit=limit) semantic_results, fulltext_results = await asyncio.gather( semantic_task, fulltext_task, return_exceptions=True, ) return { "semantic": semantic_results if not isinstance(semantic_results, Exception) else [], "fulltext": fulltext_results if not isinstance(fulltext_results, Exception) else [], "query": q, }