hadith-api/app/routers/search.py

"""
Search endpoints — semantic search (Qdrant + TEI) and full-text Arabic (Elasticsearch).
"""
from fastapi import APIRouter, Query, HTTPException
from typing import Optional

from app.services.database import db
from app.config import get_settings
from app.models.schemas import SemanticSearchResult, FullTextSearchResult, HadithSummary

router = APIRouter(prefix="/search", tags=["Search"])


async def get_embedding(text: str) -> list[float]:
    """Get embedding vector from TEI (BGE-M3)."""
    settings = get_settings()
    response = await db.http_client.post(
        f"{settings.tei_url}/embed",
        json={"inputs": text},
    )
    if response.status_code != 200:
        raise HTTPException(status_code=502, detail=f"TEI embedding failed: {response.text}")

    embeddings = response.json()
    # TEI returns list of embeddings; we sent one input
    if isinstance(embeddings, list) and len(embeddings) > 0:
        if isinstance(embeddings[0], list):
            return embeddings[0]
        return embeddings
    raise HTTPException(status_code=502, detail="Unexpected TEI response format")


@router.get("/semantic", response_model=list[SemanticSearchResult],
             summary="Semantic search (find by meaning)",
             description="Search hadiths by meaning using BGE-M3 multilingual embeddings + Qdrant. "
                         "Supports cross-language queries: search in English and find Arabic hadiths, or vice versa. "
                         "Example: `what did the prophet say about fasting` → finds Arabic hadiths about صيام")
async def semantic_search(
    q: str = Query(
        ..., min_length=2,
        description="Search query in any language. The embedding model handles Arabic, English, and Urdu.",
        examples=["what is the reward of prayer", "أحاديث عن الصيام", "حكم الربا"],
    ),
    collection: Optional[str] = Query(
        None,
        description="Filter by collection name. Example: Sahih Bukhari",
    ),
    limit: int = Query(10, ge=1, le=50, description="Number of results (max 50)"),
):
    """
    Semantic search — find hadiths by meaning, not just keywords.
    Supports Arabic, English, and cross-language queries.
    Uses BGE-M3 embeddings + Qdrant vector search.
    """
    settings = get_settings()

    # Get query embedding from TEI
    query_vector = await get_embedding(q)

    # Build Qdrant filter if collection specified
    query_filter = None
    if collection:
        from qdrant_client.models import Filter, FieldCondition, MatchValue
        query_filter = Filter(
            must=[FieldCondition(key="collection", match=MatchValue(value=collection))]
        )

    # Search Qdrant
    results = db.qdrant.search(
        collection_name=settings.qdrant_collection,
        query_vector=query_vector,
        limit=limit,
        query_filter=query_filter,
        with_payload=True,
    )

    output = []
    for hit in results:
        payload = hit.payload or {}
        output.append(SemanticSearchResult(
            hadith=HadithSummary(
                id=str(payload.get("id", hit.id)),
                collection=payload.get("collection", ""),
                hadith_number=payload.get("hadith_number", 0),
                grade=payload.get("grade"),
                arabic_text=(payload.get("arabic_text") or "")[:300],
            ),
            score=round(hit.score, 4),
            collection=payload.get("collection", ""),
        ))

    return output


@router.get("/fulltext", response_model=list[FullTextSearchResult],
             summary="Full-text Arabic search",
             description="Keyword search using Elasticsearch with Arabic morphological analysis (stemming, root extraction). "
                         "Returns highlighted text fragments showing where matches occurred. "
                         "Handles both vocalized (الصَّلاة) and unvocalized (الصلاة) Arabic.")
async def fulltext_search(
    q: str = Query(
        ..., min_length=2,
        description="Arabic text search query. Examples: الصلاة (prayer), النكاح (marriage), الجهاد (jihad)",
        examples=["الصلاة", "صيام رمضان", "بيع وشراء"],
    ),
    collection: Optional[str] = Query(
        None,
        description="Filter by collection. Example: Sahih Muslim",
    ),
    limit: int = Query(10, ge=1, le=50, description="Number of results (max 50)"),
):
    """
    Full-text Arabic search using Elasticsearch.
    Supports Arabic morphological analysis.
    """
    settings = get_settings()

    # Build ES query
    must = [
        {
            "multi_match": {
                "query": q,
                "fields": ["arabic_text^3", "arabic_normalized^2", "matn", "sanad"],
                "type": "best_fields",
                "analyzer": "arabic",
            }
        }
    ]

    if collection:
        must.append({"match": {"collection_name": collection}})

    body = {
        "query": {"bool": {"must": must}},
        "highlight": {
            "fields": {
                "arabic_text": {"fragment_size": 200, "number_of_fragments": 2},
                "matn": {"fragment_size": 200, "number_of_fragments": 1},
            }
        },
        "size": limit,
    }

    try:
        response = db.es.search(index=settings.es_index, body=body)
    except Exception as e:
        # ES index might not exist yet
        raise HTTPException(status_code=503, detail=f"Elasticsearch error: {str(e)}")

    output = []
    for hit in response["hits"]["hits"]:
        src = hit["_source"]
        highlights = []
        if "highlight" in hit:
            for field_highlights in hit["highlight"].values():
                highlights.extend(field_highlights)

        output.append(FullTextSearchResult(
            hadith=HadithSummary(
                id=str(src.get("id", hit["_id"])),
                collection=src.get("collection_name", ""),
                hadith_number=src.get("hadith_number", 0),
                grade=src.get("grade"),
                arabic_text=(src.get("arabic_text") or "")[:300],
            ),
            score=round(hit["_score"], 4),
            highlights=highlights,
        ))

    return output


@router.get("/combined", response_model=dict,
             summary="Combined search (semantic + full-text)",
             description="Runs both semantic and full-text search in parallel and returns merged results. "
                         "Best for the mobile app search bar — gives both meaning-based and keyword-based results. "
                         "Returns `{semantic: [...], fulltext: [...], query: '...'}`")
async def combined_search(
    q: str = Query(
        ..., min_length=2,
        description="Search query. Works with Arabic keywords or natural language in any language.",
        examples=["الصلاة في وقتها", "hadith about charity"],
    ),
    collection: Optional[str] = Query(None, description="Filter by collection name"),
    limit: int = Query(10, ge=1, le=20, description="Results per search type (max 20)"),
):
    """
    Combined search — runs both semantic and full-text in parallel,
    returns merged results. Best for the mobile app search bar.
    """
    import asyncio

    semantic_task = semantic_search(q=q, collection=collection, limit=limit)
    # Full-text only makes sense for Arabic queries
    fulltext_task = fulltext_search(q=q, collection=collection, limit=limit)

    semantic_results, fulltext_results = await asyncio.gather(
        semantic_task,
        fulltext_task,
        return_exceptions=True,
    )

    return {
        "semantic": semantic_results if not isinstance(semantic_results, Exception) else [],
        "fulltext": fulltext_results if not isinstance(fulltext_results, Exception) else [],
        "query": q,
    }