hadith-api/app/routers/search.py

"""
Search endpoints — semantic search (Qdrant + TEI) and full-text Arabic (Elasticsearch).
"""
from fastapi import APIRouter, Query, HTTPException
from typing import Optional

from app.services.database import db
from app.config import get_settings
from app.utils.arabic import normalize_query
from app.models.schemas import (
    SemanticSearchResult, FullTextSearchResult, CombinedSearchResult,
    HadithSummary,
)

router = APIRouter(prefix="/search", tags=["Search"])


async def _get_embedding(text: str) -> list[float]:
    """Get embedding vector from TEI (BGE-M3)."""
    settings = get_settings()
    response = await db.http_client.post(
        f"{settings.tei_url}/embed",
        json={"inputs": text},
    )
    if response.status_code != 200:
        raise HTTPException(status_code=502, detail=f"TEI embedding failed: {response.text}")

    embeddings = response.json()
    if isinstance(embeddings, list) and len(embeddings) > 0:
        if isinstance(embeddings[0], list):
            return embeddings[0]
        return embeddings
    raise HTTPException(status_code=502, detail="Unexpected TEI response format")


# ── Semantic search ─────────────────────────────────────────────────────────

@router.get("/semantic", response_model=list[SemanticSearchResult])
async def semantic_search(
    q: str = Query(..., min_length=2, description="Search query (any language — Arabic, English, etc.)"),
    collection: Optional[str] = Query(None, description="Filter by collection name"),
    limit: int = Query(10, ge=1, le=50),
):
    """
    Semantic search — find hadiths by meaning, not just keywords.
    Supports cross-language queries (English query → Arabic results).
    Uses BGE-M3 embeddings + Qdrant vector search.
    """
    if not db.qdrant_available():
        raise HTTPException(status_code=503, detail="Qdrant unavailable")

    settings = get_settings()
    query_vector = await _get_embedding(q)

    query_filter = None
    if collection:
        from qdrant_client.models import Filter, FieldCondition, MatchValue
        query_filter = Filter(
            must=[FieldCondition(key="collection", match=MatchValue(value=collection))]
        )

    results = db.qdrant.search(
        collection_name=settings.qdrant_collection,
        query_vector=query_vector,
        limit=limit,
        query_filter=query_filter,
        with_payload=True,
    )

    output = []
    for hit in results:
        payload = hit.payload or {}
        output.append(SemanticSearchResult(
            hadith=HadithSummary(
                id=str(payload.get("id", hit.id)),
                collection=payload.get("collection"),
                hadith_number=payload.get("hadith_number"),
                grade=payload.get("grade"),
                arabic_text=(payload.get("arabic_text") or "")[:300],
            ),
            score=round(hit.score, 4),
            collection=payload.get("collection", ""),
        ))

    return output


# ── Full-text Arabic search ─────────────────────────────────────────────────

@router.get("/fulltext", response_model=list[FullTextSearchResult])
async def fulltext_search(
    q: str = Query(..., min_length=2, description="Arabic text search query"),
    collection: Optional[str] = Query(None, description="Filter by collection"),
    limit: int = Query(10, ge=1, le=50),
):
    """
    Full-text Arabic search using Elasticsearch.
    Supports Arabic morphological analysis (root-based matching).
    """
    if not db.es_available():
        raise HTTPException(status_code=503, detail="Elasticsearch unavailable")

    settings = get_settings()

    must = [{"multi_match": {
        "query": q,
        "fields": ["arabic_text^3", "english_text", "urdu_text"],
        "type": "best_fields",
        "analyzer": "arabic",
    }}]

    if collection:
        must.append({"match": {"collection": collection}})

    body = {
        "query": {"bool": {"must": must}},
        "highlight": {
            "fields": {"arabic_text": {"fragment_size": 200, "number_of_fragments": 3}},
        },
        "size": limit,
    }

    resp = db.es.search(index=settings.es_index, body=body)
    hits = resp.get("hits", {}).get("hits", [])

    output = []
    for hit in hits:
        src = hit["_source"]
        highlights = hit.get("highlight", {}).get("arabic_text", [])
        output.append(FullTextSearchResult(
            hadith=HadithSummary(
                id=str(src.get("id", hit["_id"])),
                collection=src.get("collection"),
                hadith_number=src.get("hadith_number"),
                grade=src.get("grade"),
                arabic_text=(src.get("arabic_text") or "")[:300],
            ),
            score=round(hit["_score"], 4),
            highlights=highlights,
        ))

    return output


# ── Combined search (semantic + fulltext) ───────────────────────────────────

@router.get("/combined", response_model=list[CombinedSearchResult])
async def combined_search(
    q: str = Query(..., min_length=2, description="Search query"),
    collection: Optional[str] = Query(None),
    limit: int = Query(10, ge=1, le=50),
    semantic_weight: float = Query(0.6, ge=0, le=1, description="Weight for semantic score (0-1)"),
):
    """Combined semantic + full-text search. Results merged and ranked by weighted score."""
    results_map: dict[str, CombinedSearchResult] = {}

    # Semantic
    if db.qdrant_available():
        try:
            sem_results = await semantic_search(q=q, collection=collection, limit=limit)
            for sr in sem_results:
                hid = sr.hadith.id
                results_map[hid] = CombinedSearchResult(
                    hadith=sr.hadith,
                    semantic_score=sr.score,
                    combined_score=sr.score * semantic_weight,
                    source="semantic",
                )
        except Exception:
            pass

    # Full-text
    if db.es_available():
        try:
            ft_results = await fulltext_search(q=q, collection=collection, limit=limit)
            ft_weight = 1.0 - semantic_weight
            for fr in ft_results:
                hid = fr.hadith.id
                norm_score = min(fr.score / 20.0, 1.0)
                if hid in results_map:
                    existing = results_map[hid]
                    existing.fulltext_score = norm_score
                    existing.combined_score += norm_score * ft_weight
                    existing.source = "both"
                else:
                    results_map[hid] = CombinedSearchResult(
                        hadith=fr.hadith,
                        fulltext_score=norm_score,
                        combined_score=norm_score * ft_weight,
                        source="fulltext",
                    )
        except Exception:
            pass

    results = sorted(results_map.values(), key=lambda x: x.combined_score, reverse=True)
    return results[:limit]