hadith-api/app/routers/search.py

208 lines
7.7 KiB
Python

"""
Search endpoints — semantic search (Qdrant + TEI) and full-text Arabic (Elasticsearch).
"""
from fastapi import APIRouter, Query, HTTPException
from typing import Optional
from app.services.database import db
from app.config import get_settings
from app.models.schemas import SemanticSearchResult, FullTextSearchResult, HadithSummary
router = APIRouter(prefix="/search", tags=["Search"])
async def get_embedding(text: str) -> list[float]:
"""Get embedding vector from TEI (BGE-M3)."""
settings = get_settings()
response = await db.http_client.post(
f"{settings.tei_url}/embed",
json={"inputs": text},
)
if response.status_code != 200:
raise HTTPException(status_code=502, detail=f"TEI embedding failed: {response.text}")
embeddings = response.json()
# TEI returns list of embeddings; we sent one input
if isinstance(embeddings, list) and len(embeddings) > 0:
if isinstance(embeddings[0], list):
return embeddings[0]
return embeddings
raise HTTPException(status_code=502, detail="Unexpected TEI response format")
@router.get("/semantic", response_model=list[SemanticSearchResult],
summary="Semantic search (find by meaning)",
description="Search hadiths by meaning using BGE-M3 multilingual embeddings + Qdrant. "
"Supports cross-language queries: search in English and find Arabic hadiths, or vice versa. "
"Example: `what did the prophet say about fasting` → finds Arabic hadiths about صيام")
async def semantic_search(
q: str = Query(
..., min_length=2,
description="Search query in any language. The embedding model handles Arabic, English, and Urdu.",
examples=["what is the reward of prayer", "أحاديث عن الصيام", "حكم الربا"],
),
collection: Optional[str] = Query(
None,
description="Filter by collection name. Example: Sahih Bukhari",
),
limit: int = Query(10, ge=1, le=50, description="Number of results (max 50)"),
):
"""
Semantic search — find hadiths by meaning, not just keywords.
Supports Arabic, English, and cross-language queries.
Uses BGE-M3 embeddings + Qdrant vector search.
"""
settings = get_settings()
# Get query embedding from TEI
query_vector = await get_embedding(q)
# Build Qdrant filter if collection specified
query_filter = None
if collection:
from qdrant_client.models import Filter, FieldCondition, MatchValue
query_filter = Filter(
must=[FieldCondition(key="collection", match=MatchValue(value=collection))]
)
# Search Qdrant
results = db.qdrant.search(
collection_name=settings.qdrant_collection,
query_vector=query_vector,
limit=limit,
query_filter=query_filter,
with_payload=True,
)
output = []
for hit in results:
payload = hit.payload or {}
output.append(SemanticSearchResult(
hadith=HadithSummary(
id=str(payload.get("id", hit.id)),
collection=payload.get("collection", ""),
hadith_number=payload.get("hadith_number", 0),
grade=payload.get("grade"),
arabic_text=(payload.get("arabic_text") or "")[:300],
),
score=round(hit.score, 4),
collection=payload.get("collection", ""),
))
return output
@router.get("/fulltext", response_model=list[FullTextSearchResult],
summary="Full-text Arabic search",
description="Keyword search using Elasticsearch with Arabic morphological analysis (stemming, root extraction). "
"Returns highlighted text fragments showing where matches occurred. "
"Handles both vocalized (الصَّلاة) and unvocalized (الصلاة) Arabic.")
async def fulltext_search(
q: str = Query(
..., min_length=2,
description="Arabic text search query. Examples: الصلاة (prayer), النكاح (marriage), الجهاد (jihad)",
examples=["الصلاة", "صيام رمضان", "بيع وشراء"],
),
collection: Optional[str] = Query(
None,
description="Filter by collection. Example: Sahih Muslim",
),
limit: int = Query(10, ge=1, le=50, description="Number of results (max 50)"),
):
"""
Full-text Arabic search using Elasticsearch.
Supports Arabic morphological analysis.
"""
settings = get_settings()
# Build ES query
must = [
{
"multi_match": {
"query": q,
"fields": ["arabic_text^3", "arabic_normalized^2", "matn", "sanad"],
"type": "best_fields",
"analyzer": "arabic",
}
}
]
if collection:
must.append({"match": {"collection_name": collection}})
body = {
"query": {"bool": {"must": must}},
"highlight": {
"fields": {
"arabic_text": {"fragment_size": 200, "number_of_fragments": 2},
"matn": {"fragment_size": 200, "number_of_fragments": 1},
}
},
"size": limit,
}
try:
response = db.es.search(index=settings.es_index, body=body)
except Exception as e:
# ES index might not exist yet
raise HTTPException(status_code=503, detail=f"Elasticsearch error: {str(e)}")
output = []
for hit in response["hits"]["hits"]:
src = hit["_source"]
highlights = []
if "highlight" in hit:
for field_highlights in hit["highlight"].values():
highlights.extend(field_highlights)
output.append(FullTextSearchResult(
hadith=HadithSummary(
id=str(src.get("id", hit["_id"])),
collection=src.get("collection_name", ""),
hadith_number=src.get("hadith_number", 0),
grade=src.get("grade"),
arabic_text=(src.get("arabic_text") or "")[:300],
),
score=round(hit["_score"], 4),
highlights=highlights,
))
return output
@router.get("/combined", response_model=dict,
summary="Combined search (semantic + full-text)",
description="Runs both semantic and full-text search in parallel and returns merged results. "
"Best for the mobile app search bar — gives both meaning-based and keyword-based results. "
"Returns `{semantic: [...], fulltext: [...], query: '...'}`")
async def combined_search(
q: str = Query(
..., min_length=2,
description="Search query. Works with Arabic keywords or natural language in any language.",
examples=["الصلاة في وقتها", "hadith about charity"],
),
collection: Optional[str] = Query(None, description="Filter by collection name"),
limit: int = Query(10, ge=1, le=20, description="Results per search type (max 20)"),
):
"""
Combined search — runs both semantic and full-text in parallel,
returns merged results. Best for the mobile app search bar.
"""
import asyncio
semantic_task = semantic_search(q=q, collection=collection, limit=limit)
# Full-text only makes sense for Arabic queries
fulltext_task = fulltext_search(q=q, collection=collection, limit=limit)
semantic_results, fulltext_results = await asyncio.gather(
semantic_task,
fulltext_task,
return_exceptions=True,
)
return {
"semantic": semantic_results if not isinstance(semantic_results, Exception) else [],
"fulltext": fulltext_results if not isinstance(fulltext_results, Exception) else [],
"query": q,
}