208 lines
7.7 KiB
Python
208 lines
7.7 KiB
Python
"""
|
|
Search endpoints — semantic search (Qdrant + TEI) and full-text Arabic (Elasticsearch).
|
|
"""
|
|
from fastapi import APIRouter, Query, HTTPException
|
|
from typing import Optional
|
|
|
|
from app.services.database import db
|
|
from app.config import get_settings
|
|
from app.models.schemas import SemanticSearchResult, FullTextSearchResult, HadithSummary
|
|
|
|
router = APIRouter(prefix="/search", tags=["Search"])
|
|
|
|
|
|
async def get_embedding(text: str) -> list[float]:
|
|
"""Get embedding vector from TEI (BGE-M3)."""
|
|
settings = get_settings()
|
|
response = await db.http_client.post(
|
|
f"{settings.tei_url}/embed",
|
|
json={"inputs": text},
|
|
)
|
|
if response.status_code != 200:
|
|
raise HTTPException(status_code=502, detail=f"TEI embedding failed: {response.text}")
|
|
|
|
embeddings = response.json()
|
|
# TEI returns list of embeddings; we sent one input
|
|
if isinstance(embeddings, list) and len(embeddings) > 0:
|
|
if isinstance(embeddings[0], list):
|
|
return embeddings[0]
|
|
return embeddings
|
|
raise HTTPException(status_code=502, detail="Unexpected TEI response format")
|
|
|
|
|
|
@router.get("/semantic", response_model=list[SemanticSearchResult],
|
|
summary="Semantic search (find by meaning)",
|
|
description="Search hadiths by meaning using BGE-M3 multilingual embeddings + Qdrant. "
|
|
"Supports cross-language queries: search in English and find Arabic hadiths, or vice versa. "
|
|
"Example: `what did the prophet say about fasting` → finds Arabic hadiths about صيام")
|
|
async def semantic_search(
|
|
q: str = Query(
|
|
..., min_length=2,
|
|
description="Search query in any language. The embedding model handles Arabic, English, and Urdu.",
|
|
examples=["what is the reward of prayer", "أحاديث عن الصيام", "حكم الربا"],
|
|
),
|
|
collection: Optional[str] = Query(
|
|
None,
|
|
description="Filter by collection name. Example: Sahih Bukhari",
|
|
),
|
|
limit: int = Query(10, ge=1, le=50, description="Number of results (max 50)"),
|
|
):
|
|
"""
|
|
Semantic search — find hadiths by meaning, not just keywords.
|
|
Supports Arabic, English, and cross-language queries.
|
|
Uses BGE-M3 embeddings + Qdrant vector search.
|
|
"""
|
|
settings = get_settings()
|
|
|
|
# Get query embedding from TEI
|
|
query_vector = await get_embedding(q)
|
|
|
|
# Build Qdrant filter if collection specified
|
|
query_filter = None
|
|
if collection:
|
|
from qdrant_client.models import Filter, FieldCondition, MatchValue
|
|
query_filter = Filter(
|
|
must=[FieldCondition(key="collection", match=MatchValue(value=collection))]
|
|
)
|
|
|
|
# Search Qdrant
|
|
results = db.qdrant.search(
|
|
collection_name=settings.qdrant_collection,
|
|
query_vector=query_vector,
|
|
limit=limit,
|
|
query_filter=query_filter,
|
|
with_payload=True,
|
|
)
|
|
|
|
output = []
|
|
for hit in results:
|
|
payload = hit.payload or {}
|
|
output.append(SemanticSearchResult(
|
|
hadith=HadithSummary(
|
|
id=str(payload.get("id", hit.id)),
|
|
collection=payload.get("collection", ""),
|
|
hadith_number=payload.get("hadith_number", 0),
|
|
grade=payload.get("grade"),
|
|
arabic_text=(payload.get("arabic_text") or "")[:300],
|
|
),
|
|
score=round(hit.score, 4),
|
|
collection=payload.get("collection", ""),
|
|
))
|
|
|
|
return output
|
|
|
|
|
|
@router.get("/fulltext", response_model=list[FullTextSearchResult],
|
|
summary="Full-text Arabic search",
|
|
description="Keyword search using Elasticsearch with Arabic morphological analysis (stemming, root extraction). "
|
|
"Returns highlighted text fragments showing where matches occurred. "
|
|
"Handles both vocalized (الصَّلاة) and unvocalized (الصلاة) Arabic.")
|
|
async def fulltext_search(
|
|
q: str = Query(
|
|
..., min_length=2,
|
|
description="Arabic text search query. Examples: الصلاة (prayer), النكاح (marriage), الجهاد (jihad)",
|
|
examples=["الصلاة", "صيام رمضان", "بيع وشراء"],
|
|
),
|
|
collection: Optional[str] = Query(
|
|
None,
|
|
description="Filter by collection. Example: Sahih Muslim",
|
|
),
|
|
limit: int = Query(10, ge=1, le=50, description="Number of results (max 50)"),
|
|
):
|
|
"""
|
|
Full-text Arabic search using Elasticsearch.
|
|
Supports Arabic morphological analysis.
|
|
"""
|
|
settings = get_settings()
|
|
|
|
# Build ES query
|
|
must = [
|
|
{
|
|
"multi_match": {
|
|
"query": q,
|
|
"fields": ["arabic_text^3", "arabic_normalized^2", "matn", "sanad"],
|
|
"type": "best_fields",
|
|
"analyzer": "arabic",
|
|
}
|
|
}
|
|
]
|
|
|
|
if collection:
|
|
must.append({"match": {"collection_name": collection}})
|
|
|
|
body = {
|
|
"query": {"bool": {"must": must}},
|
|
"highlight": {
|
|
"fields": {
|
|
"arabic_text": {"fragment_size": 200, "number_of_fragments": 2},
|
|
"matn": {"fragment_size": 200, "number_of_fragments": 1},
|
|
}
|
|
},
|
|
"size": limit,
|
|
}
|
|
|
|
try:
|
|
response = db.es.search(index=settings.es_index, body=body)
|
|
except Exception as e:
|
|
# ES index might not exist yet
|
|
raise HTTPException(status_code=503, detail=f"Elasticsearch error: {str(e)}")
|
|
|
|
output = []
|
|
for hit in response["hits"]["hits"]:
|
|
src = hit["_source"]
|
|
highlights = []
|
|
if "highlight" in hit:
|
|
for field_highlights in hit["highlight"].values():
|
|
highlights.extend(field_highlights)
|
|
|
|
output.append(FullTextSearchResult(
|
|
hadith=HadithSummary(
|
|
id=str(src.get("id", hit["_id"])),
|
|
collection=src.get("collection_name", ""),
|
|
hadith_number=src.get("hadith_number", 0),
|
|
grade=src.get("grade"),
|
|
arabic_text=(src.get("arabic_text") or "")[:300],
|
|
),
|
|
score=round(hit["_score"], 4),
|
|
highlights=highlights,
|
|
))
|
|
|
|
return output
|
|
|
|
|
|
@router.get("/combined", response_model=dict,
|
|
summary="Combined search (semantic + full-text)",
|
|
description="Runs both semantic and full-text search in parallel and returns merged results. "
|
|
"Best for the mobile app search bar — gives both meaning-based and keyword-based results. "
|
|
"Returns `{semantic: [...], fulltext: [...], query: '...'}`")
|
|
async def combined_search(
|
|
q: str = Query(
|
|
..., min_length=2,
|
|
description="Search query. Works with Arabic keywords or natural language in any language.",
|
|
examples=["الصلاة في وقتها", "hadith about charity"],
|
|
),
|
|
collection: Optional[str] = Query(None, description="Filter by collection name"),
|
|
limit: int = Query(10, ge=1, le=20, description="Results per search type (max 20)"),
|
|
):
|
|
"""
|
|
Combined search — runs both semantic and full-text in parallel,
|
|
returns merged results. Best for the mobile app search bar.
|
|
"""
|
|
import asyncio
|
|
|
|
semantic_task = semantic_search(q=q, collection=collection, limit=limit)
|
|
# Full-text only makes sense for Arabic queries
|
|
fulltext_task = fulltext_search(q=q, collection=collection, limit=limit)
|
|
|
|
semantic_results, fulltext_results = await asyncio.gather(
|
|
semantic_task,
|
|
fulltext_task,
|
|
return_exceptions=True,
|
|
)
|
|
|
|
return {
|
|
"semantic": semantic_results if not isinstance(semantic_results, Exception) else [],
|
|
"fulltext": fulltext_results if not isinstance(fulltext_results, Exception) else [],
|
|
"query": q,
|
|
}
|