197 lines
7.1 KiB
Python
197 lines
7.1 KiB
Python
"""
|
|
Search endpoints — semantic search (Qdrant + TEI) and full-text Arabic (Elasticsearch).
|
|
"""
|
|
from fastapi import APIRouter, Query, HTTPException
|
|
from typing import Optional
|
|
|
|
from app.services.database import db
|
|
from app.config import get_settings
|
|
from app.utils.arabic import normalize_query
|
|
from app.models.schemas import (
|
|
SemanticSearchResult, FullTextSearchResult, CombinedSearchResult,
|
|
HadithSummary,
|
|
)
|
|
|
|
router = APIRouter(prefix="/search", tags=["Search"])
|
|
|
|
|
|
async def _get_embedding(text: str) -> list[float]:
|
|
"""Get embedding vector from TEI (BGE-M3)."""
|
|
settings = get_settings()
|
|
response = await db.http_client.post(
|
|
f"{settings.tei_url}/embed",
|
|
json={"inputs": text},
|
|
)
|
|
if response.status_code != 200:
|
|
raise HTTPException(status_code=502, detail=f"TEI embedding failed: {response.text}")
|
|
|
|
embeddings = response.json()
|
|
if isinstance(embeddings, list) and len(embeddings) > 0:
|
|
if isinstance(embeddings[0], list):
|
|
return embeddings[0]
|
|
return embeddings
|
|
raise HTTPException(status_code=502, detail="Unexpected TEI response format")
|
|
|
|
|
|
# ── Semantic search ─────────────────────────────────────────────────────────
|
|
|
|
@router.get("/semantic", response_model=list[SemanticSearchResult])
|
|
async def semantic_search(
|
|
q: str = Query(..., min_length=2, description="Search query (any language — Arabic, English, etc.)"),
|
|
collection: Optional[str] = Query(None, description="Filter by collection name"),
|
|
limit: int = Query(10, ge=1, le=50),
|
|
):
|
|
"""
|
|
Semantic search — find hadiths by meaning, not just keywords.
|
|
Supports cross-language queries (English query → Arabic results).
|
|
Uses BGE-M3 embeddings + Qdrant vector search.
|
|
"""
|
|
if not db.qdrant_available():
|
|
raise HTTPException(status_code=503, detail="Qdrant unavailable")
|
|
|
|
settings = get_settings()
|
|
query_vector = await _get_embedding(q)
|
|
|
|
query_filter = None
|
|
if collection:
|
|
from qdrant_client.models import Filter, FieldCondition, MatchValue
|
|
query_filter = Filter(
|
|
must=[FieldCondition(key="collection", match=MatchValue(value=collection))]
|
|
)
|
|
|
|
results = db.qdrant.search(
|
|
collection_name=settings.qdrant_collection,
|
|
query_vector=query_vector,
|
|
limit=limit,
|
|
query_filter=query_filter,
|
|
with_payload=True,
|
|
)
|
|
|
|
output = []
|
|
for hit in results:
|
|
payload = hit.payload or {}
|
|
output.append(SemanticSearchResult(
|
|
hadith=HadithSummary(
|
|
id=str(payload.get("id", hit.id)),
|
|
collection=payload.get("collection"),
|
|
hadith_number=payload.get("hadith_number"),
|
|
grade=payload.get("grade"),
|
|
arabic_text=(payload.get("arabic_text") or "")[:300],
|
|
),
|
|
score=round(hit.score, 4),
|
|
collection=payload.get("collection", ""),
|
|
))
|
|
|
|
return output
|
|
|
|
|
|
# ── Full-text Arabic search ─────────────────────────────────────────────────
|
|
|
|
@router.get("/fulltext", response_model=list[FullTextSearchResult])
|
|
async def fulltext_search(
|
|
q: str = Query(..., min_length=2, description="Arabic text search query"),
|
|
collection: Optional[str] = Query(None, description="Filter by collection"),
|
|
limit: int = Query(10, ge=1, le=50),
|
|
):
|
|
"""
|
|
Full-text Arabic search using Elasticsearch.
|
|
Supports Arabic morphological analysis (root-based matching).
|
|
"""
|
|
if not db.es_available():
|
|
raise HTTPException(status_code=503, detail="Elasticsearch unavailable")
|
|
|
|
settings = get_settings()
|
|
|
|
must = [{"multi_match": {
|
|
"query": q,
|
|
"fields": ["arabic_text^3", "english_text", "urdu_text"],
|
|
"type": "best_fields",
|
|
"analyzer": "arabic",
|
|
}}]
|
|
|
|
if collection:
|
|
must.append({"match": {"collection": collection}})
|
|
|
|
body = {
|
|
"query": {"bool": {"must": must}},
|
|
"highlight": {
|
|
"fields": {"arabic_text": {"fragment_size": 200, "number_of_fragments": 3}},
|
|
},
|
|
"size": limit,
|
|
}
|
|
|
|
resp = db.es.search(index=settings.es_index, body=body)
|
|
hits = resp.get("hits", {}).get("hits", [])
|
|
|
|
output = []
|
|
for hit in hits:
|
|
src = hit["_source"]
|
|
highlights = hit.get("highlight", {}).get("arabic_text", [])
|
|
output.append(FullTextSearchResult(
|
|
hadith=HadithSummary(
|
|
id=str(src.get("id", hit["_id"])),
|
|
collection=src.get("collection"),
|
|
hadith_number=src.get("hadith_number"),
|
|
grade=src.get("grade"),
|
|
arabic_text=(src.get("arabic_text") or "")[:300],
|
|
),
|
|
score=round(hit["_score"], 4),
|
|
highlights=highlights,
|
|
))
|
|
|
|
return output
|
|
|
|
|
|
# ── Combined search (semantic + fulltext) ───────────────────────────────────
|
|
|
|
@router.get("/combined", response_model=list[CombinedSearchResult])
|
|
async def combined_search(
|
|
q: str = Query(..., min_length=2, description="Search query"),
|
|
collection: Optional[str] = Query(None),
|
|
limit: int = Query(10, ge=1, le=50),
|
|
semantic_weight: float = Query(0.6, ge=0, le=1, description="Weight for semantic score (0-1)"),
|
|
):
|
|
"""Combined semantic + full-text search. Results merged and ranked by weighted score."""
|
|
results_map: dict[str, CombinedSearchResult] = {}
|
|
|
|
# Semantic
|
|
if db.qdrant_available():
|
|
try:
|
|
sem_results = await semantic_search(q=q, collection=collection, limit=limit)
|
|
for sr in sem_results:
|
|
hid = sr.hadith.id
|
|
results_map[hid] = CombinedSearchResult(
|
|
hadith=sr.hadith,
|
|
semantic_score=sr.score,
|
|
combined_score=sr.score * semantic_weight,
|
|
source="semantic",
|
|
)
|
|
except Exception:
|
|
pass
|
|
|
|
# Full-text
|
|
if db.es_available():
|
|
try:
|
|
ft_results = await fulltext_search(q=q, collection=collection, limit=limit)
|
|
ft_weight = 1.0 - semantic_weight
|
|
for fr in ft_results:
|
|
hid = fr.hadith.id
|
|
norm_score = min(fr.score / 20.0, 1.0)
|
|
if hid in results_map:
|
|
existing = results_map[hid]
|
|
existing.fulltext_score = norm_score
|
|
existing.combined_score += norm_score * ft_weight
|
|
existing.source = "both"
|
|
else:
|
|
results_map[hid] = CombinedSearchResult(
|
|
hadith=fr.hadith,
|
|
fulltext_score=norm_score,
|
|
combined_score=norm_score * ft_weight,
|
|
source="fulltext",
|
|
)
|
|
except Exception:
|
|
pass
|
|
|
|
results = sorted(results_map.values(), key=lambda x: x.combined_score, reverse=True)
|
|
return results[:limit]
|