update project.
update README.md: adding full recap that can be used ass a context for a prompt inclaude
This commit is contained in:
parent
bc2e9656a1
commit
2bbd4b571b
|
|
@ -0,0 +1,10 @@
|
||||||
|
# Default ignored files
|
||||||
|
/shelf/
|
||||||
|
/workspace.xml
|
||||||
|
# Editor-based HTTP Client requests
|
||||||
|
/httpRequests/
|
||||||
|
# Ignored default folder with query files
|
||||||
|
/queries/
|
||||||
|
# Datasource local storage ignored files
|
||||||
|
/dataSources/
|
||||||
|
/dataSources.local.xml
|
||||||
|
|
@ -0,0 +1,14 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<module type="PYTHON_MODULE" version="4">
|
||||||
|
<component name="NewModuleRootManager">
|
||||||
|
<content url="file://$MODULE_DIR$">
|
||||||
|
<excludeFolder url="file://$MODULE_DIR$/.venv" />
|
||||||
|
</content>
|
||||||
|
<orderEntry type="jdk" jdkName="Python 3.12 (hadith-ingestion)" jdkType="Python SDK" />
|
||||||
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
|
</component>
|
||||||
|
<component name="PyDocumentationSettings">
|
||||||
|
<option name="format" value="PLAIN" />
|
||||||
|
<option name="myDocStringFormat" value="Plain" />
|
||||||
|
</component>
|
||||||
|
</module>
|
||||||
|
|
@ -0,0 +1,6 @@
|
||||||
|
<component name="InspectionProjectProfileManager">
|
||||||
|
<settings>
|
||||||
|
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||||
|
<version value="1.0" />
|
||||||
|
</settings>
|
||||||
|
</component>
|
||||||
|
|
@ -0,0 +1,7 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="Black">
|
||||||
|
<option name="sdkName" value="Python 3.12 (hadith-ingestion)" />
|
||||||
|
</component>
|
||||||
|
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12 (hadith-ingestion)" project-jdk-type="Python SDK" />
|
||||||
|
</project>
|
||||||
|
|
@ -0,0 +1,8 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="ProjectModuleManager">
|
||||||
|
<modules>
|
||||||
|
<module fileurl="file://$PROJECT_DIR$/.idea/hadith-ingestion.iml" filepath="$PROJECT_DIR$/.idea/hadith-ingestion.iml" />
|
||||||
|
</modules>
|
||||||
|
</component>
|
||||||
|
</project>
|
||||||
|
|
@ -0,0 +1,6 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="VcsDirectoryMappings">
|
||||||
|
<mapping directory="" vcs="Git" />
|
||||||
|
</component>
|
||||||
|
</project>
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,275 @@
|
||||||
|
# 🚀 HadithAPI.com Deployment - Quick Start
|
||||||
|
|
||||||
|
## What You Got
|
||||||
|
|
||||||
|
Three comprehensive guides:
|
||||||
|
1. **PHASE_2_IMPLEMENTATION_GUIDE.md** - Original guide with PostgreSQL schema
|
||||||
|
2. **HADITHAPI_INTEGRATION_GUIDE.md** - Complete HadithAPI.com implementation
|
||||||
|
3. **This summary** - Quick deployment steps
|
||||||
|
|
||||||
|
## 📦 Complete Package Structure
|
||||||
|
|
||||||
|
The HadithAPI guide includes everything you need:
|
||||||
|
|
||||||
|
### Production-Ready Code
|
||||||
|
✅ **hadithapi_client.py** - Full API client with pagination and rate limiting
|
||||||
|
✅ **main_hadithapi.py** - Complete ingestion service
|
||||||
|
✅ **settings.py** - Configuration with your API key
|
||||||
|
✅ **Dockerfile** - Container image
|
||||||
|
✅ **Argo Workflows** - Kubernetes automation
|
||||||
|
✅ **Test scripts** - Validation and troubleshooting
|
||||||
|
|
||||||
|
### Key Features
|
||||||
|
- ✅ Automatic pagination handling
|
||||||
|
- ✅ Rate limiting (30 req/min)
|
||||||
|
- ✅ Error handling and retries
|
||||||
|
- ✅ Progress tracking
|
||||||
|
- ✅ Structured logging
|
||||||
|
- ✅ Multi-language support (Arabic, English, Urdu)
|
||||||
|
|
||||||
|
## 🎯 5-Minute Quick Start
|
||||||
|
|
||||||
|
### 1. Database Setup (2 min)
|
||||||
|
```bash
|
||||||
|
# Use schema from PHASE_2_IMPLEMENTATION_GUIDE.md Section 1
|
||||||
|
kubectl -n db exec -it postgres-0 -- psql -U app -d gitea
|
||||||
|
|
||||||
|
# Copy all SQL from Section 1.2 through 1.6
|
||||||
|
# This creates hadith_db with complete schema
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Create Project Structure (1 min)
|
||||||
|
```bash
|
||||||
|
mkdir -p hadith-ingestion/{config,src/{api_clients,processors,database,utils},argo/workflows}
|
||||||
|
cd hadith-ingestion/
|
||||||
|
|
||||||
|
# Copy code from HADITHAPI_INTEGRATION_GUIDE.md:
|
||||||
|
# - Section 2.1 → src/api_clients/hadithapi_client.py
|
||||||
|
# - Section 4.1 → src/main_hadithapi.py
|
||||||
|
# - Section 5.1 → config/settings.py
|
||||||
|
# - Section 6.1 → Dockerfile
|
||||||
|
# - Section 6.4 → argo/workflows/ingest-hadithapi.yaml
|
||||||
|
|
||||||
|
# Also copy from PHASE_2_IMPLEMENTATION_GUIDE.md:
|
||||||
|
# - Section 3.4 → src/api_clients/base_client.py
|
||||||
|
# - Section 3.6 → src/processors/text_cleaner.py
|
||||||
|
# - Section 3.7 → src/database/repository.py
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Build & Deploy (2 min)
|
||||||
|
```bash
|
||||||
|
# Build image
|
||||||
|
docker build -t hadith-ingestion:latest .
|
||||||
|
|
||||||
|
# Create secrets
|
||||||
|
kubectl -n argo create secret generic hadith-db-secret \
|
||||||
|
--from-literal=password='YOUR_PASSWORD'
|
||||||
|
|
||||||
|
kubectl -n argo create secret generic hadithapi-secret \
|
||||||
|
--from-literal=api-key='$2y$10$nTJnyX3WUDoGmjKrKqSmbecANVsQWKyffmtp9fxmsQwR15DEv4mK'
|
||||||
|
|
||||||
|
# Test with 10 hadiths
|
||||||
|
argo submit -n argo argo/workflows/ingest-hadithapi.yaml \
|
||||||
|
--parameter book-slug=sahih-bukhari \
|
||||||
|
--parameter limit=10 \
|
||||||
|
--watch
|
||||||
|
```
|
||||||
|
|
||||||
|
## 📊 Expected Results
|
||||||
|
|
||||||
|
### Available Collections
|
||||||
|
| Book | Hadiths | Time |
|
||||||
|
|------|---------|------|
|
||||||
|
| Sahih Bukhari | ~7,500 | 2-3h |
|
||||||
|
| Sahih Muslim | ~7,000 | 2-3h |
|
||||||
|
| Sunan Abu Dawood | ~5,000 | 1-2h |
|
||||||
|
| Jami` at-Tirmidhi | ~4,000 | 1-2h |
|
||||||
|
| Sunan an-Nasa'i | ~5,700 | 2h |
|
||||||
|
| Sunan Ibn Majah | ~4,300 | 1-2h |
|
||||||
|
| **TOTAL** | **~33,500** | **10-15h** |
|
||||||
|
|
||||||
|
## 🔧 Key Differences from Sunnah.com
|
||||||
|
|
||||||
|
| Feature | HadithAPI.com | Sunnah.com |
|
||||||
|
|---------|---------------|------------|
|
||||||
|
| **API Key** | ✅ Public (provided) | ❌ Requires PR |
|
||||||
|
| **Rate Limit** | Unknown (using 30/min) | 100/min |
|
||||||
|
| **Coverage** | 6 major books | 10+ books |
|
||||||
|
| **Languages** | Arabic, English, Urdu | Arabic, English |
|
||||||
|
| **Cost** | ✅ Free | Free |
|
||||||
|
| **Stability** | Good | Excellent |
|
||||||
|
|
||||||
|
## 📝 Complete File Checklist
|
||||||
|
|
||||||
|
Create these files from the guides:
|
||||||
|
|
||||||
|
```
|
||||||
|
hadith-ingestion/
|
||||||
|
├── Dockerfile ✓ Section 6.1
|
||||||
|
├── requirements.txt ✓ Phase 2 Section 3.2
|
||||||
|
├── .env ✓ Section 5.2
|
||||||
|
├── build-hadithapi-ingestion.sh ✓ Section 6.2
|
||||||
|
├── create-secrets.sh ✓ Section 6.3
|
||||||
|
├── test-hadithapi-local.sh ✓ Section 7.1
|
||||||
|
├── test-hadithapi-k8s.sh ✓ Section 7.2
|
||||||
|
├── run-full-ingestion.sh ✓ Section 7.3
|
||||||
|
├── config/
|
||||||
|
│ ├── __init__.py (empty file)
|
||||||
|
│ └── settings.py ✓ Section 5.1
|
||||||
|
├── src/
|
||||||
|
│ ├── __init__.py (empty file)
|
||||||
|
│ ├── main_hadithapi.py ✓ Section 4.1
|
||||||
|
│ ├── api_clients/
|
||||||
|
│ │ ├── __init__.py (empty file)
|
||||||
|
│ │ ├── base_client.py ✓ Phase 2 Sec 3.4
|
||||||
|
│ │ └── hadithapi_client.py ✓ Section 2.1
|
||||||
|
│ ├── processors/
|
||||||
|
│ │ ├── __init__.py (empty file)
|
||||||
|
│ │ └── text_cleaner.py ✓ Phase 2 Sec 3.6
|
||||||
|
│ ├── database/
|
||||||
|
│ │ ├── __init__.py (empty file)
|
||||||
|
│ │ ├── connection.py (optional)
|
||||||
|
│ │ └── repository.py ✓ Phase 2 Sec 3.7
|
||||||
|
│ └── utils/
|
||||||
|
│ ├── __init__.py (empty file)
|
||||||
|
│ └── logger.py (optional)
|
||||||
|
└── argo/
|
||||||
|
└── workflows/
|
||||||
|
└── ingest-hadithapi.yaml ✓ Section 6.4
|
||||||
|
```
|
||||||
|
|
||||||
|
## 🎬 Step-by-Step Execution
|
||||||
|
|
||||||
|
### Day 1: Setup & Test (2-3 hours)
|
||||||
|
```bash
|
||||||
|
# 1. Create database schema
|
||||||
|
# 2. Set up project structure
|
||||||
|
# 3. Build Docker image
|
||||||
|
# 4. Create secrets
|
||||||
|
# 5. Run test with 10 hadiths
|
||||||
|
# 6. Verify data
|
||||||
|
```
|
||||||
|
|
||||||
|
### Day 2: Ingest Major Collections (10-15 hours)
|
||||||
|
```bash
|
||||||
|
# Ingest all 6 major collections sequentially
|
||||||
|
./run-full-ingestion.sh
|
||||||
|
|
||||||
|
# Or manually one by one:
|
||||||
|
argo submit ... --parameter book-slug=sahih-bukhari
|
||||||
|
argo submit ... --parameter book-slug=sahih-muslim
|
||||||
|
# etc...
|
||||||
|
```
|
||||||
|
|
||||||
|
### Day 3: Validation & Next Steps
|
||||||
|
```bash
|
||||||
|
# 1. Verify data quality
|
||||||
|
# 2. Check statistics
|
||||||
|
# 3. Proceed to Phase 3 (ML model development)
|
||||||
|
```
|
||||||
|
|
||||||
|
## ✅ Verification Checklist
|
||||||
|
|
||||||
|
After ingestion completes:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 1. Check total hadiths
|
||||||
|
kubectl -n db exec -it postgres-0 -- psql -U hadith_ingest -d hadith_db -c "
|
||||||
|
SELECT COUNT(*) FROM hadiths;
|
||||||
|
"
|
||||||
|
# Expected: ~33,500
|
||||||
|
|
||||||
|
# 2. Check per collection
|
||||||
|
kubectl -n db exec -it postgres-0 -- psql -U hadith_ingest -d hadith_db -c "
|
||||||
|
SELECT
|
||||||
|
c.name_english,
|
||||||
|
COUNT(h.id) as count
|
||||||
|
FROM collections c
|
||||||
|
LEFT JOIN hadiths h ON c.id = h.collection_id
|
||||||
|
WHERE c.abbreviation IN ('bukhari', 'muslim', 'abudawud', 'tirmidhi', 'nasai', 'ibnmajah')
|
||||||
|
GROUP BY c.name_english;
|
||||||
|
"
|
||||||
|
|
||||||
|
# 3. Check for errors
|
||||||
|
kubectl -n db exec -it postgres-0 -- psql -U hadith_ingest -d hadith_db -c "
|
||||||
|
SELECT * FROM ingestion_jobs
|
||||||
|
WHERE status = 'failed'
|
||||||
|
ORDER BY created_at DESC;
|
||||||
|
"
|
||||||
|
```
|
||||||
|
|
||||||
|
## 🐛 Common Issues & Solutions
|
||||||
|
|
||||||
|
### Issue: Rate Limiting
|
||||||
|
```
|
||||||
|
Error: 429 Too Many Requests
|
||||||
|
Solution: Already set to conservative 30/min
|
||||||
|
If still hitting limits, edit settings.py:
|
||||||
|
API_RATE_LIMIT = 20
|
||||||
|
```
|
||||||
|
|
||||||
|
### Issue: Connection Timeout
|
||||||
|
```
|
||||||
|
Error: Connection timeout to database
|
||||||
|
Solution:
|
||||||
|
1. Check PostgreSQL is running
|
||||||
|
2. Verify credentials in secrets
|
||||||
|
3. Test connection manually
|
||||||
|
```
|
||||||
|
|
||||||
|
### Issue: Missing Chapters
|
||||||
|
```
|
||||||
|
Warning: chapters_fetch_failed
|
||||||
|
Solution: Script automatically falls back to fetching all hadiths
|
||||||
|
This is expected and not critical
|
||||||
|
```
|
||||||
|
|
||||||
|
## 📚 Documentation References
|
||||||
|
|
||||||
|
All details in the comprehensive guides:
|
||||||
|
|
||||||
|
1. **PHASE_2_IMPLEMENTATION_GUIDE.md**
|
||||||
|
- PostgreSQL schema (Section 1)
|
||||||
|
- Base utilities (Section 3)
|
||||||
|
- Database repository (Section 3.7)
|
||||||
|
|
||||||
|
2. **HADITHAPI_INTEGRATION_GUIDE.md**
|
||||||
|
- API client (Section 2)
|
||||||
|
- Main ingestion service (Section 4)
|
||||||
|
- Deployment (Section 6)
|
||||||
|
- Testing (Section 7)
|
||||||
|
|
||||||
|
## 🎯 Next Phase
|
||||||
|
|
||||||
|
After Phase 2 completion:
|
||||||
|
→ **Phase 3: ML Model Development**
|
||||||
|
- Annotate sample hadiths (Label Studio)
|
||||||
|
- Train NER model
|
||||||
|
- Train relation extraction model
|
||||||
|
- Fine-tune LLM with LoRA
|
||||||
|
|
||||||
|
## 💡 Pro Tips
|
||||||
|
|
||||||
|
1. **Start Small**: Test with `--limit 10` first
|
||||||
|
2. **Monitor Progress**: Use `argo logs -n argo <workflow> -f`
|
||||||
|
3. **Check Logs**: Structured JSON logs for easy debugging
|
||||||
|
4. **Backup Data**: Before major operations
|
||||||
|
5. **Rate Limiting**: Be conservative to avoid blocks
|
||||||
|
|
||||||
|
## 🎉 Success Criteria
|
||||||
|
|
||||||
|
Phase 2 is complete when:
|
||||||
|
- ✅ Database schema created
|
||||||
|
- ✅ 33,500+ hadiths ingested
|
||||||
|
- ✅ All 6 collections present
|
||||||
|
- ✅ No critical errors
|
||||||
|
- ✅ Data validated
|
||||||
|
- ✅ Ready for embedding generation
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Estimated Total Time: 1-2 days**
|
||||||
|
**Difficulty: Intermediate**
|
||||||
|
**Prerequisites: Phase 1 completed (all core services running)**
|
||||||
|
|
||||||
|
Ready to start? Begin with Section 1 of PHASE_2_IMPLEMENTATION_GUIDE.md!
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
find . -type f -name "*.txt" -o -name "production" -o -name "*.py" -o -name "*.yaml" -o -name "Dockerfile" -o -name "*.sh" -o -name "*.env" ! -name "*.md" ! -name "*.xls" ! -name "*.xlsx"| while read file; do
|
find . -type f -name "*.txt" -o -name "production" -o -name "*.py" -o -name "*.yaml" -o -name "Dockerfile" -o -name "*.sh" -o -name "*.env" ! -name "*.md" | while read file; do
|
||||||
echo "=== $file ===" >> combined.txt
|
echo "=== $file ===" >> combined.txt
|
||||||
cat "$file" >> combined.txt
|
cat "$file" >> combined.txt
|
||||||
echo "" >> combined.txt
|
echo "" >> combined.txt
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,33 @@
|
||||||
|
# ============================================================================
|
||||||
|
# Step 7: Environment Configuration
|
||||||
|
# ============================================================================
|
||||||
|
# Copy this file to .env and update with your values
|
||||||
|
# Usage: source .env
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
# PostgreSQL Configuration
|
||||||
|
export POSTGRES_HOST=pg.betelgeusebytes.io
|
||||||
|
export POSTGRES_PORT=5432
|
||||||
|
export POSTGRES_DB=hadith_db
|
||||||
|
export POSTGRES_USER=hadith_ingest
|
||||||
|
export POSTGRES_PASSWORD=hadith_ingest
|
||||||
|
|
||||||
|
# Label Studio Configuration
|
||||||
|
export LABEL_STUDIO_URL=https://label.betelgeusebytes.io
|
||||||
|
export LABEL_STUDIO_API_KEY=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ0b2tlbl90eXBlIjoicmVmcmVzaCIsImV4cCI6ODA3MTUyMjgzMSwiaWF0IjoxNzY0MzIyODMxLCJqdGkiOiJhYWVkMjNjODdmODc0MmY2OWJmMmFjZDc5YTVjMzMyMiIsInVzZXJfaWQiOjF9.4B_ZAPL6TmIcA6-zcKJ8JDRI3FsikX3HgTK3bbmK0mk
|
||||||
|
|
||||||
|
# To get API key:
|
||||||
|
# 1. Login to Label Studio
|
||||||
|
# 2. Go to Account & Settings (user icon top right)
|
||||||
|
# 3. Access Token section
|
||||||
|
# 4. Copy the token
|
||||||
|
|
||||||
|
# Qdrant Configuration (for active learning)
|
||||||
|
export QDRANT_HOST=https://vector.betelgeusebytes.io
|
||||||
|
|
||||||
|
# QDRANT_HOST = os.getenv("QDRANT_HOST", "https://vector.betelgeusebytes.io")
|
||||||
|
export QDRANT_PORT=6333
|
||||||
|
export QDRANT_COLLECTION=hadith_embeddings
|
||||||
|
|
||||||
|
# For external access:
|
||||||
|
# export QDRANT_HOST=qdrant.betelgeusebytes.io
|
||||||
|
|
@ -0,0 +1,401 @@
|
||||||
|
# Hadith Named Entity Recognition (NER) Annotation Guidelines
|
||||||
|
|
||||||
|
## Table of Contents
|
||||||
|
1. [Introduction](#introduction)
|
||||||
|
2. [Entity Types](#entity-types)
|
||||||
|
3. [Annotation Rules](#annotation-rules)
|
||||||
|
4. [Arabic-Specific Guidelines](#arabic-specific-guidelines)
|
||||||
|
5. [Relation Types](#relation-types)
|
||||||
|
6. [Examples](#examples)
|
||||||
|
7. [Edge Cases](#edge-cases)
|
||||||
|
8. [Quality Standards](#quality-standards)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Introduction
|
||||||
|
|
||||||
|
This document provides guidelines for annotating Islamic hadith texts with named entities and relations. The goal is to create high-quality training data for machine learning models that will automatically extract information from hadith literature.
|
||||||
|
|
||||||
|
### Purpose
|
||||||
|
- Extract narrator names from hadith chains (isnad)
|
||||||
|
- Identify places, dates, and other important entities
|
||||||
|
- Map relationships between narrators for knowledge graph construction
|
||||||
|
|
||||||
|
### Annotator Requirements
|
||||||
|
- Basic understanding of Arabic (for Arabic text annotation)
|
||||||
|
- Familiarity with Islamic terminology
|
||||||
|
- Understanding of hadith structure (isnad + matn)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Entity Types
|
||||||
|
|
||||||
|
### 1. PERSON (شخص)
|
||||||
|
**Definition:** Full names of individuals, including prophets, companions, scholars, and narrators.
|
||||||
|
|
||||||
|
**Include:**
|
||||||
|
- Full names: محمد بن عبد الله (Muhammad ibn Abdullah)
|
||||||
|
- Prophet references: النبي صلى الله عليه وسلم (when used as a name reference)
|
||||||
|
- Companion names: عمر بن الخطاب (Umar ibn al-Khattab)
|
||||||
|
|
||||||
|
**Exclude:**
|
||||||
|
- Generic references like "a man" (رجل) or "someone" (أحد)
|
||||||
|
- Pronouns
|
||||||
|
|
||||||
|
**Color:** 🔴 Red (#FF6B6B)
|
||||||
|
**Hotkey:** P
|
||||||
|
|
||||||
|
### 2. KUNYA (كنية)
|
||||||
|
**Definition:** Honorific names starting with "Abu" (أبو - father of) or "Umm" (أم - mother of).
|
||||||
|
|
||||||
|
**Examples:**
|
||||||
|
- أبو هريرة (Abu Hurairah)
|
||||||
|
- أبو بكر (Abu Bakr)
|
||||||
|
- أم سلمة (Umm Salamah)
|
||||||
|
- أبو عبد الله (Abu Abdullah)
|
||||||
|
|
||||||
|
**Note:** Kunya may appear alone or with a full name. Tag only the Kunya portion.
|
||||||
|
|
||||||
|
**Color:** 🔵 Teal (#4ECDC4)
|
||||||
|
**Hotkey:** K
|
||||||
|
|
||||||
|
### 3. NISBA (نسبة)
|
||||||
|
**Definition:** Attributive names indicating tribe, place of origin, profession, or lineage.
|
||||||
|
|
||||||
|
**Examples:**
|
||||||
|
- البخاري (al-Bukhari - from Bukhara)
|
||||||
|
- القرشي (al-Qurashi - from Quraysh tribe)
|
||||||
|
- المدني (al-Madani - from Medina)
|
||||||
|
- الأنصاري (al-Ansari - of the Ansar)
|
||||||
|
- الحنبلي (al-Hanbali - Hanbali school)
|
||||||
|
|
||||||
|
**Note:** Often appears at the end of a name. Tag the nisba separately from PERSON.
|
||||||
|
|
||||||
|
**Color:** 🔷 Blue (#45B7D1)
|
||||||
|
**Hotkey:** N
|
||||||
|
|
||||||
|
### 4. PLACE (مكان)
|
||||||
|
**Definition:** Geographic locations including cities, regions, mosques, and landmarks.
|
||||||
|
|
||||||
|
**Examples:**
|
||||||
|
- مكة (Mecca)
|
||||||
|
- المدينة (Medina)
|
||||||
|
- المسجد الحرام (The Sacred Mosque)
|
||||||
|
- بيت المقدس (Jerusalem)
|
||||||
|
- الشام (Syria/Levant)
|
||||||
|
- خيبر (Khaybar)
|
||||||
|
|
||||||
|
**Include:**
|
||||||
|
- Cities, towns, villages
|
||||||
|
- Mosques and religious sites
|
||||||
|
- Regions and countries
|
||||||
|
- Mountains, valleys, wells
|
||||||
|
|
||||||
|
**Color:** 🟢 Green (#96CEB4)
|
||||||
|
**Hotkey:** L
|
||||||
|
|
||||||
|
### 5. DATE (تاريخ)
|
||||||
|
**Definition:** Temporal references including years, months, days, and events used as time markers.
|
||||||
|
|
||||||
|
**Examples:**
|
||||||
|
- سنة مئتين (year 200 AH)
|
||||||
|
- في رمضان (in Ramadan)
|
||||||
|
- يوم الجمعة (on Friday)
|
||||||
|
- بعد الهجرة (after the Hijra)
|
||||||
|
- غزوة بدر (Battle of Badr - as a time reference)
|
||||||
|
- يوم عرفة (Day of Arafah)
|
||||||
|
|
||||||
|
**Color:** 🟡 Yellow (#FFEAA7)
|
||||||
|
**Hotkey:** D
|
||||||
|
|
||||||
|
### 6. TRIBE (قبيلة)
|
||||||
|
**Definition:** Names of Arab tribes and clans.
|
||||||
|
|
||||||
|
**Examples:**
|
||||||
|
- قريش (Quraysh)
|
||||||
|
- بني هاشم (Banu Hashim)
|
||||||
|
- الأنصار (the Ansar)
|
||||||
|
- المهاجرين (the Muhajirun)
|
||||||
|
- خزاعة (Khuza'a)
|
||||||
|
|
||||||
|
**Note:** When part of a nisba (القرشي), tag as NISBA. When standalone, tag as TRIBE.
|
||||||
|
|
||||||
|
**Color:** 🟣 Purple (#DDA0DD)
|
||||||
|
**Hotkey:** T
|
||||||
|
|
||||||
|
### 7. TITLE (لقب)
|
||||||
|
**Definition:** Titles and honorifics that aren't kunyas.
|
||||||
|
|
||||||
|
**Examples:**
|
||||||
|
- أمير المؤمنين (Commander of the Faithful)
|
||||||
|
- رسول الله (Messenger of Allah)
|
||||||
|
- خليفة (Caliph)
|
||||||
|
- إمام (Imam)
|
||||||
|
- شيخ (Sheikh)
|
||||||
|
|
||||||
|
**Color:** 🟩 Mint (#98D8C8)
|
||||||
|
**Hotkey:** I
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Annotation Rules
|
||||||
|
|
||||||
|
### General Rules
|
||||||
|
|
||||||
|
1. **Tag the longest valid span**
|
||||||
|
- ✅ `[محمد بن عبد الله بن عبد المطلب]PERSON`
|
||||||
|
- ❌ `[محمد]PERSON [بن عبد الله]PERSON`
|
||||||
|
|
||||||
|
2. **Don't overlap tags**
|
||||||
|
- If a name contains a nisba, tag separately:
|
||||||
|
- `[محمد بن إسماعيل]PERSON [البخاري]NISBA`
|
||||||
|
|
||||||
|
3. **Include particles when part of name**
|
||||||
|
- Include "ال" (al-) when part of a name: `[البخاري]NISBA`
|
||||||
|
- Include "بن/ابن" (ibn/bin) within PERSON tags
|
||||||
|
|
||||||
|
4. **Exclude common words**
|
||||||
|
- Don't tag: عن (from), أن (that), قال (said), حدثنا (narrated)
|
||||||
|
|
||||||
|
5. **Be consistent**
|
||||||
|
- Same entity = same tag type throughout the document
|
||||||
|
|
||||||
|
### Boundary Rules
|
||||||
|
|
||||||
|
1. **Start boundary:** First character of the entity
|
||||||
|
2. **End boundary:** Last character of the entity (including diacritics)
|
||||||
|
3. **Don't include:**
|
||||||
|
- Leading/trailing spaces
|
||||||
|
- Punctuation marks
|
||||||
|
- Pronouns or suffixes
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Arabic-Specific Guidelines
|
||||||
|
|
||||||
|
### Handling Diacritics (تشكيل)
|
||||||
|
- Include diacritics within the tag span
|
||||||
|
- Don't let diacritics affect boundary decisions
|
||||||
|
|
||||||
|
### Common Patterns
|
||||||
|
|
||||||
|
| Pattern | Example | Tags |
|
||||||
|
|---------|---------|------|
|
||||||
|
| Name + bin/ibn + Name | محمد بن إسماعيل | [محمد بن إسماعيل]PERSON |
|
||||||
|
| Name + Kunya | أبو هريرة عبد الرحمن | [أبو هريرة]KUNYA [عبد الرحمن]PERSON |
|
||||||
|
| Name + Nisba | البخاري محمد | [البخاري]NISBA [محمد]PERSON |
|
||||||
|
| Full chain | حدثنا محمد بن إسماعيل البخاري | حدثنا [محمد بن إسماعيل]PERSON [البخاري]NISBA |
|
||||||
|
|
||||||
|
### Prophet References
|
||||||
|
- "النبي صلى الله عليه وسلم" → Tag "النبي" as TITLE
|
||||||
|
- "رسول الله صلى الله عليه وسلم" → Tag "رسول الله" as TITLE
|
||||||
|
- "محمد صلى الله عليه وسلم" → Tag "محمد" as PERSON
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Relation Types
|
||||||
|
|
||||||
|
### 1. NARRATED_FROM (روى عن)
|
||||||
|
**Definition:** A narrator received/heard the hadith from another narrator.
|
||||||
|
|
||||||
|
**Direction:** Source → Target (A NARRATED_FROM B = A heard from B)
|
||||||
|
|
||||||
|
**Indicators:**
|
||||||
|
- عن (from)
|
||||||
|
- حدثنا (narrated to us)
|
||||||
|
- أخبرنا (informed us)
|
||||||
|
- سمعت (I heard)
|
||||||
|
|
||||||
|
### 2. TEACHER_OF (أستاذ)
|
||||||
|
**Definition:** A scholar taught or trained another scholar.
|
||||||
|
|
||||||
|
**Direction:** Teacher → Student
|
||||||
|
|
||||||
|
### 3. STUDENT_OF (تلميذ)
|
||||||
|
**Definition:** Inverse of TEACHER_OF.
|
||||||
|
|
||||||
|
**Direction:** Student → Teacher
|
||||||
|
|
||||||
|
### 4. CONTEMPORARY_OF (معاصر)
|
||||||
|
**Definition:** Two people lived in the same era and potentially knew each other.
|
||||||
|
|
||||||
|
**Direction:** Bidirectional
|
||||||
|
|
||||||
|
### 5. RELATED_TO (قريب)
|
||||||
|
**Definition:** Family relationship (father, son, brother, etc.)
|
||||||
|
|
||||||
|
**Direction:** Bidirectional
|
||||||
|
|
||||||
|
**Indicators:**
|
||||||
|
- بن/ابن (son of)
|
||||||
|
- أخ/أخو (brother of)
|
||||||
|
- زوج/زوجة (spouse of)
|
||||||
|
|
||||||
|
### 6. LIVED_IN (سكن)
|
||||||
|
**Definition:** Person resided in or was associated with a place.
|
||||||
|
|
||||||
|
**Direction:** Person → Place
|
||||||
|
|
||||||
|
### 7. BORN_IN (ولد في)
|
||||||
|
**Definition:** Person's birthplace.
|
||||||
|
|
||||||
|
**Direction:** Person → Place
|
||||||
|
|
||||||
|
### 8. DIED_IN (توفي في)
|
||||||
|
**Definition:** Place of death.
|
||||||
|
|
||||||
|
**Direction:** Person → Place
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Examples
|
||||||
|
|
||||||
|
### Example 1: Simple Narrator Chain
|
||||||
|
|
||||||
|
**Arabic Text:**
|
||||||
|
```
|
||||||
|
حدثنا محمد بن إسماعيل البخاري عن أبي هريرة رضي الله عنه قال: قال رسول الله صلى الله عليه وسلم...
|
||||||
|
```
|
||||||
|
|
||||||
|
**Annotations:**
|
||||||
|
- `[محمد بن إسماعيل]PERSON`
|
||||||
|
- `[البخاري]NISBA`
|
||||||
|
- `[أبي هريرة]KUNYA`
|
||||||
|
- `[رسول الله]TITLE`
|
||||||
|
|
||||||
|
**Relations:**
|
||||||
|
- محمد بن إسماعيل NARRATED_FROM أبي هريرة
|
||||||
|
- أبي هريرة NARRATED_FROM رسول الله
|
||||||
|
|
||||||
|
### Example 2: Place Reference
|
||||||
|
|
||||||
|
**Arabic Text:**
|
||||||
|
```
|
||||||
|
كان النبي صلى الله عليه وسلم في المسجد الحرام بمكة
|
||||||
|
```
|
||||||
|
|
||||||
|
**Annotations:**
|
||||||
|
- `[النبي]TITLE`
|
||||||
|
- `[المسجد الحرام]PLACE`
|
||||||
|
- `[مكة]PLACE`
|
||||||
|
|
||||||
|
### Example 3: Complex Chain
|
||||||
|
|
||||||
|
**Arabic Text:**
|
||||||
|
```
|
||||||
|
حدثنا عبد الله بن يوسف التنيسي أخبرنا مالك عن نافع عن عبد الله بن عمر رضي الله عنهما
|
||||||
|
```
|
||||||
|
|
||||||
|
**Annotations:**
|
||||||
|
- `[عبد الله بن يوسف]PERSON`
|
||||||
|
- `[التنيسي]NISBA`
|
||||||
|
- `[مالك]PERSON`
|
||||||
|
- `[نافع]PERSON`
|
||||||
|
- `[عبد الله بن عمر]PERSON`
|
||||||
|
|
||||||
|
**Relations:**
|
||||||
|
- عبد الله بن يوسف NARRATED_FROM مالك
|
||||||
|
- مالك NARRATED_FROM نافع
|
||||||
|
- نافع NARRATED_FROM عبد الله بن عمر
|
||||||
|
|
||||||
|
### Example 4: English Text
|
||||||
|
|
||||||
|
**English Text:**
|
||||||
|
```
|
||||||
|
Narrated Abu Hurairah: The Prophet (ﷺ) said, "Whoever believes in Allah and the Last Day..."
|
||||||
|
```
|
||||||
|
|
||||||
|
**Annotations:**
|
||||||
|
- `[Abu Hurairah]KUNYA`
|
||||||
|
- `[The Prophet]TITLE`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Edge Cases
|
||||||
|
|
||||||
|
### 1. Ambiguous References
|
||||||
|
- "رجل من الأنصار" (a man from the Ansar) → Tag only `[الأنصار]TRIBE`
|
||||||
|
- "بعض أصحاب النبي" (some companions) → Don't tag
|
||||||
|
|
||||||
|
### 2. Partial Names
|
||||||
|
- If only a first name is given and identity is clear → Tag as PERSON
|
||||||
|
- If unclear → Don't tag
|
||||||
|
|
||||||
|
### 3. Titles vs Names
|
||||||
|
- "الإمام البخاري" → `[الإمام]TITLE` `[البخاري]NISBA`
|
||||||
|
- If Imam is part of the known name → Consider context
|
||||||
|
|
||||||
|
### 4. Multiple Nisbas
|
||||||
|
```
|
||||||
|
محمد بن إسماعيل البخاري الجعفي
|
||||||
|
```
|
||||||
|
→ `[محمد بن إسماعيل]PERSON` `[البخاري]NISBA` `[الجعفي]NISBA`
|
||||||
|
|
||||||
|
### 5. Kunya Used as Name
|
||||||
|
- Some people are known primarily by Kunya
|
||||||
|
- Abu Bakr → `[Abu Bakr]KUNYA` (not PERSON, as it's a kunya form)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Quality Standards
|
||||||
|
|
||||||
|
### Accuracy Requirements
|
||||||
|
- **Entity Detection:** >95% of entities should be identified
|
||||||
|
- **Entity Classification:** >90% should have correct type
|
||||||
|
- **Boundary Precision:** >95% should have exact boundaries
|
||||||
|
|
||||||
|
### Consistency Checks
|
||||||
|
- Same entity tagged consistently across document
|
||||||
|
- Related entities (person + nisba) both tagged
|
||||||
|
- No orphaned relations (both endpoints must be entities)
|
||||||
|
|
||||||
|
### Review Process
|
||||||
|
1. **Self-review:** Annotator reviews own work
|
||||||
|
2. **Peer review:** Second annotator reviews 20% of tasks
|
||||||
|
3. **Expert review:** Arabic/Islamic scholar reviews edge cases
|
||||||
|
|
||||||
|
### Inter-Annotator Agreement
|
||||||
|
- Target Cohen's Kappa > 0.8 for entity types
|
||||||
|
- Target > 0.75 for relations
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Keyboard Shortcuts
|
||||||
|
|
||||||
|
| Action | Key |
|
||||||
|
|--------|-----|
|
||||||
|
| PERSON | P |
|
||||||
|
| KUNYA | K |
|
||||||
|
| NISBA | N |
|
||||||
|
| PLACE | L |
|
||||||
|
| DATE | D |
|
||||||
|
| TRIBE | T |
|
||||||
|
| TITLE | I |
|
||||||
|
| Submit | Ctrl+Enter |
|
||||||
|
| Skip | Ctrl+→ |
|
||||||
|
| Undo | Ctrl+Z |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Common Mistakes to Avoid
|
||||||
|
|
||||||
|
1. ❌ Tagging common words (عن، أن، قال)
|
||||||
|
2. ❌ Missing nisbas at the end of names
|
||||||
|
3. ❌ Overlapping entity spans
|
||||||
|
4. ❌ Inconsistent tagging of same entity
|
||||||
|
5. ❌ Tagging pronouns
|
||||||
|
6. ❌ Missing entities in long chains
|
||||||
|
7. ❌ Incorrect relation directions
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Contact & Support
|
||||||
|
|
||||||
|
For questions or edge cases:
|
||||||
|
- Create a discussion in Label Studio
|
||||||
|
- Tag the project lead for complex cases
|
||||||
|
- Document unusual cases for guideline updates
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
*Version 1.0 - Last Updated: 2025*
|
||||||
|
|
@ -0,0 +1,240 @@
|
||||||
|
# Islamic Hadith Scholar AI - Phase 3 Implementation Summary
|
||||||
|
|
||||||
|
## Project Context Prompt
|
||||||
|
|
||||||
|
Use this prompt to continue the project or onboard a new AI assistant:
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🎯 PROJECT OVERVIEW
|
||||||
|
|
||||||
|
I'm building an **Islamic Hadith Scholar AI System** - a production-grade AI platform for analyzing Islamic hadith literature. The system processes approximately **40,000 hadiths** from 8 major collections (Sahih Bukhari, Sahih Muslim, Abu Dawood, Tirmidhi, Ibn Majah, Nasa'i, Ahmad, Silsila Sahiha) in Arabic, English, and Urdu.
|
||||||
|
|
||||||
|
### Infrastructure
|
||||||
|
- **Kubernetes cluster** on Hetzner Cloud (2-node, 32 cores, 128GB RAM)
|
||||||
|
- **Domain:** betelgeusebytes.io
|
||||||
|
- **18+ deployed services** across 8 namespaces
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🏗️ ARCHITECTURE & SERVICES
|
||||||
|
|
||||||
|
### Data Layer
|
||||||
|
| Service | Endpoint | Purpose |
|
||||||
|
|---------|----------|---------|
|
||||||
|
| PostgreSQL 18 | pg.betelgeusebytes.io:5432 | Main database (hadith_db) |
|
||||||
|
| Neo4j 5.20 | neo4j.betelgeusebytes.io | Graph database (Phase 4) |
|
||||||
|
| Redis 7 | redis.db.svc.cluster.local | Caching |
|
||||||
|
| Elasticsearch 8.14 | elasticsearch.elastic.svc.cluster.local | Search & logs |
|
||||||
|
|
||||||
|
### ML & Vector Services
|
||||||
|
| Service | Endpoint | Purpose |
|
||||||
|
|---------|----------|---------|
|
||||||
|
| TEI | tei.ml.svc.cluster.local:80 | BGE-M3 embeddings (1024-dim) |
|
||||||
|
| vLLM | vllm.ml.svc.cluster.local:8000 | Qwen2.5-7B inference |
|
||||||
|
| Qdrant | qdrant.vector.svc.cluster.local:6333 | Vector search |
|
||||||
|
| MLflow | mlflow.betelgeusebytes.io | Experiment tracking |
|
||||||
|
| Label Studio | label.betelgeusebytes.io | Annotation |
|
||||||
|
| JupyterLab | notebook.betelgeusebytes.io | Experimentation |
|
||||||
|
|
||||||
|
### Orchestration & Monitoring
|
||||||
|
- **Argo Workflows** (namespace: ml)
|
||||||
|
- **Prometheus + Grafana** for monitoring
|
||||||
|
- **Fluent Bit + OpenTelemetry** for logging
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 💾 DATABASE SCHEMA
|
||||||
|
|
||||||
|
### Main Tables
|
||||||
|
```sql
|
||||||
|
-- hadiths (~40k rows)
|
||||||
|
id, collection_id, book_id, hadith_number
|
||||||
|
arabic_text, arabic_normalized (auto-generated)
|
||||||
|
english_text, urdu_text, grade
|
||||||
|
embedding_generated = TRUE (all done)
|
||||||
|
entities_extracted = FALSE (pending NER)
|
||||||
|
relations_extracted = FALSE (pending RE)
|
||||||
|
source_metadata (JSONB)
|
||||||
|
created_at, updated_at
|
||||||
|
|
||||||
|
-- collections (8 rows)
|
||||||
|
id, name_english, name_arabic, total_hadiths
|
||||||
|
|
||||||
|
-- books
|
||||||
|
id, collection_id, name_english, name_arabic, book_number
|
||||||
|
|
||||||
|
-- narrators_metadata
|
||||||
|
id, name_arabic, name_english, kunya, nisba, birth_year, death_year
|
||||||
|
|
||||||
|
-- annotations
|
||||||
|
id, hadith_id, annotation_type, annotation_data (JSONB)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## ✅ COMPLETED PHASES
|
||||||
|
|
||||||
|
### Phase 1-2: Infrastructure & Data Ingestion (100%)
|
||||||
|
- ✅ All 18 services deployed and operational
|
||||||
|
- ✅ ~40,000 hadiths ingested from hadithapi.com
|
||||||
|
- ✅ Multi-language support (Arabic, English, Urdu)
|
||||||
|
- ✅ PostgreSQL schema with proper indices
|
||||||
|
|
||||||
|
### Phase 3: ML Pipeline
|
||||||
|
|
||||||
|
#### Step 1-5: Embeddings (100%)
|
||||||
|
- ✅ TEI deployed with BGE-M3 model
|
||||||
|
- ✅ vLLM deployed with Qwen2.5-7B-Instruct
|
||||||
|
- ✅ Embedding generator script created
|
||||||
|
- ✅ Argo workflow for batch processing
|
||||||
|
- ✅ **All ~40k hadiths embedded in Qdrant**
|
||||||
|
|
||||||
|
#### Step 6: Verify Embeddings & Semantic Search (100%)
|
||||||
|
**Delivered Files:**
|
||||||
|
- `verify_embeddings.py` - Validates all hadiths have embeddings
|
||||||
|
- `semantic_search.py` - Benchmark suite (target: <500ms)
|
||||||
|
- `search_api.py` - FastAPI service with endpoints
|
||||||
|
- `verification_queries.sql` - SQL verification queries
|
||||||
|
- `k8s-search-api.yaml` - Kubernetes deployment
|
||||||
|
- `step6_verification.ipynb` - Interactive Jupyter notebook
|
||||||
|
|
||||||
|
**API Endpoints:**
|
||||||
|
- `POST /search` - Semantic search
|
||||||
|
- `GET /search?q=` - Simple search
|
||||||
|
- `GET /hadith/{id}` - Get by ID
|
||||||
|
- `GET /similar/{id}` - Find similar
|
||||||
|
- `GET /health`, `GET /stats`
|
||||||
|
|
||||||
|
**Performance Target:** <500ms per query (achieved)
|
||||||
|
|
||||||
|
#### Step 7: Annotation Setup (100%)
|
||||||
|
**Delivered Files:**
|
||||||
|
- `annotation_setup.py` - Main setup script
|
||||||
|
- `label_studio_client.py` - API client
|
||||||
|
- `active_learning.py` - Smart sampling strategies
|
||||||
|
- `export_queries.sql` - SQL export queries
|
||||||
|
- `ANNOTATION_GUIDELINES.md` - Comprehensive guidelines
|
||||||
|
|
||||||
|
**Entity Types for NER:**
|
||||||
|
- PERSON, KUNYA, NISBA, PLACE, DATE, TRIBE, TITLE
|
||||||
|
|
||||||
|
**Relation Types:**
|
||||||
|
- NARRATED_FROM, TEACHER_OF, STUDENT_OF, CONTEMPORARY_OF
|
||||||
|
- RELATED_TO, LIVED_IN, BORN_IN, DIED_IN
|
||||||
|
|
||||||
|
**Sampling Strategies:**
|
||||||
|
- Stratified (proportional to collection size)
|
||||||
|
- Chain-focused (hadiths with clear isnad)
|
||||||
|
- Active Learning: diversity, representative, chain_complexity, hybrid
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🔜 REMAINING STEPS (Phase 3)
|
||||||
|
|
||||||
|
### Step 8: NER Model Training (3-5 days)
|
||||||
|
**Need to implement:**
|
||||||
|
- Script to export Label Studio annotations to HuggingFace format
|
||||||
|
- Training pipeline for XLM-RoBERTa-large or AraBERT
|
||||||
|
- LoRA configuration for efficient fine-tuning
|
||||||
|
- Evaluation metrics (target: F1 > 0.85)
|
||||||
|
- MLflow experiment tracking integration
|
||||||
|
- Deployment as Kubernetes service
|
||||||
|
- Inference API endpoint
|
||||||
|
|
||||||
|
### Step 9: Relation Extraction Model (3-5 days)
|
||||||
|
**Need to implement:**
|
||||||
|
- Pipeline to use NER outputs as input
|
||||||
|
- Training script for relation classification
|
||||||
|
- Model architecture (transformer-based)
|
||||||
|
- Evaluation (target: F1 > 0.80)
|
||||||
|
- Deployment configuration
|
||||||
|
|
||||||
|
### Step 10: LLM Fine-tuning with LoRA (5-7 days)
|
||||||
|
**Need to implement:**
|
||||||
|
- Instruction dataset format for:
|
||||||
|
- Entity extraction from hadith text
|
||||||
|
- Relation extraction between narrators
|
||||||
|
- Question answering about hadiths
|
||||||
|
- Hadith explanation/interpretation
|
||||||
|
- LoRA fine-tuning script for Qwen2.5-7B
|
||||||
|
- Training configuration
|
||||||
|
- Adapter merging/dynamic loading
|
||||||
|
- Deployment to vLLM with LoRA support
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 📁 KEY COMMANDS
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Database access
|
||||||
|
psql -h pg.betelgeusebytes.io -U hadith_ingest -d hadith_db
|
||||||
|
|
||||||
|
# Check Qdrant collection
|
||||||
|
curl http://qdrant.betelgeusebytes.io/collections/hadith_embeddings
|
||||||
|
|
||||||
|
# Kubernetes pods
|
||||||
|
kubectl -n ml get pods
|
||||||
|
kubectl -n vector get pods
|
||||||
|
kubectl -n db get pods
|
||||||
|
|
||||||
|
# Argo workflows
|
||||||
|
argo list -n ml
|
||||||
|
argo logs -n ml <workflow-name>
|
||||||
|
|
||||||
|
# Label Studio (get API key from settings)
|
||||||
|
curl -H "Authorization: Token YOUR_KEY" https://label.betelgeusebytes.io/api/projects
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 📋 DELIVERABLES FORMAT
|
||||||
|
|
||||||
|
For each step, provide:
|
||||||
|
1. **Overview** - What we're building and why
|
||||||
|
2. **Prerequisites** - What needs to be ready
|
||||||
|
3. **Implementation** - Complete, copy-paste ready code
|
||||||
|
4. **Deployment** - Kubernetes YAML if applicable
|
||||||
|
5. **Testing** - How to verify it works
|
||||||
|
6. **Troubleshooting** - Common issues and fixes
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## ⏳ TIMELINE
|
||||||
|
|
||||||
|
| Step | Task | Duration | Status |
|
||||||
|
|------|------|----------|--------|
|
||||||
|
| 6 | Verify Embeddings & Search | 1 hour | ✅ Complete |
|
||||||
|
| 7 | Annotation Setup | 1-2 days | ✅ Complete |
|
||||||
|
| 8 | NER Training | 3-5 days | 🔜 Next |
|
||||||
|
| 9 | RE Training | 3-5 days | Pending |
|
||||||
|
| 10 | LLM Fine-tuning | 5-7 days | Pending |
|
||||||
|
|
||||||
|
**Total Remaining:** ~2-3 weeks
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🔑 IMPORTANT NOTES
|
||||||
|
|
||||||
|
1. **All services use internal K8s DNS** for communication
|
||||||
|
2. **External access** available via *.betelgeusebytes.io with TLS
|
||||||
|
3. **Security:** Default passwords need to be changed in production
|
||||||
|
4. **Embeddings:** BGE-M3 produces 1024-dimensional vectors
|
||||||
|
5. **Model:** Qwen2.5-7B-Instruct is the base LLM for fine-tuning
|
||||||
|
6. **Annotation:** Target 500 NER + 300 Relation samples before training
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🎯 NEXT REQUEST
|
||||||
|
|
||||||
|
Please help me implement **Step 8: NER Model Training** with:
|
||||||
|
- Export script for Label Studio → HuggingFace format
|
||||||
|
- XLM-RoBERTa or AraBERT training pipeline
|
||||||
|
- LoRA configuration
|
||||||
|
- MLflow integration
|
||||||
|
- K8s deployment
|
||||||
|
- Inference API
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
*Generated: 2025 | Project: Islamic Hadith Scholar AI*
|
||||||
|
|
@ -0,0 +1,380 @@
|
||||||
|
# Step 7: Annotation Setup with Label Studio
|
||||||
|
|
||||||
|
## 📋 Overview
|
||||||
|
|
||||||
|
This step sets up the annotation infrastructure for training NER and Relation Extraction models. We export hadiths for annotation, configure Label Studio projects, and implement active learning to speed up the annotation process.
|
||||||
|
|
||||||
|
**Duration:** 1-2 days for setup, ongoing for annotation
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 📁 Files Included
|
||||||
|
|
||||||
|
| File | Description |
|
||||||
|
|------|-------------|
|
||||||
|
| `annotation_setup.py` | Main setup script - exports data and creates projects |
|
||||||
|
| `label_studio_client.py` | API client for Label Studio operations |
|
||||||
|
| `active_learning.py` | Active learning strategies for smart sampling |
|
||||||
|
| `export_queries.sql` | SQL queries for various sampling strategies |
|
||||||
|
| `ANNOTATION_GUIDELINES.md` | Comprehensive annotation guidelines for annotators |
|
||||||
|
| `requirements.txt` | Python dependencies |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🔧 Prerequisites
|
||||||
|
|
||||||
|
1. **Label Studio** running at `label.betelgeusebytes.io`
|
||||||
|
2. **PostgreSQL** access with hadith data
|
||||||
|
3. **Qdrant** with embeddings (for active learning)
|
||||||
|
4. **Python 3.10+** with pip
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🚀 Quick Start
|
||||||
|
|
||||||
|
### 1. Install Dependencies
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Set Environment Variables
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export POSTGRES_HOST=pg.betelgeusebytes.io
|
||||||
|
export POSTGRES_PORT=5432
|
||||||
|
export POSTGRES_DB=hadith_db
|
||||||
|
export POSTGRES_USER=hadith_ingest
|
||||||
|
export POSTGRES_PASSWORD=your_password
|
||||||
|
|
||||||
|
export LABEL_STUDIO_URL=https://label.betelgeusebytes.io
|
||||||
|
export LABEL_STUDIO_API_KEY=your_api_key # Get from Label Studio settings
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Export Data for Annotation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Export 500 hadiths for NER annotation (stratified sampling)
|
||||||
|
python annotation_setup.py --ner-count 500 --relation-count 300 --export-only
|
||||||
|
|
||||||
|
# This creates:
|
||||||
|
# - annotation_data/ner_tasks.json
|
||||||
|
# - annotation_data/relation_tasks.json
|
||||||
|
# - annotation_data/ner_config.xml
|
||||||
|
# - annotation_data/relation_config.xml
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Create Label Studio Projects (Optional)
|
||||||
|
|
||||||
|
If you have an API key:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python annotation_setup.py --ner-count 500 --relation-count 300
|
||||||
|
```
|
||||||
|
|
||||||
|
Or create projects manually and import the JSON files.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 📊 Sampling Strategies
|
||||||
|
|
||||||
|
### 1. Stratified Sampling (Default for NER)
|
||||||
|
Proportional samples from each hadith collection.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python annotation_setup.py --strategy stratified
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Chain-Focused Sampling (Default for Relations)
|
||||||
|
Focuses on hadiths with clear narrator chains (isnad).
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Looks for patterns like:
|
||||||
|
# - حدثنا (narrated to us)
|
||||||
|
# - أخبرنا (informed us)
|
||||||
|
# - عن...عن (from...from)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Active Learning Sampling
|
||||||
|
Uses embeddings to select informative samples.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Diversity sampling - most different from annotated samples
|
||||||
|
python active_learning.py --strategy diversity --count 50
|
||||||
|
|
||||||
|
# Representative sampling - cluster-based selection
|
||||||
|
python active_learning.py --strategy representative --count 50
|
||||||
|
|
||||||
|
# Chain complexity - complex narrator chains
|
||||||
|
python active_learning.py --strategy chain_complexity --count 50
|
||||||
|
|
||||||
|
# Hybrid - combines all strategies
|
||||||
|
python active_learning.py --strategy hybrid --count 100
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🏷️ Entity Types for NER
|
||||||
|
|
||||||
|
| Entity | Description | Example | Hotkey |
|
||||||
|
|--------|-------------|---------|--------|
|
||||||
|
| PERSON | Full names | محمد بن عبد الله | P |
|
||||||
|
| KUNYA | Abu/Umm names | أبو هريرة | K |
|
||||||
|
| NISBA | Attributions | البخاري، القرشي | N |
|
||||||
|
| PLACE | Locations | مكة، المدينة | L |
|
||||||
|
| DATE | Time references | سنة مئتين | D |
|
||||||
|
| TRIBE | Tribe names | قريش، بني هاشم | T |
|
||||||
|
| TITLE | Honorifics | رسول الله، أمير المؤمنين | I |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🔗 Relation Types
|
||||||
|
|
||||||
|
| Relation | Description | Direction |
|
||||||
|
|----------|-------------|-----------|
|
||||||
|
| NARRATED_FROM | Narrator chain link | A → B |
|
||||||
|
| TEACHER_OF | Teaching relationship | Teacher → Student |
|
||||||
|
| STUDENT_OF | Inverse of TEACHER_OF | Student → Teacher |
|
||||||
|
| CONTEMPORARY_OF | Same era | Bidirectional |
|
||||||
|
| RELATED_TO | Family relation | Bidirectional |
|
||||||
|
| LIVED_IN | Residence | Person → Place |
|
||||||
|
| BORN_IN | Birthplace | Person → Place |
|
||||||
|
| DIED_IN | Place of death | Person → Place |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 📝 Label Studio Project Setup
|
||||||
|
|
||||||
|
### Manual Setup (Recommended)
|
||||||
|
|
||||||
|
1. **Login to Label Studio:** https://label.betelgeusebytes.io
|
||||||
|
|
||||||
|
2. **Create NER Project:**
|
||||||
|
- Click "Create Project"
|
||||||
|
- Name: "Hadith NER Annotation"
|
||||||
|
- Go to Settings → Labeling Interface
|
||||||
|
- Paste content from `annotation_data/ner_config.xml`
|
||||||
|
- Save
|
||||||
|
|
||||||
|
3. **Import NER Tasks:**
|
||||||
|
- Go to project → Import
|
||||||
|
- Upload `annotation_data/ner_tasks.json`
|
||||||
|
|
||||||
|
4. **Create Relation Project:**
|
||||||
|
- Create another project: "Hadith Relation Extraction"
|
||||||
|
- Use `annotation_data/relation_config.xml`
|
||||||
|
- Import `annotation_data/relation_tasks.json`
|
||||||
|
|
||||||
|
### Programmatic Setup
|
||||||
|
|
||||||
|
```python
|
||||||
|
from label_studio_client import LabelStudioClient
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
async def setup():
|
||||||
|
client = LabelStudioClient()
|
||||||
|
|
||||||
|
# Create NER project
|
||||||
|
with open("annotation_data/ner_config.xml") as f:
|
||||||
|
config = f.read()
|
||||||
|
|
||||||
|
project = await client.create_project(
|
||||||
|
title="Hadith NER Annotation",
|
||||||
|
description="Named entity recognition for hadith texts",
|
||||||
|
label_config=config
|
||||||
|
)
|
||||||
|
|
||||||
|
# Import tasks
|
||||||
|
await client.import_tasks_from_file(
|
||||||
|
project["id"],
|
||||||
|
"annotation_data/ner_tasks.json"
|
||||||
|
)
|
||||||
|
|
||||||
|
asyncio.run(setup())
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 📤 Exporting Annotations
|
||||||
|
|
||||||
|
### Export to JSON
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python label_studio_client.py export --project 1 --output annotations.json
|
||||||
|
```
|
||||||
|
|
||||||
|
### Convert to HuggingFace Format
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python label_studio_client.py convert \
|
||||||
|
--input annotations.json \
|
||||||
|
--output ner_dataset.json \
|
||||||
|
--format huggingface
|
||||||
|
```
|
||||||
|
|
||||||
|
### Convert to spaCy Format
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python label_studio_client.py convert \
|
||||||
|
--input annotations.json \
|
||||||
|
--output spacy_data.json \
|
||||||
|
--format spacy
|
||||||
|
```
|
||||||
|
|
||||||
|
### Convert Relations to Graph
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python label_studio_client.py convert \
|
||||||
|
--input relation_annotations.json \
|
||||||
|
--output relations.json \
|
||||||
|
--format relations
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 📈 Active Learning Workflow
|
||||||
|
|
||||||
|
### Initial Annotation (Cold Start)
|
||||||
|
|
||||||
|
1. Start with stratified sample of 100 hadiths
|
||||||
|
2. Annotate these completely
|
||||||
|
3. Train preliminary model
|
||||||
|
|
||||||
|
### Iterative Improvement
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# After initial annotations, use active learning
|
||||||
|
python active_learning.py --strategy hybrid --count 50 --output next_batch.json
|
||||||
|
|
||||||
|
# Import to Label Studio
|
||||||
|
python label_studio_client.py import --project 1 --file next_batch.json
|
||||||
|
|
||||||
|
# Annotate, then repeat
|
||||||
|
```
|
||||||
|
|
||||||
|
### Strategy Recommendations
|
||||||
|
|
||||||
|
| Stage | Strategy | Rationale |
|
||||||
|
|-------|----------|-----------|
|
||||||
|
| Initial (0-100) | Stratified | Cover all collections |
|
||||||
|
| Early (100-300) | Diversity | Expand coverage |
|
||||||
|
| Middle (300-500) | Representative | Fill gaps in clusters |
|
||||||
|
| Later (500+) | Chain Complexity | Focus on hard cases |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 📊 Annotation Progress Tracking
|
||||||
|
|
||||||
|
### Check Progress via SQL
|
||||||
|
|
||||||
|
```sql
|
||||||
|
-- Overall progress
|
||||||
|
SELECT
|
||||||
|
SUM(CASE WHEN entities_extracted THEN 1 ELSE 0 END) as ner_done,
|
||||||
|
SUM(CASE WHEN relations_extracted THEN 1 ELSE 0 END) as relations_done,
|
||||||
|
COUNT(*) as total
|
||||||
|
FROM hadiths;
|
||||||
|
|
||||||
|
-- Progress by collection
|
||||||
|
SELECT
|
||||||
|
c.name_english,
|
||||||
|
SUM(CASE WHEN h.entities_extracted THEN 1 ELSE 0 END) as ner_done,
|
||||||
|
COUNT(*) as total
|
||||||
|
FROM hadiths h
|
||||||
|
JOIN collections c ON h.collection_id = c.id
|
||||||
|
GROUP BY c.name_english;
|
||||||
|
```
|
||||||
|
|
||||||
|
### Check via Label Studio API
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python label_studio_client.py stats --project 1
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🎯 Quality Guidelines
|
||||||
|
|
||||||
|
### Inter-Annotator Agreement Target
|
||||||
|
- Entity Detection: Cohen's κ > 0.8
|
||||||
|
- Entity Classification: κ > 0.75
|
||||||
|
- Relations: κ > 0.7
|
||||||
|
|
||||||
|
### Review Process
|
||||||
|
1. Each task annotated by 1-2 annotators
|
||||||
|
2. Disagreements reviewed by expert
|
||||||
|
3. Edge cases documented for guideline updates
|
||||||
|
|
||||||
|
### Quality Checks
|
||||||
|
- Run `ANNOTATION_GUIDELINES.md` training with annotators
|
||||||
|
- Spot-check 10% of annotations weekly
|
||||||
|
- Track agreement scores over time
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🐛 Troubleshooting
|
||||||
|
|
||||||
|
### "Connection refused" to Label Studio
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check if Label Studio is running
|
||||||
|
curl https://label.betelgeusebytes.io/health
|
||||||
|
|
||||||
|
# Check API key
|
||||||
|
curl -H "Authorization: Token YOUR_API_KEY" \
|
||||||
|
https://label.betelgeusebytes.io/api/projects
|
||||||
|
```
|
||||||
|
|
||||||
|
### "No embeddings for active learning"
|
||||||
|
|
||||||
|
Ensure Step 6 completed successfully:
|
||||||
|
```bash
|
||||||
|
curl http://qdrant.betelgeusebytes.io/collections/hadith_embeddings
|
||||||
|
```
|
||||||
|
|
||||||
|
### Export fails with encoding errors
|
||||||
|
|
||||||
|
Use UTF-8 encoding explicitly:
|
||||||
|
```python
|
||||||
|
with open(file, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(data, f, ensure_ascii=False)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## ✅ Checklist Before Step 8
|
||||||
|
|
||||||
|
Before proceeding to NER model training:
|
||||||
|
|
||||||
|
- [ ] At least 300-500 hadiths annotated for NER
|
||||||
|
- [ ] At least 200-300 hadiths annotated for relations
|
||||||
|
- [ ] Annotation guidelines reviewed and finalized
|
||||||
|
- [ ] Inter-annotator agreement > 0.75
|
||||||
|
- [ ] Annotations exported in HuggingFace format
|
||||||
|
- [ ] Edge cases documented
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 📚 Next Steps
|
||||||
|
|
||||||
|
Once sufficient annotations are collected:
|
||||||
|
|
||||||
|
1. **Step 8:** Train NER model (XLM-RoBERTa or AraBERT)
|
||||||
|
- Use exported HuggingFace format data
|
||||||
|
- Target F1 > 0.85
|
||||||
|
|
||||||
|
2. **Step 9:** Train Relation Extraction model
|
||||||
|
- Use NER outputs as input
|
||||||
|
- Target F1 > 0.80
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 📎 Additional Resources
|
||||||
|
|
||||||
|
- [Label Studio Documentation](https://labelstud.io/guide/)
|
||||||
|
- [Active Learning for NER](https://arxiv.org/abs/2101.11112)
|
||||||
|
- [Arabic NER Guidelines](https://www.aclweb.org/anthology/)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
*Version 1.0 - Last Updated: 2025*
|
||||||
|
|
@ -0,0 +1,599 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Step 7: Active Learning Strategy for Hadith Annotation
|
||||||
|
=======================================================
|
||||||
|
Implements active learning to speed up annotation by selecting
|
||||||
|
the most informative samples for labeling.
|
||||||
|
|
||||||
|
Author: Hadith Scholar AI Project
|
||||||
|
Date: 2025
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import random
|
||||||
|
import asyncio
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Dict, List, Optional, Tuple
|
||||||
|
from dataclasses import dataclass
|
||||||
|
import math
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import httpx
|
||||||
|
import psycopg2
|
||||||
|
from psycopg2.extras import RealDictCursor
|
||||||
|
from rich.console import Console
|
||||||
|
from rich.table import Table
|
||||||
|
from rich.progress import Progress
|
||||||
|
|
||||||
|
if sys.platform == 'win32':
|
||||||
|
os.environ['PYTHONIOENCODING'] = 'utf-8'
|
||||||
|
if hasattr(sys.stdout, 'reconfigure'):
|
||||||
|
sys.stdout.reconfigure(encoding='utf-8')
|
||||||
|
if hasattr(sys.stderr, 'reconfigure'):
|
||||||
|
sys.stderr.reconfigure(encoding='utf-8')
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
POSTGRES_HOST = os.getenv("POSTGRES_HOST", "pg.betelgeusebytes.io")
|
||||||
|
POSTGRES_PORT = int(os.getenv("POSTGRES_PORT", "5432"))
|
||||||
|
POSTGRES_DB = os.getenv("POSTGRES_DB", "hadith_db")
|
||||||
|
POSTGRES_USER = os.getenv("POSTGRES_USER", "hadith_ingest")
|
||||||
|
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD", "hadith_ingest")
|
||||||
|
# TEI_URL = "https://embeddings.betelgeusebytes.io"
|
||||||
|
# QDRANT_URL = "https://vector.betelgeusebytes.io"
|
||||||
|
QDRANT_HOST = os.getenv("QDRANT_HOST", "https://vector.betelgeusebytes.io")
|
||||||
|
QDRANT_PORT = int(os.getenv("QDRANT_PORT", "443"))
|
||||||
|
QDRANT_COLLECTION = os.getenv("QDRANT_COLLECTION", "hadith_embeddings")
|
||||||
|
|
||||||
|
# For external access
|
||||||
|
QDRANT_EXTERNAL = os.getenv("QDRANT_EXTERNAL", "qdrant.betelgeusebytes.io")
|
||||||
|
# TEI_URL = "https://embeddings.betelgeusebytes.io"
|
||||||
|
TEI_HOST = os.getenv("TEI_HOST", "https://embeddings.betelgeusebytes.io")
|
||||||
|
TEI_PORT = int(os.getenv("TEI_PORT", "443"))
|
||||||
|
|
||||||
|
console = Console()
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class SampleCandidate:
|
||||||
|
"""A candidate sample for annotation."""
|
||||||
|
hadith_id: int
|
||||||
|
arabic_text: str
|
||||||
|
english_text: str
|
||||||
|
collection: str
|
||||||
|
score: float
|
||||||
|
strategy: str
|
||||||
|
metadata: Dict
|
||||||
|
|
||||||
|
|
||||||
|
class ActiveLearningSampler:
|
||||||
|
"""
|
||||||
|
Active learning sampler for hadith annotation.
|
||||||
|
|
||||||
|
Strategies:
|
||||||
|
1. Uncertainty Sampling - Select samples where model is least confident
|
||||||
|
2. Diversity Sampling - Select samples that are most different from annotated
|
||||||
|
3. Representative Sampling - Select samples that represent clusters
|
||||||
|
4. Hybrid - Combine multiple strategies
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.db_conn = None
|
||||||
|
self.qdrant_client = None
|
||||||
|
|
||||||
|
def _get_db_connection(self):
|
||||||
|
"""Get database connection."""
|
||||||
|
if self.db_conn is None or self.db_conn.closed:
|
||||||
|
self.db_conn = psycopg2.connect(
|
||||||
|
host=POSTGRES_HOST,
|
||||||
|
port=POSTGRES_PORT,
|
||||||
|
database=POSTGRES_DB,
|
||||||
|
user=POSTGRES_USER,
|
||||||
|
password=POSTGRES_PASSWORD,
|
||||||
|
sslmode='require'
|
||||||
|
)
|
||||||
|
return self.db_conn
|
||||||
|
|
||||||
|
async def _search_qdrant(
|
||||||
|
self,
|
||||||
|
vector: List[float],
|
||||||
|
limit: int = 100,
|
||||||
|
filter_ids: List[int] = None
|
||||||
|
) -> List[Dict]:
|
||||||
|
"""Search Qdrant for similar vectors."""
|
||||||
|
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||||
|
payload = {
|
||||||
|
"vector": vector,
|
||||||
|
"limit": limit,
|
||||||
|
"with_payload": True
|
||||||
|
}
|
||||||
|
|
||||||
|
if filter_ids:
|
||||||
|
payload["filter"] = {
|
||||||
|
"must_not": [
|
||||||
|
{"has_id": filter_ids}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
response = await client.post(
|
||||||
|
f"{QDRANT_HOST}:{QDRANT_PORT}/collections/{QDRANT_COLLECTION}/points/search",
|
||||||
|
json=payload
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.json().get("result", [])
|
||||||
|
|
||||||
|
async def _get_random_vectors(self, count: int = 10) -> List[Dict]:
|
||||||
|
"""Get random vectors from Qdrant for centroid calculation."""
|
||||||
|
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||||
|
response = await client.post(
|
||||||
|
f"{QDRANT_HOST}:{QDRANT_PORT}/collections/{QDRANT_COLLECTION}/points/scroll",
|
||||||
|
json={
|
||||||
|
"limit": count,
|
||||||
|
"with_vector": True,
|
||||||
|
"with_payload": True
|
||||||
|
}
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.json().get("result", {}).get("points", [])
|
||||||
|
|
||||||
|
def get_annotated_hadith_ids(self) -> List[int]:
|
||||||
|
"""Get IDs of already annotated hadiths."""
|
||||||
|
conn = self._get_db_connection()
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.execute("""
|
||||||
|
SELECT id FROM hadiths
|
||||||
|
WHERE entities_extracted = true
|
||||||
|
""")
|
||||||
|
return [row[0] for row in cur.fetchall()]
|
||||||
|
|
||||||
|
def get_unannotated_hadiths(self, limit: int = 1000) -> List[Dict]:
|
||||||
|
"""Get unannotated hadiths with their metadata."""
|
||||||
|
conn = self._get_db_connection()
|
||||||
|
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||||
|
cur.execute("""
|
||||||
|
SELECT
|
||||||
|
h.id,
|
||||||
|
h.hadith_number,
|
||||||
|
h.arabic_text,
|
||||||
|
h.english_text,
|
||||||
|
h.grade,
|
||||||
|
c.name_english as collection
|
||||||
|
FROM hadiths h
|
||||||
|
JOIN collections c ON h.collection_id = c.id
|
||||||
|
WHERE NOT h.entities_extracted
|
||||||
|
AND h.arabic_text IS NOT NULL
|
||||||
|
AND LENGTH(h.arabic_text) > 50
|
||||||
|
ORDER BY RANDOM()
|
||||||
|
LIMIT %s
|
||||||
|
""", (limit,))
|
||||||
|
return [dict(row) for row in cur.fetchall()]
|
||||||
|
|
||||||
|
# ========================================================================
|
||||||
|
# Sampling Strategies
|
||||||
|
# ========================================================================
|
||||||
|
|
||||||
|
async def diversity_sampling(
|
||||||
|
self,
|
||||||
|
count: int = 50,
|
||||||
|
annotated_ids: List[int] = None
|
||||||
|
) -> List[SampleCandidate]:
|
||||||
|
"""
|
||||||
|
Select samples that are most different from already annotated samples.
|
||||||
|
Uses embedding distance to find diverse samples.
|
||||||
|
"""
|
||||||
|
if annotated_ids is None:
|
||||||
|
annotated_ids = self.get_annotated_hadith_ids()
|
||||||
|
|
||||||
|
if not annotated_ids:
|
||||||
|
# No annotations yet, use random sampling
|
||||||
|
return await self.random_sampling(count)
|
||||||
|
|
||||||
|
# Get centroid of annotated samples
|
||||||
|
annotated_vectors = []
|
||||||
|
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||||
|
for batch_start in range(0, min(len(annotated_ids), 100), 10):
|
||||||
|
batch_ids = annotated_ids[batch_start:batch_start+10]
|
||||||
|
response = await client.post(
|
||||||
|
f"{QDRANT_HOST}:{QDRANT_PORT}/collections/{QDRANT_COLLECTION}/points",
|
||||||
|
json={"ids": batch_ids, "with_vector": True}
|
||||||
|
)
|
||||||
|
if response.status_code == 200:
|
||||||
|
for point in response.json().get("result", []):
|
||||||
|
if "vector" in point:
|
||||||
|
annotated_vectors.append(point["vector"])
|
||||||
|
|
||||||
|
if not annotated_vectors:
|
||||||
|
return await self.random_sampling(count)
|
||||||
|
|
||||||
|
# Calculate centroid
|
||||||
|
centroid = np.mean(annotated_vectors, axis=0).tolist()
|
||||||
|
|
||||||
|
# Find points far from centroid (negative similarity search)
|
||||||
|
# We'll get many candidates and select the most distant
|
||||||
|
candidates = await self._search_qdrant(
|
||||||
|
centroid,
|
||||||
|
limit=count * 3,
|
||||||
|
filter_ids=annotated_ids
|
||||||
|
)
|
||||||
|
|
||||||
|
# Sort by distance (lower score = more distant for cosine similarity)
|
||||||
|
candidates.sort(key=lambda x: x.get("score", 1))
|
||||||
|
|
||||||
|
# Get hadith details
|
||||||
|
hadith_ids = [
|
||||||
|
c.get("payload", {}).get("hadith_id") or c.get("id")
|
||||||
|
for c in candidates[:count]
|
||||||
|
]
|
||||||
|
|
||||||
|
hadith_map = self._get_hadiths_by_ids(hadith_ids)
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for i, c in enumerate(candidates[:count]):
|
||||||
|
hid = c.get("payload", {}).get("hadith_id") or c.get("id")
|
||||||
|
if hid in hadith_map:
|
||||||
|
h = hadith_map[hid]
|
||||||
|
results.append(SampleCandidate(
|
||||||
|
hadith_id=hid,
|
||||||
|
arabic_text=h.get("arabic_text", ""),
|
||||||
|
english_text=h.get("english_text", ""),
|
||||||
|
collection=h.get("collection", ""),
|
||||||
|
score=1 - c.get("score", 0), # Convert similarity to diversity
|
||||||
|
strategy="diversity",
|
||||||
|
metadata={"rank": i + 1}
|
||||||
|
))
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
async def representative_sampling(
|
||||||
|
self,
|
||||||
|
count: int = 50,
|
||||||
|
n_clusters: int = 10
|
||||||
|
) -> List[SampleCandidate]:
|
||||||
|
"""
|
||||||
|
Select samples that are representative of different clusters.
|
||||||
|
Uses k-means-like approach on embeddings.
|
||||||
|
"""
|
||||||
|
# Get random sample of vectors to identify clusters
|
||||||
|
sample_points = await self._get_random_vectors(count=500)
|
||||||
|
|
||||||
|
if len(sample_points) < n_clusters:
|
||||||
|
return await self.random_sampling(count)
|
||||||
|
|
||||||
|
# Simple k-means clustering on vectors
|
||||||
|
vectors = np.array([p["vector"] for p in sample_points])
|
||||||
|
|
||||||
|
# Initialize centroids randomly
|
||||||
|
centroid_indices = random.sample(range(len(vectors)), n_clusters)
|
||||||
|
centroids = vectors[centroid_indices]
|
||||||
|
|
||||||
|
# Run k-means iterations
|
||||||
|
for _ in range(10):
|
||||||
|
# Assign points to nearest centroid
|
||||||
|
distances = np.linalg.norm(vectors[:, np.newaxis] - centroids, axis=2)
|
||||||
|
assignments = np.argmin(distances, axis=1)
|
||||||
|
|
||||||
|
# Update centroids
|
||||||
|
new_centroids = []
|
||||||
|
for k in range(n_clusters):
|
||||||
|
cluster_points = vectors[assignments == k]
|
||||||
|
if len(cluster_points) > 0:
|
||||||
|
new_centroids.append(cluster_points.mean(axis=0))
|
||||||
|
else:
|
||||||
|
new_centroids.append(centroids[k])
|
||||||
|
centroids = np.array(new_centroids)
|
||||||
|
|
||||||
|
# Select samples closest to each centroid
|
||||||
|
samples_per_cluster = max(1, count // n_clusters)
|
||||||
|
selected = []
|
||||||
|
|
||||||
|
annotated_ids = set(self.get_annotated_hadith_ids())
|
||||||
|
|
||||||
|
for k in range(n_clusters):
|
||||||
|
cluster_mask = assignments == k
|
||||||
|
cluster_indices = np.where(cluster_mask)[0]
|
||||||
|
|
||||||
|
if len(cluster_indices) == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Sort by distance to centroid
|
||||||
|
cluster_vectors = vectors[cluster_mask]
|
||||||
|
distances = np.linalg.norm(cluster_vectors - centroids[k], axis=1)
|
||||||
|
sorted_indices = np.argsort(distances)
|
||||||
|
|
||||||
|
added = 0
|
||||||
|
for idx in sorted_indices:
|
||||||
|
point = sample_points[cluster_indices[idx]]
|
||||||
|
hid = point.get("payload", {}).get("hadith_id") or point.get("id")
|
||||||
|
|
||||||
|
if hid not in annotated_ids and added < samples_per_cluster:
|
||||||
|
selected.append({
|
||||||
|
"hadith_id": hid,
|
||||||
|
"cluster": k,
|
||||||
|
"distance": float(distances[idx])
|
||||||
|
})
|
||||||
|
added += 1
|
||||||
|
|
||||||
|
# Get hadith details
|
||||||
|
hadith_ids = [s["hadith_id"] for s in selected]
|
||||||
|
hadith_map = self._get_hadiths_by_ids(hadith_ids)
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for s in selected[:count]:
|
||||||
|
hid = s["hadith_id"]
|
||||||
|
if hid in hadith_map:
|
||||||
|
h = hadith_map[hid]
|
||||||
|
results.append(SampleCandidate(
|
||||||
|
hadith_id=hid,
|
||||||
|
arabic_text=h.get("arabic_text", ""),
|
||||||
|
english_text=h.get("english_text", ""),
|
||||||
|
collection=h.get("collection", ""),
|
||||||
|
score=1.0 / (1.0 + s["distance"]),
|
||||||
|
strategy="representative",
|
||||||
|
metadata={"cluster": s["cluster"]}
|
||||||
|
))
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
async def chain_complexity_sampling(
|
||||||
|
self,
|
||||||
|
count: int = 50
|
||||||
|
) -> List[SampleCandidate]:
|
||||||
|
"""
|
||||||
|
Select samples with complex narrator chains for relation annotation.
|
||||||
|
Uses heuristics based on chain patterns.
|
||||||
|
"""
|
||||||
|
conn = self._get_db_connection()
|
||||||
|
|
||||||
|
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||||
|
# Find hadiths with complex chains
|
||||||
|
cur.execute("""
|
||||||
|
WITH chain_scores AS (
|
||||||
|
SELECT
|
||||||
|
h.id,
|
||||||
|
h.hadith_number,
|
||||||
|
h.arabic_text,
|
||||||
|
h.english_text,
|
||||||
|
c.name_english as collection,
|
||||||
|
-- Score based on chain indicators
|
||||||
|
(
|
||||||
|
-- Count "عن" occurrences (narrator chain links)
|
||||||
|
(LENGTH(h.arabic_text) - LENGTH(REPLACE(h.arabic_text, 'عن', ''))) / 2 * 2
|
||||||
|
-- Count "حدثنا" occurrences
|
||||||
|
+ (LENGTH(h.arabic_text) - LENGTH(REPLACE(h.arabic_text, 'حدثنا', ''))) / 5 * 3
|
||||||
|
-- Count "أخبرنا" occurrences
|
||||||
|
+ (LENGTH(h.arabic_text) - LENGTH(REPLACE(h.arabic_text, 'أخبرنا', ''))) / 6 * 3
|
||||||
|
-- Bonus for longer texts (more potential entities)
|
||||||
|
+ LEAST(LENGTH(h.arabic_text) / 100, 10)
|
||||||
|
) as complexity_score
|
||||||
|
FROM hadiths h
|
||||||
|
JOIN collections c ON h.collection_id = c.id
|
||||||
|
WHERE NOT h.entities_extracted
|
||||||
|
AND h.arabic_text IS NOT NULL
|
||||||
|
AND LENGTH(h.arabic_text) > 100
|
||||||
|
)
|
||||||
|
SELECT *
|
||||||
|
FROM chain_scores
|
||||||
|
WHERE complexity_score > 5
|
||||||
|
ORDER BY complexity_score DESC, RANDOM()
|
||||||
|
LIMIT %s
|
||||||
|
""", (count,))
|
||||||
|
|
||||||
|
hadiths = cur.fetchall()
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for h in hadiths:
|
||||||
|
results.append(SampleCandidate(
|
||||||
|
hadith_id=h["id"],
|
||||||
|
arabic_text=h.get("arabic_text", ""),
|
||||||
|
english_text=h.get("english_text", ""),
|
||||||
|
collection=h.get("collection", ""),
|
||||||
|
score=float(h.get("complexity_score", 0)) / 20.0, # Normalize
|
||||||
|
strategy="chain_complexity",
|
||||||
|
metadata={"complexity_score": h.get("complexity_score", 0)}
|
||||||
|
))
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
async def random_sampling(self, count: int = 50) -> List[SampleCandidate]:
|
||||||
|
"""Simple random sampling as baseline."""
|
||||||
|
hadiths = self.get_unannotated_hadiths(limit=count)
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for h in hadiths:
|
||||||
|
results.append(SampleCandidate(
|
||||||
|
hadith_id=h["id"],
|
||||||
|
arabic_text=h.get("arabic_text", ""),
|
||||||
|
english_text=h.get("english_text", ""),
|
||||||
|
collection=h.get("collection", ""),
|
||||||
|
score=random.random(),
|
||||||
|
strategy="random",
|
||||||
|
metadata={}
|
||||||
|
))
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
async def hybrid_sampling(
|
||||||
|
self,
|
||||||
|
count: int = 50,
|
||||||
|
weights: Dict[str, float] = None
|
||||||
|
) -> List[SampleCandidate]:
|
||||||
|
"""
|
||||||
|
Combine multiple sampling strategies.
|
||||||
|
|
||||||
|
Default weights:
|
||||||
|
- diversity: 0.3
|
||||||
|
- representative: 0.3
|
||||||
|
- chain_complexity: 0.3
|
||||||
|
- random: 0.1
|
||||||
|
"""
|
||||||
|
if weights is None:
|
||||||
|
weights = {
|
||||||
|
"diversity": 0.3,
|
||||||
|
"representative": 0.3,
|
||||||
|
"chain_complexity": 0.3,
|
||||||
|
"random": 0.1
|
||||||
|
}
|
||||||
|
|
||||||
|
# Normalize weights
|
||||||
|
total_weight = sum(weights.values())
|
||||||
|
weights = {k: v / total_weight for k, v in weights.items()}
|
||||||
|
|
||||||
|
# Get samples from each strategy
|
||||||
|
all_candidates = []
|
||||||
|
|
||||||
|
for strategy, weight in weights.items():
|
||||||
|
strategy_count = max(1, int(count * weight * 1.5)) # Get extra for dedup
|
||||||
|
|
||||||
|
if strategy == "diversity":
|
||||||
|
candidates = await self.diversity_sampling(strategy_count)
|
||||||
|
elif strategy == "representative":
|
||||||
|
candidates = await self.representative_sampling(strategy_count)
|
||||||
|
elif strategy == "chain_complexity":
|
||||||
|
candidates = await self.chain_complexity_sampling(strategy_count)
|
||||||
|
else:
|
||||||
|
candidates = await self.random_sampling(strategy_count)
|
||||||
|
|
||||||
|
# Adjust scores by weight
|
||||||
|
for c in candidates:
|
||||||
|
c.score *= weight
|
||||||
|
|
||||||
|
all_candidates.extend(candidates)
|
||||||
|
|
||||||
|
# Deduplicate by hadith_id, keeping highest score
|
||||||
|
seen = {}
|
||||||
|
for c in all_candidates:
|
||||||
|
if c.hadith_id not in seen or c.score > seen[c.hadith_id].score:
|
||||||
|
seen[c.hadith_id] = c
|
||||||
|
|
||||||
|
# Sort by score and return top N
|
||||||
|
results = sorted(seen.values(), key=lambda x: -x.score)
|
||||||
|
return results[:count]
|
||||||
|
|
||||||
|
def _get_hadiths_by_ids(self, hadith_ids: List[int]) -> Dict[int, Dict]:
|
||||||
|
"""Get hadith details by IDs."""
|
||||||
|
if not hadith_ids:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
conn = self._get_db_connection()
|
||||||
|
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||||
|
cur.execute("""
|
||||||
|
SELECT
|
||||||
|
h.id,
|
||||||
|
h.hadith_number,
|
||||||
|
h.arabic_text,
|
||||||
|
h.english_text,
|
||||||
|
h.grade,
|
||||||
|
c.name_english as collection
|
||||||
|
FROM hadiths h
|
||||||
|
JOIN collections c ON h.collection_id = c.id
|
||||||
|
WHERE h.id = ANY(%s::uuid[])
|
||||||
|
""", (list(hadith_ids),))
|
||||||
|
|
||||||
|
return {row["id"]: dict(row) for row in cur.fetchall()}
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
"""Close database connection."""
|
||||||
|
if self.db_conn and not self.db_conn.closed:
|
||||||
|
self.db_conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Export Functions
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def export_samples_for_label_studio(
|
||||||
|
samples: List[SampleCandidate],
|
||||||
|
output_path: str
|
||||||
|
) -> str:
|
||||||
|
"""Export samples in Label Studio format."""
|
||||||
|
tasks = []
|
||||||
|
|
||||||
|
for s in samples:
|
||||||
|
task = {
|
||||||
|
"data": {
|
||||||
|
"hadith_id": s.hadith_id,
|
||||||
|
"arabic_text": s.arabic_text,
|
||||||
|
"english_text": s.english_text,
|
||||||
|
"collection": s.collection,
|
||||||
|
"selection_score": s.score,
|
||||||
|
"selection_strategy": s.strategy
|
||||||
|
},
|
||||||
|
"meta": {
|
||||||
|
"strategy": s.strategy,
|
||||||
|
"metadata": s.metadata
|
||||||
|
}
|
||||||
|
}
|
||||||
|
tasks.append(task)
|
||||||
|
|
||||||
|
with open(output_path, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(tasks, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
return output_path
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# CLI
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
"""Main CLI interface."""
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description="Active Learning Sampler")
|
||||||
|
parser.add_argument("--strategy", choices=[
|
||||||
|
"diversity", "representative", "chain_complexity", "random", "hybrid"
|
||||||
|
], default="hybrid", help="Sampling strategy")
|
||||||
|
parser.add_argument("--count", type=int, default=50, help="Number of samples")
|
||||||
|
parser.add_argument("--output", type=str, default="active_learning_samples.json")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
console.print(f"[bold]Active Learning Sampling[/bold]")
|
||||||
|
console.print(f"Strategy: {args.strategy}")
|
||||||
|
console.print(f"Count: {args.count}")
|
||||||
|
|
||||||
|
sampler = ActiveLearningSampler()
|
||||||
|
|
||||||
|
try:
|
||||||
|
if args.strategy == "diversity":
|
||||||
|
samples = await sampler.diversity_sampling(args.count)
|
||||||
|
elif args.strategy == "representative":
|
||||||
|
samples = await sampler.representative_sampling(args.count)
|
||||||
|
elif args.strategy == "chain_complexity":
|
||||||
|
samples = await sampler.chain_complexity_sampling(args.count)
|
||||||
|
elif args.strategy == "random":
|
||||||
|
samples = await sampler.random_sampling(args.count)
|
||||||
|
else:
|
||||||
|
samples = await sampler.hybrid_sampling(args.count)
|
||||||
|
|
||||||
|
# Display results
|
||||||
|
table = Table(title=f"Selected Samples ({args.strategy})")
|
||||||
|
table.add_column("ID", style="cyan")
|
||||||
|
table.add_column("Collection")
|
||||||
|
table.add_column("Score", justify="right")
|
||||||
|
table.add_column("Strategy")
|
||||||
|
table.add_column("Preview", width=40)
|
||||||
|
|
||||||
|
for s in samples[:20]: # Show first 20
|
||||||
|
preview = (s.arabic_text or s.english_text or "")[:40] + "..."
|
||||||
|
table.add_row(
|
||||||
|
str(s.hadith_id),
|
||||||
|
s.collection,
|
||||||
|
f"{s.score:.3f}",
|
||||||
|
s.strategy,
|
||||||
|
preview
|
||||||
|
)
|
||||||
|
|
||||||
|
console.print(table)
|
||||||
|
|
||||||
|
# Export
|
||||||
|
export_samples_for_label_studio(samples, args.output)
|
||||||
|
console.print(f"\n[green]Exported {len(samples)} samples to {args.output}[/green]")
|
||||||
|
|
||||||
|
finally:
|
||||||
|
sampler.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
File diff suppressed because one or more lines are too long
|
|
@ -0,0 +1,47 @@
|
||||||
|
|
||||||
|
<View>
|
||||||
|
<Header value="Hadith Entity and Relation Annotation"/>
|
||||||
|
|
||||||
|
<Collapse defaultActiveKey="arabic">
|
||||||
|
<Panel key="arabic" header="Arabic Text">
|
||||||
|
<Labels name="ner_ar" toName="arabic_text">
|
||||||
|
<Label value="PERSON" background="#FF6B6B" hotkey="p"/>
|
||||||
|
<Label value="KUNYA" background="#4ECDC4" hotkey="k"/>
|
||||||
|
<Label value="NISBA" background="#45B7D1" hotkey="n"/>
|
||||||
|
<Label value="PLACE" background="#96CEB4" hotkey="l"/>
|
||||||
|
<Label value="DATE" background="#FFEAA7" hotkey="d"/>
|
||||||
|
<Label value="TRIBE" background="#DDA0DD" hotkey="t"/>
|
||||||
|
<Label value="TITLE" background="#98D8C8" hotkey="i"/>
|
||||||
|
</Labels>
|
||||||
|
<Text name="arabic_text" value="$arabic_text" granularity="word"/>
|
||||||
|
</Panel>
|
||||||
|
|
||||||
|
<Panel key="english" header="English Text">
|
||||||
|
<Labels name="ner_en" toName="english_text">
|
||||||
|
<Label value="PERSON" background="#FF6B6B"/>
|
||||||
|
<Label value="KUNYA" background="#4ECDC4"/>
|
||||||
|
<Label value="NISBA" background="#45B7D1"/>
|
||||||
|
<Label value="PLACE" background="#96CEB4"/>
|
||||||
|
<Label value="DATE" background="#FFEAA7"/>
|
||||||
|
<Label value="TRIBE" background="#DDA0DD"/>
|
||||||
|
<Label value="TITLE" background="#98D8C8"/>
|
||||||
|
</Labels>
|
||||||
|
<Text name="english_text" value="$english_text" granularity="word"/>
|
||||||
|
</Panel>
|
||||||
|
</Collapse>
|
||||||
|
|
||||||
|
<Header value="Relations between Entities" size="4"/>
|
||||||
|
<Relations>
|
||||||
|
<Relation value="NARRATED_FROM" hotkey="r"/>
|
||||||
|
<Relation value="TEACHER_OF" hotkey="e"/>
|
||||||
|
<Relation value="STUDENT_OF" hotkey="s"/>
|
||||||
|
<Relation value="CONTEMPORARY_OF" hotkey="c"/>
|
||||||
|
<Relation value="RELATED_TO"/>
|
||||||
|
<Relation value="LIVED_IN"/>
|
||||||
|
</Relations>
|
||||||
|
|
||||||
|
<View style="margin-top: 15px; padding: 10px; background: #e8f4f8; border-radius: 5px;">
|
||||||
|
<Header value="Hadith Info" size="5"/>
|
||||||
|
<Text name="meta" value="$collection | Hadith #$hadith_number | Grade: $grade"/>
|
||||||
|
</View>
|
||||||
|
</View>
|
||||||
|
|
@ -0,0 +1,41 @@
|
||||||
|
|
||||||
|
<View>
|
||||||
|
<Header value="Hadith Named Entity Recognition (NER)"/>
|
||||||
|
|
||||||
|
<View style="display: flex; flex-direction: row;">
|
||||||
|
<View style="flex: 1; margin-right: 10px;">
|
||||||
|
<Header value="Arabic Text" size="4"/>
|
||||||
|
<Text name="arabic_text" value="$arabic_text" granularity="word"/>
|
||||||
|
<Labels name="ner_arabic" toName="arabic_text">
|
||||||
|
<Label value="PERSON" background="#FF6B6B" hotkey="p"/>
|
||||||
|
<Label value="KUNYA" background="#4ECDC4" hotkey="k"/>
|
||||||
|
<Label value="NISBA" background="#45B7D1" hotkey="n"/>
|
||||||
|
<Label value="PLACE" background="#96CEB4" hotkey="l"/>
|
||||||
|
<Label value="DATE" background="#FFEAA7" hotkey="d"/>
|
||||||
|
<Label value="TRIBE" background="#DDA0DD" hotkey="t"/>
|
||||||
|
<Label value="TITLE" background="#98D8C8" hotkey="i"/>
|
||||||
|
</Labels>
|
||||||
|
</View>
|
||||||
|
|
||||||
|
<View style="flex: 1; margin-left: 10px;">
|
||||||
|
<Header value="English Text" size="4"/>
|
||||||
|
<Text name="english_text" value="$english_text" granularity="word"/>
|
||||||
|
<Labels name="ner_english" toName="english_text">
|
||||||
|
<Label value="PERSON" background="#FF6B6B" hotkey="1"/>
|
||||||
|
<Label value="KUNYA" background="#4ECDC4" hotkey="2"/>
|
||||||
|
<Label value="NISBA" background="#45B7D1" hotkey="3"/>
|
||||||
|
<Label value="PLACE" background="#96CEB4" hotkey="4"/>
|
||||||
|
<Label value="DATE" background="#FFEAA7" hotkey="5"/>
|
||||||
|
<Label value="TRIBE" background="#DDA0DD" hotkey="6"/>
|
||||||
|
<Label value="TITLE" background="#98D8C8" hotkey="7"/>
|
||||||
|
</Labels>
|
||||||
|
</View>
|
||||||
|
</View>
|
||||||
|
|
||||||
|
<View style="margin-top: 20px; padding: 10px; background: #f5f5f5; border-radius: 5px;">
|
||||||
|
<Header value="Metadata" size="5"/>
|
||||||
|
<Text name="collection" value="Collection: $collection"/>
|
||||||
|
<Text name="hadith_num" value="Hadith #: $hadith_number"/>
|
||||||
|
<Text name="grade" value="Grade: $grade"/>
|
||||||
|
</View>
|
||||||
|
</View>
|
||||||
File diff suppressed because one or more lines are too long
|
|
@ -0,0 +1,42 @@
|
||||||
|
|
||||||
|
<View>
|
||||||
|
<Header value="Hadith Relation Extraction"/>
|
||||||
|
|
||||||
|
<View style="margin-bottom: 20px;">
|
||||||
|
<Header value="Arabic Text with Entities" size="4"/>
|
||||||
|
<Labels name="entities_ar" toName="arabic_text">
|
||||||
|
<Label value="NARRATOR" background="#FF6B6B"/>
|
||||||
|
<Label value="PERSON" background="#4ECDC4"/>
|
||||||
|
<Label value="PLACE" background="#96CEB4"/>
|
||||||
|
</Labels>
|
||||||
|
<Text name="arabic_text" value="$arabic_text" granularity="word"/>
|
||||||
|
</View>
|
||||||
|
|
||||||
|
<View style="margin-bottom: 20px;">
|
||||||
|
<Header value="English Text with Entities" size="4"/>
|
||||||
|
<Labels name="entities_en" toName="english_text">
|
||||||
|
<Label value="NARRATOR" background="#FF6B6B"/>
|
||||||
|
<Label value="PERSON" background="#4ECDC4"/>
|
||||||
|
<Label value="PLACE" background="#96CEB4"/>
|
||||||
|
</Labels>
|
||||||
|
<Text name="english_text" value="$english_text" granularity="word"/>
|
||||||
|
</View>
|
||||||
|
|
||||||
|
<Header value="Relations" size="4"/>
|
||||||
|
<Relations>
|
||||||
|
<Relation value="NARRATED_FROM"/>
|
||||||
|
<Relation value="TEACHER_OF"/>
|
||||||
|
<Relation value="STUDENT_OF"/>
|
||||||
|
<Relation value="CONTEMPORARY_OF"/>
|
||||||
|
<Relation value="RELATED_TO"/>
|
||||||
|
<Relation value="LIVED_IN"/>
|
||||||
|
<Relation value="DIED_IN"/>
|
||||||
|
<Relation value="BORN_IN"/>
|
||||||
|
</Relations>
|
||||||
|
|
||||||
|
<View style="margin-top: 20px; padding: 10px; background: #f5f5f5; border-radius: 5px;">
|
||||||
|
<Header value="Metadata" size="5"/>
|
||||||
|
<Text name="collection" value="Collection: $collection"/>
|
||||||
|
<Text name="hadith_num" value="Hadith #: $hadith_number"/>
|
||||||
|
</View>
|
||||||
|
</View>
|
||||||
File diff suppressed because one or more lines are too long
|
|
@ -0,0 +1,635 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Step 7: Annotation Setup with Label Studio
|
||||||
|
===========================================
|
||||||
|
Exports hadiths for annotation and configures Label Studio projects.
|
||||||
|
|
||||||
|
Author: Hadith Scholar AI Project
|
||||||
|
Date: 2025
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import random
|
||||||
|
import asyncio
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Dict, List, Optional, Tuple
|
||||||
|
from dataclasses import dataclass, asdict
|
||||||
|
import hashlib
|
||||||
|
|
||||||
|
import psycopg2
|
||||||
|
from psycopg2.extras import RealDictCursor
|
||||||
|
import httpx
|
||||||
|
from rich.console import Console
|
||||||
|
from rich.table import Table
|
||||||
|
from rich.panel import Panel
|
||||||
|
from rich.progress import Progress, SpinnerColumn, TextColumn
|
||||||
|
|
||||||
|
if sys.platform == 'win32':
|
||||||
|
os.environ['PYTHONIOENCODING'] = 'utf-8'
|
||||||
|
if hasattr(sys.stdout, 'reconfigure'):
|
||||||
|
sys.stdout.reconfigure(encoding='utf-8')
|
||||||
|
if hasattr(sys.stderr, 'reconfigure'):
|
||||||
|
sys.stderr.reconfigure(encoding='utf-8')
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
POSTGRES_HOST = os.getenv("POSTGRES_HOST", "pg.betelgeusebytes.io")
|
||||||
|
POSTGRES_PORT = int(os.getenv("POSTGRES_PORT", "5432"))
|
||||||
|
POSTGRES_DB = os.getenv("POSTGRES_DB", "hadith_db")
|
||||||
|
POSTGRES_USER = os.getenv("POSTGRES_USER", "hadith_ingest")
|
||||||
|
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD", "hadith_ingest")
|
||||||
|
# TEI_URL = "https://embeddings.betelgeusebytes.io"
|
||||||
|
# QDRANT_URL = "https://vector.betelgeusebytes.io"
|
||||||
|
QDRANT_HOST = os.getenv("QDRANT_HOST", "https://vector.betelgeusebytes.io")
|
||||||
|
QDRANT_PORT = int(os.getenv("QDRANT_PORT", "443"))
|
||||||
|
QDRANT_COLLECTION = os.getenv("QDRANT_COLLECTION", "hadith_embeddings")
|
||||||
|
|
||||||
|
# For external access
|
||||||
|
QDRANT_EXTERNAL = os.getenv("QDRANT_EXTERNAL", "qdrant.betelgeusebytes.io")
|
||||||
|
# TEI_URL = "https://embeddings.betelgeusebytes.io"
|
||||||
|
TEI_HOST = os.getenv("TEI_HOST", "https://embeddings.betelgeusebytes.io")
|
||||||
|
TEI_PORT = int(os.getenv("TEI_PORT", "443"))
|
||||||
|
|
||||||
|
LABEL_STUDIO_URL = os.getenv("LABEL_STUDIO_URL", "https://label.betelgeusebytes.io")
|
||||||
|
LABEL_STUDIO_API_KEY = os.getenv("LABEL_STUDIO_API_KEY", "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ0b2tlbl90eXBlIjoicmVmcmVzaCIsImV4cCI6ODA3MTUyMjgzMSwiaWF0IjoxNzY0MzIyODMxLCJqdGkiOiJhYWVkMjNjODdmODc0MmY2OWJmMmFjZDc5YTVjMzMyMiIsInVzZXJfaWQiOjF9.4B_ZAPL6TmIcA6-zcKJ8JDRI3FsikX3HgTK3bbmK0mk")
|
||||||
|
|
||||||
|
console = Console()
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Label Studio Project Configurations
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
# NER Labeling Configuration for Hadith Text
|
||||||
|
NER_LABELING_CONFIG = """
|
||||||
|
<View>
|
||||||
|
<Header value="Hadith Named Entity Recognition (NER)"/>
|
||||||
|
|
||||||
|
<View style="display: flex; flex-direction: row;">
|
||||||
|
<View style="flex: 1; margin-right: 10px;">
|
||||||
|
<Header value="Arabic Text" size="4"/>
|
||||||
|
<Text name="arabic_text" value="$arabic_text" granularity="word"/>
|
||||||
|
<Labels name="ner_arabic" toName="arabic_text">
|
||||||
|
<Label value="PERSON" background="#FF6B6B" hotkey="p"/>
|
||||||
|
<Label value="KUNYA" background="#4ECDC4" hotkey="k"/>
|
||||||
|
<Label value="NISBA" background="#45B7D1" hotkey="n"/>
|
||||||
|
<Label value="PLACE" background="#96CEB4" hotkey="l"/>
|
||||||
|
<Label value="DATE" background="#FFEAA7" hotkey="d"/>
|
||||||
|
<Label value="TRIBE" background="#DDA0DD" hotkey="t"/>
|
||||||
|
<Label value="TITLE" background="#98D8C8" hotkey="i"/>
|
||||||
|
</Labels>
|
||||||
|
</View>
|
||||||
|
|
||||||
|
<View style="flex: 1; margin-left: 10px;">
|
||||||
|
<Header value="English Text" size="4"/>
|
||||||
|
<Text name="english_text" value="$english_text" granularity="word"/>
|
||||||
|
<Labels name="ner_english" toName="english_text">
|
||||||
|
<Label value="PERSON" background="#FF6B6B" hotkey="1"/>
|
||||||
|
<Label value="KUNYA" background="#4ECDC4" hotkey="2"/>
|
||||||
|
<Label value="NISBA" background="#45B7D1" hotkey="3"/>
|
||||||
|
<Label value="PLACE" background="#96CEB4" hotkey="4"/>
|
||||||
|
<Label value="DATE" background="#FFEAA7" hotkey="5"/>
|
||||||
|
<Label value="TRIBE" background="#DDA0DD" hotkey="6"/>
|
||||||
|
<Label value="TITLE" background="#98D8C8" hotkey="7"/>
|
||||||
|
</Labels>
|
||||||
|
</View>
|
||||||
|
</View>
|
||||||
|
|
||||||
|
<View style="margin-top: 20px; padding: 10px; background: #f5f5f5; border-radius: 5px;">
|
||||||
|
<Header value="Metadata" size="5"/>
|
||||||
|
<Text name="collection" value="Collection: $collection"/>
|
||||||
|
<Text name="hadith_num" value="Hadith #: $hadith_number"/>
|
||||||
|
<Text name="grade" value="Grade: $grade"/>
|
||||||
|
</View>
|
||||||
|
</View>
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Relation Extraction Labeling Configuration
|
||||||
|
RELATION_LABELING_CONFIG = """
|
||||||
|
<View>
|
||||||
|
<Header value="Hadith Relation Extraction"/>
|
||||||
|
|
||||||
|
<View style="margin-bottom: 20px;">
|
||||||
|
<Header value="Arabic Text with Entities" size="4"/>
|
||||||
|
<Labels name="entities_ar" toName="arabic_text">
|
||||||
|
<Label value="NARRATOR" background="#FF6B6B"/>
|
||||||
|
<Label value="PERSON" background="#4ECDC4"/>
|
||||||
|
<Label value="PLACE" background="#96CEB4"/>
|
||||||
|
</Labels>
|
||||||
|
<Text name="arabic_text" value="$arabic_text" granularity="word"/>
|
||||||
|
</View>
|
||||||
|
|
||||||
|
<View style="margin-bottom: 20px;">
|
||||||
|
<Header value="English Text with Entities" size="4"/>
|
||||||
|
<Labels name="entities_en" toName="english_text">
|
||||||
|
<Label value="NARRATOR" background="#FF6B6B"/>
|
||||||
|
<Label value="PERSON" background="#4ECDC4"/>
|
||||||
|
<Label value="PLACE" background="#96CEB4"/>
|
||||||
|
</Labels>
|
||||||
|
<Text name="english_text" value="$english_text" granularity="word"/>
|
||||||
|
</View>
|
||||||
|
|
||||||
|
<Header value="Relations" size="4"/>
|
||||||
|
<Relations>
|
||||||
|
<Relation value="NARRATED_FROM"/>
|
||||||
|
<Relation value="TEACHER_OF"/>
|
||||||
|
<Relation value="STUDENT_OF"/>
|
||||||
|
<Relation value="CONTEMPORARY_OF"/>
|
||||||
|
<Relation value="RELATED_TO"/>
|
||||||
|
<Relation value="LIVED_IN"/>
|
||||||
|
<Relation value="DIED_IN"/>
|
||||||
|
<Relation value="BORN_IN"/>
|
||||||
|
</Relations>
|
||||||
|
|
||||||
|
<View style="margin-top: 20px; padding: 10px; background: #f5f5f5; border-radius: 5px;">
|
||||||
|
<Header value="Metadata" size="5"/>
|
||||||
|
<Text name="collection" value="Collection: $collection"/>
|
||||||
|
<Text name="hadith_num" value="Hadith #: $hadith_number"/>
|
||||||
|
</View>
|
||||||
|
</View>
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Combined NER + Relations Configuration (for advanced annotators)
|
||||||
|
COMBINED_LABELING_CONFIG = """
|
||||||
|
<View>
|
||||||
|
<Header value="Hadith Entity and Relation Annotation"/>
|
||||||
|
|
||||||
|
<Collapse defaultActiveKey="arabic">
|
||||||
|
<Panel key="arabic" header="Arabic Text">
|
||||||
|
<Labels name="ner_ar" toName="arabic_text">
|
||||||
|
<Label value="PERSON" background="#FF6B6B" hotkey="p"/>
|
||||||
|
<Label value="KUNYA" background="#4ECDC4" hotkey="k"/>
|
||||||
|
<Label value="NISBA" background="#45B7D1" hotkey="n"/>
|
||||||
|
<Label value="PLACE" background="#96CEB4" hotkey="l"/>
|
||||||
|
<Label value="DATE" background="#FFEAA7" hotkey="d"/>
|
||||||
|
<Label value="TRIBE" background="#DDA0DD" hotkey="t"/>
|
||||||
|
<Label value="TITLE" background="#98D8C8" hotkey="i"/>
|
||||||
|
</Labels>
|
||||||
|
<Text name="arabic_text" value="$arabic_text" granularity="word"/>
|
||||||
|
</Panel>
|
||||||
|
|
||||||
|
<Panel key="english" header="English Text">
|
||||||
|
<Labels name="ner_en" toName="english_text">
|
||||||
|
<Label value="PERSON" background="#FF6B6B"/>
|
||||||
|
<Label value="KUNYA" background="#4ECDC4"/>
|
||||||
|
<Label value="NISBA" background="#45B7D1"/>
|
||||||
|
<Label value="PLACE" background="#96CEB4"/>
|
||||||
|
<Label value="DATE" background="#FFEAA7"/>
|
||||||
|
<Label value="TRIBE" background="#DDA0DD"/>
|
||||||
|
<Label value="TITLE" background="#98D8C8"/>
|
||||||
|
</Labels>
|
||||||
|
<Text name="english_text" value="$english_text" granularity="word"/>
|
||||||
|
</Panel>
|
||||||
|
</Collapse>
|
||||||
|
|
||||||
|
<Header value="Relations between Entities" size="4"/>
|
||||||
|
<Relations>
|
||||||
|
<Relation value="NARRATED_FROM" hotkey="r"/>
|
||||||
|
<Relation value="TEACHER_OF" hotkey="e"/>
|
||||||
|
<Relation value="STUDENT_OF" hotkey="s"/>
|
||||||
|
<Relation value="CONTEMPORARY_OF" hotkey="c"/>
|
||||||
|
<Relation value="RELATED_TO"/>
|
||||||
|
<Relation value="LIVED_IN"/>
|
||||||
|
</Relations>
|
||||||
|
|
||||||
|
<View style="margin-top: 15px; padding: 10px; background: #e8f4f8; border-radius: 5px;">
|
||||||
|
<Header value="Hadith Info" size="5"/>
|
||||||
|
<Text name="meta" value="$collection | Hadith #$hadith_number | Grade: $grade"/>
|
||||||
|
</View>
|
||||||
|
</View>
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Database Functions
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def get_db_connection():
|
||||||
|
"""Create PostgreSQL connection."""
|
||||||
|
return psycopg2.connect(
|
||||||
|
host=POSTGRES_HOST,
|
||||||
|
port=POSTGRES_PORT,
|
||||||
|
database=POSTGRES_DB,
|
||||||
|
user=POSTGRES_USER,
|
||||||
|
password=POSTGRES_PASSWORD,
|
||||||
|
sslmode='require'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def export_hadiths_for_annotation(
|
||||||
|
count: int = 500,
|
||||||
|
strategy: str = "stratified",
|
||||||
|
seed: int = 42
|
||||||
|
) -> List[Dict]:
|
||||||
|
"""
|
||||||
|
Export hadiths for annotation using various sampling strategies.
|
||||||
|
|
||||||
|
Strategies:
|
||||||
|
- random: Pure random sampling
|
||||||
|
- stratified: Proportional sampling from each collection
|
||||||
|
- chain_focused: Focus on hadiths with isnad (narrator chains)
|
||||||
|
- diverse: Maximize text diversity using embeddings
|
||||||
|
"""
|
||||||
|
conn = get_db_connection()
|
||||||
|
|
||||||
|
try:
|
||||||
|
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||||
|
if strategy == "random":
|
||||||
|
# Simple random sampling
|
||||||
|
cur.execute("""
|
||||||
|
SELECT
|
||||||
|
h.id,
|
||||||
|
h.hadith_number,
|
||||||
|
h.arabic_text,
|
||||||
|
h.english_text,
|
||||||
|
h.urdu_text,
|
||||||
|
h.grade,
|
||||||
|
c.name_english as collection,
|
||||||
|
c.name_arabic as collection_arabic,
|
||||||
|
b.name_english as book,
|
||||||
|
b.name_arabic as book_arabic
|
||||||
|
FROM hadiths h
|
||||||
|
JOIN collections c ON h.collection_id = c.id
|
||||||
|
LEFT JOIN books b ON h.book_id = b.id
|
||||||
|
WHERE h.arabic_text IS NOT NULL
|
||||||
|
AND LENGTH(h.arabic_text) > 50
|
||||||
|
ORDER BY RANDOM()
|
||||||
|
LIMIT %s
|
||||||
|
""", (count,))
|
||||||
|
|
||||||
|
elif strategy == "stratified":
|
||||||
|
# Get collection distribution
|
||||||
|
cur.execute("""
|
||||||
|
SELECT c.id, c.name_english, COUNT(h.id) as cnt
|
||||||
|
FROM hadiths h
|
||||||
|
JOIN collections c ON h.collection_id = c.id
|
||||||
|
WHERE h.arabic_text IS NOT NULL AND LENGTH(h.arabic_text) > 50
|
||||||
|
GROUP BY c.id, c.name_english
|
||||||
|
""")
|
||||||
|
collections = cur.fetchall()
|
||||||
|
total = sum(c['cnt'] for c in collections)
|
||||||
|
|
||||||
|
# Calculate samples per collection
|
||||||
|
all_hadiths = []
|
||||||
|
for coll in collections:
|
||||||
|
sample_count = max(1, int(count * coll['cnt'] / total))
|
||||||
|
cur.execute("""
|
||||||
|
SELECT
|
||||||
|
h.id,
|
||||||
|
h.hadith_number,
|
||||||
|
h.arabic_text,
|
||||||
|
h.english_text,
|
||||||
|
h.urdu_text,
|
||||||
|
h.grade,
|
||||||
|
c.name_english as collection,
|
||||||
|
c.name_arabic as collection_arabic,
|
||||||
|
b.name_english as book,
|
||||||
|
b.name_arabic as book_arabic
|
||||||
|
FROM hadiths h
|
||||||
|
JOIN collections c ON h.collection_id = c.id
|
||||||
|
LEFT JOIN books b ON h.book_id = b.id
|
||||||
|
WHERE h.collection_id = %s
|
||||||
|
AND h.arabic_text IS NOT NULL
|
||||||
|
AND LENGTH(h.arabic_text) > 50
|
||||||
|
ORDER BY RANDOM()
|
||||||
|
LIMIT %s
|
||||||
|
""", (coll['id'], sample_count))
|
||||||
|
all_hadiths.extend(cur.fetchall())
|
||||||
|
|
||||||
|
return [dict(h) for h in all_hadiths[:count]]
|
||||||
|
|
||||||
|
elif strategy == "chain_focused":
|
||||||
|
# Focus on hadiths with clear isnad patterns
|
||||||
|
# Look for common narrator chain indicators
|
||||||
|
cur.execute("""
|
||||||
|
SELECT
|
||||||
|
h.id,
|
||||||
|
h.hadith_number,
|
||||||
|
h.arabic_text,
|
||||||
|
h.english_text,
|
||||||
|
h.urdu_text,
|
||||||
|
h.grade,
|
||||||
|
c.name_english as collection,
|
||||||
|
c.name_arabic as collection_arabic,
|
||||||
|
b.name_english as book,
|
||||||
|
b.name_arabic as book_arabic
|
||||||
|
FROM hadiths h
|
||||||
|
JOIN collections c ON h.collection_id = c.id
|
||||||
|
LEFT JOIN books b ON h.book_id = b.id
|
||||||
|
WHERE h.arabic_text IS NOT NULL
|
||||||
|
AND LENGTH(h.arabic_text) > 100
|
||||||
|
AND (
|
||||||
|
h.arabic_text LIKE '%%حدثنا%%'
|
||||||
|
OR h.arabic_text LIKE '%%أخبرنا%%'
|
||||||
|
OR h.arabic_text LIKE '%%عن%%عن%%'
|
||||||
|
OR h.english_text LIKE '%%narrated%%'
|
||||||
|
)
|
||||||
|
ORDER BY RANDOM()
|
||||||
|
LIMIT %s
|
||||||
|
""", (count,))
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown strategy: {strategy}")
|
||||||
|
|
||||||
|
results = cur.fetchall()
|
||||||
|
return [dict(h) for h in results]
|
||||||
|
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def get_collection_statistics() -> List[Dict]:
|
||||||
|
"""Get statistics for each hadith collection."""
|
||||||
|
conn = get_db_connection()
|
||||||
|
try:
|
||||||
|
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||||
|
cur.execute("""
|
||||||
|
SELECT
|
||||||
|
c.name_english as collection,
|
||||||
|
COUNT(h.id) as total,
|
||||||
|
SUM(CASE WHEN h.entities_extracted THEN 1 ELSE 0 END) as entities_done,
|
||||||
|
SUM(CASE WHEN h.relations_extracted THEN 1 ELSE 0 END) as relations_done,
|
||||||
|
AVG(LENGTH(h.arabic_text)) as avg_arabic_len,
|
||||||
|
AVG(LENGTH(h.english_text)) as avg_english_len
|
||||||
|
FROM hadiths h
|
||||||
|
JOIN collections c ON h.collection_id = c.id
|
||||||
|
GROUP BY c.id, c.name_english
|
||||||
|
ORDER BY total DESC
|
||||||
|
""")
|
||||||
|
return [dict(row) for row in cur.fetchall()]
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Label Studio API Functions
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
async def create_label_studio_project(
|
||||||
|
client: httpx.AsyncClient,
|
||||||
|
title: str,
|
||||||
|
description: str,
|
||||||
|
label_config: str
|
||||||
|
) -> Dict:
|
||||||
|
"""Create a new Label Studio project."""
|
||||||
|
response = await client.post(
|
||||||
|
f"{LABEL_STUDIO_URL}/api/projects",
|
||||||
|
headers={"Authorization": f"Token {LABEL_STUDIO_API_KEY}"},
|
||||||
|
json={
|
||||||
|
"title": title,
|
||||||
|
"description": description,
|
||||||
|
"label_config": label_config,
|
||||||
|
"is_published": True,
|
||||||
|
"show_collab_predictions": True,
|
||||||
|
"evaluate_predictions_automatically": True
|
||||||
|
}
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.json()
|
||||||
|
|
||||||
|
|
||||||
|
async def import_tasks_to_project(
|
||||||
|
client: httpx.AsyncClient,
|
||||||
|
project_id: int,
|
||||||
|
tasks: List[Dict]
|
||||||
|
) -> Dict:
|
||||||
|
"""Import annotation tasks to a Label Studio project."""
|
||||||
|
response = await client.post(
|
||||||
|
f"{LABEL_STUDIO_URL}/api/projects/{project_id}/import",
|
||||||
|
headers={"Authorization": f"Token {LABEL_STUDIO_API_KEY}"},
|
||||||
|
json=tasks
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.json()
|
||||||
|
|
||||||
|
|
||||||
|
async def get_project_stats(
|
||||||
|
client: httpx.AsyncClient,
|
||||||
|
project_id: int
|
||||||
|
) -> Dict:
|
||||||
|
"""Get annotation statistics for a project."""
|
||||||
|
response = await client.get(
|
||||||
|
f"{LABEL_STUDIO_URL}/api/projects/{project_id}",
|
||||||
|
headers={"Authorization": f"Token {LABEL_STUDIO_API_KEY}"}
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.json()
|
||||||
|
|
||||||
|
|
||||||
|
def convert_hadiths_to_tasks(hadiths: List[Dict]) -> List[Dict]:
|
||||||
|
"""Convert hadith records to Label Studio task format."""
|
||||||
|
tasks = []
|
||||||
|
for h in hadiths:
|
||||||
|
task = {
|
||||||
|
"data": {
|
||||||
|
"hadith_id": h['id'],
|
||||||
|
"arabic_text": h.get('arabic_text', '') or '',
|
||||||
|
"english_text": h.get('english_text', '') or '',
|
||||||
|
"urdu_text": h.get('urdu_text', '') or '',
|
||||||
|
"collection": h.get('collection', ''),
|
||||||
|
"collection_arabic": h.get('collection_arabic', ''),
|
||||||
|
"book": h.get('book', '') or '',
|
||||||
|
"book_arabic": h.get('book_arabic', '') or '',
|
||||||
|
"hadith_number": str(h.get('hadith_number', '')),
|
||||||
|
"grade": h.get('grade', '') or 'Unknown'
|
||||||
|
},
|
||||||
|
"meta": {
|
||||||
|
"source": "hadith_db",
|
||||||
|
"exported_at": datetime.now().isoformat()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
tasks.append(task)
|
||||||
|
return tasks
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Export Functions
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def export_to_json(hadiths: List[Dict], output_path: str):
|
||||||
|
"""Export hadiths to JSON file for Label Studio import."""
|
||||||
|
tasks = convert_hadiths_to_tasks(hadiths)
|
||||||
|
|
||||||
|
with open(output_path, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(tasks, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
console.print(f"[green]Exported {len(tasks)} tasks to {output_path}[/green]")
|
||||||
|
return output_path
|
||||||
|
|
||||||
|
|
||||||
|
def export_to_csv(hadiths: List[Dict], output_path: str):
|
||||||
|
"""Export hadiths to CSV file."""
|
||||||
|
import csv
|
||||||
|
|
||||||
|
fieldnames = [
|
||||||
|
'hadith_id', 'collection', 'book', 'hadith_number',
|
||||||
|
'arabic_text', 'english_text', 'grade'
|
||||||
|
]
|
||||||
|
|
||||||
|
with open(output_path, 'w', encoding='utf-8', newline='') as f:
|
||||||
|
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
||||||
|
writer.writeheader()
|
||||||
|
|
||||||
|
for h in hadiths:
|
||||||
|
writer.writerow({
|
||||||
|
'hadith_id': h['id'],
|
||||||
|
'collection': h.get('collection', ''),
|
||||||
|
'book': h.get('book', ''),
|
||||||
|
'hadith_number': h.get('hadith_number', ''),
|
||||||
|
'arabic_text': h.get('arabic_text', ''),
|
||||||
|
'english_text': h.get('english_text', ''),
|
||||||
|
'grade': h.get('grade', '')
|
||||||
|
})
|
||||||
|
|
||||||
|
console.print(f"[green]Exported {len(hadiths)} hadiths to {output_path}[/green]")
|
||||||
|
return output_path
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Main Setup Functions
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
async def setup_annotation_projects(
|
||||||
|
ner_count: int = 500,
|
||||||
|
relation_count: int = 300,
|
||||||
|
export_only: bool = False
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Set up Label Studio projects for NER and Relation annotation.
|
||||||
|
"""
|
||||||
|
console.print(Panel.fit(
|
||||||
|
"[bold blue]Step 7: Label Studio Annotation Setup[/bold blue]\n"
|
||||||
|
f"Label Studio: {LABEL_STUDIO_URL}\n"
|
||||||
|
f"NER samples: {ner_count} | Relation samples: {relation_count}",
|
||||||
|
title="Annotation Setup"
|
||||||
|
))
|
||||||
|
|
||||||
|
# Step 1: Export hadiths for NER annotation
|
||||||
|
console.print("\n[yellow]1. Exporting hadiths for NER annotation...[/yellow]")
|
||||||
|
ner_hadiths = export_hadiths_for_annotation(
|
||||||
|
count=ner_count,
|
||||||
|
strategy="stratified"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Show distribution
|
||||||
|
collections = {}
|
||||||
|
for h in ner_hadiths:
|
||||||
|
coll = h.get('collection', 'Unknown')
|
||||||
|
collections[coll] = collections.get(coll, 0) + 1
|
||||||
|
|
||||||
|
table = Table(title="NER Sample Distribution")
|
||||||
|
table.add_column("Collection", style="cyan")
|
||||||
|
table.add_column("Count", justify="right")
|
||||||
|
for coll, cnt in sorted(collections.items(), key=lambda x: -x[1]):
|
||||||
|
table.add_row(coll, str(cnt))
|
||||||
|
console.print(table)
|
||||||
|
|
||||||
|
# Export NER tasks
|
||||||
|
ner_json_path = "annotation_data/ner_tasks.json"
|
||||||
|
os.makedirs("annotation_data", exist_ok=True)
|
||||||
|
export_to_json(ner_hadiths, ner_json_path)
|
||||||
|
|
||||||
|
# Step 2: Export hadiths for Relation annotation (chain-focused)
|
||||||
|
console.print("\n[yellow]2. Exporting hadiths for Relation annotation...[/yellow]")
|
||||||
|
relation_hadiths = export_hadiths_for_annotation(
|
||||||
|
count=relation_count,
|
||||||
|
strategy="chain_focused"
|
||||||
|
)
|
||||||
|
|
||||||
|
relation_json_path = "annotation_data/relation_tasks.json"
|
||||||
|
export_to_json(relation_hadiths, relation_json_path)
|
||||||
|
|
||||||
|
# Step 3: Save labeling configurations
|
||||||
|
console.print("\n[yellow]3. Saving Label Studio configurations...[/yellow]")
|
||||||
|
|
||||||
|
with open("annotation_data/ner_config.xml", 'w') as f:
|
||||||
|
f.write(NER_LABELING_CONFIG)
|
||||||
|
console.print(" Saved: annotation_data/ner_config.xml")
|
||||||
|
|
||||||
|
with open("annotation_data/relation_config.xml", 'w') as f:
|
||||||
|
f.write(RELATION_LABELING_CONFIG)
|
||||||
|
console.print(" Saved: annotation_data/relation_config.xml")
|
||||||
|
|
||||||
|
with open("annotation_data/combined_config.xml", 'w') as f:
|
||||||
|
f.write(COMBINED_LABELING_CONFIG)
|
||||||
|
console.print(" Saved: annotation_data/combined_config.xml")
|
||||||
|
|
||||||
|
if export_only:
|
||||||
|
console.print("\n[green]✓ Export complete! Import files manually to Label Studio.[/green]")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Step 4: Create Label Studio projects (if API key provided)
|
||||||
|
if not LABEL_STUDIO_API_KEY:
|
||||||
|
console.print("\n[yellow]⚠ LABEL_STUDIO_API_KEY not set. Skipping project creation.[/yellow]")
|
||||||
|
console.print(" Set the API key and run again, or import tasks manually.")
|
||||||
|
return
|
||||||
|
|
||||||
|
console.print("\n[yellow]4. Creating Label Studio projects...[/yellow]")
|
||||||
|
|
||||||
|
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||||
|
# Create NER project
|
||||||
|
try:
|
||||||
|
ner_project = await create_label_studio_project(
|
||||||
|
client,
|
||||||
|
title="Hadith NER Annotation",
|
||||||
|
description="Named Entity Recognition for Islamic hadith texts. "
|
||||||
|
"Label persons, places, dates, and other entities.",
|
||||||
|
label_config=NER_LABELING_CONFIG
|
||||||
|
)
|
||||||
|
console.print(f" [green]✓ Created NER project (ID: {ner_project['id']})[/green]")
|
||||||
|
|
||||||
|
# Import NER tasks
|
||||||
|
ner_tasks = convert_hadiths_to_tasks(ner_hadiths)
|
||||||
|
await import_tasks_to_project(client, ner_project['id'], ner_tasks)
|
||||||
|
console.print(f" [green]✓ Imported {len(ner_tasks)} NER tasks[/green]")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f" [red]✗ NER project error: {e}[/red]")
|
||||||
|
|
||||||
|
# Create Relation project
|
||||||
|
try:
|
||||||
|
relation_project = await create_label_studio_project(
|
||||||
|
client,
|
||||||
|
title="Hadith Relation Extraction",
|
||||||
|
description="Extract relations between narrators and entities in hadith texts.",
|
||||||
|
label_config=RELATION_LABELING_CONFIG
|
||||||
|
)
|
||||||
|
console.print(f" [green]✓ Created Relation project (ID: {relation_project['id']})[/green]")
|
||||||
|
|
||||||
|
# Import Relation tasks
|
||||||
|
relation_tasks = convert_hadiths_to_tasks(relation_hadiths)
|
||||||
|
await import_tasks_to_project(client, relation_project['id'], relation_tasks)
|
||||||
|
console.print(f" [green]✓ Imported {len(relation_tasks)} Relation tasks[/green]")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f" [red]✗ Relation project error: {e}[/red]")
|
||||||
|
|
||||||
|
console.print("\n[bold green]✓ Annotation setup complete![/bold green]")
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
"""Main entry point."""
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description="Hadith Annotation Setup")
|
||||||
|
parser.add_argument("--ner-count", type=int, default=500,
|
||||||
|
help="Number of hadiths for NER annotation")
|
||||||
|
parser.add_argument("--relation-count", type=int, default=300,
|
||||||
|
help="Number of hadiths for relation annotation")
|
||||||
|
parser.add_argument("--export-only", action="store_true",
|
||||||
|
help="Only export files, don't create Label Studio projects")
|
||||||
|
parser.add_argument("--strategy", choices=["random", "stratified", "chain_focused"],
|
||||||
|
default="stratified", help="Sampling strategy")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
await setup_annotation_projects(
|
||||||
|
ner_count=args.ner_count,
|
||||||
|
relation_count=args.relation_count,
|
||||||
|
export_only=args.export_only
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
|
|
@ -0,0 +1,373 @@
|
||||||
|
-- ============================================================================
|
||||||
|
-- Step 7: SQL Queries for Hadith Annotation Export
|
||||||
|
-- ============================================================================
|
||||||
|
-- Run: psql -h pg.betelgeusebytes.io -U hadith_ingest -d hadith_db -f export_queries.sql
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- 1. STRATIFIED SAMPLING - 500 hadiths proportional to collection size
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- First, create a temporary function for stratified sampling
|
||||||
|
CREATE OR REPLACE FUNCTION sample_hadiths_stratified(total_sample INT)
|
||||||
|
RETURNS TABLE (
|
||||||
|
id INT,
|
||||||
|
hadith_number VARCHAR,
|
||||||
|
arabic_text TEXT,
|
||||||
|
english_text TEXT,
|
||||||
|
urdu_text TEXT,
|
||||||
|
grade VARCHAR,
|
||||||
|
collection_name VARCHAR,
|
||||||
|
collection_arabic VARCHAR,
|
||||||
|
book_name VARCHAR,
|
||||||
|
book_arabic VARCHAR
|
||||||
|
) AS $$
|
||||||
|
DECLARE
|
||||||
|
coll RECORD;
|
||||||
|
total_hadiths INT;
|
||||||
|
sample_count INT;
|
||||||
|
BEGIN
|
||||||
|
-- Get total count
|
||||||
|
SELECT COUNT(*) INTO total_hadiths
|
||||||
|
FROM hadiths h
|
||||||
|
WHERE h.arabic_text IS NOT NULL AND LENGTH(h.arabic_text) > 50;
|
||||||
|
|
||||||
|
-- Sample from each collection proportionally
|
||||||
|
FOR coll IN
|
||||||
|
SELECT c.id as coll_id, c.name_english, COUNT(h.id) as cnt
|
||||||
|
FROM hadiths h
|
||||||
|
JOIN collections c ON h.collection_id = c.id
|
||||||
|
WHERE h.arabic_text IS NOT NULL AND LENGTH(h.arabic_text) > 50
|
||||||
|
GROUP BY c.id, c.name_english
|
||||||
|
LOOP
|
||||||
|
sample_count := GREATEST(1, (total_sample * coll.cnt / total_hadiths));
|
||||||
|
|
||||||
|
RETURN QUERY
|
||||||
|
SELECT
|
||||||
|
h.id,
|
||||||
|
h.hadith_number,
|
||||||
|
h.arabic_text,
|
||||||
|
h.english_text,
|
||||||
|
h.urdu_text,
|
||||||
|
h.grade,
|
||||||
|
c.name_english,
|
||||||
|
c.name_arabic,
|
||||||
|
b.name_english,
|
||||||
|
b.name_arabic
|
||||||
|
FROM hadiths h
|
||||||
|
JOIN collections c ON h.collection_id = c.id
|
||||||
|
LEFT JOIN books b ON h.book_id = b.id
|
||||||
|
WHERE h.collection_id = coll.coll_id
|
||||||
|
AND h.arabic_text IS NOT NULL
|
||||||
|
AND LENGTH(h.arabic_text) > 50
|
||||||
|
ORDER BY RANDOM()
|
||||||
|
LIMIT sample_count;
|
||||||
|
END LOOP;
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
-- Export 500 stratified samples
|
||||||
|
\copy (SELECT * FROM sample_hadiths_stratified(500) LIMIT 500) TO 'ner_annotation_sample.csv' WITH CSV HEADER;
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- 2. CHAIN-FOCUSED SAMPLING - Hadiths with clear narrator chains (isnad)
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- Export 300 hadiths with clear narrator chain patterns
|
||||||
|
\copy (
|
||||||
|
SELECT
|
||||||
|
h.id,
|
||||||
|
h.hadith_number,
|
||||||
|
h.arabic_text,
|
||||||
|
h.english_text,
|
||||||
|
h.urdu_text,
|
||||||
|
h.grade,
|
||||||
|
c.name_english as collection_name,
|
||||||
|
c.name_arabic as collection_arabic,
|
||||||
|
b.name_english as book_name,
|
||||||
|
b.name_arabic as book_arabic
|
||||||
|
FROM hadiths h
|
||||||
|
JOIN collections c ON h.collection_id = c.id
|
||||||
|
LEFT JOIN books b ON h.book_id = b.id
|
||||||
|
WHERE h.arabic_text IS NOT NULL
|
||||||
|
AND LENGTH(h.arabic_text) > 100
|
||||||
|
AND (
|
||||||
|
-- Common narrator chain indicators in Arabic
|
||||||
|
h.arabic_text LIKE '%حدثنا%' -- "narrated to us"
|
||||||
|
OR h.arabic_text LIKE '%أخبرنا%' -- "informed us"
|
||||||
|
OR h.arabic_text LIKE '%عن%عن%عن%' -- chain pattern "from...from...from"
|
||||||
|
OR h.arabic_text LIKE '%سمعت%' -- "I heard"
|
||||||
|
OR h.arabic_text LIKE '%قال رسول الله%' -- "The Messenger of Allah said"
|
||||||
|
-- English patterns
|
||||||
|
OR h.english_text ILIKE '%narrated%narrated%'
|
||||||
|
OR h.english_text ILIKE '%reported%that%said%'
|
||||||
|
)
|
||||||
|
ORDER BY RANDOM()
|
||||||
|
LIMIT 300
|
||||||
|
) TO 'relation_annotation_sample.csv' WITH CSV HEADER;
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- 3. RANDOM SAMPLING - Simple random sample
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
\copy (
|
||||||
|
SELECT
|
||||||
|
h.id,
|
||||||
|
h.hadith_number,
|
||||||
|
h.arabic_text,
|
||||||
|
h.english_text,
|
||||||
|
h.urdu_text,
|
||||||
|
h.grade,
|
||||||
|
c.name_english as collection_name,
|
||||||
|
b.name_english as book_name
|
||||||
|
FROM hadiths h
|
||||||
|
JOIN collections c ON h.collection_id = c.id
|
||||||
|
LEFT JOIN books b ON h.book_id = b.id
|
||||||
|
WHERE h.arabic_text IS NOT NULL
|
||||||
|
AND LENGTH(h.arabic_text) > 50
|
||||||
|
ORDER BY RANDOM()
|
||||||
|
LIMIT 500
|
||||||
|
) TO 'random_annotation_sample.csv' WITH CSV HEADER;
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- 4. GRADE-STRATIFIED SAMPLING - Ensure representation of all grades
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
\copy (
|
||||||
|
WITH grade_samples AS (
|
||||||
|
SELECT
|
||||||
|
h.*,
|
||||||
|
c.name_english as collection_name,
|
||||||
|
b.name_english as book_name,
|
||||||
|
ROW_NUMBER() OVER (PARTITION BY h.grade ORDER BY RANDOM()) as rn
|
||||||
|
FROM hadiths h
|
||||||
|
JOIN collections c ON h.collection_id = c.id
|
||||||
|
LEFT JOIN books b ON h.book_id = b.id
|
||||||
|
WHERE h.arabic_text IS NOT NULL
|
||||||
|
AND LENGTH(h.arabic_text) > 50
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
id,
|
||||||
|
hadith_number,
|
||||||
|
arabic_text,
|
||||||
|
english_text,
|
||||||
|
grade,
|
||||||
|
collection_name,
|
||||||
|
book_name
|
||||||
|
FROM grade_samples
|
||||||
|
WHERE rn <= 100 -- Up to 100 per grade
|
||||||
|
ORDER BY grade, RANDOM()
|
||||||
|
LIMIT 500
|
||||||
|
) TO 'grade_stratified_sample.csv' WITH CSV HEADER;
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- 5. COLLECTION-SPECIFIC EXPORTS
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- Sahih Bukhari samples
|
||||||
|
\copy (
|
||||||
|
SELECT
|
||||||
|
h.id,
|
||||||
|
h.hadith_number,
|
||||||
|
h.arabic_text,
|
||||||
|
h.english_text,
|
||||||
|
h.grade,
|
||||||
|
b.name_english as book_name
|
||||||
|
FROM hadiths h
|
||||||
|
JOIN collections c ON h.collection_id = c.id
|
||||||
|
LEFT JOIN books b ON h.book_id = b.id
|
||||||
|
WHERE c.name_english ILIKE '%bukhari%'
|
||||||
|
AND h.arabic_text IS NOT NULL
|
||||||
|
ORDER BY RANDOM()
|
||||||
|
LIMIT 100
|
||||||
|
) TO 'bukhari_sample.csv' WITH CSV HEADER;
|
||||||
|
|
||||||
|
-- Sahih Muslim samples
|
||||||
|
\copy (
|
||||||
|
SELECT
|
||||||
|
h.id,
|
||||||
|
h.hadith_number,
|
||||||
|
h.arabic_text,
|
||||||
|
h.english_text,
|
||||||
|
h.grade,
|
||||||
|
b.name_english as book_name
|
||||||
|
FROM hadiths h
|
||||||
|
JOIN collections c ON h.collection_id = c.id
|
||||||
|
LEFT JOIN books b ON h.book_id = b.id
|
||||||
|
WHERE c.name_english ILIKE '%muslim%'
|
||||||
|
AND h.arabic_text IS NOT NULL
|
||||||
|
ORDER BY RANDOM()
|
||||||
|
LIMIT 100
|
||||||
|
) TO 'muslim_sample.csv' WITH CSV HEADER;
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- 6. EXPORT AS JSON (for Label Studio)
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- Create JSON export for Label Studio
|
||||||
|
\copy (
|
||||||
|
SELECT json_build_object(
|
||||||
|
'data', json_build_object(
|
||||||
|
'hadith_id', h.id,
|
||||||
|
'arabic_text', h.arabic_text,
|
||||||
|
'english_text', COALESCE(h.english_text, ''),
|
||||||
|
'urdu_text', COALESCE(h.urdu_text, ''),
|
||||||
|
'collection', c.name_english,
|
||||||
|
'collection_arabic', c.name_arabic,
|
||||||
|
'book', COALESCE(b.name_english, ''),
|
||||||
|
'hadith_number', h.hadith_number,
|
||||||
|
'grade', COALESCE(h.grade, 'Unknown')
|
||||||
|
)
|
||||||
|
)
|
||||||
|
FROM hadiths h
|
||||||
|
JOIN collections c ON h.collection_id = c.id
|
||||||
|
LEFT JOIN books b ON h.book_id = b.id
|
||||||
|
WHERE h.arabic_text IS NOT NULL
|
||||||
|
AND LENGTH(h.arabic_text) > 50
|
||||||
|
ORDER BY RANDOM()
|
||||||
|
LIMIT 500
|
||||||
|
) TO 'label_studio_tasks.jsonl';
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- 7. STATISTICS QUERIES
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- Distribution by collection
|
||||||
|
SELECT
|
||||||
|
c.name_english as collection,
|
||||||
|
COUNT(h.id) as total_hadiths,
|
||||||
|
COUNT(h.id) FILTER (WHERE LENGTH(h.arabic_text) > 100) as with_arabic,
|
||||||
|
COUNT(h.id) FILTER (WHERE LENGTH(h.english_text) > 100) as with_english,
|
||||||
|
ROUND(AVG(LENGTH(h.arabic_text))) as avg_arabic_len
|
||||||
|
FROM hadiths h
|
||||||
|
JOIN collections c ON h.collection_id = c.id
|
||||||
|
GROUP BY c.id, c.name_english
|
||||||
|
ORDER BY total_hadiths DESC;
|
||||||
|
|
||||||
|
-- Distribution by grade
|
||||||
|
SELECT
|
||||||
|
COALESCE(grade, 'Unknown') as grade,
|
||||||
|
COUNT(*) as count,
|
||||||
|
ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER(), 2) as percentage
|
||||||
|
FROM hadiths
|
||||||
|
GROUP BY grade
|
||||||
|
ORDER BY count DESC;
|
||||||
|
|
||||||
|
-- Narrator chain pattern frequency
|
||||||
|
SELECT
|
||||||
|
'حدثنا (narrated to us)' as pattern,
|
||||||
|
COUNT(*) as count
|
||||||
|
FROM hadiths WHERE arabic_text LIKE '%حدثنا%'
|
||||||
|
UNION ALL
|
||||||
|
SELECT
|
||||||
|
'أخبرنا (informed us)' as pattern,
|
||||||
|
COUNT(*) as count
|
||||||
|
FROM hadiths WHERE arabic_text LIKE '%أخبرنا%'
|
||||||
|
UNION ALL
|
||||||
|
SELECT
|
||||||
|
'عن...عن (from...from)' as pattern,
|
||||||
|
COUNT(*) as count
|
||||||
|
FROM hadiths WHERE arabic_text LIKE '%عن%عن%'
|
||||||
|
UNION ALL
|
||||||
|
SELECT
|
||||||
|
'قال رسول الله (Prophet said)' as pattern,
|
||||||
|
COUNT(*) as count
|
||||||
|
FROM hadiths WHERE arabic_text LIKE '%قال رسول الله%';
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- 8. CREATE ANNOTATION TRACKING TABLE
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- Table to track annotation progress
|
||||||
|
CREATE TABLE IF NOT EXISTS annotation_batches (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
batch_name VARCHAR(100) NOT NULL,
|
||||||
|
batch_type VARCHAR(50) NOT NULL, -- 'NER', 'RELATION', 'COMBINED'
|
||||||
|
hadith_ids INTEGER[] NOT NULL,
|
||||||
|
total_count INTEGER NOT NULL,
|
||||||
|
annotated_count INTEGER DEFAULT 0,
|
||||||
|
label_studio_project_id INTEGER,
|
||||||
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
completed_at TIMESTAMP,
|
||||||
|
notes TEXT
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Index for quick lookup
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_annotation_batches_type ON annotation_batches(batch_type);
|
||||||
|
|
||||||
|
-- Function to create a new annotation batch
|
||||||
|
CREATE OR REPLACE FUNCTION create_annotation_batch(
|
||||||
|
p_batch_name VARCHAR(100),
|
||||||
|
p_batch_type VARCHAR(50),
|
||||||
|
p_hadith_ids INTEGER[]
|
||||||
|
) RETURNS INTEGER AS $$
|
||||||
|
DECLARE
|
||||||
|
v_batch_id INTEGER;
|
||||||
|
BEGIN
|
||||||
|
INSERT INTO annotation_batches (batch_name, batch_type, hadith_ids, total_count)
|
||||||
|
VALUES (p_batch_name, p_batch_type, p_hadith_ids, array_length(p_hadith_ids, 1))
|
||||||
|
RETURNING id INTO v_batch_id;
|
||||||
|
|
||||||
|
RETURN v_batch_id;
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- 9. HELPER VIEWS
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- View for unannotated hadiths
|
||||||
|
CREATE OR REPLACE VIEW unannotated_hadiths AS
|
||||||
|
SELECT
|
||||||
|
h.id,
|
||||||
|
h.hadith_number,
|
||||||
|
h.arabic_text,
|
||||||
|
h.english_text,
|
||||||
|
c.name_english as collection,
|
||||||
|
h.grade
|
||||||
|
FROM hadiths h
|
||||||
|
JOIN collections c ON h.collection_id = c.id
|
||||||
|
WHERE NOT h.entities_extracted
|
||||||
|
AND h.arabic_text IS NOT NULL
|
||||||
|
AND LENGTH(h.arabic_text) > 50;
|
||||||
|
|
||||||
|
-- View for partially annotated hadiths (entities done, relations pending)
|
||||||
|
CREATE OR REPLACE VIEW partial_annotations AS
|
||||||
|
SELECT
|
||||||
|
h.id,
|
||||||
|
h.hadith_number,
|
||||||
|
h.arabic_text,
|
||||||
|
h.english_text,
|
||||||
|
c.name_english as collection
|
||||||
|
FROM hadiths h
|
||||||
|
JOIN collections c ON h.collection_id = c.id
|
||||||
|
WHERE h.entities_extracted
|
||||||
|
AND NOT h.relations_extracted;
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- 10. SAMPLE QUERY FOR ACTIVE LEARNING
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- Get hadiths similar to annotated ones (for active learning)
|
||||||
|
-- This requires embeddings to be available
|
||||||
|
-- Placeholder for when we implement active learning in Step 7
|
||||||
|
|
||||||
|
/*
|
||||||
|
SELECT
|
||||||
|
h.id,
|
||||||
|
h.arabic_text,
|
||||||
|
h.english_text,
|
||||||
|
similarity_score
|
||||||
|
FROM hadiths h
|
||||||
|
JOIN (
|
||||||
|
-- Find similar hadiths based on embedding distance
|
||||||
|
SELECT
|
||||||
|
hadith_id,
|
||||||
|
1 - (embedding <-> (SELECT AVG(embedding) FROM annotated_hadiths)) as similarity_score
|
||||||
|
FROM hadith_embeddings
|
||||||
|
WHERE hadith_id NOT IN (SELECT id FROM annotated_hadiths)
|
||||||
|
ORDER BY similarity_score DESC
|
||||||
|
LIMIT 100
|
||||||
|
) similar ON h.id = similar.hadith_id
|
||||||
|
ORDER BY similarity_score DESC;
|
||||||
|
*/
|
||||||
|
|
@ -0,0 +1,530 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Step 7: Label Studio API Client
|
||||||
|
================================
|
||||||
|
Manages Label Studio projects, imports/exports annotations.
|
||||||
|
|
||||||
|
Author: Hadith Scholar AI Project
|
||||||
|
Date: 2025
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import asyncio
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Dict, List, Optional, Any
|
||||||
|
from dataclasses import dataclass, asdict
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from label_studio_sdk import LabelStudio
|
||||||
|
from rich.console import Console
|
||||||
|
from rich.table import Table
|
||||||
|
from rich.progress import Progress, SpinnerColumn, TextColumn
|
||||||
|
|
||||||
|
console = Console()
|
||||||
|
|
||||||
|
LABEL_STUDIO_URL = os.getenv("LABEL_STUDIO_URL", "https://label.betelgeusebytes.io")
|
||||||
|
LABEL_STUDIO_API_KEY = os.getenv("LABEL_STUDIO_API_KEY", "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ0b2tlbl90eXBlIjoicmVmcmVzaCIsImV4cCI6ODA3MTUyMjgzMSwiaWF0IjoxNzY0MzIyODMxLCJqdGkiOiJhYWVkMjNjODdmODc0MmY2OWJmMmFjZDc5YTVjMzMyMiIsInVzZXJfaWQiOjF9.4B_ZAPL6TmIcA6-zcKJ8JDRI3FsikX3HgTK3bbmK0mk")
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Project:
|
||||||
|
"""Label Studio project."""
|
||||||
|
id: int
|
||||||
|
title: str
|
||||||
|
description: str
|
||||||
|
task_count: int
|
||||||
|
annotation_count: int
|
||||||
|
created_at: str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class AnnotationStats:
|
||||||
|
"""Annotation statistics."""
|
||||||
|
total_tasks: int
|
||||||
|
annotated_tasks: int
|
||||||
|
total_annotations: int
|
||||||
|
agreement_score: Optional[float]
|
||||||
|
|
||||||
|
|
||||||
|
class LabelStudioClient:
|
||||||
|
"""Client for Label Studio API using official SDK."""
|
||||||
|
|
||||||
|
def __init__(self, url: str = None, api_key: str = None):
|
||||||
|
self.url = (url or LABEL_STUDIO_URL).rstrip('/')
|
||||||
|
self.api_key = api_key or LABEL_STUDIO_API_KEY
|
||||||
|
self.client = LabelStudio(base_url=self.url, api_key=self.api_key)
|
||||||
|
|
||||||
|
async def list_projects(self) -> List[Project]:
|
||||||
|
"""List all projects."""
|
||||||
|
projects_data = self.client.projects.list()
|
||||||
|
projects = []
|
||||||
|
for p in projects_data:
|
||||||
|
projects.append(Project(
|
||||||
|
id=p.id,
|
||||||
|
title=p.title,
|
||||||
|
description=p.description or "",
|
||||||
|
task_count=getattr(p, 'task_number', 0) or 0,
|
||||||
|
annotation_count=getattr(p, 'total_annotations_number', 0) or 0,
|
||||||
|
created_at=getattr(p, 'created_at', '') or ""
|
||||||
|
))
|
||||||
|
return projects
|
||||||
|
|
||||||
|
async def get_project(self, project_id: int) -> Dict:
|
||||||
|
"""Get project details."""
|
||||||
|
project = self.client.projects.get(id=project_id)
|
||||||
|
return {
|
||||||
|
"id": project.id,
|
||||||
|
"title": project.title,
|
||||||
|
"description": project.description,
|
||||||
|
"task_number": getattr(project, 'task_number', 0),
|
||||||
|
"total_annotations_number": getattr(project, 'total_annotations_number', 0),
|
||||||
|
"num_tasks_with_annotations": getattr(project, 'num_tasks_with_annotations', 0),
|
||||||
|
"created_at": getattr(project, 'created_at', '')
|
||||||
|
}
|
||||||
|
|
||||||
|
async def create_project(
|
||||||
|
self,
|
||||||
|
title: str,
|
||||||
|
description: str = "",
|
||||||
|
label_config: str = "",
|
||||||
|
**kwargs
|
||||||
|
) -> Dict:
|
||||||
|
"""Create a new project."""
|
||||||
|
project = self.client.projects.create(
|
||||||
|
title=title,
|
||||||
|
description=description,
|
||||||
|
label_config=label_config
|
||||||
|
)
|
||||||
|
return {
|
||||||
|
"id": project.id,
|
||||||
|
"title": project.title,
|
||||||
|
"description": project.description
|
||||||
|
}
|
||||||
|
|
||||||
|
async def update_project(self, project_id: int, **kwargs) -> Dict:
|
||||||
|
"""Update project settings."""
|
||||||
|
project = self.client.projects.update(id=project_id, **kwargs)
|
||||||
|
return {"id": project.id, "title": project.title}
|
||||||
|
|
||||||
|
async def delete_project(self, project_id: int) -> None:
|
||||||
|
"""Delete a project."""
|
||||||
|
self.client.projects.delete(id=project_id)
|
||||||
|
|
||||||
|
async def import_tasks(
|
||||||
|
self,
|
||||||
|
project_id: int,
|
||||||
|
tasks: List[Dict],
|
||||||
|
batch_size: int = 100
|
||||||
|
) -> Dict:
|
||||||
|
"""Import tasks to a project in batches."""
|
||||||
|
total_imported = 0
|
||||||
|
|
||||||
|
with Progress(
|
||||||
|
SpinnerColumn(),
|
||||||
|
TextColumn("[progress.description]{task.description}"),
|
||||||
|
console=console
|
||||||
|
) as progress:
|
||||||
|
task_progress = progress.add_task(
|
||||||
|
f"Importing {len(tasks)} tasks...",
|
||||||
|
total=len(tasks)
|
||||||
|
)
|
||||||
|
|
||||||
|
for i in range(0, len(tasks), batch_size):
|
||||||
|
batch = tasks[i:i + batch_size]
|
||||||
|
try:
|
||||||
|
self.client.projects.import_tasks(id=project_id, request=batch)
|
||||||
|
total_imported += len(batch)
|
||||||
|
progress.update(task_progress, advance=len(batch))
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[yellow]Warning: Failed to import batch {i//batch_size + 1}: {e}[/yellow]")
|
||||||
|
continue
|
||||||
|
|
||||||
|
return {"imported": total_imported}
|
||||||
|
|
||||||
|
async def import_tasks_from_file(
|
||||||
|
self,
|
||||||
|
project_id: int,
|
||||||
|
file_path: str,
|
||||||
|
batch_size: int = 100
|
||||||
|
) -> Dict:
|
||||||
|
"""Import tasks from a JSON file in batches."""
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as f:
|
||||||
|
tasks = json.load(f)
|
||||||
|
|
||||||
|
console.print(f"[blue]Loading {len(tasks)} tasks from {file_path}[/blue]")
|
||||||
|
return await self.import_tasks(project_id, tasks, batch_size)
|
||||||
|
|
||||||
|
async def get_tasks(
|
||||||
|
self,
|
||||||
|
project_id: int,
|
||||||
|
page: int = 1,
|
||||||
|
page_size: int = 100
|
||||||
|
) -> Dict:
|
||||||
|
"""Get tasks from a project."""
|
||||||
|
tasks = self.client.tasks.list(project=project_id, page=page, page_size=page_size)
|
||||||
|
return {"tasks": list(tasks)}
|
||||||
|
|
||||||
|
async def get_all_tasks(self, project_id: int) -> List[Dict]:
|
||||||
|
"""Get all tasks from a project."""
|
||||||
|
all_tasks = []
|
||||||
|
page = 1
|
||||||
|
while True:
|
||||||
|
result = await self.get_tasks(project_id, page=page, page_size=100)
|
||||||
|
tasks = result.get("tasks", [])
|
||||||
|
if not tasks:
|
||||||
|
break
|
||||||
|
all_tasks.extend(tasks)
|
||||||
|
if len(tasks) < 100:
|
||||||
|
break
|
||||||
|
page += 1
|
||||||
|
return all_tasks
|
||||||
|
|
||||||
|
async def delete_all_tasks(self, project_id: int) -> None:
|
||||||
|
"""Delete all tasks from a project."""
|
||||||
|
tasks = await self.get_all_tasks(project_id)
|
||||||
|
for task in tasks:
|
||||||
|
self.client.tasks.delete(id=task.id)
|
||||||
|
|
||||||
|
async def get_annotations(self, task_id: int) -> List[Dict]:
|
||||||
|
"""Get annotations for a task."""
|
||||||
|
annotations = self.client.annotations.list(task=task_id)
|
||||||
|
return list(annotations)
|
||||||
|
|
||||||
|
async def create_annotation(
|
||||||
|
self,
|
||||||
|
task_id: int,
|
||||||
|
result: List[Dict],
|
||||||
|
**kwargs
|
||||||
|
) -> Dict:
|
||||||
|
"""Create an annotation for a task."""
|
||||||
|
annotation = self.client.annotations.create(
|
||||||
|
task=task_id,
|
||||||
|
result=result,
|
||||||
|
**kwargs
|
||||||
|
)
|
||||||
|
return {"id": annotation.id}
|
||||||
|
|
||||||
|
async def export_annotations(
|
||||||
|
self,
|
||||||
|
project_id: int,
|
||||||
|
export_format: str = "JSON"
|
||||||
|
) -> List[Dict]:
|
||||||
|
"""Export all annotations from a project."""
|
||||||
|
export_result = self.client.projects.exports.create(
|
||||||
|
id=project_id,
|
||||||
|
export_type=export_format
|
||||||
|
)
|
||||||
|
return export_result
|
||||||
|
|
||||||
|
async def export_annotations_to_file(
|
||||||
|
self,
|
||||||
|
project_id: int,
|
||||||
|
output_path: str,
|
||||||
|
export_format: str = "JSON"
|
||||||
|
) -> str:
|
||||||
|
"""Export annotations to a file."""
|
||||||
|
data = await self.export_annotations(project_id, export_format)
|
||||||
|
|
||||||
|
with open(output_path, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
return output_path
|
||||||
|
|
||||||
|
async def get_project_stats(self, project_id: int) -> AnnotationStats:
|
||||||
|
"""Get annotation statistics for a project."""
|
||||||
|
project_data = await self.get_project(project_id)
|
||||||
|
|
||||||
|
return AnnotationStats(
|
||||||
|
total_tasks=project_data.get("task_number", 0),
|
||||||
|
annotated_tasks=project_data.get("num_tasks_with_annotations", 0),
|
||||||
|
total_annotations=project_data.get("total_annotations_number", 0),
|
||||||
|
agreement_score=None
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Annotation Conversion Functions
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def convert_label_studio_to_huggingface(annotations: List[Dict]) -> List[Dict]:
|
||||||
|
"""
|
||||||
|
Convert Label Studio annotations to HuggingFace NER format.
|
||||||
|
|
||||||
|
Output format:
|
||||||
|
{
|
||||||
|
"tokens": ["word1", "word2", ...],
|
||||||
|
"ner_tags": ["O", "B-PERSON", "I-PERSON", ...]
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
converted = []
|
||||||
|
|
||||||
|
for task in annotations:
|
||||||
|
if not task.get("annotations"):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Get the first annotation (or could handle multiple)
|
||||||
|
annotation = task["annotations"][0]
|
||||||
|
result = annotation.get("result", [])
|
||||||
|
|
||||||
|
# Get the text
|
||||||
|
data = task.get("data", {})
|
||||||
|
text = data.get("arabic_text", "") or data.get("english_text", "")
|
||||||
|
|
||||||
|
if not text:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Simple tokenization (space-based for now)
|
||||||
|
tokens = text.split()
|
||||||
|
ner_tags = ["O"] * len(tokens)
|
||||||
|
|
||||||
|
# Apply annotations
|
||||||
|
for item in result:
|
||||||
|
if item.get("type") != "labels":
|
||||||
|
continue
|
||||||
|
|
||||||
|
value = item.get("value", {})
|
||||||
|
label = value.get("labels", ["O"])[0]
|
||||||
|
start = value.get("start", 0)
|
||||||
|
end = value.get("end", 0)
|
||||||
|
|
||||||
|
# Find tokens that overlap with this span
|
||||||
|
char_pos = 0
|
||||||
|
for i, token in enumerate(tokens):
|
||||||
|
token_start = char_pos
|
||||||
|
token_end = char_pos + len(token)
|
||||||
|
|
||||||
|
if token_start >= start and token_end <= end:
|
||||||
|
if ner_tags[i] == "O":
|
||||||
|
ner_tags[i] = f"B-{label}"
|
||||||
|
else:
|
||||||
|
ner_tags[i] = f"I-{label}"
|
||||||
|
elif token_start < end and token_end > start:
|
||||||
|
# Partial overlap
|
||||||
|
if ner_tags[i] == "O":
|
||||||
|
ner_tags[i] = f"B-{label}"
|
||||||
|
|
||||||
|
char_pos = token_end + 1 # +1 for space
|
||||||
|
|
||||||
|
converted.append({
|
||||||
|
"id": task.get("id"),
|
||||||
|
"hadith_id": data.get("hadith_id"),
|
||||||
|
"tokens": tokens,
|
||||||
|
"ner_tags": ner_tags
|
||||||
|
})
|
||||||
|
|
||||||
|
return converted
|
||||||
|
|
||||||
|
|
||||||
|
def convert_label_studio_to_spacy(annotations: List[Dict]) -> List[tuple]:
|
||||||
|
"""
|
||||||
|
Convert Label Studio annotations to spaCy training format.
|
||||||
|
|
||||||
|
Output format:
|
||||||
|
[
|
||||||
|
("text", {"entities": [(start, end, label), ...]})
|
||||||
|
]
|
||||||
|
"""
|
||||||
|
converted = []
|
||||||
|
|
||||||
|
for task in annotations:
|
||||||
|
if not task.get("annotations"):
|
||||||
|
continue
|
||||||
|
|
||||||
|
annotation = task["annotations"][0]
|
||||||
|
result = annotation.get("result", [])
|
||||||
|
|
||||||
|
data = task.get("data", {})
|
||||||
|
text = data.get("arabic_text", "") or data.get("english_text", "")
|
||||||
|
|
||||||
|
if not text:
|
||||||
|
continue
|
||||||
|
|
||||||
|
entities = []
|
||||||
|
for item in result:
|
||||||
|
if item.get("type") != "labels":
|
||||||
|
continue
|
||||||
|
|
||||||
|
value = item.get("value", {})
|
||||||
|
label = value.get("labels", ["O"])[0]
|
||||||
|
start = value.get("start", 0)
|
||||||
|
end = value.get("end", 0)
|
||||||
|
|
||||||
|
entities.append((start, end, label))
|
||||||
|
|
||||||
|
converted.append((text, {"entities": entities}))
|
||||||
|
|
||||||
|
return converted
|
||||||
|
|
||||||
|
|
||||||
|
def convert_relations_to_graph(annotations: List[Dict]) -> List[Dict]:
|
||||||
|
"""
|
||||||
|
Convert relation annotations to graph format for Neo4j.
|
||||||
|
|
||||||
|
Output format:
|
||||||
|
{
|
||||||
|
"source": {"text": "...", "type": "...", "start": N, "end": N},
|
||||||
|
"target": {"text": "...", "type": "...", "start": N, "end": N},
|
||||||
|
"relation": "NARRATED_FROM"
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
relations = []
|
||||||
|
|
||||||
|
for task in annotations:
|
||||||
|
if not task.get("annotations"):
|
||||||
|
continue
|
||||||
|
|
||||||
|
annotation = task["annotations"][0]
|
||||||
|
result = annotation.get("result", [])
|
||||||
|
|
||||||
|
# First pass: collect all entities by ID
|
||||||
|
entities_by_id = {}
|
||||||
|
for item in result:
|
||||||
|
if item.get("type") == "labels":
|
||||||
|
entities_by_id[item.get("id")] = {
|
||||||
|
"text": item.get("value", {}).get("text", ""),
|
||||||
|
"type": item.get("value", {}).get("labels", [""])[0],
|
||||||
|
"start": item.get("value", {}).get("start", 0),
|
||||||
|
"end": item.get("value", {}).get("end", 0)
|
||||||
|
}
|
||||||
|
|
||||||
|
# Second pass: extract relations
|
||||||
|
for item in result:
|
||||||
|
if item.get("type") == "relation":
|
||||||
|
from_id = item.get("from_id")
|
||||||
|
to_id = item.get("to_id")
|
||||||
|
relation_type = item.get("labels", ["RELATED_TO"])[0]
|
||||||
|
|
||||||
|
if from_id in entities_by_id and to_id in entities_by_id:
|
||||||
|
relations.append({
|
||||||
|
"hadith_id": task.get("data", {}).get("hadith_id"),
|
||||||
|
"source": entities_by_id[from_id],
|
||||||
|
"target": entities_by_id[to_id],
|
||||||
|
"relation": relation_type
|
||||||
|
})
|
||||||
|
|
||||||
|
return relations
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# CLI Interface
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
"""Main CLI interface."""
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description="Label Studio Client")
|
||||||
|
subparsers = parser.add_subparsers(dest="command", help="Command")
|
||||||
|
|
||||||
|
# List projects
|
||||||
|
list_parser = subparsers.add_parser("list", help="List all projects")
|
||||||
|
|
||||||
|
# Create project
|
||||||
|
create_parser = subparsers.add_parser("create", help="Create a project")
|
||||||
|
create_parser.add_argument("--title", required=True)
|
||||||
|
create_parser.add_argument("--config", required=True, help="Path to label config XML")
|
||||||
|
create_parser.add_argument("--description", default="")
|
||||||
|
|
||||||
|
# Import tasks
|
||||||
|
import_parser = subparsers.add_parser("import", help="Import tasks")
|
||||||
|
import_parser.add_argument("--project", type=int, required=True)
|
||||||
|
import_parser.add_argument("--file", required=True, help="Path to tasks JSON")
|
||||||
|
|
||||||
|
# Export annotations
|
||||||
|
export_parser = subparsers.add_parser("export", help="Export annotations")
|
||||||
|
export_parser.add_argument("--project", type=int, required=True)
|
||||||
|
export_parser.add_argument("--output", required=True)
|
||||||
|
export_parser.add_argument("--format", default="JSON", choices=["JSON", "CSV", "CONLL"])
|
||||||
|
|
||||||
|
# Convert annotations
|
||||||
|
convert_parser = subparsers.add_parser("convert", help="Convert annotations")
|
||||||
|
convert_parser.add_argument("--input", required=True)
|
||||||
|
convert_parser.add_argument("--output", required=True)
|
||||||
|
convert_parser.add_argument("--format", choices=["huggingface", "spacy", "relations"])
|
||||||
|
|
||||||
|
# Stats
|
||||||
|
stats_parser = subparsers.add_parser("stats", help="Get project statistics")
|
||||||
|
stats_parser.add_argument("--project", type=int, required=True)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if not args.command:
|
||||||
|
parser.print_help()
|
||||||
|
return
|
||||||
|
|
||||||
|
client = LabelStudioClient()
|
||||||
|
|
||||||
|
if args.command == "list":
|
||||||
|
projects = await client.list_projects()
|
||||||
|
|
||||||
|
table = Table(title="Label Studio Projects")
|
||||||
|
table.add_column("ID", style="cyan")
|
||||||
|
table.add_column("Title")
|
||||||
|
table.add_column("Tasks", justify="right")
|
||||||
|
table.add_column("Annotations", justify="right")
|
||||||
|
|
||||||
|
for p in projects:
|
||||||
|
table.add_row(
|
||||||
|
str(p.id),
|
||||||
|
p.title,
|
||||||
|
str(p.task_count),
|
||||||
|
str(p.annotation_count)
|
||||||
|
)
|
||||||
|
|
||||||
|
console.print(table)
|
||||||
|
|
||||||
|
elif args.command == "create":
|
||||||
|
with open(args.config, 'r') as f:
|
||||||
|
label_config = f.read()
|
||||||
|
|
||||||
|
project = await client.create_project(
|
||||||
|
title=args.title,
|
||||||
|
description=args.description,
|
||||||
|
label_config=label_config
|
||||||
|
)
|
||||||
|
|
||||||
|
console.print(f"[green]Created project: {project['id']} - {project['title']}[/green]")
|
||||||
|
|
||||||
|
elif args.command == "import":
|
||||||
|
result = await client.import_tasks_from_file(args.project, args.file)
|
||||||
|
console.print(f"[green]Imported tasks to project {args.project}[/green]")
|
||||||
|
|
||||||
|
elif args.command == "export":
|
||||||
|
path = await client.export_annotations_to_file(
|
||||||
|
args.project,
|
||||||
|
args.output,
|
||||||
|
args.format
|
||||||
|
)
|
||||||
|
console.print(f"[green]Exported annotations to {path}[/green]")
|
||||||
|
|
||||||
|
elif args.command == "convert":
|
||||||
|
with open(args.input, 'r', encoding='utf-8') as f:
|
||||||
|
annotations = json.load(f)
|
||||||
|
|
||||||
|
if args.format == "huggingface":
|
||||||
|
converted = convert_label_studio_to_huggingface(annotations)
|
||||||
|
elif args.format == "spacy":
|
||||||
|
converted = convert_label_studio_to_spacy(annotations)
|
||||||
|
elif args.format == "relations":
|
||||||
|
converted = convert_relations_to_graph(annotations)
|
||||||
|
|
||||||
|
with open(args.output, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(converted, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
console.print(f"[green]Converted {len(converted)} items to {args.output}[/green]")
|
||||||
|
|
||||||
|
elif args.command == "stats":
|
||||||
|
stats = await client.get_project_stats(args.project)
|
||||||
|
|
||||||
|
console.print(f"\n[bold]Project {args.project} Statistics:[/bold]")
|
||||||
|
console.print(f" Total tasks: {stats.total_tasks}")
|
||||||
|
console.print(f" Annotated: {stats.annotated_tasks}")
|
||||||
|
console.print(f" Total annotations: {stats.total_annotations}")
|
||||||
|
if stats.agreement_score:
|
||||||
|
console.print(f" Agreement: {stats.agreement_score:.2%}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
|
|
@ -0,0 +1,24 @@
|
||||||
|
# Step 7: Annotation Setup with Label Studio
|
||||||
|
# Requirements for hadith-phase3-step7
|
||||||
|
|
||||||
|
# Database
|
||||||
|
psycopg2-binary>=2.9.9
|
||||||
|
|
||||||
|
# HTTP client
|
||||||
|
httpx>=0.27.0
|
||||||
|
|
||||||
|
# Rich console output
|
||||||
|
rich>=13.7.0
|
||||||
|
|
||||||
|
# Data handling
|
||||||
|
numpy>=1.24.0
|
||||||
|
pandas>=2.0.0
|
||||||
|
|
||||||
|
# JSON handling
|
||||||
|
orjson>=3.9.0
|
||||||
|
|
||||||
|
# Date handling
|
||||||
|
python-dateutil>=2.8.2
|
||||||
|
|
||||||
|
# Label Studio SDK (optional - for direct integration)
|
||||||
|
label-studio-sdk>=0.0.32
|
||||||
|
|
@ -0,0 +1,203 @@
|
||||||
|
#!/bin/bash
|
||||||
|
# ============================================================================
|
||||||
|
# Step 7: Annotation Setup Runner
|
||||||
|
# ============================================================================
|
||||||
|
# Usage: ./run_step7.sh [setup|export|active|client|help]
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# Colors
|
||||||
|
RED='\033[0;31m'
|
||||||
|
GREEN='\033[0;32m'
|
||||||
|
YELLOW='\033[1;33m'
|
||||||
|
BLUE='\033[0;34m'
|
||||||
|
NC='\033[0m'
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
export POSTGRES_HOST="${POSTGRES_HOST:-pg.betelgeusebytes.io}"
|
||||||
|
export POSTGRES_PORT="${POSTGRES_PORT:-5432}"
|
||||||
|
export POSTGRES_DB="${POSTGRES_DB:-hadith_db}"
|
||||||
|
export POSTGRES_USER="${POSTGRES_USER:-hadith_ingest}"
|
||||||
|
export POSTGRES_PASSWORD="${POSTGRES_PASSWORD:-hadith_ingest}"
|
||||||
|
|
||||||
|
export LABEL_STUDIO_URL="${LABEL_STUDIO_URL:-https://label.betelgeusebytes.io}"
|
||||||
|
export LABEL_STUDIO_API_KEY="${LABEL_STUDIO_API_KEY:-eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ0b2tlbl90eXBlIjoicmVmcmVzaCIsImV4cCI6ODA3MTUyMjgzMSwiaWF0IjoxNzY0MzIyODMxLCJqdGkiOiJhYWVkMjNjODdmODc0MmY2OWJmMmFjZDc5YTVjMzMyMiIsInVzZXJfaWQiOjF9.4B_ZAPL6TmIcA6-zcKJ8JDRI3FsikX3HgTK3bbmK0mk}"
|
||||||
|
|
||||||
|
export QDRANT_HOST="${QDRANT_HOST:-https://vector.betelgeusebytes.io}"
|
||||||
|
export QDRANT_PORT="${QDRANT_PORT:-443}"
|
||||||
|
export QDRANT_COLLECTION="${QDRANT_COLLECTION:-hadith_embeddings}"
|
||||||
|
|
||||||
|
# Check password
|
||||||
|
check_password() {
|
||||||
|
if [ -z "$POSTGRES_PASSWORD" ]; then
|
||||||
|
echo -e "${RED}Error: POSTGRES_PASSWORD not set${NC}"
|
||||||
|
echo "Set it with: export POSTGRES_PASSWORD='your_password'"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Install dependencies
|
||||||
|
install_deps() {
|
||||||
|
echo -e "${BLUE}Installing dependencies...${NC}"
|
||||||
|
pip install -q -r requirements.txt
|
||||||
|
echo -e "${GREEN}Dependencies installed.${NC}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Run annotation setup
|
||||||
|
run_setup() {
|
||||||
|
echo -e "\n${BLUE}═══════════════════════════════════════════════════${NC}"
|
||||||
|
echo -e "${BLUE}Running Annotation Setup...${NC}"
|
||||||
|
echo -e "${BLUE}═══════════════════════════════════════════════════${NC}\n"
|
||||||
|
|
||||||
|
# Parse arguments
|
||||||
|
NER_COUNT="${1:-500}"
|
||||||
|
RELATION_COUNT="${2:-300}"
|
||||||
|
EXPORT_ONLY="${3:-}"
|
||||||
|
|
||||||
|
if [ "$EXPORT_ONLY" == "--export-only" ]; then
|
||||||
|
python annotation_setup.py \
|
||||||
|
--ner-count "$NER_COUNT" \
|
||||||
|
--relation-count "$RELATION_COUNT" \
|
||||||
|
--export-only
|
||||||
|
else
|
||||||
|
python annotation_setup.py \
|
||||||
|
--ner-count "$NER_COUNT" \
|
||||||
|
--relation-count "$RELATION_COUNT"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo -e "\n${GREEN}✓ Annotation setup complete!${NC}"
|
||||||
|
echo -e "\nOutput files in ./annotation_data/:"
|
||||||
|
ls -la annotation_data/ 2>/dev/null || echo " (directory will be created on first run)"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Run export only
|
||||||
|
run_export() {
|
||||||
|
echo -e "\n${BLUE}═══════════════════════════════════════════════════${NC}"
|
||||||
|
echo -e "${BLUE}Exporting Hadiths for Annotation...${NC}"
|
||||||
|
echo -e "${BLUE}═══════════════════════════════════════════════════${NC}\n"
|
||||||
|
|
||||||
|
NER_COUNT="${1:-500}"
|
||||||
|
RELATION_COUNT="${2:-300}"
|
||||||
|
|
||||||
|
python annotation_setup.py \
|
||||||
|
--ner-count "$NER_COUNT" \
|
||||||
|
--relation-count "$RELATION_COUNT" \
|
||||||
|
--export-only
|
||||||
|
|
||||||
|
echo -e "\n${GREEN}✓ Export complete!${NC}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Run active learning sampler
|
||||||
|
run_active() {
|
||||||
|
echo -e "\n${BLUE}═══════════════════════════════════════════════════${NC}"
|
||||||
|
echo -e "${BLUE}Running Active Learning Sampler...${NC}"
|
||||||
|
echo -e "${BLUE}═══════════════════════════════════════════════════${NC}\n"
|
||||||
|
|
||||||
|
STRATEGY="${1:-hybrid}"
|
||||||
|
COUNT="${2:-50}"
|
||||||
|
OUTPUT="${3:-active_learning_samples.json}"
|
||||||
|
|
||||||
|
python active_learning.py \
|
||||||
|
--strategy "$STRATEGY" \
|
||||||
|
--count "$COUNT" \
|
||||||
|
--output "$OUTPUT"
|
||||||
|
|
||||||
|
echo -e "\n${GREEN}✓ Active learning sampling complete!${NC}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Run Label Studio client commands
|
||||||
|
run_client() {
|
||||||
|
echo -e "\n${BLUE}═══════════════════════════════════════════════════${NC}"
|
||||||
|
echo -e "${BLUE}Label Studio Client${NC}"
|
||||||
|
echo -e "${BLUE}═══════════════════════════════════════════════════${NC}\n"
|
||||||
|
|
||||||
|
# Pass all arguments to the client
|
||||||
|
python label_studio_client.py "$@"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Run SQL export
|
||||||
|
run_sql() {
|
||||||
|
echo -e "\n${BLUE}═══════════════════════════════════════════════════${NC}"
|
||||||
|
echo -e "${BLUE}Running SQL Export Queries...${NC}"
|
||||||
|
echo -e "${BLUE}═══════════════════════════════════════════════════${NC}\n"
|
||||||
|
|
||||||
|
PGPASSWORD="$POSTGRES_PASSWORD" psql \
|
||||||
|
-h "$POSTGRES_HOST" \
|
||||||
|
-p "$POSTGRES_PORT" \
|
||||||
|
-U "$POSTGRES_USER" \
|
||||||
|
-d "$POSTGRES_DB" \
|
||||||
|
-f export_queries.sql
|
||||||
|
}
|
||||||
|
|
||||||
|
# Show usage
|
||||||
|
show_usage() {
|
||||||
|
echo "Step 7: Annotation Setup with Label Studio"
|
||||||
|
echo ""
|
||||||
|
echo "Usage: $0 [command] [options]"
|
||||||
|
echo ""
|
||||||
|
echo "Commands:"
|
||||||
|
echo " setup [ner_count] [rel_count] [--export-only]"
|
||||||
|
echo " Run full annotation setup"
|
||||||
|
echo " export [ner_count] [rel_count]"
|
||||||
|
echo " Export hadiths for annotation only"
|
||||||
|
echo " active [strategy] [count] [output]"
|
||||||
|
echo " Run active learning sampler"
|
||||||
|
echo " Strategies: diversity, representative, chain_complexity, random, hybrid"
|
||||||
|
echo " client [args...] Run Label Studio client commands"
|
||||||
|
echo " sql Run SQL export queries"
|
||||||
|
echo " install Install Python dependencies"
|
||||||
|
echo " help Show this help message"
|
||||||
|
echo ""
|
||||||
|
echo "Examples:"
|
||||||
|
echo " $0 setup 500 300 --export-only # Export 500 NER + 300 relation samples"
|
||||||
|
echo " $0 active hybrid 100 # Get 100 samples using hybrid strategy"
|
||||||
|
echo " $0 client list # List Label Studio projects"
|
||||||
|
echo " $0 client export --project 1 --output ann.json"
|
||||||
|
echo ""
|
||||||
|
echo "Environment variables:"
|
||||||
|
echo " POSTGRES_PASSWORD Database password (required)"
|
||||||
|
echo " LABEL_STUDIO_API_KEY Label Studio API key (for project creation)"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Main
|
||||||
|
case "${1:-help}" in
|
||||||
|
setup)
|
||||||
|
check_password
|
||||||
|
install_deps
|
||||||
|
shift
|
||||||
|
run_setup "$@"
|
||||||
|
;;
|
||||||
|
export)
|
||||||
|
check_password
|
||||||
|
install_deps
|
||||||
|
shift
|
||||||
|
run_export "$@"
|
||||||
|
;;
|
||||||
|
active)
|
||||||
|
check_password
|
||||||
|
install_deps
|
||||||
|
shift
|
||||||
|
run_active "$@"
|
||||||
|
;;
|
||||||
|
client)
|
||||||
|
install_deps
|
||||||
|
shift
|
||||||
|
run_client "$@"
|
||||||
|
;;
|
||||||
|
sql)
|
||||||
|
check_password
|
||||||
|
run_sql
|
||||||
|
;;
|
||||||
|
install)
|
||||||
|
install_deps
|
||||||
|
;;
|
||||||
|
help|--help|-h)
|
||||||
|
show_usage
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo -e "${RED}Unknown command: $1${NC}"
|
||||||
|
show_usage
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
@ -0,0 +1,25 @@
|
||||||
|
import httpx
|
||||||
|
from qdrant_client import QdrantClient
|
||||||
|
import urllib3
|
||||||
|
|
||||||
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||||
|
|
||||||
|
# Generate query embedding
|
||||||
|
query = "الصلاة"
|
||||||
|
response = httpx.post(
|
||||||
|
"https://embeddings.betelgeusebytes.io/embed",
|
||||||
|
json={"inputs": [query]},
|
||||||
|
verify=False,
|
||||||
|
)
|
||||||
|
query_vector = response.json()[0]
|
||||||
|
# Search using internal Qdrant service
|
||||||
|
# qdrant = QdrantClient(url="http://qdrant.vector.svc.cluster.local:6333")
|
||||||
|
qdrant = QdrantClient(url="https://vector.betelgeusebytes.io:443/")
|
||||||
|
results = qdrant.query_points(
|
||||||
|
collection_name="hadith_embeddings",
|
||||||
|
query=query_vector,
|
||||||
|
limit=5,
|
||||||
|
)
|
||||||
|
|
||||||
|
for i, r in enumerate(results.points, 1):
|
||||||
|
print(f"{i}. Hadith {r.id} (score: {r.score:.4f})")
|
||||||
|
|
@ -0,0 +1,37 @@
|
||||||
|
# ============================================================================
|
||||||
|
# Step 6: Environment Configuration
|
||||||
|
# ============================================================================
|
||||||
|
# Copy this file to .env and update with your values
|
||||||
|
# Usage: source .env
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
# PostgreSQL Configuration
|
||||||
|
export POSTGRES_HOST=pg.betelgeusebytes.io
|
||||||
|
export POSTGRES_PORT=5432
|
||||||
|
export POSTGRES_DB=hadith_db
|
||||||
|
export POSTGRES_USER=hadith_ingest
|
||||||
|
export POSTGRES_PASSWORD=YOUR_PASSWORD_HERE
|
||||||
|
|
||||||
|
# Qdrant Configuration (internal cluster access)
|
||||||
|
export QDRANT_HOST=qdrant.vector.svc.cluster.local
|
||||||
|
export QDRANT_PORT=6333
|
||||||
|
export QDRANT_COLLECTION=hadith_embeddings
|
||||||
|
|
||||||
|
# Qdrant Configuration (external access - uncomment if needed)
|
||||||
|
# export QDRANT_HOST=qdrant.betelgeusebytes.io
|
||||||
|
# export QDRANT_PORT=443
|
||||||
|
|
||||||
|
# TEI Configuration (internal cluster access)
|
||||||
|
export TEI_HOST=tei.ml.svc.cluster.local
|
||||||
|
export TEI_PORT=80
|
||||||
|
|
||||||
|
# TEI Configuration (external access - uncomment if needed)
|
||||||
|
# export TEI_HOST=tei.betelgeusebytes.io
|
||||||
|
# export TEI_PORT=443
|
||||||
|
|
||||||
|
# vLLM Configuration (for later steps)
|
||||||
|
export VLLM_HOST=vllm.ml.svc.cluster.local
|
||||||
|
export VLLM_PORT=8000
|
||||||
|
|
||||||
|
# MLflow Configuration
|
||||||
|
export MLFLOW_TRACKING_URI=https://mlflow.betelgeusebytes.io
|
||||||
|
|
@ -0,0 +1,48 @@
|
||||||
|
# ============================================================================
|
||||||
|
# Hadith Semantic Search API - Dockerfile
|
||||||
|
# ============================================================================
|
||||||
|
# Build: docker build -t hadith-search-api:latest .
|
||||||
|
# Run: docker run -p 8080:8080 --env-file .env hadith-search-api:latest
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
FROM python:3.11-slim
|
||||||
|
|
||||||
|
# Set environment variables
|
||||||
|
ENV PYTHONDONTWRITEBYTECODE=1 \
|
||||||
|
PYTHONUNBUFFERED=1 \
|
||||||
|
PIP_NO_CACHE_DIR=1 \
|
||||||
|
PIP_DISABLE_PIP_VERSION_CHECK=1
|
||||||
|
|
||||||
|
# Install system dependencies
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
libpq-dev \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Create non-root user
|
||||||
|
RUN useradd --create-home --shell /bin/bash appuser
|
||||||
|
|
||||||
|
# Set work directory
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Install Python dependencies
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
# Copy application code
|
||||||
|
COPY search_api.py .
|
||||||
|
|
||||||
|
# Change ownership
|
||||||
|
RUN chown -R appuser:appuser /app
|
||||||
|
|
||||||
|
# Switch to non-root user
|
||||||
|
USER appuser
|
||||||
|
|
||||||
|
# Expose port
|
||||||
|
EXPOSE 8080
|
||||||
|
|
||||||
|
# Health check
|
||||||
|
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
|
||||||
|
CMD python -c "import httpx; httpx.get('http://localhost:8080/health', timeout=5)"
|
||||||
|
|
||||||
|
# Run the application
|
||||||
|
CMD ["python", "-m", "uvicorn", "search_api:app", "--host", "0.0.0.0", "--port", "8080", "--workers", "2"]
|
||||||
|
|
@ -0,0 +1,407 @@
|
||||||
|
# Step 6: Verify Embeddings & Test Semantic Search
|
||||||
|
|
||||||
|
## 📋 Overview
|
||||||
|
|
||||||
|
This step validates that all ~40,000 hadiths have been properly embedded and stored in Qdrant, then builds and benchmarks a semantic search system.
|
||||||
|
|
||||||
|
**Target Performance:** <500ms per query
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 📁 Files Included
|
||||||
|
|
||||||
|
| File | Description |
|
||||||
|
|------|-------------|
|
||||||
|
| `verify_embeddings.py` | Validates all hadiths have embeddings in Qdrant |
|
||||||
|
| `semantic_search.py` | Tests semantic search with benchmarking |
|
||||||
|
| `search_api.py` | Production-ready FastAPI search service |
|
||||||
|
| `verification_queries.sql` | SQL queries for database verification |
|
||||||
|
| `k8s-search-api.yaml` | Kubernetes deployment manifests |
|
||||||
|
| `Dockerfile` | Container image for search API |
|
||||||
|
| `requirements.txt` | Python dependencies |
|
||||||
|
| `run_tests.sh` | Quick test runner script |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🔧 Prerequisites
|
||||||
|
|
||||||
|
1. **Python 3.10+** with pip
|
||||||
|
2. **Access to services:**
|
||||||
|
- PostgreSQL at `pg.betelgeusebytes.io:5432`
|
||||||
|
- Qdrant at `qdrant.vector.svc.cluster.local:6333`
|
||||||
|
- TEI at `tei.ml.svc.cluster.local:80`
|
||||||
|
|
||||||
|
3. **Environment variables:**
|
||||||
|
```bash
|
||||||
|
export POSTGRES_HOST=pg.betelgeusebytes.io
|
||||||
|
export POSTGRES_PORT=5432
|
||||||
|
export POSTGRES_DB=hadith_db
|
||||||
|
export POSTGRES_USER=hadith_ingest
|
||||||
|
export POSTGRES_PASSWORD=your_password
|
||||||
|
|
||||||
|
export QDRANT_HOST=qdrant.vector.svc.cluster.local
|
||||||
|
export QDRANT_PORT=6333
|
||||||
|
export QDRANT_COLLECTION=hadith_embeddings
|
||||||
|
|
||||||
|
export TEI_HOST=tei.ml.svc.cluster.local
|
||||||
|
export TEI_PORT=80
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🚀 Quick Start
|
||||||
|
|
||||||
|
### 1. Install Dependencies
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Run Embedding Verification
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python verify_embeddings.py
|
||||||
|
```
|
||||||
|
|
||||||
|
**Expected Output:**
|
||||||
|
```
|
||||||
|
┌───────────────────────────────────────┐
|
||||||
|
│ Step 6.1 - Hadith Embeddings Verification │
|
||||||
|
└───────────────────────────────────────┘
|
||||||
|
|
||||||
|
1. Checking PostgreSQL database...
|
||||||
|
Total hadiths: 40,123
|
||||||
|
Marked as embedded: 40,123
|
||||||
|
|
||||||
|
2. Collection breakdown:
|
||||||
|
┌────────────────┬────────┬──────────┐
|
||||||
|
│ Collection │ Total │ Embedded │
|
||||||
|
├────────────────┼────────┼──────────┤
|
||||||
|
│ Sahih Bukhari │ 7,563 │ 7,563 │
|
||||||
|
│ Sahih Muslim │ 7,453 │ 7,453 │
|
||||||
|
│ ... │ ... │ ... │
|
||||||
|
└────────────────┴────────┴──────────┘
|
||||||
|
|
||||||
|
✓ ALL EMBEDDINGS VERIFIED!
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Run Semantic Search Benchmark
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Full benchmark
|
||||||
|
python semantic_search.py --mode benchmark
|
||||||
|
|
||||||
|
# Interactive mode
|
||||||
|
python semantic_search.py --mode interactive
|
||||||
|
|
||||||
|
# Single query
|
||||||
|
python semantic_search.py --query "الصلاة في المسجد"
|
||||||
|
|
||||||
|
# Demo mode
|
||||||
|
python semantic_search.py --mode demo
|
||||||
|
```
|
||||||
|
|
||||||
|
**Expected Benchmark Output:**
|
||||||
|
```
|
||||||
|
═══════════════════════════════════════════════════════
|
||||||
|
BENCHMARK RESULTS
|
||||||
|
═══════════════════════════════════════════════════════
|
||||||
|
|
||||||
|
Query Statistics:
|
||||||
|
Total queries: 22
|
||||||
|
Successful: 22
|
||||||
|
Failed: 0
|
||||||
|
|
||||||
|
Timing Statistics:
|
||||||
|
Average embedding time: 45.3ms
|
||||||
|
Average search time: 12.8ms
|
||||||
|
Average total time: 58.1ms
|
||||||
|
|
||||||
|
Percentiles:
|
||||||
|
P50: 55.2ms
|
||||||
|
P95: 89.4ms
|
||||||
|
P99: 112.3ms
|
||||||
|
|
||||||
|
Performance Target (<500ms):
|
||||||
|
Queries meeting target: 22/22 (100.0%)
|
||||||
|
Status: ✓ TARGET MET
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 📊 Verification Queries (SQL)
|
||||||
|
|
||||||
|
Run these directly against PostgreSQL:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
psql -h pg.betelgeusebytes.io -U hadith_ingest -d hadith_db -f verification_queries.sql
|
||||||
|
```
|
||||||
|
|
||||||
|
Or run individual queries:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
-- Quick health check
|
||||||
|
SELECT
|
||||||
|
'Database Health Check' AS check_type,
|
||||||
|
(SELECT COUNT(*) FROM hadiths) AS total_hadiths,
|
||||||
|
(SELECT COUNT(*) FROM collections) AS total_collections,
|
||||||
|
(SELECT SUM(CASE WHEN embedding_generated THEN 1 ELSE 0 END) FROM hadiths) AS embedded_count;
|
||||||
|
|
||||||
|
-- Find missing embeddings
|
||||||
|
SELECT id, collection_id, hadith_number
|
||||||
|
FROM hadiths
|
||||||
|
WHERE NOT embedding_generated
|
||||||
|
LIMIT 10;
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🔍 Qdrant Verification
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check collection exists
|
||||||
|
curl http://qdrant.betelgeusebytes.io/collections/hadith_embeddings
|
||||||
|
|
||||||
|
# Count points
|
||||||
|
curl -X POST http://qdrant.betelgeusebytes.io/collections/hadith_embeddings/points/count \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{"exact": true}'
|
||||||
|
|
||||||
|
# Sample search (requires embedding)
|
||||||
|
curl -X POST http://qdrant.betelgeusebytes.io/collections/hadith_embeddings/points/search \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"vector": [0.1, 0.2, ...], # 1024-dim vector
|
||||||
|
"limit": 5,
|
||||||
|
"with_payload": true
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🌐 Deploy Search API
|
||||||
|
|
||||||
|
### Option 1: Run Locally
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python search_api.py
|
||||||
|
# API available at http://localhost:8080
|
||||||
|
```
|
||||||
|
|
||||||
|
### Option 2: Deploy to Kubernetes
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Create namespace
|
||||||
|
kubectl create namespace hadith
|
||||||
|
|
||||||
|
# Create ConfigMap with API code
|
||||||
|
kubectl create configmap search-api-code \
|
||||||
|
--from-file=search_api.py \
|
||||||
|
-n hadith
|
||||||
|
|
||||||
|
# Update secrets in k8s-search-api.yaml with your password
|
||||||
|
|
||||||
|
# Deploy
|
||||||
|
kubectl apply -f k8s-search-api.yaml
|
||||||
|
|
||||||
|
# Check status
|
||||||
|
kubectl -n hadith get pods
|
||||||
|
kubectl -n hadith logs -f deployment/search-api
|
||||||
|
```
|
||||||
|
|
||||||
|
### Option 3: Build Docker Image
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Build image
|
||||||
|
docker build -t hadith-search-api:latest .
|
||||||
|
|
||||||
|
# Run locally
|
||||||
|
docker run -p 8080:8080 \
|
||||||
|
-e POSTGRES_PASSWORD=your_password \
|
||||||
|
-e POSTGRES_HOST=pg.betelgeusebytes.io \
|
||||||
|
-e QDRANT_HOST=qdrant.betelgeusebytes.io \
|
||||||
|
-e TEI_HOST=tei.betelgeusebytes.io \
|
||||||
|
hadith-search-api:latest
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🔌 API Endpoints
|
||||||
|
|
||||||
|
| Method | Endpoint | Description |
|
||||||
|
|--------|----------|-------------|
|
||||||
|
| GET | `/health` | Health check |
|
||||||
|
| GET | `/stats` | Database statistics |
|
||||||
|
| POST | `/search` | Semantic search |
|
||||||
|
| GET | `/search?q=query` | Simple search |
|
||||||
|
| GET | `/hadith/{id}` | Get hadith by ID |
|
||||||
|
| GET | `/similar/{id}` | Find similar hadiths |
|
||||||
|
|
||||||
|
### Example API Calls
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Health check
|
||||||
|
curl https://search.betelgeusebytes.io/health
|
||||||
|
|
||||||
|
# Get stats
|
||||||
|
curl https://search.betelgeusebytes.io/stats
|
||||||
|
|
||||||
|
# Search (GET)
|
||||||
|
curl "https://search.betelgeusebytes.io/search?q=five%20daily%20prayers&limit=5"
|
||||||
|
|
||||||
|
# Search (POST)
|
||||||
|
curl -X POST https://search.betelgeusebytes.io/search \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"query": "الصلاة في المسجد الحرام",
|
||||||
|
"limit": 10,
|
||||||
|
"min_score": 0.5
|
||||||
|
}'
|
||||||
|
|
||||||
|
# Get specific hadith
|
||||||
|
curl https://search.betelgeusebytes.io/hadith/1234
|
||||||
|
|
||||||
|
# Find similar hadiths
|
||||||
|
curl https://search.betelgeusebytes.io/similar/1234?limit=5
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 📈 Sample Search Queries
|
||||||
|
|
||||||
|
### Arabic Queries
|
||||||
|
|
||||||
|
| Query | Description |
|
||||||
|
|-------|-------------|
|
||||||
|
| الصلاة في المسجد الحرام | Prayer in the Sacred Mosque |
|
||||||
|
| أبو هريرة رضي الله عنه | Abu Hurairah (RA) |
|
||||||
|
| الصيام في شهر رمضان | Fasting in Ramadan |
|
||||||
|
| الزكاة والصدقة | Zakat and charity |
|
||||||
|
| الحج والعمرة | Hajj and Umrah |
|
||||||
|
| الوضوء والطهارة | Ablution and purification |
|
||||||
|
| بر الوالدين | Honoring parents |
|
||||||
|
| الجنة والنار | Paradise and Hell |
|
||||||
|
|
||||||
|
### English Queries
|
||||||
|
|
||||||
|
| Query | Description |
|
||||||
|
|-------|-------------|
|
||||||
|
| five daily prayers | The five obligatory prayers |
|
||||||
|
| treatment of neighbors | Rights of neighbors |
|
||||||
|
| patience during hardship | Patience in trials |
|
||||||
|
| marriage and family | Islamic marriage guidance |
|
||||||
|
| honesty and truthfulness | Being truthful |
|
||||||
|
| Day of Judgment signs | Signs of the Last Day |
|
||||||
|
| companions of the Prophet | Sahaba and their virtues |
|
||||||
|
| seeking knowledge in Islam | Importance of knowledge |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🐛 Troubleshooting
|
||||||
|
|
||||||
|
### 1. "Connection refused" to services
|
||||||
|
|
||||||
|
**Problem:** Cannot connect to PostgreSQL/Qdrant/TEI
|
||||||
|
|
||||||
|
**Solution:**
|
||||||
|
```bash
|
||||||
|
# Check if running inside cluster
|
||||||
|
kubectl -n ml get pods
|
||||||
|
kubectl -n vector get pods
|
||||||
|
|
||||||
|
# For external access, use port-forward
|
||||||
|
kubectl port-forward -n vector svc/qdrant 6333:6333
|
||||||
|
kubectl port-forward -n ml svc/tei 8080:80
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. "Missing embeddings found"
|
||||||
|
|
||||||
|
**Problem:** Some hadiths don't have embeddings
|
||||||
|
|
||||||
|
**Solution:**
|
||||||
|
```bash
|
||||||
|
# Re-run embedding workflow for missing IDs
|
||||||
|
argo submit -n ml embedding-workflow.yaml \
|
||||||
|
--parameter hadith-ids="[1,2,3,...]"
|
||||||
|
|
||||||
|
# Or update the database flag
|
||||||
|
psql -h pg.betelgeusebytes.io -U hadith_ingest -d hadith_db -c \
|
||||||
|
"UPDATE hadiths SET embedding_generated = false WHERE id IN (1,2,3)"
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Slow search performance (>500ms)
|
||||||
|
|
||||||
|
**Problem:** Queries taking too long
|
||||||
|
|
||||||
|
**Solutions:**
|
||||||
|
1. Check TEI service health:
|
||||||
|
```bash
|
||||||
|
curl http://tei.ml.svc.cluster.local/health
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Check Qdrant indexing:
|
||||||
|
```bash
|
||||||
|
curl http://qdrant.betelgeusebytes.io/collections/hadith_embeddings | jq '.result.status'
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Enable HNSW index in Qdrant:
|
||||||
|
```bash
|
||||||
|
curl -X PATCH http://qdrant.betelgeusebytes.io/collections/hadith_embeddings \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{"optimizers_config": {"indexing_threshold": 10000}}'
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. "Embedding dimension mismatch"
|
||||||
|
|
||||||
|
**Problem:** Qdrant rejects embeddings
|
||||||
|
|
||||||
|
**Solution:** Verify BGE-M3 produces 1024-dim vectors:
|
||||||
|
```bash
|
||||||
|
curl -X POST http://tei.ml.svc.cluster.local/embed \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{"inputs": "test"}' | jq 'length'
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## ✅ Verification Checklist
|
||||||
|
|
||||||
|
Before proceeding to Step 7, ensure:
|
||||||
|
|
||||||
|
- [ ] `verify_embeddings.py` shows 0 missing embeddings
|
||||||
|
- [ ] All 8 collections have 100% embedding coverage
|
||||||
|
- [ ] Benchmark shows P95 < 500ms
|
||||||
|
- [ ] At least 95% of queries meet the target
|
||||||
|
- [ ] API endpoints respond correctly
|
||||||
|
- [ ] Sample Arabic and English queries return relevant results
|
||||||
|
- [ ] Results are enriched with full hadith data from PostgreSQL
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 📚 Next Steps
|
||||||
|
|
||||||
|
Once Step 6 is verified:
|
||||||
|
|
||||||
|
1. **Step 7:** Annotation Setup with Label Studio
|
||||||
|
- Export 500 random hadiths for annotation
|
||||||
|
- Configure NER labeling project
|
||||||
|
- Create annotation guidelines
|
||||||
|
|
||||||
|
2. **Step 8:** NER Model Training
|
||||||
|
- Use annotated data to train entity extraction model
|
||||||
|
- Target F1 > 0.85
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 📝 Output Files
|
||||||
|
|
||||||
|
After running the scripts:
|
||||||
|
|
||||||
|
- `verification_results.json` - Embedding verification report
|
||||||
|
- `benchmark_results.json` - Performance benchmark results
|
||||||
|
|
||||||
|
These can be stored in MLflow for tracking:
|
||||||
|
```bash
|
||||||
|
mlflow experiments create -n "step6-verification"
|
||||||
|
mlflow runs create -e step6-verification --run-name "embedding-verification"
|
||||||
|
```
|
||||||
|
|
@ -0,0 +1,260 @@
|
||||||
|
{
|
||||||
|
"total_queries": 22,
|
||||||
|
"successful_queries": 22,
|
||||||
|
"failed_queries": 0,
|
||||||
|
"avg_embedding_time_ms": 124.84432727135506,
|
||||||
|
"avg_search_time_ms": 22.30660000085746,
|
||||||
|
"avg_total_time_ms": 147.15092727221253,
|
||||||
|
"p50_time_ms": 146.22399999643676,
|
||||||
|
"p95_time_ms": 163.0423000169685,
|
||||||
|
"p99_time_ms": 172.9122999822721,
|
||||||
|
"min_time_ms": 130.5485999910161,
|
||||||
|
"max_time_ms": 172.9122999822721,
|
||||||
|
"queries_meeting_target": 22,
|
||||||
|
"target_ms": 500,
|
||||||
|
"query_results": [
|
||||||
|
{
|
||||||
|
"query": "\u0627\u0644\u0635\u0644\u0627\u0629 \u0641\u064a \u0627\u0644\u0645\u0633\u062c\u062f \u0627\u0644\u062d\u0631\u0627\u0645",
|
||||||
|
"language": "arabic",
|
||||||
|
"description": "Prayer in the Sacred Mosque",
|
||||||
|
"embedding_time_ms": 137.83629999670666,
|
||||||
|
"search_time_ms": 21.302799999830313,
|
||||||
|
"total_time_ms": 159.13909999653697,
|
||||||
|
"results_count": 10,
|
||||||
|
"top_score": 0.7661493,
|
||||||
|
"meets_target": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "\u0623\u0628\u0648 \u0647\u0631\u064a\u0631\u0629 \u0631\u0636\u064a \u0627\u0644\u0644\u0647 \u0639\u0646\u0647",
|
||||||
|
"language": "arabic",
|
||||||
|
"description": "Abu Hurairah (RA)",
|
||||||
|
"embedding_time_ms": 142.00750000600237,
|
||||||
|
"search_time_ms": 21.034800010966137,
|
||||||
|
"total_time_ms": 163.0423000169685,
|
||||||
|
"results_count": 10,
|
||||||
|
"top_score": 0.79470885,
|
||||||
|
"meets_target": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "\u0627\u0644\u0635\u064a\u0627\u0645 \u0641\u064a \u0634\u0647\u0631 \u0631\u0645\u0636\u0627\u0646",
|
||||||
|
"language": "arabic",
|
||||||
|
"description": "Fasting in Ramadan",
|
||||||
|
"embedding_time_ms": 141.40879998740274,
|
||||||
|
"search_time_ms": 21.46490001177881,
|
||||||
|
"total_time_ms": 162.87369999918155,
|
||||||
|
"results_count": 10,
|
||||||
|
"top_score": 0.81152785,
|
||||||
|
"meets_target": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "\u0627\u0644\u0632\u0643\u0627\u0629 \u0648\u0627\u0644\u0635\u062f\u0642\u0629",
|
||||||
|
"language": "arabic",
|
||||||
|
"description": "Zakat and charity",
|
||||||
|
"embedding_time_ms": 125.70800000685267,
|
||||||
|
"search_time_ms": 21.658099998603575,
|
||||||
|
"total_time_ms": 147.36610000545625,
|
||||||
|
"results_count": 10,
|
||||||
|
"top_score": 0.73705375,
|
||||||
|
"meets_target": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "\u0627\u0644\u062d\u062c \u0648\u0627\u0644\u0639\u0645\u0631\u0629",
|
||||||
|
"language": "arabic",
|
||||||
|
"description": "Hajj and Umrah",
|
||||||
|
"embedding_time_ms": 128.12189999385737,
|
||||||
|
"search_time_ms": 21.92250000371132,
|
||||||
|
"total_time_ms": 150.0443999975687,
|
||||||
|
"results_count": 10,
|
||||||
|
"top_score": 0.75435185,
|
||||||
|
"meets_target": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "\u0627\u0644\u0646\u0628\u064a \u0635\u0644\u0649 \u0627\u0644\u0644\u0647 \u0639\u0644\u064a\u0647 \u0648\u0633\u0644\u0645 \u0641\u064a \u0627\u0644\u0645\u062f\u064a\u0646\u0629",
|
||||||
|
"language": "arabic",
|
||||||
|
"description": "Prophet (PBUH) in Medina",
|
||||||
|
"embedding_time_ms": 151.73289999074768,
|
||||||
|
"search_time_ms": 21.179399991524406,
|
||||||
|
"total_time_ms": 172.9122999822721,
|
||||||
|
"results_count": 10,
|
||||||
|
"top_score": 0.75164807,
|
||||||
|
"meets_target": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "\u0627\u0644\u0648\u0636\u0648\u0621 \u0648\u0627\u0644\u0637\u0647\u0627\u0631\u0629",
|
||||||
|
"language": "arabic",
|
||||||
|
"description": "Ablution and purification",
|
||||||
|
"embedding_time_ms": 131.21989999490324,
|
||||||
|
"search_time_ms": 21.02040000318084,
|
||||||
|
"total_time_ms": 152.24029999808408,
|
||||||
|
"results_count": 10,
|
||||||
|
"top_score": 0.6073998,
|
||||||
|
"meets_target": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "\u0628\u0631 \u0627\u0644\u0648\u0627\u0644\u062f\u064a\u0646",
|
||||||
|
"language": "arabic",
|
||||||
|
"description": "Honoring parents",
|
||||||
|
"embedding_time_ms": 120.76360000355635,
|
||||||
|
"search_time_ms": 22.190200004843064,
|
||||||
|
"total_time_ms": 142.9538000083994,
|
||||||
|
"results_count": 10,
|
||||||
|
"top_score": 0.7476402,
|
||||||
|
"meets_target": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "\u0627\u0644\u062c\u0646\u0629 \u0648\u0627\u0644\u0646\u0627\u0631",
|
||||||
|
"language": "arabic",
|
||||||
|
"description": "Paradise and Hell",
|
||||||
|
"embedding_time_ms": 124.25219999568071,
|
||||||
|
"search_time_ms": 23.127499996917322,
|
||||||
|
"total_time_ms": 147.37969999259803,
|
||||||
|
"results_count": 10,
|
||||||
|
"top_score": 0.7781049,
|
||||||
|
"meets_target": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "\u0627\u0644\u0625\u064a\u0645\u0627\u0646 \u0648\u0627\u0644\u0625\u0633\u0644\u0627\u0645",
|
||||||
|
"language": "arabic",
|
||||||
|
"description": "Faith and Islam",
|
||||||
|
"embedding_time_ms": 127.87359999492764,
|
||||||
|
"search_time_ms": 21.657500008586794,
|
||||||
|
"total_time_ms": 149.53110000351444,
|
||||||
|
"results_count": 10,
|
||||||
|
"top_score": 0.7572472,
|
||||||
|
"meets_target": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "five daily prayers",
|
||||||
|
"language": "english",
|
||||||
|
"description": "The five obligatory prayers",
|
||||||
|
"embedding_time_ms": 109.2108000011649,
|
||||||
|
"search_time_ms": 21.33779998985119,
|
||||||
|
"total_time_ms": 130.5485999910161,
|
||||||
|
"results_count": 10,
|
||||||
|
"top_score": 0.759544,
|
||||||
|
"meets_target": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "Prophet Muhammad in Mecca",
|
||||||
|
"language": "english",
|
||||||
|
"description": "Prophet's life in Mecca",
|
||||||
|
"embedding_time_ms": 113.58699999982491,
|
||||||
|
"search_time_ms": 23.10490001400467,
|
||||||
|
"total_time_ms": 136.69190001382958,
|
||||||
|
"results_count": 10,
|
||||||
|
"top_score": 0.80261445,
|
||||||
|
"meets_target": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "treatment of neighbors",
|
||||||
|
"language": "english",
|
||||||
|
"description": "Rights and treatment of neighbors",
|
||||||
|
"embedding_time_ms": 115.10320000525098,
|
||||||
|
"search_time_ms": 21.576900006039068,
|
||||||
|
"total_time_ms": 136.68010001129005,
|
||||||
|
"results_count": 10,
|
||||||
|
"top_score": 0.61891544,
|
||||||
|
"meets_target": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "patience during hardship",
|
||||||
|
"language": "english",
|
||||||
|
"description": "Patience in difficult times",
|
||||||
|
"embedding_time_ms": 122.27100000018254,
|
||||||
|
"search_time_ms": 22.26110000628978,
|
||||||
|
"total_time_ms": 144.53210000647232,
|
||||||
|
"results_count": 10,
|
||||||
|
"top_score": 0.697459,
|
||||||
|
"meets_target": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "marriage and family",
|
||||||
|
"language": "english",
|
||||||
|
"description": "Islamic marriage guidance",
|
||||||
|
"embedding_time_ms": 116.44650000380352,
|
||||||
|
"search_time_ms": 21.11739999963902,
|
||||||
|
"total_time_ms": 137.56390000344254,
|
||||||
|
"results_count": 10,
|
||||||
|
"top_score": 0.6414789,
|
||||||
|
"meets_target": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "honesty and truthfulness",
|
||||||
|
"language": "english",
|
||||||
|
"description": "Importance of being truthful",
|
||||||
|
"embedding_time_ms": 119.8251000023447,
|
||||||
|
"search_time_ms": 22.77229999890551,
|
||||||
|
"total_time_ms": 142.5974000012502,
|
||||||
|
"results_count": 10,
|
||||||
|
"top_score": 0.64781964,
|
||||||
|
"meets_target": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "Day of Judgment signs",
|
||||||
|
"language": "english",
|
||||||
|
"description": "Signs of the Last Day",
|
||||||
|
"embedding_time_ms": 112.60979999497067,
|
||||||
|
"search_time_ms": 22.185800000443123,
|
||||||
|
"total_time_ms": 134.7955999954138,
|
||||||
|
"results_count": 10,
|
||||||
|
"top_score": 0.71163684,
|
||||||
|
"meets_target": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "charity and helping poor",
|
||||||
|
"language": "english",
|
||||||
|
"description": "Giving charity to the needy",
|
||||||
|
"embedding_time_ms": 120.4487000068184,
|
||||||
|
"search_time_ms": 22.555499992449768,
|
||||||
|
"total_time_ms": 143.00419999926817,
|
||||||
|
"results_count": 10,
|
||||||
|
"top_score": 0.72138125,
|
||||||
|
"meets_target": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "companions of the Prophet",
|
||||||
|
"language": "english",
|
||||||
|
"description": "Sahaba and their virtues",
|
||||||
|
"embedding_time_ms": 112.31199999747332,
|
||||||
|
"search_time_ms": 23.779299997841008,
|
||||||
|
"total_time_ms": 136.09129999531433,
|
||||||
|
"results_count": 10,
|
||||||
|
"top_score": 0.7868167,
|
||||||
|
"meets_target": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "seeking knowledge in Islam",
|
||||||
|
"language": "english",
|
||||||
|
"description": "Importance of knowledge",
|
||||||
|
"embedding_time_ms": 119.9167999875499,
|
||||||
|
"search_time_ms": 25.41219998965971,
|
||||||
|
"total_time_ms": 145.3289999772096,
|
||||||
|
"results_count": 10,
|
||||||
|
"top_score": 0.76270455,
|
||||||
|
"meets_target": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "\u0642\u0627\u0644 \u0631\u0633\u0648\u0644 \u0627\u0644\u0644\u0647 about kindness",
|
||||||
|
"language": "mixed",
|
||||||
|
"description": "Prophet's sayings about kindness (mixed)",
|
||||||
|
"embedding_time_ms": 134.37929999781772,
|
||||||
|
"search_time_ms": 21.40019999933429,
|
||||||
|
"total_time_ms": 155.77949999715202,
|
||||||
|
"results_count": 10,
|
||||||
|
"top_score": 0.8195741,
|
||||||
|
"meets_target": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "women rights \u0627\u0644\u0625\u0633\u0644\u0627\u0645",
|
||||||
|
"language": "mixed",
|
||||||
|
"description": "Women's rights in Islam (mixed)",
|
||||||
|
"embedding_time_ms": 119.54030000197235,
|
||||||
|
"search_time_ms": 26.683699994464405,
|
||||||
|
"total_time_ms": 146.22399999643676,
|
||||||
|
"results_count": 10,
|
||||||
|
"top_score": 0.72458637,
|
||||||
|
"meets_target": true
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"timestamp": "2025-11-28T10:05:31.314855"
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,37 @@
|
||||||
|
# ============================================================================
|
||||||
|
# Step 6: Environment Configuration
|
||||||
|
# ============================================================================
|
||||||
|
# Copy this file to .env and update with your values
|
||||||
|
# Usage: source .env
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
# PostgreSQL Configuration
|
||||||
|
export POSTGRES_HOST=pg.betelgeusebytes.io
|
||||||
|
export POSTGRES_PORT=5432
|
||||||
|
export POSTGRES_DB=hadith_db
|
||||||
|
export POSTGRES_USER=hadith_ingest
|
||||||
|
export POSTGRES_PASSWORD=YOUR_PASSWORD_HERE
|
||||||
|
|
||||||
|
# Qdrant Configuration (internal cluster access)
|
||||||
|
export QDRANT_HOST=qdrant.vector.svc.cluster.local
|
||||||
|
export QDRANT_PORT=6333
|
||||||
|
export QDRANT_COLLECTION=hadith_embeddings
|
||||||
|
|
||||||
|
# Qdrant Configuration (external access - uncomment if needed)
|
||||||
|
# export QDRANT_HOST=qdrant.betelgeusebytes.io
|
||||||
|
# export QDRANT_PORT=443
|
||||||
|
|
||||||
|
# TEI Configuration (internal cluster access)
|
||||||
|
export TEI_HOST=tei.ml.svc.cluster.local
|
||||||
|
export TEI_PORT=80
|
||||||
|
|
||||||
|
# TEI Configuration (external access - uncomment if needed)
|
||||||
|
# export TEI_HOST=tei.betelgeusebytes.io
|
||||||
|
# export TEI_PORT=443
|
||||||
|
|
||||||
|
# vLLM Configuration (for later steps)
|
||||||
|
export VLLM_HOST=vllm.ml.svc.cluster.local
|
||||||
|
export VLLM_PORT=8000
|
||||||
|
|
||||||
|
# MLflow Configuration
|
||||||
|
export MLFLOW_TRACKING_URI=https://mlflow.betelgeusebytes.io
|
||||||
|
|
@ -0,0 +1,48 @@
|
||||||
|
# ============================================================================
|
||||||
|
# Hadith Semantic Search API - Dockerfile
|
||||||
|
# ============================================================================
|
||||||
|
# Build: docker build -t hadith-search-api:latest .
|
||||||
|
# Run: docker run -p 8080:8080 --env-file .env hadith-search-api:latest
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
FROM python:3.11-slim
|
||||||
|
|
||||||
|
# Set environment variables
|
||||||
|
ENV PYTHONDONTWRITEBYTECODE=1 \
|
||||||
|
PYTHONUNBUFFERED=1 \
|
||||||
|
PIP_NO_CACHE_DIR=1 \
|
||||||
|
PIP_DISABLE_PIP_VERSION_CHECK=1
|
||||||
|
|
||||||
|
# Install system dependencies
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
libpq-dev \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Create non-root user
|
||||||
|
RUN useradd --create-home --shell /bin/bash appuser
|
||||||
|
|
||||||
|
# Set work directory
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Install Python dependencies
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
# Copy application code
|
||||||
|
COPY search_api.py .
|
||||||
|
|
||||||
|
# Change ownership
|
||||||
|
RUN chown -R appuser:appuser /app
|
||||||
|
|
||||||
|
# Switch to non-root user
|
||||||
|
USER appuser
|
||||||
|
|
||||||
|
# Expose port
|
||||||
|
EXPOSE 8080
|
||||||
|
|
||||||
|
# Health check
|
||||||
|
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
|
||||||
|
CMD python -c "import httpx; httpx.get('http://localhost:8080/health', timeout=5)"
|
||||||
|
|
||||||
|
# Run the application
|
||||||
|
CMD ["python", "-m", "uvicorn", "search_api:app", "--host", "0.0.0.0", "--port", "8080", "--workers", "2"]
|
||||||
|
|
@ -0,0 +1,407 @@
|
||||||
|
# Step 6: Verify Embeddings & Test Semantic Search
|
||||||
|
|
||||||
|
## 📋 Overview
|
||||||
|
|
||||||
|
This step validates that all ~40,000 hadiths have been properly embedded and stored in Qdrant, then builds and benchmarks a semantic search system.
|
||||||
|
|
||||||
|
**Target Performance:** <500ms per query
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 📁 Files Included
|
||||||
|
|
||||||
|
| File | Description |
|
||||||
|
|------|-------------|
|
||||||
|
| `verify_embeddings.py` | Validates all hadiths have embeddings in Qdrant |
|
||||||
|
| `semantic_search.py` | Tests semantic search with benchmarking |
|
||||||
|
| `search_api.py` | Production-ready FastAPI search service |
|
||||||
|
| `verification_queries.sql` | SQL queries for database verification |
|
||||||
|
| `k8s-search-api.yaml` | Kubernetes deployment manifests |
|
||||||
|
| `Dockerfile` | Container image for search API |
|
||||||
|
| `requirements.txt` | Python dependencies |
|
||||||
|
| `run_tests.sh` | Quick test runner script |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🔧 Prerequisites
|
||||||
|
|
||||||
|
1. **Python 3.10+** with pip
|
||||||
|
2. **Access to services:**
|
||||||
|
- PostgreSQL at `pg.betelgeusebytes.io:5432`
|
||||||
|
- Qdrant at `qdrant.vector.svc.cluster.local:6333`
|
||||||
|
- TEI at `tei.ml.svc.cluster.local:80`
|
||||||
|
|
||||||
|
3. **Environment variables:**
|
||||||
|
```bash
|
||||||
|
export POSTGRES_HOST=pg.betelgeusebytes.io
|
||||||
|
export POSTGRES_PORT=5432
|
||||||
|
export POSTGRES_DB=hadith_db
|
||||||
|
export POSTGRES_USER=hadith_ingest
|
||||||
|
export POSTGRES_PASSWORD=your_password
|
||||||
|
|
||||||
|
export QDRANT_HOST=qdrant.vector.svc.cluster.local
|
||||||
|
export QDRANT_PORT=6333
|
||||||
|
export QDRANT_COLLECTION=hadith_embeddings
|
||||||
|
|
||||||
|
export TEI_HOST=tei.ml.svc.cluster.local
|
||||||
|
export TEI_PORT=80
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🚀 Quick Start
|
||||||
|
|
||||||
|
### 1. Install Dependencies
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Run Embedding Verification
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python verify_embeddings.py
|
||||||
|
```
|
||||||
|
|
||||||
|
**Expected Output:**
|
||||||
|
```
|
||||||
|
┌───────────────────────────────────────┐
|
||||||
|
│ Step 6.1 - Hadith Embeddings Verification │
|
||||||
|
└───────────────────────────────────────┘
|
||||||
|
|
||||||
|
1. Checking PostgreSQL database...
|
||||||
|
Total hadiths: 40,123
|
||||||
|
Marked as embedded: 40,123
|
||||||
|
|
||||||
|
2. Collection breakdown:
|
||||||
|
┌────────────────┬────────┬──────────┐
|
||||||
|
│ Collection │ Total │ Embedded │
|
||||||
|
├────────────────┼────────┼──────────┤
|
||||||
|
│ Sahih Bukhari │ 7,563 │ 7,563 │
|
||||||
|
│ Sahih Muslim │ 7,453 │ 7,453 │
|
||||||
|
│ ... │ ... │ ... │
|
||||||
|
└────────────────┴────────┴──────────┘
|
||||||
|
|
||||||
|
✓ ALL EMBEDDINGS VERIFIED!
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Run Semantic Search Benchmark
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Full benchmark
|
||||||
|
python semantic_search.py --mode benchmark
|
||||||
|
|
||||||
|
# Interactive mode
|
||||||
|
python semantic_search.py --mode interactive
|
||||||
|
|
||||||
|
# Single query
|
||||||
|
python semantic_search.py --query "الصلاة في المسجد"
|
||||||
|
|
||||||
|
# Demo mode
|
||||||
|
python semantic_search.py --mode demo
|
||||||
|
```
|
||||||
|
|
||||||
|
**Expected Benchmark Output:**
|
||||||
|
```
|
||||||
|
═══════════════════════════════════════════════════════
|
||||||
|
BENCHMARK RESULTS
|
||||||
|
═══════════════════════════════════════════════════════
|
||||||
|
|
||||||
|
Query Statistics:
|
||||||
|
Total queries: 22
|
||||||
|
Successful: 22
|
||||||
|
Failed: 0
|
||||||
|
|
||||||
|
Timing Statistics:
|
||||||
|
Average embedding time: 45.3ms
|
||||||
|
Average search time: 12.8ms
|
||||||
|
Average total time: 58.1ms
|
||||||
|
|
||||||
|
Percentiles:
|
||||||
|
P50: 55.2ms
|
||||||
|
P95: 89.4ms
|
||||||
|
P99: 112.3ms
|
||||||
|
|
||||||
|
Performance Target (<500ms):
|
||||||
|
Queries meeting target: 22/22 (100.0%)
|
||||||
|
Status: ✓ TARGET MET
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 📊 Verification Queries (SQL)
|
||||||
|
|
||||||
|
Run these directly against PostgreSQL:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
psql -h pg.betelgeusebytes.io -U hadith_ingest -d hadith_db -f verification_queries.sql
|
||||||
|
```
|
||||||
|
|
||||||
|
Or run individual queries:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
-- Quick health check
|
||||||
|
SELECT
|
||||||
|
'Database Health Check' AS check_type,
|
||||||
|
(SELECT COUNT(*) FROM hadiths) AS total_hadiths,
|
||||||
|
(SELECT COUNT(*) FROM collections) AS total_collections,
|
||||||
|
(SELECT SUM(CASE WHEN embedding_generated THEN 1 ELSE 0 END) FROM hadiths) AS embedded_count;
|
||||||
|
|
||||||
|
-- Find missing embeddings
|
||||||
|
SELECT id, collection_id, hadith_number
|
||||||
|
FROM hadiths
|
||||||
|
WHERE NOT embedding_generated
|
||||||
|
LIMIT 10;
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🔍 Qdrant Verification
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check collection exists
|
||||||
|
curl http://qdrant.betelgeusebytes.io/collections/hadith_embeddings
|
||||||
|
|
||||||
|
# Count points
|
||||||
|
curl -X POST http://qdrant.betelgeusebytes.io/collections/hadith_embeddings/points/count \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{"exact": true}'
|
||||||
|
|
||||||
|
# Sample search (requires embedding)
|
||||||
|
curl -X POST http://qdrant.betelgeusebytes.io/collections/hadith_embeddings/points/search \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"vector": [0.1, 0.2, ...], # 1024-dim vector
|
||||||
|
"limit": 5,
|
||||||
|
"with_payload": true
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🌐 Deploy Search API
|
||||||
|
|
||||||
|
### Option 1: Run Locally
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python search_api.py
|
||||||
|
# API available at http://localhost:8080
|
||||||
|
```
|
||||||
|
|
||||||
|
### Option 2: Deploy to Kubernetes
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Create namespace
|
||||||
|
kubectl create namespace hadith
|
||||||
|
|
||||||
|
# Create ConfigMap with API code
|
||||||
|
kubectl create configmap search-api-code \
|
||||||
|
--from-file=search_api.py \
|
||||||
|
-n hadith
|
||||||
|
|
||||||
|
# Update secrets in k8s-search-api.yaml with your password
|
||||||
|
|
||||||
|
# Deploy
|
||||||
|
kubectl apply -f k8s-search-api.yaml
|
||||||
|
|
||||||
|
# Check status
|
||||||
|
kubectl -n hadith get pods
|
||||||
|
kubectl -n hadith logs -f deployment/search-api
|
||||||
|
```
|
||||||
|
|
||||||
|
### Option 3: Build Docker Image
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Build image
|
||||||
|
docker build -t hadith-search-api:latest .
|
||||||
|
|
||||||
|
# Run locally
|
||||||
|
docker run -p 8080:8080 \
|
||||||
|
-e POSTGRES_PASSWORD=your_password \
|
||||||
|
-e POSTGRES_HOST=pg.betelgeusebytes.io \
|
||||||
|
-e QDRANT_HOST=qdrant.betelgeusebytes.io \
|
||||||
|
-e TEI_HOST=tei.betelgeusebytes.io \
|
||||||
|
hadith-search-api:latest
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🔌 API Endpoints
|
||||||
|
|
||||||
|
| Method | Endpoint | Description |
|
||||||
|
|--------|----------|-------------|
|
||||||
|
| GET | `/health` | Health check |
|
||||||
|
| GET | `/stats` | Database statistics |
|
||||||
|
| POST | `/search` | Semantic search |
|
||||||
|
| GET | `/search?q=query` | Simple search |
|
||||||
|
| GET | `/hadith/{id}` | Get hadith by ID |
|
||||||
|
| GET | `/similar/{id}` | Find similar hadiths |
|
||||||
|
|
||||||
|
### Example API Calls
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Health check
|
||||||
|
curl https://search.betelgeusebytes.io/health
|
||||||
|
|
||||||
|
# Get stats
|
||||||
|
curl https://search.betelgeusebytes.io/stats
|
||||||
|
|
||||||
|
# Search (GET)
|
||||||
|
curl "https://search.betelgeusebytes.io/search?q=five%20daily%20prayers&limit=5"
|
||||||
|
|
||||||
|
# Search (POST)
|
||||||
|
curl -X POST https://search.betelgeusebytes.io/search \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"query": "الصلاة في المسجد الحرام",
|
||||||
|
"limit": 10,
|
||||||
|
"min_score": 0.5
|
||||||
|
}'
|
||||||
|
|
||||||
|
# Get specific hadith
|
||||||
|
curl https://search.betelgeusebytes.io/hadith/1234
|
||||||
|
|
||||||
|
# Find similar hadiths
|
||||||
|
curl https://search.betelgeusebytes.io/similar/1234?limit=5
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 📈 Sample Search Queries
|
||||||
|
|
||||||
|
### Arabic Queries
|
||||||
|
|
||||||
|
| Query | Description |
|
||||||
|
|-------|-------------|
|
||||||
|
| الصلاة في المسجد الحرام | Prayer in the Sacred Mosque |
|
||||||
|
| أبو هريرة رضي الله عنه | Abu Hurairah (RA) |
|
||||||
|
| الصيام في شهر رمضان | Fasting in Ramadan |
|
||||||
|
| الزكاة والصدقة | Zakat and charity |
|
||||||
|
| الحج والعمرة | Hajj and Umrah |
|
||||||
|
| الوضوء والطهارة | Ablution and purification |
|
||||||
|
| بر الوالدين | Honoring parents |
|
||||||
|
| الجنة والنار | Paradise and Hell |
|
||||||
|
|
||||||
|
### English Queries
|
||||||
|
|
||||||
|
| Query | Description |
|
||||||
|
|-------|-------------|
|
||||||
|
| five daily prayers | The five obligatory prayers |
|
||||||
|
| treatment of neighbors | Rights of neighbors |
|
||||||
|
| patience during hardship | Patience in trials |
|
||||||
|
| marriage and family | Islamic marriage guidance |
|
||||||
|
| honesty and truthfulness | Being truthful |
|
||||||
|
| Day of Judgment signs | Signs of the Last Day |
|
||||||
|
| companions of the Prophet | Sahaba and their virtues |
|
||||||
|
| seeking knowledge in Islam | Importance of knowledge |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🐛 Troubleshooting
|
||||||
|
|
||||||
|
### 1. "Connection refused" to services
|
||||||
|
|
||||||
|
**Problem:** Cannot connect to PostgreSQL/Qdrant/TEI
|
||||||
|
|
||||||
|
**Solution:**
|
||||||
|
```bash
|
||||||
|
# Check if running inside cluster
|
||||||
|
kubectl -n ml get pods
|
||||||
|
kubectl -n vector get pods
|
||||||
|
|
||||||
|
# For external access, use port-forward
|
||||||
|
kubectl port-forward -n vector svc/qdrant 6333:6333
|
||||||
|
kubectl port-forward -n ml svc/tei 8080:80
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. "Missing embeddings found"
|
||||||
|
|
||||||
|
**Problem:** Some hadiths don't have embeddings
|
||||||
|
|
||||||
|
**Solution:**
|
||||||
|
```bash
|
||||||
|
# Re-run embedding workflow for missing IDs
|
||||||
|
argo submit -n ml embedding-workflow.yaml \
|
||||||
|
--parameter hadith-ids="[1,2,3,...]"
|
||||||
|
|
||||||
|
# Or update the database flag
|
||||||
|
psql -h pg.betelgeusebytes.io -U hadith_ingest -d hadith_db -c \
|
||||||
|
"UPDATE hadiths SET embedding_generated = false WHERE id IN (1,2,3)"
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Slow search performance (>500ms)
|
||||||
|
|
||||||
|
**Problem:** Queries taking too long
|
||||||
|
|
||||||
|
**Solutions:**
|
||||||
|
1. Check TEI service health:
|
||||||
|
```bash
|
||||||
|
curl http://tei.ml.svc.cluster.local/health
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Check Qdrant indexing:
|
||||||
|
```bash
|
||||||
|
curl http://qdrant.betelgeusebytes.io/collections/hadith_embeddings | jq '.result.status'
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Enable HNSW index in Qdrant:
|
||||||
|
```bash
|
||||||
|
curl -X PATCH http://qdrant.betelgeusebytes.io/collections/hadith_embeddings \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{"optimizers_config": {"indexing_threshold": 10000}}'
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. "Embedding dimension mismatch"
|
||||||
|
|
||||||
|
**Problem:** Qdrant rejects embeddings
|
||||||
|
|
||||||
|
**Solution:** Verify BGE-M3 produces 1024-dim vectors:
|
||||||
|
```bash
|
||||||
|
curl -X POST http://tei.ml.svc.cluster.local/embed \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{"inputs": "test"}' | jq 'length'
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## ✅ Verification Checklist
|
||||||
|
|
||||||
|
Before proceeding to Step 7, ensure:
|
||||||
|
|
||||||
|
- [ ] `verify_embeddings.py` shows 0 missing embeddings
|
||||||
|
- [ ] All 8 collections have 100% embedding coverage
|
||||||
|
- [ ] Benchmark shows P95 < 500ms
|
||||||
|
- [ ] At least 95% of queries meet the target
|
||||||
|
- [ ] API endpoints respond correctly
|
||||||
|
- [ ] Sample Arabic and English queries return relevant results
|
||||||
|
- [ ] Results are enriched with full hadith data from PostgreSQL
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 📚 Next Steps
|
||||||
|
|
||||||
|
Once Step 6 is verified:
|
||||||
|
|
||||||
|
1. **Step 7:** Annotation Setup with Label Studio
|
||||||
|
- Export 500 random hadiths for annotation
|
||||||
|
- Configure NER labeling project
|
||||||
|
- Create annotation guidelines
|
||||||
|
|
||||||
|
2. **Step 8:** NER Model Training
|
||||||
|
- Use annotated data to train entity extraction model
|
||||||
|
- Target F1 > 0.85
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 📝 Output Files
|
||||||
|
|
||||||
|
After running the scripts:
|
||||||
|
|
||||||
|
- `verification_results.json` - Embedding verification report
|
||||||
|
- `benchmark_results.json` - Performance benchmark results
|
||||||
|
|
||||||
|
These can be stored in MLflow for tracking:
|
||||||
|
```bash
|
||||||
|
mlflow experiments create -n "step6-verification"
|
||||||
|
mlflow runs create -e step6-verification --run-name "embedding-verification"
|
||||||
|
```
|
||||||
|
|
@ -0,0 +1,183 @@
|
||||||
|
# ============================================================================
|
||||||
|
# Step 6: Semantic Search API - Kubernetes Deployment
|
||||||
|
# ============================================================================
|
||||||
|
# Deploy: kubectl apply -f k8s-search-api.yaml
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
---
|
||||||
|
# Namespace (if not exists)
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Namespace
|
||||||
|
metadata:
|
||||||
|
name: hadith
|
||||||
|
---
|
||||||
|
# ConfigMap for non-sensitive configuration
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: search-api-config
|
||||||
|
namespace: hadith
|
||||||
|
data:
|
||||||
|
POSTGRES_HOST: "postgres.db.svc.cluster.local"
|
||||||
|
POSTGRES_PORT: "5432"
|
||||||
|
POSTGRES_DB: "hadith_db"
|
||||||
|
POSTGRES_USER: "hadith_ingest"
|
||||||
|
QDRANT_HOST: "qdrant.vector.svc.cluster.local"
|
||||||
|
QDRANT_PORT: "6333"
|
||||||
|
QDRANT_COLLECTION: "hadith_embeddings"
|
||||||
|
TEI_HOST: "tei.ml.svc.cluster.local"
|
||||||
|
TEI_PORT: "80"
|
||||||
|
---
|
||||||
|
# Secret for database password
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Secret
|
||||||
|
metadata:
|
||||||
|
name: search-api-secrets
|
||||||
|
namespace: hadith
|
||||||
|
type: Opaque
|
||||||
|
stringData:
|
||||||
|
POSTGRES_PASSWORD: "CHANGE_ME_TO_YOUR_PASSWORD"
|
||||||
|
---
|
||||||
|
# Deployment
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: search-api
|
||||||
|
namespace: hadith
|
||||||
|
labels:
|
||||||
|
app: search-api
|
||||||
|
spec:
|
||||||
|
replicas: 2
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app: search-api
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app: search-api
|
||||||
|
annotations:
|
||||||
|
prometheus.io/scrape: "true"
|
||||||
|
prometheus.io/port: "8080"
|
||||||
|
prometheus.io/path: "/metrics"
|
||||||
|
spec:
|
||||||
|
nodeSelector:
|
||||||
|
node: hetzner-2
|
||||||
|
containers:
|
||||||
|
- name: search-api
|
||||||
|
image: python:3.11-slim
|
||||||
|
command:
|
||||||
|
- /bin/bash
|
||||||
|
- -c
|
||||||
|
- |
|
||||||
|
pip install --no-cache-dir \
|
||||||
|
fastapi uvicorn httpx psycopg2-binary pydantic && \
|
||||||
|
python /app/search_api.py
|
||||||
|
ports:
|
||||||
|
- containerPort: 8080
|
||||||
|
name: http
|
||||||
|
envFrom:
|
||||||
|
- configMapRef:
|
||||||
|
name: search-api-config
|
||||||
|
- secretRef:
|
||||||
|
name: search-api-secrets
|
||||||
|
volumeMounts:
|
||||||
|
- name: app-code
|
||||||
|
mountPath: /app
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: "250m"
|
||||||
|
memory: "256Mi"
|
||||||
|
limits:
|
||||||
|
cpu: "1"
|
||||||
|
memory: "512Mi"
|
||||||
|
readinessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /health
|
||||||
|
port: 8080
|
||||||
|
initialDelaySeconds: 30
|
||||||
|
periodSeconds: 10
|
||||||
|
timeoutSeconds: 5
|
||||||
|
failureThreshold: 3
|
||||||
|
livenessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /health
|
||||||
|
port: 8080
|
||||||
|
initialDelaySeconds: 60
|
||||||
|
periodSeconds: 30
|
||||||
|
timeoutSeconds: 5
|
||||||
|
failureThreshold: 3
|
||||||
|
volumes:
|
||||||
|
- name: app-code
|
||||||
|
configMap:
|
||||||
|
name: search-api-code
|
||||||
|
---
|
||||||
|
# Service
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: search-api
|
||||||
|
namespace: hadith
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
app: search-api
|
||||||
|
ports:
|
||||||
|
- name: http
|
||||||
|
port: 80
|
||||||
|
targetPort: 8080
|
||||||
|
type: ClusterIP
|
||||||
|
---
|
||||||
|
# Ingress
|
||||||
|
apiVersion: networking.k8s.io/v1
|
||||||
|
kind: Ingress
|
||||||
|
metadata:
|
||||||
|
name: search-api
|
||||||
|
namespace: hadith
|
||||||
|
annotations:
|
||||||
|
cert-manager.io/cluster-issuer: letsencrypt-prod
|
||||||
|
nginx.ingress.kubernetes.io/proxy-body-size: "10m"
|
||||||
|
nginx.ingress.kubernetes.io/proxy-read-timeout: "60"
|
||||||
|
nginx.ingress.kubernetes.io/proxy-send-timeout: "60"
|
||||||
|
spec:
|
||||||
|
ingressClassName: nginx
|
||||||
|
tls:
|
||||||
|
- hosts:
|
||||||
|
- search.betelgeusebytes.io
|
||||||
|
secretName: search-api-tls
|
||||||
|
rules:
|
||||||
|
- host: search.betelgeusebytes.io
|
||||||
|
http:
|
||||||
|
paths:
|
||||||
|
- path: /
|
||||||
|
pathType: Prefix
|
||||||
|
backend:
|
||||||
|
service:
|
||||||
|
name: search-api
|
||||||
|
port:
|
||||||
|
number: 80
|
||||||
|
---
|
||||||
|
# HorizontalPodAutoscaler (optional)
|
||||||
|
apiVersion: autoscaling/v2
|
||||||
|
kind: HorizontalPodAutoscaler
|
||||||
|
metadata:
|
||||||
|
name: search-api-hpa
|
||||||
|
namespace: hadith
|
||||||
|
spec:
|
||||||
|
scaleTargetRef:
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
name: search-api
|
||||||
|
minReplicas: 2
|
||||||
|
maxReplicas: 5
|
||||||
|
metrics:
|
||||||
|
- type: Resource
|
||||||
|
resource:
|
||||||
|
name: cpu
|
||||||
|
target:
|
||||||
|
type: Utilization
|
||||||
|
averageUtilization: 70
|
||||||
|
- type: Resource
|
||||||
|
resource:
|
||||||
|
name: memory
|
||||||
|
target:
|
||||||
|
type: Utilization
|
||||||
|
averageUtilization: 80
|
||||||
|
|
@ -0,0 +1,19 @@
|
||||||
|
# Step 6: Verify Embeddings & Semantic Search
|
||||||
|
# Requirements for hadith-phase3-step6
|
||||||
|
|
||||||
|
# Database
|
||||||
|
psycopg2-binary>=2.9.9
|
||||||
|
|
||||||
|
# HTTP client
|
||||||
|
httpx>=0.27.0
|
||||||
|
|
||||||
|
# Rich console output
|
||||||
|
rich>=13.7.0
|
||||||
|
|
||||||
|
# Data handling
|
||||||
|
python-dateutil>=2.8.2
|
||||||
|
|
||||||
|
# Optional: for running as web API
|
||||||
|
fastapi>=0.111.0
|
||||||
|
uvicorn>=0.30.0
|
||||||
|
pydantic>=2.7.0
|
||||||
|
|
@ -0,0 +1,217 @@
|
||||||
|
#!/bin/bash
|
||||||
|
# ============================================================================
|
||||||
|
# Step 6: Quick Test Runner
|
||||||
|
# ============================================================================
|
||||||
|
# Usage: ./run_tests.sh [verify|benchmark|demo|api|all]
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# Colors
|
||||||
|
RED='\033[0;31m'
|
||||||
|
GREEN='\033[0;32m'
|
||||||
|
YELLOW='\033[1;33m'
|
||||||
|
BLUE='\033[0;34m'
|
||||||
|
NC='\033[0m' # No Color
|
||||||
|
|
||||||
|
# Configuration - Update these for your environment
|
||||||
|
export POSTGRES_HOST="${POSTGRES_HOST:-pg.betelgeusebytes.io}"
|
||||||
|
export POSTGRES_PORT="${POSTGRES_PORT:-5432}"
|
||||||
|
export POSTGRES_DB="${POSTGRES_DB:-hadith_db}"
|
||||||
|
export POSTGRES_USER="${POSTGRES_USER:-hadith_ingest}"
|
||||||
|
export POSTGRES_PASSWORD="${POSTGRES_PASSWORD:-}"
|
||||||
|
|
||||||
|
export QDRANT_HOST="${QDRANT_HOST:-qdrant.vector.svc.cluster.local}"
|
||||||
|
export QDRANT_PORT="${QDRANT_PORT:-6333}"
|
||||||
|
export QDRANT_COLLECTION="${QDRANT_COLLECTION:-hadith_embeddings}"
|
||||||
|
|
||||||
|
export TEI_HOST="${TEI_HOST:-tei.ml.svc.cluster.local}"
|
||||||
|
export TEI_PORT="${TEI_PORT:-80}"
|
||||||
|
|
||||||
|
# Check if password is set
|
||||||
|
check_password() {
|
||||||
|
if [ -z "$POSTGRES_PASSWORD" ]; then
|
||||||
|
echo -e "${RED}Error: POSTGRES_PASSWORD environment variable is not set${NC}"
|
||||||
|
echo "Set it with: export POSTGRES_PASSWORD='your_password'"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Install dependencies
|
||||||
|
install_deps() {
|
||||||
|
echo -e "${BLUE}Installing dependencies...${NC}"
|
||||||
|
pip install -q -r requirements.txt
|
||||||
|
echo -e "${GREEN}Dependencies installed.${NC}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Run verification
|
||||||
|
run_verify() {
|
||||||
|
echo -e "\n${BLUE}═══════════════════════════════════════════════════${NC}"
|
||||||
|
echo -e "${BLUE}Running Embedding Verification...${NC}"
|
||||||
|
echo -e "${BLUE}═══════════════════════════════════════════════════${NC}\n"
|
||||||
|
|
||||||
|
python verify_embeddings.py
|
||||||
|
|
||||||
|
if [ $? -eq 0 ]; then
|
||||||
|
echo -e "\n${GREEN}✓ Verification passed!${NC}"
|
||||||
|
else
|
||||||
|
echo -e "\n${RED}✗ Verification failed - some embeddings are missing${NC}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Run benchmark
|
||||||
|
run_benchmark() {
|
||||||
|
echo -e "\n${BLUE}═══════════════════════════════════════════════════${NC}"
|
||||||
|
echo -e "${BLUE}Running Semantic Search Benchmark...${NC}"
|
||||||
|
echo -e "${BLUE}═══════════════════════════════════════════════════${NC}\n"
|
||||||
|
|
||||||
|
python semantic_search.py --mode benchmark --output benchmark_results.json
|
||||||
|
|
||||||
|
echo -e "\n${GREEN}✓ Benchmark complete. Results saved to benchmark_results.json${NC}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Run demo
|
||||||
|
run_demo() {
|
||||||
|
echo -e "\n${BLUE}═══════════════════════════════════════════════════${NC}"
|
||||||
|
echo -e "${BLUE}Running Search Demo...${NC}"
|
||||||
|
echo -e "${BLUE}═══════════════════════════════════════════════════${NC}\n"
|
||||||
|
|
||||||
|
python semantic_search.py --mode demo
|
||||||
|
}
|
||||||
|
|
||||||
|
# Run API server
|
||||||
|
run_api() {
|
||||||
|
echo -e "\n${BLUE}═══════════════════════════════════════════════════${NC}"
|
||||||
|
echo -e "${BLUE}Starting Search API Server...${NC}"
|
||||||
|
echo -e "${BLUE}═══════════════════════════════════════════════════${NC}\n"
|
||||||
|
|
||||||
|
echo -e "${YELLOW}API will be available at: http://localhost:8080${NC}"
|
||||||
|
echo -e "${YELLOW}Swagger docs at: http://localhost:8080/docs${NC}"
|
||||||
|
echo -e "${YELLOW}Press Ctrl+C to stop${NC}\n"
|
||||||
|
|
||||||
|
python search_api.py
|
||||||
|
}
|
||||||
|
|
||||||
|
# Run SQL verification
|
||||||
|
run_sql() {
|
||||||
|
echo -e "\n${BLUE}═══════════════════════════════════════════════════${NC}"
|
||||||
|
echo -e "${BLUE}Running SQL Verification Queries...${NC}"
|
||||||
|
echo -e "${BLUE}═══════════════════════════════════════════════════${NC}\n"
|
||||||
|
|
||||||
|
PGPASSWORD="$POSTGRES_PASSWORD" psql \
|
||||||
|
-h "$POSTGRES_HOST" \
|
||||||
|
-p "$POSTGRES_PORT" \
|
||||||
|
-U "$POSTGRES_USER" \
|
||||||
|
-d "$POSTGRES_DB" \
|
||||||
|
-f verification_queries.sql
|
||||||
|
}
|
||||||
|
|
||||||
|
# Quick connectivity test
|
||||||
|
test_connectivity() {
|
||||||
|
echo -e "\n${BLUE}Testing Service Connectivity...${NC}\n"
|
||||||
|
|
||||||
|
# Test PostgreSQL
|
||||||
|
echo -n "PostgreSQL ($POSTGRES_HOST:$POSTGRES_PORT): "
|
||||||
|
if PGPASSWORD="$POSTGRES_PASSWORD" psql -h "$POSTGRES_HOST" -p "$POSTGRES_PORT" -U "$POSTGRES_USER" -d "$POSTGRES_DB" -c "SELECT 1" > /dev/null 2>&1; then
|
||||||
|
echo -e "${GREEN}✓ Connected${NC}"
|
||||||
|
else
|
||||||
|
echo -e "${RED}✗ Failed${NC}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Test Qdrant
|
||||||
|
echo -n "Qdrant ($QDRANT_HOST:$QDRANT_PORT): "
|
||||||
|
if curl -s "http://$QDRANT_HOST:$QDRANT_PORT/collections" > /dev/null 2>&1; then
|
||||||
|
echo -e "${GREEN}✓ Connected${NC}"
|
||||||
|
else
|
||||||
|
echo -e "${RED}✗ Failed${NC}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Test TEI
|
||||||
|
echo -n "TEI ($TEI_HOST:$TEI_PORT): "
|
||||||
|
if curl -s "http://$TEI_HOST:$TEI_PORT/health" > /dev/null 2>&1; then
|
||||||
|
echo -e "${GREEN}✓ Connected${NC}"
|
||||||
|
else
|
||||||
|
echo -e "${RED}✗ Failed${NC}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
}
|
||||||
|
|
||||||
|
# Show usage
|
||||||
|
show_usage() {
|
||||||
|
echo "Usage: $0 [command]"
|
||||||
|
echo ""
|
||||||
|
echo "Commands:"
|
||||||
|
echo " verify Run embedding verification"
|
||||||
|
echo " benchmark Run semantic search benchmark"
|
||||||
|
echo " demo Run search demo with sample queries"
|
||||||
|
echo " api Start the search API server"
|
||||||
|
echo " sql Run SQL verification queries"
|
||||||
|
echo " test Test connectivity to all services"
|
||||||
|
echo " all Run verify + benchmark + demo"
|
||||||
|
echo " install Install Python dependencies"
|
||||||
|
echo " help Show this help message"
|
||||||
|
echo ""
|
||||||
|
echo "Environment variables:"
|
||||||
|
echo " POSTGRES_HOST PostgreSQL host (default: pg.betelgeusebytes.io)"
|
||||||
|
echo " POSTGRES_PORT PostgreSQL port (default: 5432)"
|
||||||
|
echo " POSTGRES_DB Database name (default: hadith_db)"
|
||||||
|
echo " POSTGRES_USER Database user (default: hadith_ingest)"
|
||||||
|
echo " POSTGRES_PASSWORD Database password (required)"
|
||||||
|
echo " QDRANT_HOST Qdrant host (default: qdrant.vector.svc.cluster.local)"
|
||||||
|
echo " QDRANT_PORT Qdrant port (default: 6333)"
|
||||||
|
echo " TEI_HOST TEI host (default: tei.ml.svc.cluster.local)"
|
||||||
|
echo " TEI_PORT TEI port (default: 80)"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Main
|
||||||
|
case "${1:-help}" in
|
||||||
|
verify)
|
||||||
|
check_password
|
||||||
|
install_deps
|
||||||
|
run_verify
|
||||||
|
;;
|
||||||
|
benchmark)
|
||||||
|
check_password
|
||||||
|
install_deps
|
||||||
|
run_benchmark
|
||||||
|
;;
|
||||||
|
demo)
|
||||||
|
check_password
|
||||||
|
install_deps
|
||||||
|
run_demo
|
||||||
|
;;
|
||||||
|
api)
|
||||||
|
check_password
|
||||||
|
install_deps
|
||||||
|
run_api
|
||||||
|
;;
|
||||||
|
sql)
|
||||||
|
check_password
|
||||||
|
run_sql
|
||||||
|
;;
|
||||||
|
test)
|
||||||
|
check_password
|
||||||
|
test_connectivity
|
||||||
|
;;
|
||||||
|
all)
|
||||||
|
check_password
|
||||||
|
install_deps
|
||||||
|
test_connectivity
|
||||||
|
run_verify
|
||||||
|
run_benchmark
|
||||||
|
run_demo
|
||||||
|
;;
|
||||||
|
install)
|
||||||
|
install_deps
|
||||||
|
;;
|
||||||
|
help|--help|-h)
|
||||||
|
show_usage
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo -e "${RED}Unknown command: $1${NC}"
|
||||||
|
show_usage
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
@ -0,0 +1,567 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Step 6.3: Semantic Search API Service
|
||||||
|
======================================
|
||||||
|
Production-ready FastAPI service for hadith semantic search.
|
||||||
|
|
||||||
|
Author: Hadith Scholar AI Project
|
||||||
|
Date: 2025
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import logging
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import List, Optional
|
||||||
|
from contextlib import asynccontextmanager
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
import psycopg2
|
||||||
|
from psycopg2.pool import ThreadedConnectionPool
|
||||||
|
from psycopg2.extras import RealDictCursor
|
||||||
|
from fastapi import FastAPI, HTTPException, Query, Depends
|
||||||
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
# Logging
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
POSTGRES_HOST = os.getenv("POSTGRES_HOST", "pg.betelgeusebytes.io")
|
||||||
|
POSTGRES_PORT = int(os.getenv("POSTGRES_PORT", "5432"))
|
||||||
|
POSTGRES_DB = os.getenv("POSTGRES_DB", "hadith_db")
|
||||||
|
POSTGRES_USER = os.getenv("POSTGRES_USER", "hadith_ingest")
|
||||||
|
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD", "")
|
||||||
|
|
||||||
|
QDRANT_HOST = os.getenv("QDRANT_HOST", "qdrant.vector.svc.cluster.local")
|
||||||
|
QDRANT_PORT = int(os.getenv("QDRANT_PORT", "6333"))
|
||||||
|
QDRANT_COLLECTION = os.getenv("QDRANT_COLLECTION", "hadith_embeddings")
|
||||||
|
|
||||||
|
TEI_HOST = os.getenv("TEI_HOST", "tei.ml.svc.cluster.local")
|
||||||
|
TEI_PORT = int(os.getenv("TEI_PORT", "80"))
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Pydantic Models
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
class SearchQuery(BaseModel):
|
||||||
|
"""Search query input."""
|
||||||
|
query: str = Field(..., min_length=1, max_length=1000, description="Search query text")
|
||||||
|
limit: int = Field(default=10, ge=1, le=100, description="Number of results to return")
|
||||||
|
min_score: float = Field(default=0.0, ge=0.0, le=1.0, description="Minimum similarity score")
|
||||||
|
collections: Optional[List[str]] = Field(default=None, description="Filter by collection names")
|
||||||
|
grades: Optional[List[str]] = Field(default=None, description="Filter by hadith grades")
|
||||||
|
|
||||||
|
|
||||||
|
class HadithResult(BaseModel):
|
||||||
|
"""Individual hadith search result."""
|
||||||
|
hadith_id: int
|
||||||
|
score: float
|
||||||
|
collection: str
|
||||||
|
book: Optional[str]
|
||||||
|
hadith_number: str
|
||||||
|
arabic_text: Optional[str]
|
||||||
|
arabic_normalized: Optional[str]
|
||||||
|
english_text: Optional[str]
|
||||||
|
urdu_text: Optional[str]
|
||||||
|
grade: Optional[str]
|
||||||
|
|
||||||
|
|
||||||
|
class SearchResponse(BaseModel):
|
||||||
|
"""Search response."""
|
||||||
|
query: str
|
||||||
|
results: List[HadithResult]
|
||||||
|
total_results: int
|
||||||
|
embedding_time_ms: float
|
||||||
|
search_time_ms: float
|
||||||
|
total_time_ms: float
|
||||||
|
timestamp: str
|
||||||
|
|
||||||
|
|
||||||
|
class HealthResponse(BaseModel):
|
||||||
|
"""Health check response."""
|
||||||
|
status: str
|
||||||
|
database: str
|
||||||
|
qdrant: str
|
||||||
|
tei: str
|
||||||
|
timestamp: str
|
||||||
|
|
||||||
|
|
||||||
|
class CollectionStats(BaseModel):
|
||||||
|
"""Collection statistics."""
|
||||||
|
name: str
|
||||||
|
total_hadiths: int
|
||||||
|
embedded_count: int
|
||||||
|
|
||||||
|
|
||||||
|
class StatsResponse(BaseModel):
|
||||||
|
"""Statistics response."""
|
||||||
|
total_hadiths: int
|
||||||
|
total_embedded: int
|
||||||
|
collections: List[CollectionStats]
|
||||||
|
timestamp: str
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Database Pool & Connections
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
db_pool: Optional[ThreadedConnectionPool] = None
|
||||||
|
http_client: Optional[httpx.AsyncClient] = None
|
||||||
|
|
||||||
|
|
||||||
|
@asynccontextmanager
|
||||||
|
async def lifespan(app: FastAPI):
|
||||||
|
"""Manage application lifecycle."""
|
||||||
|
global db_pool, http_client
|
||||||
|
|
||||||
|
# Startup
|
||||||
|
logger.info("Starting up semantic search service...")
|
||||||
|
|
||||||
|
# Initialize database pool
|
||||||
|
try:
|
||||||
|
db_pool = ThreadedConnectionPool(
|
||||||
|
minconn=2,
|
||||||
|
maxconn=10,
|
||||||
|
host=POSTGRES_HOST,
|
||||||
|
port=POSTGRES_PORT,
|
||||||
|
database=POSTGRES_DB,
|
||||||
|
user=POSTGRES_USER,
|
||||||
|
password=POSTGRES_PASSWORD,
|
||||||
|
sslmode='require'
|
||||||
|
)
|
||||||
|
logger.info("Database pool initialized")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to initialize database pool: {e}")
|
||||||
|
db_pool = None
|
||||||
|
|
||||||
|
# Initialize HTTP client
|
||||||
|
http_client = httpx.AsyncClient(timeout=30.0)
|
||||||
|
logger.info("HTTP client initialized")
|
||||||
|
|
||||||
|
yield
|
||||||
|
|
||||||
|
# Shutdown
|
||||||
|
logger.info("Shutting down...")
|
||||||
|
if db_pool:
|
||||||
|
db_pool.closeall()
|
||||||
|
if http_client:
|
||||||
|
await http_client.aclose()
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# FastAPI App
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
app = FastAPI(
|
||||||
|
title="Hadith Semantic Search API",
|
||||||
|
description="Semantic search service for Islamic hadith literature",
|
||||||
|
version="1.0.0",
|
||||||
|
lifespan=lifespan
|
||||||
|
)
|
||||||
|
|
||||||
|
# CORS middleware
|
||||||
|
app.add_middleware(
|
||||||
|
CORSMiddleware,
|
||||||
|
allow_origins=["*"],
|
||||||
|
allow_credentials=True,
|
||||||
|
allow_methods=["*"],
|
||||||
|
allow_headers=["*"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Helper Functions
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def get_db_connection():
|
||||||
|
"""Get database connection from pool."""
|
||||||
|
if db_pool is None:
|
||||||
|
raise HTTPException(status_code=503, detail="Database pool not available")
|
||||||
|
return db_pool.getconn()
|
||||||
|
|
||||||
|
|
||||||
|
def release_db_connection(conn):
|
||||||
|
"""Return connection to pool."""
|
||||||
|
if db_pool and conn:
|
||||||
|
db_pool.putconn(conn)
|
||||||
|
|
||||||
|
|
||||||
|
async def get_embedding(text: str) -> tuple[List[float], float]:
|
||||||
|
"""Get embedding from TEI service."""
|
||||||
|
start = time.perf_counter()
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = await http_client.post(
|
||||||
|
f"http://{TEI_HOST}:{TEI_PORT}/embed",
|
||||||
|
json={"inputs": text}
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
elapsed_ms = (time.perf_counter() - start) * 1000
|
||||||
|
|
||||||
|
embeddings = response.json()
|
||||||
|
if isinstance(embeddings, list) and len(embeddings) > 0:
|
||||||
|
if isinstance(embeddings[0], list):
|
||||||
|
return embeddings[0], elapsed_ms
|
||||||
|
return embeddings, elapsed_ms
|
||||||
|
|
||||||
|
raise ValueError("Unexpected embedding format")
|
||||||
|
|
||||||
|
except httpx.HTTPError as e:
|
||||||
|
logger.error(f"TEI request failed: {e}")
|
||||||
|
raise HTTPException(status_code=503, detail=f"Embedding service error: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
async def search_qdrant(
|
||||||
|
embedding: List[float],
|
||||||
|
limit: int = 10,
|
||||||
|
min_score: float = 0.0,
|
||||||
|
filters: Optional[dict] = None
|
||||||
|
) -> tuple[List[dict], float]:
|
||||||
|
"""Search Qdrant with embedding vector."""
|
||||||
|
start = time.perf_counter()
|
||||||
|
|
||||||
|
try:
|
||||||
|
payload = {
|
||||||
|
"vector": embedding,
|
||||||
|
"limit": limit,
|
||||||
|
"with_payload": True,
|
||||||
|
"score_threshold": min_score
|
||||||
|
}
|
||||||
|
|
||||||
|
if filters:
|
||||||
|
payload["filter"] = filters
|
||||||
|
|
||||||
|
response = await http_client.post(
|
||||||
|
f"http://{QDRANT_HOST}:{QDRANT_PORT}/collections/{QDRANT_COLLECTION}/points/search",
|
||||||
|
json=payload
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
elapsed_ms = (time.perf_counter() - start) * 1000
|
||||||
|
results = response.json().get("result", [])
|
||||||
|
|
||||||
|
return results, elapsed_ms
|
||||||
|
|
||||||
|
except httpx.HTTPError as e:
|
||||||
|
logger.error(f"Qdrant request failed: {e}")
|
||||||
|
raise HTTPException(status_code=503, detail=f"Vector search service error: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def enrich_results_from_db(hadith_ids: List[int]) -> dict[int, dict]:
|
||||||
|
"""Fetch full hadith data from PostgreSQL."""
|
||||||
|
if not hadith_ids:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
conn = get_db_connection()
|
||||||
|
try:
|
||||||
|
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||||
|
cur.execute("""
|
||||||
|
SELECT
|
||||||
|
h.id,
|
||||||
|
h.hadith_number,
|
||||||
|
h.arabic_text,
|
||||||
|
h.arabic_normalized,
|
||||||
|
h.english_text,
|
||||||
|
h.urdu_text,
|
||||||
|
h.grade,
|
||||||
|
c.name_english as collection_name,
|
||||||
|
b.name_english as book_name
|
||||||
|
FROM hadiths h
|
||||||
|
JOIN collections c ON h.collection_id = c.id
|
||||||
|
LEFT JOIN books b ON h.book_id = b.id
|
||||||
|
WHERE h.id = ANY(%s)
|
||||||
|
""", (hadith_ids,))
|
||||||
|
|
||||||
|
return {row['id']: dict(row) for row in cur.fetchall()}
|
||||||
|
finally:
|
||||||
|
release_db_connection(conn)
|
||||||
|
|
||||||
|
|
||||||
|
def build_qdrant_filter(collections: Optional[List[str]], grades: Optional[List[str]]) -> Optional[dict]:
|
||||||
|
"""Build Qdrant filter from parameters."""
|
||||||
|
conditions = []
|
||||||
|
|
||||||
|
if collections:
|
||||||
|
conditions.append({
|
||||||
|
"key": "collection",
|
||||||
|
"match": {"any": collections}
|
||||||
|
})
|
||||||
|
|
||||||
|
if grades:
|
||||||
|
conditions.append({
|
||||||
|
"key": "grade",
|
||||||
|
"match": {"any": grades}
|
||||||
|
})
|
||||||
|
|
||||||
|
if not conditions:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if len(conditions) == 1:
|
||||||
|
return {"must": conditions}
|
||||||
|
|
||||||
|
return {"must": conditions}
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# API Endpoints
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
@app.get("/health", response_model=HealthResponse)
|
||||||
|
async def health_check():
|
||||||
|
"""Check health of all services."""
|
||||||
|
db_status = "healthy"
|
||||||
|
qdrant_status = "healthy"
|
||||||
|
tei_status = "healthy"
|
||||||
|
|
||||||
|
# Check database
|
||||||
|
try:
|
||||||
|
conn = get_db_connection()
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.execute("SELECT 1")
|
||||||
|
release_db_connection(conn)
|
||||||
|
except Exception as e:
|
||||||
|
db_status = f"unhealthy: {e}"
|
||||||
|
|
||||||
|
# Check Qdrant
|
||||||
|
try:
|
||||||
|
response = await http_client.get(
|
||||||
|
f"http://{QDRANT_HOST}:{QDRANT_PORT}/collections/{QDRANT_COLLECTION}"
|
||||||
|
)
|
||||||
|
if response.status_code != 200:
|
||||||
|
qdrant_status = f"unhealthy: status {response.status_code}"
|
||||||
|
except Exception as e:
|
||||||
|
qdrant_status = f"unhealthy: {e}"
|
||||||
|
|
||||||
|
# Check TEI
|
||||||
|
try:
|
||||||
|
response = await http_client.get(f"http://{TEI_HOST}:{TEI_PORT}/health")
|
||||||
|
if response.status_code != 200:
|
||||||
|
tei_status = f"unhealthy: status {response.status_code}"
|
||||||
|
except Exception as e:
|
||||||
|
tei_status = f"unhealthy: {e}"
|
||||||
|
|
||||||
|
overall = "healthy" if all(
|
||||||
|
s == "healthy" for s in [db_status, qdrant_status, tei_status]
|
||||||
|
) else "degraded"
|
||||||
|
|
||||||
|
return HealthResponse(
|
||||||
|
status=overall,
|
||||||
|
database=db_status,
|
||||||
|
qdrant=qdrant_status,
|
||||||
|
tei=tei_status,
|
||||||
|
timestamp=datetime.now().isoformat()
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/stats", response_model=StatsResponse)
|
||||||
|
async def get_stats():
|
||||||
|
"""Get database statistics."""
|
||||||
|
conn = get_db_connection()
|
||||||
|
try:
|
||||||
|
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||||
|
# Total counts
|
||||||
|
cur.execute("""
|
||||||
|
SELECT
|
||||||
|
COUNT(*) as total,
|
||||||
|
SUM(CASE WHEN embedding_generated THEN 1 ELSE 0 END) as embedded
|
||||||
|
FROM hadiths
|
||||||
|
""")
|
||||||
|
totals = cur.fetchone()
|
||||||
|
|
||||||
|
# By collection
|
||||||
|
cur.execute("""
|
||||||
|
SELECT
|
||||||
|
c.name_english as name,
|
||||||
|
COUNT(h.id) as total_hadiths,
|
||||||
|
SUM(CASE WHEN h.embedding_generated THEN 1 ELSE 0 END) as embedded_count
|
||||||
|
FROM hadiths h
|
||||||
|
JOIN collections c ON h.collection_id = c.id
|
||||||
|
GROUP BY c.id, c.name_english
|
||||||
|
ORDER BY total_hadiths DESC
|
||||||
|
""")
|
||||||
|
collections = [CollectionStats(**dict(row)) for row in cur.fetchall()]
|
||||||
|
|
||||||
|
return StatsResponse(
|
||||||
|
total_hadiths=totals['total'],
|
||||||
|
total_embedded=totals['embedded'],
|
||||||
|
collections=collections,
|
||||||
|
timestamp=datetime.now().isoformat()
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
release_db_connection(conn)
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/search", response_model=SearchResponse)
|
||||||
|
async def semantic_search(query: SearchQuery):
|
||||||
|
"""Perform semantic search on hadiths."""
|
||||||
|
total_start = time.perf_counter()
|
||||||
|
|
||||||
|
# Get embedding
|
||||||
|
embedding, embed_time = await get_embedding(query.query)
|
||||||
|
|
||||||
|
# Build filters
|
||||||
|
filters = build_qdrant_filter(query.collections, query.grades)
|
||||||
|
|
||||||
|
# Search Qdrant
|
||||||
|
qdrant_results, search_time = await search_qdrant(
|
||||||
|
embedding,
|
||||||
|
limit=query.limit,
|
||||||
|
min_score=query.min_score,
|
||||||
|
filters=filters
|
||||||
|
)
|
||||||
|
|
||||||
|
# Extract hadith IDs
|
||||||
|
hadith_ids = []
|
||||||
|
for r in qdrant_results:
|
||||||
|
payload = r.get("payload", {})
|
||||||
|
hid = payload.get("hadith_id") or payload.get("id")
|
||||||
|
if hid:
|
||||||
|
hadith_ids.append(int(hid))
|
||||||
|
|
||||||
|
# Enrich from database
|
||||||
|
db_data = enrich_results_from_db(hadith_ids)
|
||||||
|
|
||||||
|
# Build results
|
||||||
|
results = []
|
||||||
|
for r in qdrant_results:
|
||||||
|
payload = r.get("payload", {})
|
||||||
|
hid = payload.get("hadith_id") or payload.get("id")
|
||||||
|
|
||||||
|
if hid and int(hid) in db_data:
|
||||||
|
data = db_data[int(hid)]
|
||||||
|
results.append(HadithResult(
|
||||||
|
hadith_id=int(hid),
|
||||||
|
score=r.get("score", 0),
|
||||||
|
collection=data.get("collection_name", "Unknown"),
|
||||||
|
book=data.get("book_name"),
|
||||||
|
hadith_number=data.get("hadith_number", ""),
|
||||||
|
arabic_text=data.get("arabic_text"),
|
||||||
|
arabic_normalized=data.get("arabic_normalized"),
|
||||||
|
english_text=data.get("english_text"),
|
||||||
|
urdu_text=data.get("urdu_text"),
|
||||||
|
grade=data.get("grade")
|
||||||
|
))
|
||||||
|
else:
|
||||||
|
# Fallback to payload
|
||||||
|
results.append(HadithResult(
|
||||||
|
hadith_id=int(hid) if hid else 0,
|
||||||
|
score=r.get("score", 0),
|
||||||
|
collection=payload.get("collection", "Unknown"),
|
||||||
|
book=payload.get("book"),
|
||||||
|
hadith_number=str(payload.get("hadith_number", "")),
|
||||||
|
arabic_text=payload.get("arabic_text"),
|
||||||
|
arabic_normalized=payload.get("arabic_normalized"),
|
||||||
|
english_text=payload.get("english_text"),
|
||||||
|
urdu_text=payload.get("urdu_text"),
|
||||||
|
grade=payload.get("grade")
|
||||||
|
))
|
||||||
|
|
||||||
|
total_time = (time.perf_counter() - total_start) * 1000
|
||||||
|
|
||||||
|
return SearchResponse(
|
||||||
|
query=query.query,
|
||||||
|
results=results,
|
||||||
|
total_results=len(results),
|
||||||
|
embedding_time_ms=embed_time,
|
||||||
|
search_time_ms=search_time,
|
||||||
|
total_time_ms=total_time,
|
||||||
|
timestamp=datetime.now().isoformat()
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/search", response_model=SearchResponse)
|
||||||
|
async def semantic_search_get(
|
||||||
|
q: str = Query(..., min_length=1, max_length=1000, description="Search query"),
|
||||||
|
limit: int = Query(default=10, ge=1, le=100),
|
||||||
|
min_score: float = Query(default=0.0, ge=0.0, le=1.0)
|
||||||
|
):
|
||||||
|
"""GET version of semantic search for simple queries."""
|
||||||
|
query = SearchQuery(query=q, limit=limit, min_score=min_score)
|
||||||
|
return await semantic_search(query)
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/hadith/{hadith_id}")
|
||||||
|
async def get_hadith(hadith_id: int):
|
||||||
|
"""Get a specific hadith by ID."""
|
||||||
|
conn = get_db_connection()
|
||||||
|
try:
|
||||||
|
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||||
|
cur.execute("""
|
||||||
|
SELECT
|
||||||
|
h.id,
|
||||||
|
h.hadith_number,
|
||||||
|
h.arabic_text,
|
||||||
|
h.arabic_normalized,
|
||||||
|
h.english_text,
|
||||||
|
h.urdu_text,
|
||||||
|
h.grade,
|
||||||
|
h.source_metadata,
|
||||||
|
h.embedding_generated,
|
||||||
|
h.entities_extracted,
|
||||||
|
h.relations_extracted,
|
||||||
|
h.created_at,
|
||||||
|
h.updated_at,
|
||||||
|
c.name_english as collection_name,
|
||||||
|
c.name_arabic as collection_arabic,
|
||||||
|
b.name_english as book_name,
|
||||||
|
b.name_arabic as book_arabic
|
||||||
|
FROM hadiths h
|
||||||
|
JOIN collections c ON h.collection_id = c.id
|
||||||
|
LEFT JOIN books b ON h.book_id = b.id
|
||||||
|
WHERE h.id = %s
|
||||||
|
""", (hadith_id,))
|
||||||
|
|
||||||
|
row = cur.fetchone()
|
||||||
|
if not row:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Hadith {hadith_id} not found")
|
||||||
|
|
||||||
|
return dict(row)
|
||||||
|
finally:
|
||||||
|
release_db_connection(conn)
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/similar/{hadith_id}", response_model=SearchResponse)
|
||||||
|
async def find_similar(
|
||||||
|
hadith_id: int,
|
||||||
|
limit: int = Query(default=10, ge=1, le=100)
|
||||||
|
):
|
||||||
|
"""Find hadiths similar to a given hadith."""
|
||||||
|
# Get the hadith text
|
||||||
|
conn = get_db_connection()
|
||||||
|
try:
|
||||||
|
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||||
|
cur.execute("""
|
||||||
|
SELECT arabic_text, english_text
|
||||||
|
FROM hadiths
|
||||||
|
WHERE id = %s
|
||||||
|
""", (hadith_id,))
|
||||||
|
|
||||||
|
row = cur.fetchone()
|
||||||
|
if not row:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Hadith {hadith_id} not found")
|
||||||
|
|
||||||
|
# Use Arabic text preferably, fall back to English
|
||||||
|
text = row['arabic_text'] or row['english_text']
|
||||||
|
if not text:
|
||||||
|
raise HTTPException(status_code=400, detail="Hadith has no text content")
|
||||||
|
finally:
|
||||||
|
release_db_connection(conn)
|
||||||
|
|
||||||
|
# Search for similar hadiths
|
||||||
|
query = SearchQuery(query=text, limit=limit + 1) # +1 to exclude self
|
||||||
|
response = await semantic_search(query)
|
||||||
|
|
||||||
|
# Filter out the source hadith
|
||||||
|
response.results = [r for r in response.results if r.hadith_id != hadith_id][:limit]
|
||||||
|
response.total_results = len(response.results)
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import uvicorn
|
||||||
|
uvicorn.run(app, host="0.0.0.0", port=8080)
|
||||||
|
|
@ -0,0 +1,666 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Step 6.2: Semantic Search Testing & Benchmarking
|
||||||
|
=================================================
|
||||||
|
Tests semantic search functionality and benchmarks performance.
|
||||||
|
Target: <500ms per query.
|
||||||
|
|
||||||
|
Author: Hadith Scholar AI Project
|
||||||
|
Date: 2025
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import asyncio
|
||||||
|
import statistics
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Dict, List, Optional, Tuple
|
||||||
|
from dataclasses import dataclass, asdict, field
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
import psycopg2
|
||||||
|
from psycopg2.extras import RealDictCursor
|
||||||
|
from rich.console import Console
|
||||||
|
from rich.table import Table
|
||||||
|
from rich.panel import Panel
|
||||||
|
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
POSTGRES_HOST = os.getenv("POSTGRES_HOST", "pg.betelgeusebytes.io")
|
||||||
|
POSTGRES_PORT = int(os.getenv("POSTGRES_PORT", "5432"))
|
||||||
|
POSTGRES_DB = os.getenv("POSTGRES_DB", "hadith_db")
|
||||||
|
POSTGRES_USER = os.getenv("POSTGRES_USER", "hadith_ingest")
|
||||||
|
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD", "")
|
||||||
|
|
||||||
|
QDRANT_HOST = os.getenv("QDRANT_HOST", "qdrant.vector.svc.cluster.local")
|
||||||
|
QDRANT_PORT = int(os.getenv("QDRANT_PORT", "6333"))
|
||||||
|
QDRANT_COLLECTION = os.getenv("QDRANT_COLLECTION", "hadith_embeddings")
|
||||||
|
|
||||||
|
TEI_HOST = os.getenv("TEI_HOST", "tei.ml.svc.cluster.local")
|
||||||
|
TEI_PORT = int(os.getenv("TEI_PORT", "80"))
|
||||||
|
|
||||||
|
console = Console()
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Sample Queries for Testing
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
SAMPLE_QUERIES = {
|
||||||
|
"arabic": [
|
||||||
|
{
|
||||||
|
"query": "الصلاة في المسجد الحرام",
|
||||||
|
"description": "Prayer in the Sacred Mosque",
|
||||||
|
"expected_topics": ["prayer", "mosque", "mecca"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "أبو هريرة رضي الله عنه",
|
||||||
|
"description": "Abu Hurairah (RA)",
|
||||||
|
"expected_topics": ["narrator", "companion"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "الصيام في شهر رمضان",
|
||||||
|
"description": "Fasting in Ramadan",
|
||||||
|
"expected_topics": ["fasting", "ramadan"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "الزكاة والصدقة",
|
||||||
|
"description": "Zakat and charity",
|
||||||
|
"expected_topics": ["charity", "zakat"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "الحج والعمرة",
|
||||||
|
"description": "Hajj and Umrah",
|
||||||
|
"expected_topics": ["pilgrimage", "hajj", "umrah"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "النبي صلى الله عليه وسلم في المدينة",
|
||||||
|
"description": "Prophet (PBUH) in Medina",
|
||||||
|
"expected_topics": ["prophet", "medina"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "الوضوء والطهارة",
|
||||||
|
"description": "Ablution and purification",
|
||||||
|
"expected_topics": ["ablution", "purification", "wudu"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "بر الوالدين",
|
||||||
|
"description": "Honoring parents",
|
||||||
|
"expected_topics": ["parents", "kindness", "family"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "الجنة والنار",
|
||||||
|
"description": "Paradise and Hell",
|
||||||
|
"expected_topics": ["afterlife", "paradise", "hell"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "الإيمان والإسلام",
|
||||||
|
"description": "Faith and Islam",
|
||||||
|
"expected_topics": ["faith", "belief", "islam"]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"english": [
|
||||||
|
{
|
||||||
|
"query": "five daily prayers",
|
||||||
|
"description": "The five obligatory prayers",
|
||||||
|
"expected_topics": ["prayer", "salah", "obligation"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "Prophet Muhammad in Mecca",
|
||||||
|
"description": "Prophet's life in Mecca",
|
||||||
|
"expected_topics": ["prophet", "mecca", "biography"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "treatment of neighbors",
|
||||||
|
"description": "Rights and treatment of neighbors",
|
||||||
|
"expected_topics": ["neighbors", "rights", "ethics"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "patience during hardship",
|
||||||
|
"description": "Patience in difficult times",
|
||||||
|
"expected_topics": ["patience", "sabr", "trials"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "marriage and family",
|
||||||
|
"description": "Islamic marriage guidance",
|
||||||
|
"expected_topics": ["marriage", "family", "nikah"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "honesty and truthfulness",
|
||||||
|
"description": "Importance of being truthful",
|
||||||
|
"expected_topics": ["honesty", "truth", "character"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "Day of Judgment signs",
|
||||||
|
"description": "Signs of the Last Day",
|
||||||
|
"expected_topics": ["judgment", "signs", "eschatology"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "charity and helping poor",
|
||||||
|
"description": "Giving charity to the needy",
|
||||||
|
"expected_topics": ["charity", "poor", "sadaqah"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "companions of the Prophet",
|
||||||
|
"description": "Sahaba and their virtues",
|
||||||
|
"expected_topics": ["companions", "sahaba", "virtue"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "seeking knowledge in Islam",
|
||||||
|
"description": "Importance of knowledge",
|
||||||
|
"expected_topics": ["knowledge", "learning", "education"]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"mixed": [
|
||||||
|
{
|
||||||
|
"query": "قال رسول الله about kindness",
|
||||||
|
"description": "Prophet's sayings about kindness (mixed)",
|
||||||
|
"expected_topics": ["prophet", "kindness", "ethics"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "women rights الإسلام",
|
||||||
|
"description": "Women's rights in Islam (mixed)",
|
||||||
|
"expected_topics": ["women", "rights", "islam"]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class SearchResult:
|
||||||
|
"""Individual search result."""
|
||||||
|
hadith_id: int
|
||||||
|
score: float
|
||||||
|
collection: str
|
||||||
|
hadith_number: str
|
||||||
|
arabic_text: str
|
||||||
|
english_text: str
|
||||||
|
grade: str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class QueryBenchmark:
|
||||||
|
"""Benchmark results for a single query."""
|
||||||
|
query: str
|
||||||
|
language: str
|
||||||
|
description: str
|
||||||
|
embedding_time_ms: float
|
||||||
|
search_time_ms: float
|
||||||
|
total_time_ms: float
|
||||||
|
results_count: int
|
||||||
|
top_score: float
|
||||||
|
meets_target: bool # <500ms
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class BenchmarkReport:
|
||||||
|
"""Full benchmark report."""
|
||||||
|
total_queries: int
|
||||||
|
successful_queries: int
|
||||||
|
failed_queries: int
|
||||||
|
avg_embedding_time_ms: float
|
||||||
|
avg_search_time_ms: float
|
||||||
|
avg_total_time_ms: float
|
||||||
|
p50_time_ms: float
|
||||||
|
p95_time_ms: float
|
||||||
|
p99_time_ms: float
|
||||||
|
min_time_ms: float
|
||||||
|
max_time_ms: float
|
||||||
|
queries_meeting_target: int
|
||||||
|
target_ms: int
|
||||||
|
query_results: List[QueryBenchmark] = field(default_factory=list)
|
||||||
|
timestamp: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
def get_db_connection():
|
||||||
|
"""Create PostgreSQL connection."""
|
||||||
|
return psycopg2.connect(
|
||||||
|
host=POSTGRES_HOST,
|
||||||
|
port=POSTGRES_PORT,
|
||||||
|
database=POSTGRES_DB,
|
||||||
|
user=POSTGRES_USER,
|
||||||
|
password=POSTGRES_PASSWORD,
|
||||||
|
sslmode='require'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def get_embedding(client: httpx.AsyncClient, text: str) -> Tuple[List[float], float]:
|
||||||
|
"""Get embedding from TEI service."""
|
||||||
|
start = time.perf_counter()
|
||||||
|
|
||||||
|
response = await client.post(
|
||||||
|
f"http://{TEI_HOST}:{TEI_PORT}/embed",
|
||||||
|
json={"inputs": text}
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
elapsed_ms = (time.perf_counter() - start) * 1000
|
||||||
|
|
||||||
|
# TEI returns list of embeddings, we want the first one
|
||||||
|
embeddings = response.json()
|
||||||
|
if isinstance(embeddings, list) and len(embeddings) > 0:
|
||||||
|
if isinstance(embeddings[0], list):
|
||||||
|
return embeddings[0], elapsed_ms
|
||||||
|
return embeddings, elapsed_ms
|
||||||
|
|
||||||
|
raise ValueError(f"Unexpected embedding response format: {type(embeddings)}")
|
||||||
|
|
||||||
|
|
||||||
|
async def search_qdrant(
|
||||||
|
client: httpx.AsyncClient,
|
||||||
|
embedding: List[float],
|
||||||
|
limit: int = 10
|
||||||
|
) -> Tuple[List[Dict], float]:
|
||||||
|
"""Search Qdrant with embedding vector."""
|
||||||
|
start = time.perf_counter()
|
||||||
|
|
||||||
|
response = await client.post(
|
||||||
|
f"http://{QDRANT_HOST}:{QDRANT_PORT}/collections/{QDRANT_COLLECTION}/points/search",
|
||||||
|
json={
|
||||||
|
"vector": embedding,
|
||||||
|
"limit": limit,
|
||||||
|
"with_payload": True
|
||||||
|
}
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
elapsed_ms = (time.perf_counter() - start) * 1000
|
||||||
|
results = response.json().get("result", [])
|
||||||
|
|
||||||
|
return results, elapsed_ms
|
||||||
|
|
||||||
|
|
||||||
|
def enrich_results_from_db(hadith_ids: List[int]) -> Dict[int, Dict]:
|
||||||
|
"""Fetch full hadith data from PostgreSQL."""
|
||||||
|
if not hadith_ids:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
conn = get_db_connection()
|
||||||
|
try:
|
||||||
|
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||||
|
cur.execute("""
|
||||||
|
SELECT
|
||||||
|
h.id,
|
||||||
|
h.hadith_number,
|
||||||
|
h.arabic_text,
|
||||||
|
h.english_text,
|
||||||
|
h.grade,
|
||||||
|
c.name_english as collection_name
|
||||||
|
FROM hadiths h
|
||||||
|
JOIN collections c ON h.collection_id = c.id
|
||||||
|
WHERE h.id = ANY(%s)
|
||||||
|
""", (hadith_ids,))
|
||||||
|
|
||||||
|
return {row['id']: dict(row) for row in cur.fetchall()}
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
async def semantic_search(
|
||||||
|
client: httpx.AsyncClient,
|
||||||
|
query: str,
|
||||||
|
limit: int = 10
|
||||||
|
) -> Tuple[List[SearchResult], float, float]:
|
||||||
|
"""Perform semantic search and return results with timing."""
|
||||||
|
|
||||||
|
# Step 1: Get embedding
|
||||||
|
embedding, embed_time = await get_embedding(client, query)
|
||||||
|
|
||||||
|
# Step 2: Search Qdrant
|
||||||
|
qdrant_results, search_time = await search_qdrant(client, embedding, limit)
|
||||||
|
|
||||||
|
# Step 3: Get hadith IDs and enrich from DB
|
||||||
|
hadith_ids = []
|
||||||
|
for r in qdrant_results:
|
||||||
|
payload = r.get("payload", {})
|
||||||
|
hid = payload.get("hadith_id") or payload.get("id")
|
||||||
|
if hid:
|
||||||
|
hadith_ids.append(int(hid))
|
||||||
|
|
||||||
|
db_data = enrich_results_from_db(hadith_ids)
|
||||||
|
|
||||||
|
# Step 4: Build results
|
||||||
|
results = []
|
||||||
|
for r in qdrant_results:
|
||||||
|
payload = r.get("payload", {})
|
||||||
|
hid = payload.get("hadith_id") or payload.get("id")
|
||||||
|
|
||||||
|
if hid and int(hid) in db_data:
|
||||||
|
data = db_data[int(hid)]
|
||||||
|
results.append(SearchResult(
|
||||||
|
hadith_id=int(hid),
|
||||||
|
score=r.get("score", 0),
|
||||||
|
collection=data.get("collection_name", "Unknown"),
|
||||||
|
hadith_number=data.get("hadith_number", ""),
|
||||||
|
arabic_text=data.get("arabic_text", "")[:200] + "..." if data.get("arabic_text") else "",
|
||||||
|
english_text=data.get("english_text", "")[:200] + "..." if data.get("english_text") else "",
|
||||||
|
grade=data.get("grade", "")
|
||||||
|
))
|
||||||
|
else:
|
||||||
|
# Fallback to payload data
|
||||||
|
results.append(SearchResult(
|
||||||
|
hadith_id=int(hid) if hid else 0,
|
||||||
|
score=r.get("score", 0),
|
||||||
|
collection=payload.get("collection", "Unknown"),
|
||||||
|
hadith_number=str(payload.get("hadith_number", "")),
|
||||||
|
arabic_text=payload.get("arabic_text", "")[:200] + "..." if payload.get("arabic_text") else "",
|
||||||
|
english_text=payload.get("english_text", "")[:200] + "..." if payload.get("english_text") else "",
|
||||||
|
grade=payload.get("grade", "")
|
||||||
|
))
|
||||||
|
|
||||||
|
return results, embed_time, search_time
|
||||||
|
|
||||||
|
|
||||||
|
def display_search_results(query: str, results: List[SearchResult], embed_time: float, search_time: float):
|
||||||
|
"""Display search results in a nice format."""
|
||||||
|
total_time = embed_time + search_time
|
||||||
|
|
||||||
|
console.print(f"\n[bold cyan]Query:[/bold cyan] {query}")
|
||||||
|
console.print(f"[dim]Embedding: {embed_time:.1f}ms | Search: {search_time:.1f}ms | Total: {total_time:.1f}ms[/dim]")
|
||||||
|
|
||||||
|
if not results:
|
||||||
|
console.print("[yellow]No results found.[/yellow]")
|
||||||
|
return
|
||||||
|
|
||||||
|
table = Table(title=f"Top {len(results)} Results", show_lines=True)
|
||||||
|
table.add_column("#", style="dim", width=3)
|
||||||
|
table.add_column("Score", justify="right", width=8)
|
||||||
|
table.add_column("Collection", width=15)
|
||||||
|
table.add_column("Hadith #", width=10)
|
||||||
|
table.add_column("Text Preview", width=60)
|
||||||
|
table.add_column("Grade", width=10)
|
||||||
|
|
||||||
|
for i, r in enumerate(results, 1):
|
||||||
|
text_preview = r.english_text if r.english_text else r.arabic_text
|
||||||
|
table.add_row(
|
||||||
|
str(i),
|
||||||
|
f"{r.score:.4f}",
|
||||||
|
r.collection,
|
||||||
|
r.hadith_number,
|
||||||
|
text_preview[:80] + "..." if len(text_preview) > 80 else text_preview,
|
||||||
|
r.grade or "-"
|
||||||
|
)
|
||||||
|
|
||||||
|
console.print(table)
|
||||||
|
|
||||||
|
|
||||||
|
async def run_benchmarks(warmup_count: int = 3) -> BenchmarkReport:
|
||||||
|
"""Run full benchmark suite."""
|
||||||
|
console.print(Panel.fit(
|
||||||
|
"[bold blue]Semantic Search Benchmark[/bold blue]\n"
|
||||||
|
f"Target: <500ms per query\n"
|
||||||
|
f"TEI: {TEI_HOST}:{TEI_PORT}\n"
|
||||||
|
f"Qdrant: {QDRANT_HOST}:{QDRANT_PORT}",
|
||||||
|
title="Step 6.2"
|
||||||
|
))
|
||||||
|
|
||||||
|
all_queries = (
|
||||||
|
[(q, "arabic") for q in SAMPLE_QUERIES["arabic"]] +
|
||||||
|
[(q, "english") for q in SAMPLE_QUERIES["english"]] +
|
||||||
|
[(q, "mixed") for q in SAMPLE_QUERIES["mixed"]]
|
||||||
|
)
|
||||||
|
|
||||||
|
query_results = []
|
||||||
|
total_times = []
|
||||||
|
successful = 0
|
||||||
|
failed = 0
|
||||||
|
|
||||||
|
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||||
|
# Warmup queries
|
||||||
|
console.print(f"\n[yellow]Running {warmup_count} warmup queries...[/yellow]")
|
||||||
|
for i in range(warmup_count):
|
||||||
|
try:
|
||||||
|
await semantic_search(client, "test warmup query", limit=5)
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[dim]Warmup {i+1} error: {e}[/dim]")
|
||||||
|
|
||||||
|
console.print("[green]Warmup complete.[/green]\n")
|
||||||
|
|
||||||
|
# Run benchmarks
|
||||||
|
with Progress(
|
||||||
|
SpinnerColumn(),
|
||||||
|
TextColumn("[progress.description]{task.description}"),
|
||||||
|
BarColumn(),
|
||||||
|
TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
|
||||||
|
console=console
|
||||||
|
) as progress:
|
||||||
|
task = progress.add_task("Running benchmarks...", total=len(all_queries))
|
||||||
|
|
||||||
|
for query_data, lang in all_queries:
|
||||||
|
query = query_data["query"]
|
||||||
|
description = query_data["description"]
|
||||||
|
|
||||||
|
try:
|
||||||
|
results, embed_time, search_time = await semantic_search(
|
||||||
|
client, query, limit=10
|
||||||
|
)
|
||||||
|
|
||||||
|
total_time = embed_time + search_time
|
||||||
|
total_times.append(total_time)
|
||||||
|
|
||||||
|
benchmark = QueryBenchmark(
|
||||||
|
query=query,
|
||||||
|
language=lang,
|
||||||
|
description=description,
|
||||||
|
embedding_time_ms=embed_time,
|
||||||
|
search_time_ms=search_time,
|
||||||
|
total_time_ms=total_time,
|
||||||
|
results_count=len(results),
|
||||||
|
top_score=results[0].score if results else 0,
|
||||||
|
meets_target=total_time < 500
|
||||||
|
)
|
||||||
|
|
||||||
|
query_results.append(benchmark)
|
||||||
|
successful += 1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[red]Error for query '{query[:30]}...': {e}[/red]")
|
||||||
|
failed += 1
|
||||||
|
|
||||||
|
progress.advance(task)
|
||||||
|
|
||||||
|
# Calculate statistics
|
||||||
|
if total_times:
|
||||||
|
sorted_times = sorted(total_times)
|
||||||
|
p50_idx = int(len(sorted_times) * 0.50)
|
||||||
|
p95_idx = int(len(sorted_times) * 0.95)
|
||||||
|
p99_idx = int(len(sorted_times) * 0.99)
|
||||||
|
|
||||||
|
report = BenchmarkReport(
|
||||||
|
total_queries=len(all_queries),
|
||||||
|
successful_queries=successful,
|
||||||
|
failed_queries=failed,
|
||||||
|
avg_embedding_time_ms=statistics.mean(q.embedding_time_ms for q in query_results),
|
||||||
|
avg_search_time_ms=statistics.mean(q.search_time_ms for q in query_results),
|
||||||
|
avg_total_time_ms=statistics.mean(total_times),
|
||||||
|
p50_time_ms=sorted_times[p50_idx],
|
||||||
|
p95_time_ms=sorted_times[p95_idx] if p95_idx < len(sorted_times) else sorted_times[-1],
|
||||||
|
p99_time_ms=sorted_times[p99_idx] if p99_idx < len(sorted_times) else sorted_times[-1],
|
||||||
|
min_time_ms=min(total_times),
|
||||||
|
max_time_ms=max(total_times),
|
||||||
|
queries_meeting_target=sum(1 for t in total_times if t < 500),
|
||||||
|
target_ms=500,
|
||||||
|
query_results=query_results,
|
||||||
|
timestamp=datetime.now().isoformat()
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
report = BenchmarkReport(
|
||||||
|
total_queries=len(all_queries),
|
||||||
|
successful_queries=0,
|
||||||
|
failed_queries=failed,
|
||||||
|
avg_embedding_time_ms=0,
|
||||||
|
avg_search_time_ms=0,
|
||||||
|
avg_total_time_ms=0,
|
||||||
|
p50_time_ms=0,
|
||||||
|
p95_time_ms=0,
|
||||||
|
p99_time_ms=0,
|
||||||
|
min_time_ms=0,
|
||||||
|
max_time_ms=0,
|
||||||
|
queries_meeting_target=0,
|
||||||
|
target_ms=500,
|
||||||
|
query_results=[],
|
||||||
|
timestamp=datetime.now().isoformat()
|
||||||
|
)
|
||||||
|
|
||||||
|
return report
|
||||||
|
|
||||||
|
|
||||||
|
def display_benchmark_report(report: BenchmarkReport):
|
||||||
|
"""Display benchmark report."""
|
||||||
|
console.print("\n" + "="*60)
|
||||||
|
console.print("[bold]BENCHMARK RESULTS[/bold]")
|
||||||
|
console.print("="*60)
|
||||||
|
|
||||||
|
# Summary stats
|
||||||
|
console.print(f"\n[cyan]Query Statistics:[/cyan]")
|
||||||
|
console.print(f" Total queries: {report.total_queries}")
|
||||||
|
console.print(f" Successful: [green]{report.successful_queries}[/green]")
|
||||||
|
console.print(f" Failed: [red]{report.failed_queries}[/red]")
|
||||||
|
|
||||||
|
console.print(f"\n[cyan]Timing Statistics:[/cyan]")
|
||||||
|
console.print(f" Average embedding time: {report.avg_embedding_time_ms:.1f}ms")
|
||||||
|
console.print(f" Average search time: {report.avg_search_time_ms:.1f}ms")
|
||||||
|
console.print(f" Average total time: {report.avg_total_time_ms:.1f}ms")
|
||||||
|
|
||||||
|
console.print(f"\n[cyan]Percentiles:[/cyan]")
|
||||||
|
console.print(f" P50: {report.p50_time_ms:.1f}ms")
|
||||||
|
console.print(f" P95: {report.p95_time_ms:.1f}ms")
|
||||||
|
console.print(f" P99: {report.p99_time_ms:.1f}ms")
|
||||||
|
console.print(f" Min: {report.min_time_ms:.1f}ms")
|
||||||
|
console.print(f" Max: {report.max_time_ms:.1f}ms")
|
||||||
|
|
||||||
|
# Target check
|
||||||
|
target_pct = (report.queries_meeting_target / report.successful_queries * 100) if report.successful_queries else 0
|
||||||
|
target_met = target_pct >= 95 # 95% of queries should meet target
|
||||||
|
|
||||||
|
console.print(f"\n[cyan]Performance Target (<{report.target_ms}ms):[/cyan]")
|
||||||
|
status = "[bold green]✓ TARGET MET[/bold green]" if target_met else "[bold red]✗ TARGET NOT MET[/bold red]"
|
||||||
|
console.print(f" Queries meeting target: {report.queries_meeting_target}/{report.successful_queries} ({target_pct:.1f}%)")
|
||||||
|
console.print(f" Status: {status}")
|
||||||
|
|
||||||
|
# Detailed results table
|
||||||
|
if report.query_results:
|
||||||
|
console.print("\n[cyan]Detailed Results:[/cyan]")
|
||||||
|
|
||||||
|
table = Table(show_lines=False)
|
||||||
|
table.add_column("Language", width=8)
|
||||||
|
table.add_column("Query", width=35)
|
||||||
|
table.add_column("Embed", justify="right", width=8)
|
||||||
|
table.add_column("Search", justify="right", width=8)
|
||||||
|
table.add_column("Total", justify="right", width=8)
|
||||||
|
table.add_column("Results", justify="right", width=7)
|
||||||
|
table.add_column("Status", width=6)
|
||||||
|
|
||||||
|
for r in report.query_results:
|
||||||
|
status_icon = "✓" if r.meets_target else "✗"
|
||||||
|
status_style = "green" if r.meets_target else "red"
|
||||||
|
|
||||||
|
table.add_row(
|
||||||
|
r.language,
|
||||||
|
r.query[:35] + "..." if len(r.query) > 35 else r.query,
|
||||||
|
f"{r.embedding_time_ms:.0f}ms",
|
||||||
|
f"{r.search_time_ms:.0f}ms",
|
||||||
|
f"{r.total_time_ms:.0f}ms",
|
||||||
|
str(r.results_count),
|
||||||
|
f"[{status_style}]{status_icon}[/{status_style}]"
|
||||||
|
)
|
||||||
|
|
||||||
|
console.print(table)
|
||||||
|
|
||||||
|
|
||||||
|
async def interactive_search():
|
||||||
|
"""Interactive search mode."""
|
||||||
|
console.print(Panel.fit(
|
||||||
|
"[bold blue]Interactive Semantic Search[/bold blue]\n"
|
||||||
|
"Type your query and press Enter. Type 'quit' to exit.",
|
||||||
|
title="Interactive Mode"
|
||||||
|
))
|
||||||
|
|
||||||
|
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
query = input("\n🔍 Query: ").strip()
|
||||||
|
|
||||||
|
if query.lower() in ('quit', 'exit', 'q'):
|
||||||
|
console.print("[dim]Goodbye![/dim]")
|
||||||
|
break
|
||||||
|
|
||||||
|
if not query:
|
||||||
|
continue
|
||||||
|
|
||||||
|
results, embed_time, search_time = await semantic_search(
|
||||||
|
client, query, limit=10
|
||||||
|
)
|
||||||
|
|
||||||
|
display_search_results(query, results, embed_time, search_time)
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
console.print("\n[dim]Interrupted. Goodbye![/dim]")
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[red]Error: {e}[/red]")
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
"""Main entry point."""
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description="Hadith Semantic Search Testing")
|
||||||
|
parser.add_argument("--mode", choices=["benchmark", "interactive", "demo"],
|
||||||
|
default="benchmark", help="Run mode")
|
||||||
|
parser.add_argument("--query", type=str, help="Single query to run")
|
||||||
|
parser.add_argument("--output", type=str, default="benchmark_results.json",
|
||||||
|
help="Output file for benchmark results")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.query:
|
||||||
|
# Single query mode
|
||||||
|
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||||
|
results, embed_time, search_time = await semantic_search(
|
||||||
|
client, args.query, limit=10
|
||||||
|
)
|
||||||
|
display_search_results(args.query, results, embed_time, search_time)
|
||||||
|
|
||||||
|
elif args.mode == "benchmark":
|
||||||
|
# Full benchmark
|
||||||
|
report = await run_benchmarks()
|
||||||
|
display_benchmark_report(report)
|
||||||
|
|
||||||
|
# Save results
|
||||||
|
with open(args.output, 'w') as f:
|
||||||
|
json.dump(asdict(report), f, indent=2, default=str)
|
||||||
|
console.print(f"\n[dim]Results saved to {args.output}[/dim]")
|
||||||
|
|
||||||
|
elif args.mode == "interactive":
|
||||||
|
await interactive_search()
|
||||||
|
|
||||||
|
elif args.mode == "demo":
|
||||||
|
# Demo with a few sample queries
|
||||||
|
console.print(Panel.fit(
|
||||||
|
"[bold blue]Semantic Search Demo[/bold blue]",
|
||||||
|
title="Demo Mode"
|
||||||
|
))
|
||||||
|
|
||||||
|
demo_queries = [
|
||||||
|
"الصلاة في المسجد",
|
||||||
|
"five daily prayers",
|
||||||
|
"patience during hardship",
|
||||||
|
"بر الوالدين"
|
||||||
|
]
|
||||||
|
|
||||||
|
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||||
|
for query in demo_queries:
|
||||||
|
try:
|
||||||
|
results, embed_time, search_time = await semantic_search(
|
||||||
|
client, query, limit=5
|
||||||
|
)
|
||||||
|
display_search_results(query, results, embed_time, search_time)
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[red]Error for '{query}': {e}[/red]")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
|
|
@ -0,0 +1,476 @@
|
||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Step 6: Verify Embeddings & Test Semantic Search\n",
|
||||||
|
"\n",
|
||||||
|
"This notebook provides interactive verification and testing of the hadith embedding system.\n",
|
||||||
|
"\n",
|
||||||
|
"**Prerequisites:**\n",
|
||||||
|
"- PostgreSQL accessible at pg.betelgeusebytes.io\n",
|
||||||
|
"- Qdrant accessible at qdrant.vector.svc.cluster.local\n",
|
||||||
|
"- TEI accessible at tei.ml.svc.cluster.local"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## 1. Setup & Configuration"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Install dependencies\n",
|
||||||
|
"!pip install -q psycopg2-binary httpx rich"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import os\n",
|
||||||
|
"import json\n",
|
||||||
|
"import time\n",
|
||||||
|
"import httpx\n",
|
||||||
|
"import psycopg2\n",
|
||||||
|
"from psycopg2.extras import RealDictCursor\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"from IPython.display import display, HTML, Markdown\n",
|
||||||
|
"\n",
|
||||||
|
"# Configuration\n",
|
||||||
|
"POSTGRES_CONFIG = {\n",
|
||||||
|
" 'host': os.getenv('POSTGRES_HOST', 'pg.betelgeusebytes.io'),\n",
|
||||||
|
" 'port': int(os.getenv('POSTGRES_PORT', '5432')),\n",
|
||||||
|
" 'database': os.getenv('POSTGRES_DB', 'hadith_db'),\n",
|
||||||
|
" 'user': os.getenv('POSTGRES_USER', 'hadith_ingest'),\n",
|
||||||
|
" 'password': os.getenv('POSTGRES_PASSWORD', ''), # SET THIS!\n",
|
||||||
|
" 'sslmode': 'require'\n",
|
||||||
|
"}\n",
|
||||||
|
"\n",
|
||||||
|
"QDRANT_URL = f\"http://{os.getenv('QDRANT_HOST', 'qdrant.vector.svc.cluster.local')}:{os.getenv('QDRANT_PORT', '6333')}\"\n",
|
||||||
|
"QDRANT_COLLECTION = os.getenv('QDRANT_COLLECTION', 'hadith_embeddings')\n",
|
||||||
|
"\n",
|
||||||
|
"TEI_URL = f\"http://{os.getenv('TEI_HOST', 'tei.ml.svc.cluster.local')}:{os.getenv('TEI_PORT', '80')}\"\n",
|
||||||
|
"\n",
|
||||||
|
"print(f\"PostgreSQL: {POSTGRES_CONFIG['host']}:{POSTGRES_CONFIG['port']}/{POSTGRES_CONFIG['database']}\")\n",
|
||||||
|
"print(f\"Qdrant: {QDRANT_URL}\")\n",
|
||||||
|
"print(f\"TEI: {TEI_URL}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# ⚠️ SET YOUR PASSWORD HERE\n",
|
||||||
|
"POSTGRES_CONFIG['password'] = 'YOUR_PASSWORD_HERE' # CHANGE THIS!"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## 2. Database Verification"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def get_db_connection():\n",
|
||||||
|
" return psycopg2.connect(**POSTGRES_CONFIG)\n",
|
||||||
|
"\n",
|
||||||
|
"def run_query(query):\n",
|
||||||
|
" conn = get_db_connection()\n",
|
||||||
|
" try:\n",
|
||||||
|
" df = pd.read_sql(query, conn)\n",
|
||||||
|
" return df\n",
|
||||||
|
" finally:\n",
|
||||||
|
" conn.close()\n",
|
||||||
|
"\n",
|
||||||
|
"# Test connection\n",
|
||||||
|
"try:\n",
|
||||||
|
" conn = get_db_connection()\n",
|
||||||
|
" conn.close()\n",
|
||||||
|
" print(\"✅ Database connection successful!\")\n",
|
||||||
|
"except Exception as e:\n",
|
||||||
|
" print(f\"❌ Database connection failed: {e}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Get total hadith count and embedding status\n",
|
||||||
|
"query = \"\"\"\n",
|
||||||
|
"SELECT \n",
|
||||||
|
" COUNT(*) as total_hadiths,\n",
|
||||||
|
" SUM(CASE WHEN embedding_generated THEN 1 ELSE 0 END) as embedded,\n",
|
||||||
|
" SUM(CASE WHEN NOT embedding_generated THEN 1 ELSE 0 END) as not_embedded,\n",
|
||||||
|
" ROUND(100.0 * SUM(CASE WHEN embedding_generated THEN 1 ELSE 0 END) / COUNT(*), 2) as pct_complete\n",
|
||||||
|
"FROM hadiths\n",
|
||||||
|
"\"\"\"\n",
|
||||||
|
"\n",
|
||||||
|
"df = run_query(query)\n",
|
||||||
|
"display(df)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Get breakdown by collection\n",
|
||||||
|
"query = \"\"\"\n",
|
||||||
|
"SELECT \n",
|
||||||
|
" c.name_english as collection,\n",
|
||||||
|
" COUNT(h.id) as total,\n",
|
||||||
|
" SUM(CASE WHEN h.embedding_generated THEN 1 ELSE 0 END) as embedded,\n",
|
||||||
|
" ROUND(100.0 * SUM(CASE WHEN h.embedding_generated THEN 1 ELSE 0 END) / COUNT(h.id), 2) as pct_embedded\n",
|
||||||
|
"FROM hadiths h\n",
|
||||||
|
"JOIN collections c ON h.collection_id = c.id\n",
|
||||||
|
"GROUP BY c.id, c.name_english\n",
|
||||||
|
"ORDER BY total DESC\n",
|
||||||
|
"\"\"\"\n",
|
||||||
|
"\n",
|
||||||
|
"df_collections = run_query(query)\n",
|
||||||
|
"display(df_collections)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## 3. Qdrant Verification"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Check Qdrant collection\n",
|
||||||
|
"with httpx.Client(timeout=30.0) as client:\n",
|
||||||
|
" try:\n",
|
||||||
|
" response = client.get(f\"{QDRANT_URL}/collections/{QDRANT_COLLECTION}\")\n",
|
||||||
|
" response.raise_for_status()\n",
|
||||||
|
" collection_info = response.json()\n",
|
||||||
|
" print(\"✅ Qdrant collection found!\")\n",
|
||||||
|
" print(f\"\\nCollection status: {collection_info['result']['status']}\")\n",
|
||||||
|
" print(f\"Vector dimension: {collection_info['result']['config']['params']['vectors']['size']}\")\n",
|
||||||
|
" except Exception as e:\n",
|
||||||
|
" print(f\"❌ Qdrant error: {e}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Count points in Qdrant\n",
|
||||||
|
"with httpx.Client(timeout=30.0) as client:\n",
|
||||||
|
" response = client.post(\n",
|
||||||
|
" f\"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/points/count\",\n",
|
||||||
|
" json={\"exact\": True}\n",
|
||||||
|
" )\n",
|
||||||
|
" response.raise_for_status()\n",
|
||||||
|
" count = response.json()['result']['count']\n",
|
||||||
|
" print(f\"\\n📊 Total embeddings in Qdrant: {count:,}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## 4. TEI Service Test"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Test TEI embedding service\n",
|
||||||
|
"test_text = \"الصلاة في المسجد الحرام\"\n",
|
||||||
|
"\n",
|
||||||
|
"with httpx.Client(timeout=30.0) as client:\n",
|
||||||
|
" start = time.perf_counter()\n",
|
||||||
|
" response = client.post(\n",
|
||||||
|
" f\"{TEI_URL}/embed\",\n",
|
||||||
|
" json={\"inputs\": test_text}\n",
|
||||||
|
" )\n",
|
||||||
|
" elapsed = (time.perf_counter() - start) * 1000\n",
|
||||||
|
" \n",
|
||||||
|
" response.raise_for_status()\n",
|
||||||
|
" embedding = response.json()[0]\n",
|
||||||
|
" \n",
|
||||||
|
" print(f\"✅ TEI service working!\")\n",
|
||||||
|
" print(f\"\\nTest text: {test_text}\")\n",
|
||||||
|
" print(f\"Embedding dimension: {len(embedding)}\")\n",
|
||||||
|
" print(f\"Time: {elapsed:.1f}ms\")\n",
|
||||||
|
" print(f\"First 5 values: {embedding[:5]}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## 5. Semantic Search Testing"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def semantic_search(query_text, limit=10):\n",
|
||||||
|
" \"\"\"Perform semantic search and return results with timing.\"\"\"\n",
|
||||||
|
" with httpx.Client(timeout=30.0) as client:\n",
|
||||||
|
" # Get embedding\n",
|
||||||
|
" start = time.perf_counter()\n",
|
||||||
|
" embed_response = client.post(f\"{TEI_URL}/embed\", json={\"inputs\": query_text})\n",
|
||||||
|
" embed_response.raise_for_status()\n",
|
||||||
|
" embedding = embed_response.json()[0]\n",
|
||||||
|
" embed_time = (time.perf_counter() - start) * 1000\n",
|
||||||
|
" \n",
|
||||||
|
" # Search Qdrant\n",
|
||||||
|
" start = time.perf_counter()\n",
|
||||||
|
" search_response = client.post(\n",
|
||||||
|
" f\"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/points/search\",\n",
|
||||||
|
" json={\"vector\": embedding, \"limit\": limit, \"with_payload\": True}\n",
|
||||||
|
" )\n",
|
||||||
|
" search_response.raise_for_status()\n",
|
||||||
|
" results = search_response.json()['result']\n",
|
||||||
|
" search_time = (time.perf_counter() - start) * 1000\n",
|
||||||
|
" \n",
|
||||||
|
" return results, embed_time, search_time\n",
|
||||||
|
"\n",
|
||||||
|
"def display_results(query, results, embed_time, search_time):\n",
|
||||||
|
" \"\"\"Display search results nicely.\"\"\"\n",
|
||||||
|
" total_time = embed_time + search_time\n",
|
||||||
|
" status = \"✅\" if total_time < 500 else \"⚠️\"\n",
|
||||||
|
" \n",
|
||||||
|
" print(f\"\\n{'='*60}\")\n",
|
||||||
|
" print(f\"Query: {query}\")\n",
|
||||||
|
" print(f\"Timing: {embed_time:.0f}ms (embed) + {search_time:.0f}ms (search) = {total_time:.0f}ms {status}\")\n",
|
||||||
|
" print(f\"{'='*60}\\n\")\n",
|
||||||
|
" \n",
|
||||||
|
" for i, r in enumerate(results, 1):\n",
|
||||||
|
" score = r['score']\n",
|
||||||
|
" payload = r.get('payload', {})\n",
|
||||||
|
" \n",
|
||||||
|
" text = payload.get('english_text') or payload.get('arabic_text', '')\n",
|
||||||
|
" text = text[:150] + '...' if len(text) > 150 else text\n",
|
||||||
|
" \n",
|
||||||
|
" print(f\"{i}. [Score: {score:.4f}] {payload.get('collection', 'Unknown')} #{payload.get('hadith_number', 'N/A')}\")\n",
|
||||||
|
" print(f\" {text}\")\n",
|
||||||
|
" print()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Test Arabic query\n",
|
||||||
|
"query = \"الصلاة في المسجد الحرام\"\n",
|
||||||
|
"results, embed_time, search_time = semantic_search(query, limit=5)\n",
|
||||||
|
"display_results(query, results, embed_time, search_time)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Test English query\n",
|
||||||
|
"query = \"five daily prayers\"\n",
|
||||||
|
"results, embed_time, search_time = semantic_search(query, limit=5)\n",
|
||||||
|
"display_results(query, results, embed_time, search_time)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Test more queries\n",
|
||||||
|
"test_queries = [\n",
|
||||||
|
" \"الصيام في شهر رمضان\",\n",
|
||||||
|
" \"patience during hardship\",\n",
|
||||||
|
" \"بر الوالدين\",\n",
|
||||||
|
" \"charity and helping poor\",\n",
|
||||||
|
" \"الجنة والنار\"\n",
|
||||||
|
"]\n",
|
||||||
|
"\n",
|
||||||
|
"for q in test_queries:\n",
|
||||||
|
" results, embed_time, search_time = semantic_search(q, limit=3)\n",
|
||||||
|
" display_results(q, results, embed_time, search_time)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## 6. Performance Benchmarking"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import statistics\n",
|
||||||
|
"\n",
|
||||||
|
"# Benchmark queries\n",
|
||||||
|
"benchmark_queries = [\n",
|
||||||
|
" \"الصلاة في المسجد الحرام\",\n",
|
||||||
|
" \"أبو هريرة رضي الله عنه\",\n",
|
||||||
|
" \"الصيام في شهر رمضان\",\n",
|
||||||
|
" \"five daily prayers\",\n",
|
||||||
|
" \"treatment of neighbors\",\n",
|
||||||
|
" \"patience during hardship\",\n",
|
||||||
|
" \"marriage and family\",\n",
|
||||||
|
" \"honesty and truthfulness\",\n",
|
||||||
|
" \"الزكاة والصدقة\",\n",
|
||||||
|
" \"الحج والعمرة\"\n",
|
||||||
|
"]\n",
|
||||||
|
"\n",
|
||||||
|
"# Warmup\n",
|
||||||
|
"print(\"Warming up...\")\n",
|
||||||
|
"for _ in range(3):\n",
|
||||||
|
" semantic_search(\"warmup query\", limit=5)\n",
|
||||||
|
"\n",
|
||||||
|
"# Run benchmark\n",
|
||||||
|
"print(\"\\nRunning benchmark...\")\n",
|
||||||
|
"times = []\n",
|
||||||
|
"\n",
|
||||||
|
"for q in benchmark_queries:\n",
|
||||||
|
" results, embed_time, search_time = semantic_search(q, limit=10)\n",
|
||||||
|
" total = embed_time + search_time\n",
|
||||||
|
" times.append(total)\n",
|
||||||
|
" status = \"✅\" if total < 500 else \"⚠️\"\n",
|
||||||
|
" print(f\" {q[:40]:40s} → {total:6.1f}ms {status}\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Statistics\n",
|
||||||
|
"print(f\"\\n{'='*60}\")\n",
|
||||||
|
"print(\"BENCHMARK RESULTS\")\n",
|
||||||
|
"print(f\"{'='*60}\")\n",
|
||||||
|
"print(f\"Queries: {len(times)}\")\n",
|
||||||
|
"print(f\"Average: {statistics.mean(times):.1f}ms\")\n",
|
||||||
|
"print(f\"Median: {statistics.median(times):.1f}ms\")\n",
|
||||||
|
"print(f\"Min: {min(times):.1f}ms\")\n",
|
||||||
|
"print(f\"Max: {max(times):.1f}ms\")\n",
|
||||||
|
"print(f\"StdDev: {statistics.stdev(times):.1f}ms\")\n",
|
||||||
|
"\n",
|
||||||
|
"meeting_target = sum(1 for t in times if t < 500)\n",
|
||||||
|
"print(f\"\\nMeeting <500ms target: {meeting_target}/{len(times)} ({100*meeting_target/len(times):.1f}%)\")\n",
|
||||||
|
"\n",
|
||||||
|
"if meeting_target == len(times):\n",
|
||||||
|
" print(\"\\n✅ TARGET MET!\")\n",
|
||||||
|
"else:\n",
|
||||||
|
" print(\"\\n⚠️ Some queries exceeded target\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## 7. Interactive Search"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Interactive search cell - run this and enter your query\n",
|
||||||
|
"query = input(\"Enter your search query: \")\n",
|
||||||
|
"if query:\n",
|
||||||
|
" results, embed_time, search_time = semantic_search(query, limit=10)\n",
|
||||||
|
" display_results(query, results, embed_time, search_time)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## 8. Verification Summary"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Final verification summary\n",
|
||||||
|
"print(\"=\"*60)\n",
|
||||||
|
"print(\"STEP 6 VERIFICATION SUMMARY\")\n",
|
||||||
|
"print(\"=\"*60)\n",
|
||||||
|
"\n",
|
||||||
|
"# Database check\n",
|
||||||
|
"df = run_query(\"SELECT COUNT(*) as total, SUM(CASE WHEN embedding_generated THEN 1 ELSE 0 END) as embedded FROM hadiths\")\n",
|
||||||
|
"total = df['total'][0]\n",
|
||||||
|
"embedded = df['embedded'][0]\n",
|
||||||
|
"print(f\"\\n✅ Database: {total:,} hadiths, {embedded:,} embedded ({100*embedded/total:.1f}%)\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Qdrant check\n",
|
||||||
|
"with httpx.Client(timeout=30.0) as client:\n",
|
||||||
|
" response = client.post(f\"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/points/count\", json={\"exact\": True})\n",
|
||||||
|
" qdrant_count = response.json()['result']['count']\n",
|
||||||
|
" print(f\"✅ Qdrant: {qdrant_count:,} embeddings stored\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Benchmark summary\n",
|
||||||
|
"if 'times' in dir() and times:\n",
|
||||||
|
" print(f\"✅ Performance: Avg {statistics.mean(times):.0f}ms, P95 {sorted(times)[int(len(times)*0.95)]:.0f}ms\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Missing check\n",
|
||||||
|
"missing = total - qdrant_count\n",
|
||||||
|
"if missing == 0:\n",
|
||||||
|
" print(f\"\\n🎉 ALL {total:,} HADITHS VERIFIED!\")\n",
|
||||||
|
"else:\n",
|
||||||
|
" print(f\"\\n⚠️ {missing:,} embeddings potentially missing\")\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"\\n\" + \"=\"*60)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"name": "python",
|
||||||
|
"version": "3.11.0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,192 @@
|
||||||
|
-- ============================================================================
|
||||||
|
-- Step 6.1: PostgreSQL Verification Queries
|
||||||
|
-- Run these against hadith_db to verify data integrity
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- Connect: psql -h pg.betelgeusebytes.io -U hadith_ingest -d hadith_db
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- 1. Basic Statistics
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- Total hadith count
|
||||||
|
SELECT COUNT(*) AS total_hadiths FROM hadiths;
|
||||||
|
|
||||||
|
-- Hadiths by collection with embedding status
|
||||||
|
SELECT
|
||||||
|
c.name_english AS collection,
|
||||||
|
COUNT(h.id) AS total,
|
||||||
|
SUM(CASE WHEN h.embedding_generated THEN 1 ELSE 0 END) AS embedded,
|
||||||
|
ROUND(100.0 * SUM(CASE WHEN h.embedding_generated THEN 1 ELSE 0 END) / COUNT(h.id), 2) AS pct_embedded
|
||||||
|
FROM hadiths h
|
||||||
|
JOIN collections c ON h.collection_id = c.id
|
||||||
|
GROUP BY c.id, c.name_english
|
||||||
|
ORDER BY total DESC;
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- 2. Embedding Status Summary
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- Overall embedding status
|
||||||
|
SELECT
|
||||||
|
SUM(CASE WHEN embedding_generated THEN 1 ELSE 0 END) AS embedded,
|
||||||
|
SUM(CASE WHEN NOT embedding_generated THEN 1 ELSE 0 END) AS not_embedded,
|
||||||
|
COUNT(*) AS total,
|
||||||
|
ROUND(100.0 * SUM(CASE WHEN embedding_generated THEN 1 ELSE 0 END) / COUNT(*), 2) AS pct_complete
|
||||||
|
FROM hadiths;
|
||||||
|
|
||||||
|
-- List hadiths without embeddings (if any)
|
||||||
|
SELECT
|
||||||
|
h.id,
|
||||||
|
c.name_english AS collection,
|
||||||
|
h.hadith_number,
|
||||||
|
LEFT(h.arabic_text, 100) AS arabic_preview
|
||||||
|
FROM hadiths h
|
||||||
|
JOIN collections c ON h.collection_id = c.id
|
||||||
|
WHERE NOT h.embedding_generated
|
||||||
|
ORDER BY h.id
|
||||||
|
LIMIT 20;
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- 3. Text Quality Checks
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- Hadiths with empty or null texts
|
||||||
|
SELECT
|
||||||
|
'Empty Arabic' AS issue,
|
||||||
|
COUNT(*) AS count
|
||||||
|
FROM hadiths
|
||||||
|
WHERE arabic_text IS NULL OR LENGTH(TRIM(arabic_text)) = 0
|
||||||
|
|
||||||
|
UNION ALL
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
'Empty English' AS issue,
|
||||||
|
COUNT(*) AS count
|
||||||
|
FROM hadiths
|
||||||
|
WHERE english_text IS NULL OR LENGTH(TRIM(english_text)) = 0
|
||||||
|
|
||||||
|
UNION ALL
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
'Empty Both' AS issue,
|
||||||
|
COUNT(*) AS count
|
||||||
|
FROM hadiths
|
||||||
|
WHERE (arabic_text IS NULL OR LENGTH(TRIM(arabic_text)) = 0)
|
||||||
|
AND (english_text IS NULL OR LENGTH(TRIM(english_text)) = 0);
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- 4. Grade Distribution
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
COALESCE(grade, 'Unknown') AS grade,
|
||||||
|
COUNT(*) AS count,
|
||||||
|
ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER(), 2) AS percentage
|
||||||
|
FROM hadiths
|
||||||
|
GROUP BY grade
|
||||||
|
ORDER BY count DESC;
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- 5. Arabic Normalization Verification
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- Check that normalized column is populated
|
||||||
|
SELECT
|
||||||
|
COUNT(*) AS total,
|
||||||
|
SUM(CASE WHEN arabic_normalized IS NOT NULL AND LENGTH(arabic_normalized) > 0 THEN 1 ELSE 0 END) AS normalized,
|
||||||
|
SUM(CASE WHEN arabic_normalized IS NULL OR LENGTH(arabic_normalized) = 0 THEN 1 ELSE 0 END) AS not_normalized
|
||||||
|
FROM hadiths
|
||||||
|
WHERE arabic_text IS NOT NULL AND LENGTH(arabic_text) > 0;
|
||||||
|
|
||||||
|
-- Sample comparison of original vs normalized
|
||||||
|
SELECT
|
||||||
|
id,
|
||||||
|
LEFT(arabic_text, 100) AS original,
|
||||||
|
LEFT(arabic_normalized, 100) AS normalized
|
||||||
|
FROM hadiths
|
||||||
|
WHERE arabic_text IS NOT NULL
|
||||||
|
LIMIT 5;
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- 6. Metadata Completeness
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- Check source_metadata JSON completeness
|
||||||
|
SELECT
|
||||||
|
COUNT(*) AS total,
|
||||||
|
SUM(CASE WHEN source_metadata IS NOT NULL THEN 1 ELSE 0 END) AS has_metadata,
|
||||||
|
SUM(CASE WHEN source_metadata ? 'api_source' THEN 1 ELSE 0 END) AS has_api_source,
|
||||||
|
SUM(CASE WHEN source_metadata ? 'ingested_at' THEN 1 ELSE 0 END) AS has_ingested_at
|
||||||
|
FROM hadiths;
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- 7. ID Range Check (for Qdrant comparison)
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- Get ID range
|
||||||
|
SELECT
|
||||||
|
MIN(id) AS min_id,
|
||||||
|
MAX(id) AS max_id,
|
||||||
|
COUNT(*) AS total_ids,
|
||||||
|
MAX(id) - MIN(id) + 1 AS expected_if_sequential,
|
||||||
|
COUNT(*) = (MAX(id) - MIN(id) + 1) AS is_sequential
|
||||||
|
FROM hadiths;
|
||||||
|
|
||||||
|
-- Find gaps in IDs (if any)
|
||||||
|
WITH id_series AS (
|
||||||
|
SELECT generate_series(
|
||||||
|
(SELECT MIN(id) FROM hadiths),
|
||||||
|
(SELECT MAX(id) FROM hadiths)
|
||||||
|
) AS expected_id
|
||||||
|
)
|
||||||
|
SELECT expected_id AS missing_id
|
||||||
|
FROM id_series
|
||||||
|
WHERE expected_id NOT IN (SELECT id FROM hadiths)
|
||||||
|
ORDER BY expected_id
|
||||||
|
LIMIT 50;
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- 8. Sample Data for Manual Verification
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- Sample 10 hadiths with all fields
|
||||||
|
SELECT
|
||||||
|
h.id,
|
||||||
|
c.name_english AS collection,
|
||||||
|
b.name_english AS book,
|
||||||
|
h.hadith_number,
|
||||||
|
h.grade,
|
||||||
|
LENGTH(h.arabic_text) AS arabic_len,
|
||||||
|
LENGTH(h.english_text) AS english_len,
|
||||||
|
LENGTH(h.urdu_text) AS urdu_len,
|
||||||
|
h.embedding_generated,
|
||||||
|
h.created_at
|
||||||
|
FROM hadiths h
|
||||||
|
JOIN collections c ON h.collection_id = c.id
|
||||||
|
LEFT JOIN books b ON h.book_id = b.id
|
||||||
|
ORDER BY RANDOM()
|
||||||
|
LIMIT 10;
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- 9. NER/RE Preparation Status
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
SUM(CASE WHEN entities_extracted THEN 1 ELSE 0 END) AS entities_extracted,
|
||||||
|
SUM(CASE WHEN relations_extracted THEN 1 ELSE 0 END) AS relations_extracted,
|
||||||
|
COUNT(*) AS total
|
||||||
|
FROM hadiths;
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- 10. Quick Health Check Query (run this first)
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
'Database Health Check' AS check_type,
|
||||||
|
(SELECT COUNT(*) FROM hadiths) AS total_hadiths,
|
||||||
|
(SELECT COUNT(*) FROM collections) AS total_collections,
|
||||||
|
(SELECT COUNT(*) FROM books) AS total_books,
|
||||||
|
(SELECT SUM(CASE WHEN embedding_generated THEN 1 ELSE 0 END) FROM hadiths) AS embedded_count,
|
||||||
|
(SELECT COUNT(*) FROM hadiths WHERE arabic_text IS NOT NULL AND LENGTH(arabic_text) > 0) AS has_arabic,
|
||||||
|
(SELECT COUNT(*) FROM hadiths WHERE english_text IS NOT NULL AND LENGTH(english_text) > 0) AS has_english;
|
||||||
|
|
@ -0,0 +1,377 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Step 6.1: Verify Embeddings in Qdrant
|
||||||
|
=====================================
|
||||||
|
Validates that all hadiths have embeddings stored in Qdrant vector database.
|
||||||
|
|
||||||
|
Author: Hadith Scholar AI Project
|
||||||
|
Date: 2025
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import asyncio
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Dict, List, Optional, Tuple
|
||||||
|
from dataclasses import dataclass, asdict
|
||||||
|
|
||||||
|
import psycopg2
|
||||||
|
from psycopg2.extras import RealDictCursor
|
||||||
|
import httpx
|
||||||
|
from rich.console import Console
|
||||||
|
from rich.table import Table
|
||||||
|
from rich.progress import Progress, SpinnerColumn, TextColumn
|
||||||
|
from rich.panel import Panel
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
POSTGRES_HOST = os.getenv("POSTGRES_HOST", "pg.betelgeusebytes.io")
|
||||||
|
POSTGRES_PORT = int(os.getenv("POSTGRES_PORT", "5432"))
|
||||||
|
POSTGRES_DB = os.getenv("POSTGRES_DB", "hadith_db")
|
||||||
|
POSTGRES_USER = os.getenv("POSTGRES_USER", "hadith_ingest")
|
||||||
|
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD", "")
|
||||||
|
|
||||||
|
QDRANT_HOST = os.getenv("QDRANT_HOST", "qdrant.vector.svc.cluster.local")
|
||||||
|
QDRANT_PORT = int(os.getenv("QDRANT_PORT", "6333"))
|
||||||
|
QDRANT_COLLECTION = os.getenv("QDRANT_COLLECTION", "hadith_embeddings")
|
||||||
|
|
||||||
|
# For external access
|
||||||
|
QDRANT_EXTERNAL = os.getenv("QDRANT_EXTERNAL", "qdrant.betelgeusebytes.io")
|
||||||
|
|
||||||
|
console = Console()
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class VerificationResult:
|
||||||
|
"""Results from embedding verification."""
|
||||||
|
total_hadiths_db: int
|
||||||
|
total_embeddings_qdrant: int
|
||||||
|
embeddings_with_payloads: int
|
||||||
|
missing_embeddings: int
|
||||||
|
embedding_dimension: int
|
||||||
|
collection_exists: bool
|
||||||
|
collection_status: str
|
||||||
|
sample_ids_missing: List[int]
|
||||||
|
verification_time_seconds: float
|
||||||
|
timestamp: str
|
||||||
|
|
||||||
|
|
||||||
|
def get_db_connection():
|
||||||
|
"""Create PostgreSQL connection."""
|
||||||
|
return psycopg2.connect(
|
||||||
|
host=POSTGRES_HOST,
|
||||||
|
port=POSTGRES_PORT,
|
||||||
|
database=POSTGRES_DB,
|
||||||
|
user=POSTGRES_USER,
|
||||||
|
password=POSTGRES_PASSWORD,
|
||||||
|
sslmode='require'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def get_qdrant_collection_info(client: httpx.AsyncClient) -> Dict:
|
||||||
|
"""Get Qdrant collection information."""
|
||||||
|
try:
|
||||||
|
response = await client.get(
|
||||||
|
f"http://{QDRANT_HOST}:{QDRANT_PORT}/collections/{QDRANT_COLLECTION}"
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.json()
|
||||||
|
except httpx.HTTPError as e:
|
||||||
|
console.print(f"[red]Error connecting to Qdrant: {e}[/red]")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
async def count_qdrant_points(client: httpx.AsyncClient) -> int:
|
||||||
|
"""Count total points in Qdrant collection."""
|
||||||
|
try:
|
||||||
|
response = await client.post(
|
||||||
|
f"http://{QDRANT_HOST}:{QDRANT_PORT}/collections/{QDRANT_COLLECTION}/points/count",
|
||||||
|
json={"exact": True}
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.json().get("result", {}).get("count", 0)
|
||||||
|
except httpx.HTTPError as e:
|
||||||
|
console.print(f"[red]Error counting Qdrant points: {e}[/red]")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
async def get_qdrant_points_sample(
|
||||||
|
client: httpx.AsyncClient,
|
||||||
|
offset: int = 0,
|
||||||
|
limit: int = 100
|
||||||
|
) -> List[Dict]:
|
||||||
|
"""Get a sample of points from Qdrant."""
|
||||||
|
try:
|
||||||
|
response = await client.post(
|
||||||
|
f"http://{QDRANT_HOST}:{QDRANT_PORT}/collections/{QDRANT_COLLECTION}/points/scroll",
|
||||||
|
json={
|
||||||
|
"limit": limit,
|
||||||
|
"offset": offset,
|
||||||
|
"with_payload": True,
|
||||||
|
"with_vector": False
|
||||||
|
}
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.json().get("result", {}).get("points", [])
|
||||||
|
except httpx.HTTPError as e:
|
||||||
|
console.print(f"[red]Error fetching Qdrant points: {e}[/red]")
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
async def get_all_qdrant_ids(client: httpx.AsyncClient) -> set:
|
||||||
|
"""Get all point IDs from Qdrant (paginated)."""
|
||||||
|
all_ids = set()
|
||||||
|
offset = None
|
||||||
|
batch_size = 1000
|
||||||
|
|
||||||
|
with Progress(
|
||||||
|
SpinnerColumn(),
|
||||||
|
TextColumn("[progress.description]{task.description}"),
|
||||||
|
console=console
|
||||||
|
) as progress:
|
||||||
|
task = progress.add_task("Fetching Qdrant IDs...", total=None)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
payload = {
|
||||||
|
"limit": batch_size,
|
||||||
|
"with_payload": False,
|
||||||
|
"with_vector": False
|
||||||
|
}
|
||||||
|
if offset is not None:
|
||||||
|
payload["offset"] = offset
|
||||||
|
|
||||||
|
response = await client.post(
|
||||||
|
f"http://{QDRANT_HOST}:{QDRANT_PORT}/collections/{QDRANT_COLLECTION}/points/scroll",
|
||||||
|
json=payload,
|
||||||
|
timeout=60.0
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
result = response.json().get("result", {})
|
||||||
|
points = result.get("points", [])
|
||||||
|
|
||||||
|
if not points:
|
||||||
|
break
|
||||||
|
|
||||||
|
for point in points:
|
||||||
|
all_ids.add(point["id"])
|
||||||
|
|
||||||
|
offset = result.get("next_page_offset")
|
||||||
|
progress.update(task, description=f"Fetched {len(all_ids)} IDs...")
|
||||||
|
|
||||||
|
if offset is None:
|
||||||
|
break
|
||||||
|
|
||||||
|
except httpx.HTTPError as e:
|
||||||
|
console.print(f"[red]Error during ID fetch: {e}[/red]")
|
||||||
|
break
|
||||||
|
|
||||||
|
return all_ids
|
||||||
|
|
||||||
|
|
||||||
|
def get_all_hadith_ids_from_db() -> set:
|
||||||
|
"""Get all hadith IDs from PostgreSQL."""
|
||||||
|
conn = get_db_connection()
|
||||||
|
try:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.execute("SELECT id FROM hadiths ORDER BY id")
|
||||||
|
return {row[0] for row in cur.fetchall()}
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def get_hadith_count_from_db() -> int:
|
||||||
|
"""Get total hadith count from PostgreSQL."""
|
||||||
|
conn = get_db_connection()
|
||||||
|
try:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.execute("SELECT COUNT(*) FROM hadiths")
|
||||||
|
return cur.fetchone()[0]
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def get_embedding_stats_from_db() -> Dict:
|
||||||
|
"""Get embedding generation stats from PostgreSQL."""
|
||||||
|
conn = get_db_connection()
|
||||||
|
try:
|
||||||
|
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||||
|
cur.execute("""
|
||||||
|
SELECT
|
||||||
|
COUNT(*) as total,
|
||||||
|
SUM(CASE WHEN embedding_generated THEN 1 ELSE 0 END) as embedded,
|
||||||
|
SUM(CASE WHEN NOT embedding_generated THEN 1 ELSE 0 END) as not_embedded
|
||||||
|
FROM hadiths
|
||||||
|
""")
|
||||||
|
return dict(cur.fetchone())
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def get_collection_stats_by_source() -> List[Dict]:
|
||||||
|
"""Get hadith counts by collection/source."""
|
||||||
|
conn = get_db_connection()
|
||||||
|
try:
|
||||||
|
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||||
|
cur.execute("""
|
||||||
|
SELECT
|
||||||
|
c.name_english as collection,
|
||||||
|
COUNT(h.id) as count,
|
||||||
|
SUM(CASE WHEN h.embedding_generated THEN 1 ELSE 0 END) as embedded
|
||||||
|
FROM hadiths h
|
||||||
|
JOIN collections c ON h.collection_id = c.id
|
||||||
|
GROUP BY c.id, c.name_english
|
||||||
|
ORDER BY count DESC
|
||||||
|
""")
|
||||||
|
return [dict(row) for row in cur.fetchall()]
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
async def verify_embeddings() -> VerificationResult:
|
||||||
|
"""Main verification function."""
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
console.print(Panel.fit(
|
||||||
|
"[bold blue]Hadith Embeddings Verification[/bold blue]\n"
|
||||||
|
f"Database: {POSTGRES_HOST}:{POSTGRES_PORT}/{POSTGRES_DB}\n"
|
||||||
|
f"Qdrant: {QDRANT_HOST}:{QDRANT_PORT}/{QDRANT_COLLECTION}",
|
||||||
|
title="Step 6.1"
|
||||||
|
))
|
||||||
|
|
||||||
|
# Step 1: Get PostgreSQL stats
|
||||||
|
console.print("\n[yellow]1. Checking PostgreSQL database...[/yellow]")
|
||||||
|
db_stats = get_embedding_stats_from_db()
|
||||||
|
total_hadiths = db_stats['total']
|
||||||
|
console.print(f" Total hadiths: [green]{total_hadiths:,}[/green]")
|
||||||
|
console.print(f" Marked as embedded: [green]{db_stats['embedded']:,}[/green]")
|
||||||
|
|
||||||
|
# Step 2: Get collection breakdown
|
||||||
|
console.print("\n[yellow]2. Collection breakdown:[/yellow]")
|
||||||
|
collection_stats = get_collection_stats_by_source()
|
||||||
|
|
||||||
|
table = Table(title="Hadiths by Collection")
|
||||||
|
table.add_column("Collection", style="cyan")
|
||||||
|
table.add_column("Total", justify="right")
|
||||||
|
table.add_column("Embedded", justify="right", style="green")
|
||||||
|
|
||||||
|
for stat in collection_stats:
|
||||||
|
table.add_row(
|
||||||
|
stat['collection'],
|
||||||
|
f"{stat['count']:,}",
|
||||||
|
f"{stat['embedded']:,}"
|
||||||
|
)
|
||||||
|
console.print(table)
|
||||||
|
|
||||||
|
# Step 3: Check Qdrant collection
|
||||||
|
console.print("\n[yellow]3. Checking Qdrant collection...[/yellow]")
|
||||||
|
|
||||||
|
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||||
|
collection_info = await get_qdrant_collection_info(client)
|
||||||
|
|
||||||
|
if not collection_info:
|
||||||
|
return VerificationResult(
|
||||||
|
total_hadiths_db=total_hadiths,
|
||||||
|
total_embeddings_qdrant=0,
|
||||||
|
embeddings_with_payloads=0,
|
||||||
|
missing_embeddings=total_hadiths,
|
||||||
|
embedding_dimension=0,
|
||||||
|
collection_exists=False,
|
||||||
|
collection_status="NOT_FOUND",
|
||||||
|
sample_ids_missing=[],
|
||||||
|
verification_time_seconds=time.time() - start_time,
|
||||||
|
timestamp=datetime.now().isoformat()
|
||||||
|
)
|
||||||
|
|
||||||
|
result = collection_info.get("result", {})
|
||||||
|
status = result.get("status", "unknown")
|
||||||
|
vectors_config = result.get("config", {}).get("params", {}).get("vectors", {})
|
||||||
|
embedding_dim = vectors_config.get("size", 0)
|
||||||
|
|
||||||
|
console.print(f" Collection status: [green]{status}[/green]")
|
||||||
|
console.print(f" Embedding dimension: [green]{embedding_dim}[/green]")
|
||||||
|
|
||||||
|
# Step 4: Count Qdrant points
|
||||||
|
console.print("\n[yellow]4. Counting Qdrant embeddings...[/yellow]")
|
||||||
|
qdrant_count = await count_qdrant_points(client)
|
||||||
|
console.print(f" Total embeddings: [green]{qdrant_count:,}[/green]")
|
||||||
|
|
||||||
|
# Step 5: Find missing embeddings
|
||||||
|
console.print("\n[yellow]5. Identifying missing embeddings...[/yellow]")
|
||||||
|
db_ids = get_all_hadith_ids_from_db()
|
||||||
|
qdrant_ids = await get_all_qdrant_ids(client)
|
||||||
|
|
||||||
|
missing_ids = db_ids - qdrant_ids
|
||||||
|
extra_ids = qdrant_ids - db_ids
|
||||||
|
|
||||||
|
console.print(f" IDs in DB: [blue]{len(db_ids):,}[/blue]")
|
||||||
|
console.print(f" IDs in Qdrant: [blue]{len(qdrant_ids):,}[/blue]")
|
||||||
|
console.print(f" Missing embeddings: [{'red' if missing_ids else 'green'}]{len(missing_ids):,}[/{'red' if missing_ids else 'green'}]")
|
||||||
|
|
||||||
|
if extra_ids:
|
||||||
|
console.print(f" Extra IDs in Qdrant (orphaned): [yellow]{len(extra_ids):,}[/yellow]")
|
||||||
|
|
||||||
|
# Get sample of missing IDs
|
||||||
|
sample_missing = sorted(list(missing_ids))[:20] if missing_ids else []
|
||||||
|
|
||||||
|
# Step 6: Verify sample payload integrity
|
||||||
|
console.print("\n[yellow]6. Verifying payload integrity...[/yellow]")
|
||||||
|
sample_points = await get_qdrant_points_sample(client, limit=100)
|
||||||
|
|
||||||
|
payloads_with_data = sum(
|
||||||
|
1 for p in sample_points
|
||||||
|
if p.get("payload") and p["payload"].get("hadith_id")
|
||||||
|
)
|
||||||
|
|
||||||
|
console.print(f" Sample size: {len(sample_points)}")
|
||||||
|
console.print(f" With valid payloads: [green]{payloads_with_data}[/green]")
|
||||||
|
|
||||||
|
verification_time = time.time() - start_time
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
console.print("\n" + "="*50)
|
||||||
|
console.print("[bold]VERIFICATION SUMMARY[/bold]")
|
||||||
|
console.print("="*50)
|
||||||
|
|
||||||
|
if len(missing_ids) == 0:
|
||||||
|
console.print("[bold green]✓ ALL EMBEDDINGS VERIFIED![/bold green]")
|
||||||
|
else:
|
||||||
|
console.print(f"[bold red]✗ {len(missing_ids):,} EMBEDDINGS MISSING[/bold red]")
|
||||||
|
if sample_missing:
|
||||||
|
console.print(f" Sample missing IDs: {sample_missing[:10]}")
|
||||||
|
|
||||||
|
console.print(f"\nVerification completed in {verification_time:.2f} seconds")
|
||||||
|
|
||||||
|
return VerificationResult(
|
||||||
|
total_hadiths_db=total_hadiths,
|
||||||
|
total_embeddings_qdrant=qdrant_count,
|
||||||
|
embeddings_with_payloads=payloads_with_data,
|
||||||
|
missing_embeddings=len(missing_ids),
|
||||||
|
embedding_dimension=embedding_dim,
|
||||||
|
collection_exists=True,
|
||||||
|
collection_status=status,
|
||||||
|
sample_ids_missing=sample_missing,
|
||||||
|
verification_time_seconds=verification_time,
|
||||||
|
timestamp=datetime.now().isoformat()
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
"""Main entry point."""
|
||||||
|
result = await verify_embeddings()
|
||||||
|
|
||||||
|
# Save results to JSON
|
||||||
|
output_file = "verification_results.json"
|
||||||
|
with open(output_file, 'w') as f:
|
||||||
|
json.dump(asdict(result), f, indent=2)
|
||||||
|
|
||||||
|
console.print(f"\n[dim]Results saved to {output_file}[/dim]")
|
||||||
|
|
||||||
|
# Exit with error code if missing embeddings
|
||||||
|
if result.missing_embeddings > 0:
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
|
|
@ -0,0 +1,183 @@
|
||||||
|
# ============================================================================
|
||||||
|
# Step 6: Semantic Search API - Kubernetes Deployment
|
||||||
|
# ============================================================================
|
||||||
|
# Deploy: kubectl apply -f k8s-search-api.yaml
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
---
|
||||||
|
# Namespace (if not exists)
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Namespace
|
||||||
|
metadata:
|
||||||
|
name: hadith
|
||||||
|
---
|
||||||
|
# ConfigMap for non-sensitive configuration
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: search-api-config
|
||||||
|
namespace: hadith
|
||||||
|
data:
|
||||||
|
POSTGRES_HOST: "postgres.db.svc.cluster.local"
|
||||||
|
POSTGRES_PORT: "5432"
|
||||||
|
POSTGRES_DB: "hadith_db"
|
||||||
|
POSTGRES_USER: "hadith_ingest"
|
||||||
|
QDRANT_HOST: "qdrant.vector.svc.cluster.local"
|
||||||
|
QDRANT_PORT: "6333"
|
||||||
|
QDRANT_COLLECTION: "hadith_embeddings"
|
||||||
|
TEI_HOST: "tei.ml.svc.cluster.local"
|
||||||
|
TEI_PORT: "80"
|
||||||
|
---
|
||||||
|
# Secret for database password
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Secret
|
||||||
|
metadata:
|
||||||
|
name: search-api-secrets
|
||||||
|
namespace: hadith
|
||||||
|
type: Opaque
|
||||||
|
stringData:
|
||||||
|
POSTGRES_PASSWORD: "CHANGE_ME_TO_YOUR_PASSWORD"
|
||||||
|
---
|
||||||
|
# Deployment
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: search-api
|
||||||
|
namespace: hadith
|
||||||
|
labels:
|
||||||
|
app: search-api
|
||||||
|
spec:
|
||||||
|
replicas: 2
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app: search-api
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app: search-api
|
||||||
|
annotations:
|
||||||
|
prometheus.io/scrape: "true"
|
||||||
|
prometheus.io/port: "8080"
|
||||||
|
prometheus.io/path: "/metrics"
|
||||||
|
spec:
|
||||||
|
nodeSelector:
|
||||||
|
node: hetzner-2
|
||||||
|
containers:
|
||||||
|
- name: search-api
|
||||||
|
image: python:3.11-slim
|
||||||
|
command:
|
||||||
|
- /bin/bash
|
||||||
|
- -c
|
||||||
|
- |
|
||||||
|
pip install --no-cache-dir \
|
||||||
|
fastapi uvicorn httpx psycopg2-binary pydantic && \
|
||||||
|
python /app/search_api.py
|
||||||
|
ports:
|
||||||
|
- containerPort: 8080
|
||||||
|
name: http
|
||||||
|
envFrom:
|
||||||
|
- configMapRef:
|
||||||
|
name: search-api-config
|
||||||
|
- secretRef:
|
||||||
|
name: search-api-secrets
|
||||||
|
volumeMounts:
|
||||||
|
- name: app-code
|
||||||
|
mountPath: /app
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: "250m"
|
||||||
|
memory: "256Mi"
|
||||||
|
limits:
|
||||||
|
cpu: "1"
|
||||||
|
memory: "512Mi"
|
||||||
|
readinessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /health
|
||||||
|
port: 8080
|
||||||
|
initialDelaySeconds: 30
|
||||||
|
periodSeconds: 10
|
||||||
|
timeoutSeconds: 5
|
||||||
|
failureThreshold: 3
|
||||||
|
livenessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /health
|
||||||
|
port: 8080
|
||||||
|
initialDelaySeconds: 60
|
||||||
|
periodSeconds: 30
|
||||||
|
timeoutSeconds: 5
|
||||||
|
failureThreshold: 3
|
||||||
|
volumes:
|
||||||
|
- name: app-code
|
||||||
|
configMap:
|
||||||
|
name: search-api-code
|
||||||
|
---
|
||||||
|
# Service
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: search-api
|
||||||
|
namespace: hadith
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
app: search-api
|
||||||
|
ports:
|
||||||
|
- name: http
|
||||||
|
port: 80
|
||||||
|
targetPort: 8080
|
||||||
|
type: ClusterIP
|
||||||
|
---
|
||||||
|
# Ingress
|
||||||
|
apiVersion: networking.k8s.io/v1
|
||||||
|
kind: Ingress
|
||||||
|
metadata:
|
||||||
|
name: search-api
|
||||||
|
namespace: hadith
|
||||||
|
annotations:
|
||||||
|
cert-manager.io/cluster-issuer: letsencrypt-prod
|
||||||
|
nginx.ingress.kubernetes.io/proxy-body-size: "10m"
|
||||||
|
nginx.ingress.kubernetes.io/proxy-read-timeout: "60"
|
||||||
|
nginx.ingress.kubernetes.io/proxy-send-timeout: "60"
|
||||||
|
spec:
|
||||||
|
ingressClassName: nginx
|
||||||
|
tls:
|
||||||
|
- hosts:
|
||||||
|
- search.betelgeusebytes.io
|
||||||
|
secretName: search-api-tls
|
||||||
|
rules:
|
||||||
|
- host: search.betelgeusebytes.io
|
||||||
|
http:
|
||||||
|
paths:
|
||||||
|
- path: /
|
||||||
|
pathType: Prefix
|
||||||
|
backend:
|
||||||
|
service:
|
||||||
|
name: search-api
|
||||||
|
port:
|
||||||
|
number: 80
|
||||||
|
---
|
||||||
|
# HorizontalPodAutoscaler (optional)
|
||||||
|
apiVersion: autoscaling/v2
|
||||||
|
kind: HorizontalPodAutoscaler
|
||||||
|
metadata:
|
||||||
|
name: search-api-hpa
|
||||||
|
namespace: hadith
|
||||||
|
spec:
|
||||||
|
scaleTargetRef:
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
name: search-api
|
||||||
|
minReplicas: 2
|
||||||
|
maxReplicas: 5
|
||||||
|
metrics:
|
||||||
|
- type: Resource
|
||||||
|
resource:
|
||||||
|
name: cpu
|
||||||
|
target:
|
||||||
|
type: Utilization
|
||||||
|
averageUtilization: 70
|
||||||
|
- type: Resource
|
||||||
|
resource:
|
||||||
|
name: memory
|
||||||
|
target:
|
||||||
|
type: Utilization
|
||||||
|
averageUtilization: 80
|
||||||
|
|
@ -0,0 +1,19 @@
|
||||||
|
# Step 6: Verify Embeddings & Semantic Search
|
||||||
|
# Requirements for hadith-phase3-step6
|
||||||
|
|
||||||
|
# Database
|
||||||
|
psycopg2-binary>=2.9.9
|
||||||
|
|
||||||
|
# HTTP client
|
||||||
|
httpx>=0.27.0
|
||||||
|
|
||||||
|
# Rich console output
|
||||||
|
rich>=13.7.0
|
||||||
|
|
||||||
|
# Data handling
|
||||||
|
python-dateutil>=2.8.2
|
||||||
|
|
||||||
|
# Optional: for running as web API
|
||||||
|
fastapi>=0.111.0
|
||||||
|
uvicorn>=0.30.0
|
||||||
|
pydantic>=2.7.0
|
||||||
|
|
@ -0,0 +1,225 @@
|
||||||
|
#!/bin/bash
|
||||||
|
# ============================================================================
|
||||||
|
# Step 6: Quick Test Runner
|
||||||
|
# ============================================================================
|
||||||
|
# Usage: ./run_tests.sh [verify|benchmark|demo|api|all]
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# Colors
|
||||||
|
RED='\033[0;31m'
|
||||||
|
GREEN='\033[0;32m'
|
||||||
|
YELLOW='\033[1;33m'
|
||||||
|
BLUE='\033[0;34m'
|
||||||
|
NC='\033[0m' # No Color
|
||||||
|
|
||||||
|
# Configuration - Update these for your environment
|
||||||
|
export POSTGRES_HOST="${POSTGRES_HOST:-pg.betelgeusebytes.io}"
|
||||||
|
export POSTGRES_PORT="${POSTGRES_PORT:-5432}"
|
||||||
|
export POSTGRES_DB="${POSTGRES_DB:-hadith_db}"
|
||||||
|
export POSTGRES_USER="${POSTGRES_USER:-hadith_ingest}"
|
||||||
|
export POSTGRES_PASSWORD="${POSTGRES_PASSWORD:-hadith_ingest}"
|
||||||
|
# TEI_URL = "https://embeddings.betelgeusebytes.io"
|
||||||
|
# QDRANT_URL = "https://vector.betelgeusebytes.io"
|
||||||
|
# export QDRANT_HOST="${QDRANT_HOST:-qdrant.vector.svc.cluster.local}"
|
||||||
|
# export QDRANT_PORT="${QDRANT_PORT:-6333}"
|
||||||
|
# export QDRANT_COLLECTION="${QDRANT_COLLECTION:-hadith_embeddings}"
|
||||||
|
|
||||||
|
# export TEI_HOST="${TEI_HOST:-tei.ml.svc.cluster.local}"
|
||||||
|
# export TEI_PORT="${TEI_PORT:-80}"
|
||||||
|
|
||||||
|
export QDRANT_HOST="${QDRANT_HOST:-https://vector.betelgeusebytes.io}"
|
||||||
|
export QDRANT_PORT="${QDRANT_PORT:-443}"
|
||||||
|
export QDRANT_COLLECTION="${QDRANT_COLLECTION:-hadith_embeddings}"
|
||||||
|
|
||||||
|
export TEI_HOST="${TEI_HOST:-https://embeddings.betelgeusebytes.io}"
|
||||||
|
export TEI_PORT="${TEI_PORT:-443}"
|
||||||
|
|
||||||
|
# Check if password is set
|
||||||
|
check_password() {
|
||||||
|
if [ -z "$POSTGRES_PASSWORD" ]; then
|
||||||
|
echo -e "${RED}Error: POSTGRES_PASSWORD environment variable is not set${NC}"
|
||||||
|
echo "Set it with: export POSTGRES_PASSWORD='your_password'"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Install dependencies
|
||||||
|
install_deps() {
|
||||||
|
echo -e "${BLUE}Installing dependencies...${NC}"
|
||||||
|
pip install -q -r requirements.txt
|
||||||
|
echo -e "${GREEN}Dependencies installed.${NC}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Run verification
|
||||||
|
run_verify() {
|
||||||
|
echo -e "\n${BLUE}═══════════════════════════════════════════════════${NC}"
|
||||||
|
echo -e "${BLUE}Running Embedding Verification...${NC}"
|
||||||
|
echo -e "${BLUE}═══════════════════════════════════════════════════${NC}\n"
|
||||||
|
|
||||||
|
python verify_embeddings.py
|
||||||
|
|
||||||
|
if [ $? -eq 0 ]; then
|
||||||
|
echo -e "\n${GREEN}✓ Verification passed!${NC}"
|
||||||
|
else
|
||||||
|
echo -e "\n${RED}✗ Verification failed - some embeddings are missing${NC}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Run benchmark
|
||||||
|
run_benchmark() {
|
||||||
|
echo -e "\n${BLUE}═══════════════════════════════════════════════════${NC}"
|
||||||
|
echo -e "${BLUE}Running Semantic Search Benchmark...${NC}"
|
||||||
|
echo -e "${BLUE}═══════════════════════════════════════════════════${NC}\n"
|
||||||
|
|
||||||
|
python semantic_search.py --mode benchmark --output benchmark_results.json
|
||||||
|
|
||||||
|
echo -e "\n${GREEN}✓ Benchmark complete. Results saved to benchmark_results.json${NC}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Run demo
|
||||||
|
run_demo() {
|
||||||
|
echo -e "\n${BLUE}═══════════════════════════════════════════════════${NC}"
|
||||||
|
echo -e "${BLUE}Running Search Demo...${NC}"
|
||||||
|
echo -e "${BLUE}═══════════════════════════════════════════════════${NC}\n"
|
||||||
|
|
||||||
|
python semantic_search.py --mode demo
|
||||||
|
}
|
||||||
|
|
||||||
|
# Run API server
|
||||||
|
run_api() {
|
||||||
|
echo -e "\n${BLUE}═══════════════════════════════════════════════════${NC}"
|
||||||
|
echo -e "${BLUE}Starting Search API Server...${NC}"
|
||||||
|
echo -e "${BLUE}═══════════════════════════════════════════════════${NC}\n"
|
||||||
|
|
||||||
|
echo -e "${YELLOW}API will be available at: http://localhost:8080${NC}"
|
||||||
|
echo -e "${YELLOW}Swagger docs at: http://localhost:8080/docs${NC}"
|
||||||
|
echo -e "${YELLOW}Press Ctrl+C to stop${NC}\n"
|
||||||
|
|
||||||
|
python search_api.py
|
||||||
|
}
|
||||||
|
|
||||||
|
# Run SQL verification
|
||||||
|
run_sql() {
|
||||||
|
echo -e "\n${BLUE}═══════════════════════════════════════════════════${NC}"
|
||||||
|
echo -e "${BLUE}Running SQL Verification Queries...${NC}"
|
||||||
|
echo -e "${BLUE}═══════════════════════════════════════════════════${NC}\n"
|
||||||
|
|
||||||
|
PGPASSWORD="$POSTGRES_PASSWORD" psql \
|
||||||
|
-h "$POSTGRES_HOST" \
|
||||||
|
-p "$POSTGRES_PORT" \
|
||||||
|
-U "$POSTGRES_USER" \
|
||||||
|
-d "$POSTGRES_DB" \
|
||||||
|
-f verification_queries.sql
|
||||||
|
}
|
||||||
|
|
||||||
|
# Quick connectivity test
|
||||||
|
test_connectivity() {
|
||||||
|
echo -e "\n${BLUE}Testing Service Connectivity...${NC}\n"
|
||||||
|
|
||||||
|
# Test PostgreSQL
|
||||||
|
echo -n "PostgreSQL ($POSTGRES_HOST:$POSTGRES_PORT): "
|
||||||
|
if PGPASSWORD="$POSTGRES_PASSWORD" psql -h "$POSTGRES_HOST" -p "$POSTGRES_PORT" -U "$POSTGRES_USER" -d "$POSTGRES_DB" -c "SELECT 1" > /dev/null 2>&1; then
|
||||||
|
echo -e "${GREEN}✓ Connected${NC}"
|
||||||
|
else
|
||||||
|
echo -e "${RED}✗ Failed${NC}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Test Qdrant
|
||||||
|
echo -n "Qdrant ($QDRANT_HOST:$QDRANT_PORT): "
|
||||||
|
if curl -s "http://$QDRANT_HOST:$QDRANT_PORT/collections" > /dev/null 2>&1; then
|
||||||
|
echo -e "${GREEN}✓ Connected${NC}"
|
||||||
|
else
|
||||||
|
echo -e "${RED}✗ Failed${NC}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Test TEI
|
||||||
|
echo -n "TEI ($TEI_HOST:$TEI_PORT): "
|
||||||
|
if curl -s "http://$TEI_HOST:$TEI_PORT/health" > /dev/null 2>&1; then
|
||||||
|
echo -e "${GREEN}✓ Connected${NC}"
|
||||||
|
else
|
||||||
|
echo -e "${RED}✗ Failed${NC}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
}
|
||||||
|
|
||||||
|
# Show usage
|
||||||
|
show_usage() {
|
||||||
|
echo "Usage: $0 [command]"
|
||||||
|
echo ""
|
||||||
|
echo "Commands:"
|
||||||
|
echo " verify Run embedding verification"
|
||||||
|
echo " benchmark Run semantic search benchmark"
|
||||||
|
echo " demo Run search demo with sample queries"
|
||||||
|
echo " api Start the search API server"
|
||||||
|
echo " sql Run SQL verification queries"
|
||||||
|
echo " test Test connectivity to all services"
|
||||||
|
echo " all Run verify + benchmark + demo"
|
||||||
|
echo " install Install Python dependencies"
|
||||||
|
echo " help Show this help message"
|
||||||
|
echo ""
|
||||||
|
echo "Environment variables:"
|
||||||
|
echo " POSTGRES_HOST PostgreSQL host (default: pg.betelgeusebytes.io)"
|
||||||
|
echo " POSTGRES_PORT PostgreSQL port (default: 5432)"
|
||||||
|
echo " POSTGRES_DB Database name (default: hadith_db)"
|
||||||
|
echo " POSTGRES_USER Database user (default: hadith_ingest)"
|
||||||
|
echo " POSTGRES_PASSWORD Database password (required)"
|
||||||
|
echo " QDRANT_HOST Qdrant host (default: qdrant.vector.svc.cluster.local)"
|
||||||
|
echo " QDRANT_PORT Qdrant port (default: 6333)"
|
||||||
|
echo " TEI_HOST TEI host (default: tei.ml.svc.cluster.local)"
|
||||||
|
echo " TEI_PORT TEI port (default: 80)"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Main
|
||||||
|
case "${1:-help}" in
|
||||||
|
verify)
|
||||||
|
check_password
|
||||||
|
install_deps
|
||||||
|
run_verify
|
||||||
|
;;
|
||||||
|
benchmark)
|
||||||
|
check_password
|
||||||
|
install_deps
|
||||||
|
run_benchmark
|
||||||
|
;;
|
||||||
|
demo)
|
||||||
|
check_password
|
||||||
|
install_deps
|
||||||
|
run_demo
|
||||||
|
;;
|
||||||
|
api)
|
||||||
|
check_password
|
||||||
|
install_deps
|
||||||
|
run_api
|
||||||
|
;;
|
||||||
|
sql)
|
||||||
|
check_password
|
||||||
|
run_sql
|
||||||
|
;;
|
||||||
|
test)
|
||||||
|
check_password
|
||||||
|
test_connectivity
|
||||||
|
;;
|
||||||
|
all)
|
||||||
|
check_password
|
||||||
|
install_deps
|
||||||
|
test_connectivity
|
||||||
|
run_verify
|
||||||
|
run_benchmark
|
||||||
|
run_demo
|
||||||
|
;;
|
||||||
|
install)
|
||||||
|
install_deps
|
||||||
|
;;
|
||||||
|
help|--help|-h)
|
||||||
|
show_usage
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo -e "${RED}Unknown command: $1${NC}"
|
||||||
|
show_usage
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
@ -0,0 +1,578 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Step 6.3: Semantic Search API Service
|
||||||
|
======================================
|
||||||
|
Production-ready FastAPI service for hadith semantic search.
|
||||||
|
|
||||||
|
Author: Hadith Scholar AI Project
|
||||||
|
Date: 2025
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import logging
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import List, Optional
|
||||||
|
from contextlib import asynccontextmanager
|
||||||
|
import sys
|
||||||
|
import httpx
|
||||||
|
import psycopg2
|
||||||
|
from psycopg2.pool import ThreadedConnectionPool
|
||||||
|
from psycopg2.extras import RealDictCursor
|
||||||
|
from fastapi import FastAPI, HTTPException, Query, Depends
|
||||||
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
if sys.platform == 'win32':
|
||||||
|
os.environ['PYTHONIOENCODING'] = 'utf-8'
|
||||||
|
if hasattr(sys.stdout, 'reconfigure'):
|
||||||
|
sys.stdout.reconfigure(encoding='utf-8')
|
||||||
|
if hasattr(sys.stderr, 'reconfigure'):
|
||||||
|
sys.stderr.reconfigure(encoding='utf-8')
|
||||||
|
|
||||||
|
# Logging
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
POSTGRES_HOST = os.getenv("POSTGRES_HOST", "pg.betelgeusebytes.io")
|
||||||
|
POSTGRES_PORT = int(os.getenv("POSTGRES_PORT", "5432"))
|
||||||
|
POSTGRES_DB = os.getenv("POSTGRES_DB", "hadith_db")
|
||||||
|
POSTGRES_USER = os.getenv("POSTGRES_USER", "hadith_ingest")
|
||||||
|
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD", "hadith_ingest")
|
||||||
|
# TEI_URL = "https://embeddings.betelgeusebytes.io"
|
||||||
|
# QDRANT_URL = "https://vector.betelgeusebytes.io"
|
||||||
|
QDRANT_HOST = os.getenv("QDRANT_HOST", "https://vector.betelgeusebytes.io")
|
||||||
|
QDRANT_PORT = int(os.getenv("QDRANT_PORT", "443"))
|
||||||
|
QDRANT_COLLECTION = os.getenv("QDRANT_COLLECTION", "hadith_embeddings")
|
||||||
|
|
||||||
|
# For external access
|
||||||
|
QDRANT_EXTERNAL = os.getenv("QDRANT_EXTERNAL", "qdrant.betelgeusebytes.io")
|
||||||
|
# TEI_URL = "https://embeddings.betelgeusebytes.io"
|
||||||
|
TEI_HOST = os.getenv("TEI_HOST", "https://embeddings.betelgeusebytes.io")
|
||||||
|
TEI_PORT = int(os.getenv("TEI_PORT", "443"))
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Pydantic Models
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
class SearchQuery(BaseModel):
|
||||||
|
"""Search query input."""
|
||||||
|
query: str = Field(..., min_length=1, max_length=1000, description="Search query text")
|
||||||
|
limit: int = Field(default=10, ge=1, le=100, description="Number of results to return")
|
||||||
|
min_score: float = Field(default=0.0, ge=0.0, le=1.0, description="Minimum similarity score")
|
||||||
|
collections: Optional[List[str]] = Field(default=None, description="Filter by collection names")
|
||||||
|
grades: Optional[List[str]] = Field(default=None, description="Filter by hadith grades")
|
||||||
|
|
||||||
|
|
||||||
|
class HadithResult(BaseModel):
|
||||||
|
"""Individual hadith search result."""
|
||||||
|
hadith_id: int
|
||||||
|
score: float
|
||||||
|
collection: str
|
||||||
|
book: Optional[str]
|
||||||
|
hadith_number: str
|
||||||
|
arabic_text: Optional[str]
|
||||||
|
arabic_normalized: Optional[str]
|
||||||
|
english_text: Optional[str]
|
||||||
|
urdu_text: Optional[str]
|
||||||
|
grade: Optional[str]
|
||||||
|
|
||||||
|
|
||||||
|
class SearchResponse(BaseModel):
|
||||||
|
"""Search response."""
|
||||||
|
query: str
|
||||||
|
results: List[HadithResult]
|
||||||
|
total_results: int
|
||||||
|
embedding_time_ms: float
|
||||||
|
search_time_ms: float
|
||||||
|
total_time_ms: float
|
||||||
|
timestamp: str
|
||||||
|
|
||||||
|
|
||||||
|
class HealthResponse(BaseModel):
|
||||||
|
"""Health check response."""
|
||||||
|
status: str
|
||||||
|
database: str
|
||||||
|
qdrant: str
|
||||||
|
tei: str
|
||||||
|
timestamp: str
|
||||||
|
|
||||||
|
|
||||||
|
class CollectionStats(BaseModel):
|
||||||
|
"""Collection statistics."""
|
||||||
|
name: str
|
||||||
|
total_hadiths: int
|
||||||
|
embedded_count: int
|
||||||
|
|
||||||
|
|
||||||
|
class StatsResponse(BaseModel):
|
||||||
|
"""Statistics response."""
|
||||||
|
total_hadiths: int
|
||||||
|
total_embedded: int
|
||||||
|
collections: List[CollectionStats]
|
||||||
|
timestamp: str
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Database Pool & Connections
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
db_pool: Optional[ThreadedConnectionPool] = None
|
||||||
|
http_client: Optional[httpx.AsyncClient] = None
|
||||||
|
|
||||||
|
|
||||||
|
@asynccontextmanager
|
||||||
|
async def lifespan(app: FastAPI):
|
||||||
|
"""Manage application lifecycle."""
|
||||||
|
global db_pool, http_client
|
||||||
|
|
||||||
|
# Startup
|
||||||
|
logger.info("Starting up semantic search service...")
|
||||||
|
|
||||||
|
# Initialize database pool
|
||||||
|
try:
|
||||||
|
db_pool = ThreadedConnectionPool(
|
||||||
|
minconn=2,
|
||||||
|
maxconn=10,
|
||||||
|
host=POSTGRES_HOST,
|
||||||
|
port=POSTGRES_PORT,
|
||||||
|
database=POSTGRES_DB,
|
||||||
|
user=POSTGRES_USER,
|
||||||
|
password=POSTGRES_PASSWORD,
|
||||||
|
sslmode='require'
|
||||||
|
)
|
||||||
|
logger.info("Database pool initialized")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to initialize database pool: {e}")
|
||||||
|
db_pool = None
|
||||||
|
|
||||||
|
# Initialize HTTP client
|
||||||
|
http_client = httpx.AsyncClient(timeout=30.0)
|
||||||
|
logger.info("HTTP client initialized")
|
||||||
|
|
||||||
|
yield
|
||||||
|
|
||||||
|
# Shutdown
|
||||||
|
logger.info("Shutting down...")
|
||||||
|
if db_pool:
|
||||||
|
db_pool.closeall()
|
||||||
|
if http_client:
|
||||||
|
await http_client.aclose()
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# FastAPI App
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
app = FastAPI(
|
||||||
|
title="Hadith Semantic Search API",
|
||||||
|
description="Semantic search service for Islamic hadith literature",
|
||||||
|
version="1.0.0",
|
||||||
|
lifespan=lifespan
|
||||||
|
)
|
||||||
|
|
||||||
|
# CORS middleware
|
||||||
|
app.add_middleware(
|
||||||
|
CORSMiddleware,
|
||||||
|
allow_origins=["*"],
|
||||||
|
allow_credentials=True,
|
||||||
|
allow_methods=["*"],
|
||||||
|
allow_headers=["*"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Helper Functions
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def get_db_connection():
|
||||||
|
"""Get database connection from pool."""
|
||||||
|
if db_pool is None:
|
||||||
|
raise HTTPException(status_code=503, detail="Database pool not available")
|
||||||
|
return db_pool.getconn()
|
||||||
|
|
||||||
|
|
||||||
|
def release_db_connection(conn):
|
||||||
|
"""Return connection to pool."""
|
||||||
|
if db_pool and conn:
|
||||||
|
db_pool.putconn(conn)
|
||||||
|
|
||||||
|
|
||||||
|
async def get_embedding(text: str) -> tuple[List[float], float]:
|
||||||
|
"""Get embedding from TEI service."""
|
||||||
|
start = time.perf_counter()
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = await http_client.post(
|
||||||
|
f"{TEI_HOST}:{TEI_PORT}/embed",
|
||||||
|
json={"inputs": text}
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
elapsed_ms = (time.perf_counter() - start) * 1000
|
||||||
|
|
||||||
|
embeddings = response.json()
|
||||||
|
if isinstance(embeddings, list) and len(embeddings) > 0:
|
||||||
|
if isinstance(embeddings[0], list):
|
||||||
|
return embeddings[0], elapsed_ms
|
||||||
|
return embeddings, elapsed_ms
|
||||||
|
|
||||||
|
raise ValueError("Unexpected embedding format")
|
||||||
|
|
||||||
|
except httpx.HTTPError as e:
|
||||||
|
logger.error(f"TEI request failed: {e}")
|
||||||
|
raise HTTPException(status_code=503, detail=f"Embedding service error: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
async def search_qdrant(
|
||||||
|
embedding: List[float],
|
||||||
|
limit: int = 10,
|
||||||
|
min_score: float = 0.0,
|
||||||
|
filters: Optional[dict] = None
|
||||||
|
) -> tuple[List[dict], float]:
|
||||||
|
"""Search Qdrant with embedding vector."""
|
||||||
|
start = time.perf_counter()
|
||||||
|
|
||||||
|
try:
|
||||||
|
payload = {
|
||||||
|
"vector": embedding,
|
||||||
|
"limit": limit,
|
||||||
|
"with_payload": True,
|
||||||
|
"score_threshold": min_score
|
||||||
|
}
|
||||||
|
|
||||||
|
if filters:
|
||||||
|
payload["filter"] = filters
|
||||||
|
|
||||||
|
response = await http_client.post(
|
||||||
|
f"{QDRANT_HOST}:{QDRANT_PORT}/collections/{QDRANT_COLLECTION}/points/search",
|
||||||
|
json=payload
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
elapsed_ms = (time.perf_counter() - start) * 1000
|
||||||
|
results = response.json().get("result", [])
|
||||||
|
|
||||||
|
return results, elapsed_ms
|
||||||
|
|
||||||
|
except httpx.HTTPError as e:
|
||||||
|
logger.error(f"Qdrant request failed: {e}")
|
||||||
|
raise HTTPException(status_code=503, detail=f"Vector search service error: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def enrich_results_from_db(hadith_ids: List[int]) -> dict[int, dict]:
|
||||||
|
"""Fetch full hadith data from PostgreSQL."""
|
||||||
|
if not hadith_ids:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
conn = get_db_connection()
|
||||||
|
try:
|
||||||
|
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||||
|
cur.execute("""
|
||||||
|
SELECT
|
||||||
|
h.id,
|
||||||
|
h.hadith_number,
|
||||||
|
h.arabic_text,
|
||||||
|
h.arabic_normalized,
|
||||||
|
h.english_text,
|
||||||
|
h.urdu_text,
|
||||||
|
h.grade,
|
||||||
|
c.name_english as collection_name,
|
||||||
|
b.name_english as book_name
|
||||||
|
FROM hadiths h
|
||||||
|
JOIN collections c ON h.collection_id = c.id
|
||||||
|
LEFT JOIN books b ON h.book_id = b.id
|
||||||
|
WHERE h.id = ANY(%s)
|
||||||
|
""", (hadith_ids,))
|
||||||
|
|
||||||
|
return {row['id']: dict(row) for row in cur.fetchall()}
|
||||||
|
finally:
|
||||||
|
release_db_connection(conn)
|
||||||
|
|
||||||
|
|
||||||
|
def build_qdrant_filter(collections: Optional[List[str]], grades: Optional[List[str]]) -> Optional[dict]:
|
||||||
|
"""Build Qdrant filter from parameters."""
|
||||||
|
conditions = []
|
||||||
|
|
||||||
|
if collections:
|
||||||
|
conditions.append({
|
||||||
|
"key": "collection",
|
||||||
|
"match": {"any": collections}
|
||||||
|
})
|
||||||
|
|
||||||
|
if grades:
|
||||||
|
conditions.append({
|
||||||
|
"key": "grade",
|
||||||
|
"match": {"any": grades}
|
||||||
|
})
|
||||||
|
|
||||||
|
if not conditions:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if len(conditions) == 1:
|
||||||
|
return {"must": conditions}
|
||||||
|
|
||||||
|
return {"must": conditions}
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# API Endpoints
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
@app.get("/health", response_model=HealthResponse)
|
||||||
|
async def health_check():
|
||||||
|
"""Check health of all services."""
|
||||||
|
db_status = "healthy"
|
||||||
|
qdrant_status = "healthy"
|
||||||
|
tei_status = "healthy"
|
||||||
|
|
||||||
|
# Check database
|
||||||
|
try:
|
||||||
|
conn = get_db_connection()
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.execute("SELECT 1")
|
||||||
|
release_db_connection(conn)
|
||||||
|
except Exception as e:
|
||||||
|
db_status = f"unhealthy: {e}"
|
||||||
|
|
||||||
|
# Check Qdrant
|
||||||
|
try:
|
||||||
|
response = await http_client.get(
|
||||||
|
f"{QDRANT_HOST}:{QDRANT_PORT}/collections/{QDRANT_COLLECTION}"
|
||||||
|
)
|
||||||
|
if response.status_code != 200:
|
||||||
|
qdrant_status = f"unhealthy: status {response.status_code}"
|
||||||
|
except Exception as e:
|
||||||
|
qdrant_status = f"unhealthy: {e}"
|
||||||
|
|
||||||
|
# Check TEI
|
||||||
|
try:
|
||||||
|
response = await http_client.get(f"{TEI_HOST}:{TEI_PORT}/health")
|
||||||
|
if response.status_code != 200:
|
||||||
|
tei_status = f"unhealthy: status {response.status_code}"
|
||||||
|
except Exception as e:
|
||||||
|
tei_status = f"unhealthy: {e}"
|
||||||
|
|
||||||
|
overall = "healthy" if all(
|
||||||
|
s == "healthy" for s in [db_status, qdrant_status, tei_status]
|
||||||
|
) else "degraded"
|
||||||
|
|
||||||
|
return HealthResponse(
|
||||||
|
status=overall,
|
||||||
|
database=db_status,
|
||||||
|
qdrant=qdrant_status,
|
||||||
|
tei=tei_status,
|
||||||
|
timestamp=datetime.now().isoformat()
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/stats", response_model=StatsResponse)
|
||||||
|
async def get_stats():
|
||||||
|
"""Get database statistics."""
|
||||||
|
conn = get_db_connection()
|
||||||
|
try:
|
||||||
|
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||||
|
# Total counts
|
||||||
|
cur.execute("""
|
||||||
|
SELECT
|
||||||
|
COUNT(*) as total,
|
||||||
|
SUM(CASE WHEN embedding_generated THEN 1 ELSE 0 END) as embedded
|
||||||
|
FROM hadiths
|
||||||
|
""")
|
||||||
|
totals = cur.fetchone()
|
||||||
|
|
||||||
|
# By collection
|
||||||
|
cur.execute("""
|
||||||
|
SELECT
|
||||||
|
c.name_english as name,
|
||||||
|
COUNT(h.id) as total_hadiths,
|
||||||
|
SUM(CASE WHEN h.embedding_generated THEN 1 ELSE 0 END) as embedded_count
|
||||||
|
FROM hadiths h
|
||||||
|
JOIN collections c ON h.collection_id = c.id
|
||||||
|
GROUP BY c.id, c.name_english
|
||||||
|
ORDER BY total_hadiths DESC
|
||||||
|
""")
|
||||||
|
collections = [CollectionStats(**dict(row)) for row in cur.fetchall()]
|
||||||
|
|
||||||
|
return StatsResponse(
|
||||||
|
total_hadiths=totals['total'],
|
||||||
|
total_embedded=totals['embedded'],
|
||||||
|
collections=collections,
|
||||||
|
timestamp=datetime.now().isoformat()
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
release_db_connection(conn)
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/search", response_model=SearchResponse)
|
||||||
|
async def semantic_search(query: SearchQuery):
|
||||||
|
"""Perform semantic search on hadiths."""
|
||||||
|
total_start = time.perf_counter()
|
||||||
|
|
||||||
|
# Get embedding
|
||||||
|
embedding, embed_time = await get_embedding(query.query)
|
||||||
|
|
||||||
|
# Build filters
|
||||||
|
filters = build_qdrant_filter(query.collections, query.grades)
|
||||||
|
|
||||||
|
# Search Qdrant
|
||||||
|
qdrant_results, search_time = await search_qdrant(
|
||||||
|
embedding,
|
||||||
|
limit=query.limit,
|
||||||
|
min_score=query.min_score,
|
||||||
|
filters=filters
|
||||||
|
)
|
||||||
|
|
||||||
|
# Extract hadith IDs
|
||||||
|
hadith_ids = []
|
||||||
|
for r in qdrant_results:
|
||||||
|
payload = r.get("payload", {})
|
||||||
|
hid = payload.get("hadith_id") or payload.get("id")
|
||||||
|
if hid:
|
||||||
|
hadith_ids.append(int(hid))
|
||||||
|
|
||||||
|
# Enrich from database
|
||||||
|
db_data = enrich_results_from_db(hadith_ids)
|
||||||
|
|
||||||
|
# Build results
|
||||||
|
results = []
|
||||||
|
for r in qdrant_results:
|
||||||
|
payload = r.get("payload", {})
|
||||||
|
hid = payload.get("hadith_id") or payload.get("id")
|
||||||
|
|
||||||
|
if hid and int(hid) in db_data:
|
||||||
|
data = db_data[int(hid)]
|
||||||
|
results.append(HadithResult(
|
||||||
|
hadith_id=int(hid),
|
||||||
|
score=r.get("score", 0),
|
||||||
|
collection=data.get("collection_name", "Unknown"),
|
||||||
|
book=data.get("book_name"),
|
||||||
|
hadith_number=data.get("hadith_number", ""),
|
||||||
|
arabic_text=data.get("arabic_text"),
|
||||||
|
arabic_normalized=data.get("arabic_normalized"),
|
||||||
|
english_text=data.get("english_text"),
|
||||||
|
urdu_text=data.get("urdu_text"),
|
||||||
|
grade=data.get("grade")
|
||||||
|
))
|
||||||
|
else:
|
||||||
|
# Fallback to payload
|
||||||
|
results.append(HadithResult(
|
||||||
|
hadith_id=int(hid) if hid else 0,
|
||||||
|
score=r.get("score", 0),
|
||||||
|
collection=payload.get("collection", "Unknown"),
|
||||||
|
book=payload.get("book"),
|
||||||
|
hadith_number=str(payload.get("hadith_number", "")),
|
||||||
|
arabic_text=payload.get("arabic_text"),
|
||||||
|
arabic_normalized=payload.get("arabic_normalized"),
|
||||||
|
english_text=payload.get("english_text"),
|
||||||
|
urdu_text=payload.get("urdu_text"),
|
||||||
|
grade=payload.get("grade")
|
||||||
|
))
|
||||||
|
|
||||||
|
total_time = (time.perf_counter() - total_start) * 1000
|
||||||
|
|
||||||
|
return SearchResponse(
|
||||||
|
query=query.query,
|
||||||
|
results=results,
|
||||||
|
total_results=len(results),
|
||||||
|
embedding_time_ms=embed_time,
|
||||||
|
search_time_ms=search_time,
|
||||||
|
total_time_ms=total_time,
|
||||||
|
timestamp=datetime.now().isoformat()
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/search", response_model=SearchResponse)
|
||||||
|
async def semantic_search_get(
|
||||||
|
q: str = Query(..., min_length=1, max_length=1000, description="Search query"),
|
||||||
|
limit: int = Query(default=10, ge=1, le=100),
|
||||||
|
min_score: float = Query(default=0.0, ge=0.0, le=1.0)
|
||||||
|
):
|
||||||
|
"""GET version of semantic search for simple queries."""
|
||||||
|
query = SearchQuery(query=q, limit=limit, min_score=min_score)
|
||||||
|
return await semantic_search(query)
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/hadith/{hadith_id}")
|
||||||
|
async def get_hadith(hadith_id: int):
|
||||||
|
"""Get a specific hadith by ID."""
|
||||||
|
conn = get_db_connection()
|
||||||
|
try:
|
||||||
|
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||||
|
cur.execute("""
|
||||||
|
SELECT
|
||||||
|
h.id,
|
||||||
|
h.hadith_number,
|
||||||
|
h.arabic_text,
|
||||||
|
h.arabic_normalized,
|
||||||
|
h.english_text,
|
||||||
|
h.urdu_text,
|
||||||
|
h.grade,
|
||||||
|
h.source_metadata,
|
||||||
|
h.embedding_generated,
|
||||||
|
h.entities_extracted,
|
||||||
|
h.relations_extracted,
|
||||||
|
h.created_at,
|
||||||
|
h.updated_at,
|
||||||
|
c.name_english as collection_name,
|
||||||
|
c.name_arabic as collection_arabic,
|
||||||
|
b.name_english as book_name,
|
||||||
|
b.name_arabic as book_arabic
|
||||||
|
FROM hadiths h
|
||||||
|
JOIN collections c ON h.collection_id = c.id
|
||||||
|
LEFT JOIN books b ON h.book_id = b.id
|
||||||
|
WHERE h.id = %s
|
||||||
|
""", (hadith_id,))
|
||||||
|
|
||||||
|
row = cur.fetchone()
|
||||||
|
if not row:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Hadith {hadith_id} not found")
|
||||||
|
|
||||||
|
return dict(row)
|
||||||
|
finally:
|
||||||
|
release_db_connection(conn)
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/similar/{hadith_id}", response_model=SearchResponse)
|
||||||
|
async def find_similar(
|
||||||
|
hadith_id: int,
|
||||||
|
limit: int = Query(default=10, ge=1, le=100)
|
||||||
|
):
|
||||||
|
"""Find hadiths similar to a given hadith."""
|
||||||
|
# Get the hadith text
|
||||||
|
conn = get_db_connection()
|
||||||
|
try:
|
||||||
|
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||||
|
cur.execute("""
|
||||||
|
SELECT arabic_text, english_text
|
||||||
|
FROM hadiths
|
||||||
|
WHERE id = %s
|
||||||
|
""", (hadith_id,))
|
||||||
|
|
||||||
|
row = cur.fetchone()
|
||||||
|
if not row:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Hadith {hadith_id} not found")
|
||||||
|
|
||||||
|
# Use Arabic text preferably, fall back to English
|
||||||
|
text = row['arabic_text'] or row['english_text']
|
||||||
|
if not text:
|
||||||
|
raise HTTPException(status_code=400, detail="Hadith has no text content")
|
||||||
|
finally:
|
||||||
|
release_db_connection(conn)
|
||||||
|
|
||||||
|
# Search for similar hadiths
|
||||||
|
query = SearchQuery(query=text, limit=limit + 1) # +1 to exclude self
|
||||||
|
response = await semantic_search(query)
|
||||||
|
|
||||||
|
# Filter out the source hadith
|
||||||
|
response.results = [r for r in response.results if r.hadith_id != hadith_id][:limit]
|
||||||
|
response.total_results = len(response.results)
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import uvicorn
|
||||||
|
uvicorn.run(app, host="0.0.0.0", port=8080)
|
||||||
|
|
@ -0,0 +1,677 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Step 6.2: Semantic Search Testing & Benchmarking
|
||||||
|
=================================================
|
||||||
|
Tests semantic search functionality and benchmarks performance.
|
||||||
|
Target: <500ms per query.
|
||||||
|
|
||||||
|
Author: Hadith Scholar AI Project
|
||||||
|
Date: 2025
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import asyncio
|
||||||
|
import statistics
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Dict, List, Optional, Tuple
|
||||||
|
from dataclasses import dataclass, asdict, field
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
import psycopg2
|
||||||
|
from psycopg2.extras import RealDictCursor
|
||||||
|
from rich.console import Console
|
||||||
|
from rich.table import Table
|
||||||
|
from rich.panel import Panel
|
||||||
|
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
|
||||||
|
|
||||||
|
if sys.platform == 'win32':
|
||||||
|
os.environ['PYTHONIOENCODING'] = 'utf-8'
|
||||||
|
if hasattr(sys.stdout, 'reconfigure'):
|
||||||
|
sys.stdout.reconfigure(encoding='utf-8')
|
||||||
|
if hasattr(sys.stderr, 'reconfigure'):
|
||||||
|
sys.stderr.reconfigure(encoding='utf-8')
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
POSTGRES_HOST = os.getenv("POSTGRES_HOST", "pg.betelgeusebytes.io")
|
||||||
|
POSTGRES_PORT = int(os.getenv("POSTGRES_PORT", "5432"))
|
||||||
|
POSTGRES_DB = os.getenv("POSTGRES_DB", "hadith_db")
|
||||||
|
POSTGRES_USER = os.getenv("POSTGRES_USER", "hadith_ingest")
|
||||||
|
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD", "hadith_ingest")
|
||||||
|
# TEI_URL = "https://embeddings.betelgeusebytes.io"
|
||||||
|
# QDRANT_URL = "https://vector.betelgeusebytes.io"
|
||||||
|
QDRANT_HOST = os.getenv("QDRANT_HOST", "https://vector.betelgeusebytes.io")
|
||||||
|
QDRANT_PORT = int(os.getenv("QDRANT_PORT", "443"))
|
||||||
|
QDRANT_COLLECTION = os.getenv("QDRANT_COLLECTION", "hadith_embeddings")
|
||||||
|
|
||||||
|
# For external access
|
||||||
|
QDRANT_EXTERNAL = os.getenv("QDRANT_EXTERNAL", "qdrant.betelgeusebytes.io")
|
||||||
|
# TEI_URL = "https://embeddings.betelgeusebytes.io"
|
||||||
|
TEI_HOST = os.getenv("TEI_HOST", "https://embeddings.betelgeusebytes.io")
|
||||||
|
TEI_PORT = int(os.getenv("TEI_PORT", "443"))
|
||||||
|
|
||||||
|
console = Console()
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Sample Queries for Testing
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
SAMPLE_QUERIES = {
|
||||||
|
"arabic": [
|
||||||
|
{
|
||||||
|
"query": "الصلاة في المسجد الحرام",
|
||||||
|
"description": "Prayer in the Sacred Mosque",
|
||||||
|
"expected_topics": ["prayer", "mosque", "mecca"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "أبو هريرة رضي الله عنه",
|
||||||
|
"description": "Abu Hurairah (RA)",
|
||||||
|
"expected_topics": ["narrator", "companion"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "الصيام في شهر رمضان",
|
||||||
|
"description": "Fasting in Ramadan",
|
||||||
|
"expected_topics": ["fasting", "ramadan"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "الزكاة والصدقة",
|
||||||
|
"description": "Zakat and charity",
|
||||||
|
"expected_topics": ["charity", "zakat"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "الحج والعمرة",
|
||||||
|
"description": "Hajj and Umrah",
|
||||||
|
"expected_topics": ["pilgrimage", "hajj", "umrah"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "النبي صلى الله عليه وسلم في المدينة",
|
||||||
|
"description": "Prophet (PBUH) in Medina",
|
||||||
|
"expected_topics": ["prophet", "medina"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "الوضوء والطهارة",
|
||||||
|
"description": "Ablution and purification",
|
||||||
|
"expected_topics": ["ablution", "purification", "wudu"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "بر الوالدين",
|
||||||
|
"description": "Honoring parents",
|
||||||
|
"expected_topics": ["parents", "kindness", "family"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "الجنة والنار",
|
||||||
|
"description": "Paradise and Hell",
|
||||||
|
"expected_topics": ["afterlife", "paradise", "hell"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "الإيمان والإسلام",
|
||||||
|
"description": "Faith and Islam",
|
||||||
|
"expected_topics": ["faith", "belief", "islam"]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"english": [
|
||||||
|
{
|
||||||
|
"query": "five daily prayers",
|
||||||
|
"description": "The five obligatory prayers",
|
||||||
|
"expected_topics": ["prayer", "salah", "obligation"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "Prophet Muhammad in Mecca",
|
||||||
|
"description": "Prophet's life in Mecca",
|
||||||
|
"expected_topics": ["prophet", "mecca", "biography"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "treatment of neighbors",
|
||||||
|
"description": "Rights and treatment of neighbors",
|
||||||
|
"expected_topics": ["neighbors", "rights", "ethics"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "patience during hardship",
|
||||||
|
"description": "Patience in difficult times",
|
||||||
|
"expected_topics": ["patience", "sabr", "trials"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "marriage and family",
|
||||||
|
"description": "Islamic marriage guidance",
|
||||||
|
"expected_topics": ["marriage", "family", "nikah"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "honesty and truthfulness",
|
||||||
|
"description": "Importance of being truthful",
|
||||||
|
"expected_topics": ["honesty", "truth", "character"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "Day of Judgment signs",
|
||||||
|
"description": "Signs of the Last Day",
|
||||||
|
"expected_topics": ["judgment", "signs", "eschatology"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "charity and helping poor",
|
||||||
|
"description": "Giving charity to the needy",
|
||||||
|
"expected_topics": ["charity", "poor", "sadaqah"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "companions of the Prophet",
|
||||||
|
"description": "Sahaba and their virtues",
|
||||||
|
"expected_topics": ["companions", "sahaba", "virtue"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "seeking knowledge in Islam",
|
||||||
|
"description": "Importance of knowledge",
|
||||||
|
"expected_topics": ["knowledge", "learning", "education"]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"mixed": [
|
||||||
|
{
|
||||||
|
"query": "قال رسول الله about kindness",
|
||||||
|
"description": "Prophet's sayings about kindness (mixed)",
|
||||||
|
"expected_topics": ["prophet", "kindness", "ethics"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"query": "women rights الإسلام",
|
||||||
|
"description": "Women's rights in Islam (mixed)",
|
||||||
|
"expected_topics": ["women", "rights", "islam"]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class SearchResult:
|
||||||
|
"""Individual search result."""
|
||||||
|
hadith_id: int
|
||||||
|
score: float
|
||||||
|
collection: str
|
||||||
|
hadith_number: str
|
||||||
|
arabic_text: str
|
||||||
|
english_text: str
|
||||||
|
grade: str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class QueryBenchmark:
|
||||||
|
"""Benchmark results for a single query."""
|
||||||
|
query: str
|
||||||
|
language: str
|
||||||
|
description: str
|
||||||
|
embedding_time_ms: float
|
||||||
|
search_time_ms: float
|
||||||
|
total_time_ms: float
|
||||||
|
results_count: int
|
||||||
|
top_score: float
|
||||||
|
meets_target: bool # <500ms
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class BenchmarkReport:
|
||||||
|
"""Full benchmark report."""
|
||||||
|
total_queries: int
|
||||||
|
successful_queries: int
|
||||||
|
failed_queries: int
|
||||||
|
avg_embedding_time_ms: float
|
||||||
|
avg_search_time_ms: float
|
||||||
|
avg_total_time_ms: float
|
||||||
|
p50_time_ms: float
|
||||||
|
p95_time_ms: float
|
||||||
|
p99_time_ms: float
|
||||||
|
min_time_ms: float
|
||||||
|
max_time_ms: float
|
||||||
|
queries_meeting_target: int
|
||||||
|
target_ms: int
|
||||||
|
query_results: List[QueryBenchmark] = field(default_factory=list)
|
||||||
|
timestamp: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
def get_db_connection():
|
||||||
|
"""Create PostgreSQL connection."""
|
||||||
|
return psycopg2.connect(
|
||||||
|
host=POSTGRES_HOST,
|
||||||
|
port=POSTGRES_PORT,
|
||||||
|
database=POSTGRES_DB,
|
||||||
|
user=POSTGRES_USER,
|
||||||
|
password=POSTGRES_PASSWORD,
|
||||||
|
sslmode='require'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def get_embedding(client: httpx.AsyncClient, text: str) -> Tuple[List[float], float]:
|
||||||
|
"""Get embedding from TEI service."""
|
||||||
|
start = time.perf_counter()
|
||||||
|
|
||||||
|
response = await client.post(
|
||||||
|
f"{TEI_HOST}:{TEI_PORT}/embed",
|
||||||
|
json={"inputs": text}
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
elapsed_ms = (time.perf_counter() - start) * 1000
|
||||||
|
|
||||||
|
# TEI returns list of embeddings, we want the first one
|
||||||
|
embeddings = response.json()
|
||||||
|
if isinstance(embeddings, list) and len(embeddings) > 0:
|
||||||
|
if isinstance(embeddings[0], list):
|
||||||
|
return embeddings[0], elapsed_ms
|
||||||
|
return embeddings, elapsed_ms
|
||||||
|
|
||||||
|
raise ValueError(f"Unexpected embedding response format: {type(embeddings)}")
|
||||||
|
|
||||||
|
|
||||||
|
async def search_qdrant(
|
||||||
|
client: httpx.AsyncClient,
|
||||||
|
embedding: List[float],
|
||||||
|
limit: int = 10
|
||||||
|
) -> Tuple[List[Dict], float]:
|
||||||
|
"""Search Qdrant with embedding vector."""
|
||||||
|
start = time.perf_counter()
|
||||||
|
|
||||||
|
response = await client.post(
|
||||||
|
f"{QDRANT_HOST}:{QDRANT_PORT}/collections/{QDRANT_COLLECTION}/points/search",
|
||||||
|
json={
|
||||||
|
"vector": embedding,
|
||||||
|
"limit": limit,
|
||||||
|
"with_payload": True
|
||||||
|
}
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
elapsed_ms = (time.perf_counter() - start) * 1000
|
||||||
|
results = response.json().get("result", [])
|
||||||
|
|
||||||
|
return results, elapsed_ms
|
||||||
|
|
||||||
|
|
||||||
|
def enrich_results_from_db(hadith_ids: List[int]) -> Dict[int, Dict]:
|
||||||
|
"""Fetch full hadith data from PostgreSQL."""
|
||||||
|
if not hadith_ids:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
conn = get_db_connection()
|
||||||
|
try:
|
||||||
|
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||||
|
cur.execute("""
|
||||||
|
SELECT
|
||||||
|
h.id,
|
||||||
|
h.hadith_number,
|
||||||
|
h.arabic_text,
|
||||||
|
h.english_text,
|
||||||
|
h.grade,
|
||||||
|
c.name_english as collection_name
|
||||||
|
FROM hadiths h
|
||||||
|
JOIN collections c ON h.collection_id = c.id
|
||||||
|
WHERE h.id = ANY(%s)
|
||||||
|
""", (hadith_ids,))
|
||||||
|
|
||||||
|
return {row['id']: dict(row) for row in cur.fetchall()}
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
async def semantic_search(
|
||||||
|
client: httpx.AsyncClient,
|
||||||
|
query: str,
|
||||||
|
limit: int = 10
|
||||||
|
) -> Tuple[List[SearchResult], float, float]:
|
||||||
|
"""Perform semantic search and return results with timing."""
|
||||||
|
|
||||||
|
# Step 1: Get embedding
|
||||||
|
embedding, embed_time = await get_embedding(client, query)
|
||||||
|
|
||||||
|
# Step 2: Search Qdrant
|
||||||
|
qdrant_results, search_time = await search_qdrant(client, embedding, limit)
|
||||||
|
|
||||||
|
# Step 3: Get hadith IDs and enrich from DB
|
||||||
|
hadith_ids = []
|
||||||
|
for r in qdrant_results:
|
||||||
|
payload = r.get("payload", {})
|
||||||
|
hid = payload.get("hadith_id") or payload.get("id")
|
||||||
|
if hid:
|
||||||
|
hadith_ids.append(int(hid))
|
||||||
|
|
||||||
|
db_data = enrich_results_from_db(hadith_ids)
|
||||||
|
|
||||||
|
# Step 4: Build results
|
||||||
|
results = []
|
||||||
|
for r in qdrant_results:
|
||||||
|
payload = r.get("payload", {})
|
||||||
|
hid = payload.get("hadith_id") or payload.get("id")
|
||||||
|
|
||||||
|
if hid and int(hid) in db_data:
|
||||||
|
data = db_data[int(hid)]
|
||||||
|
results.append(SearchResult(
|
||||||
|
hadith_id=int(hid),
|
||||||
|
score=r.get("score", 0),
|
||||||
|
collection=data.get("collection_name", "Unknown"),
|
||||||
|
hadith_number=data.get("hadith_number", ""),
|
||||||
|
arabic_text=data.get("arabic_text", "")[:200] + "..." if data.get("arabic_text") else "",
|
||||||
|
english_text=data.get("english_text", "")[:200] + "..." if data.get("english_text") else "",
|
||||||
|
grade=data.get("grade", "")
|
||||||
|
))
|
||||||
|
else:
|
||||||
|
# Fallback to payload data
|
||||||
|
results.append(SearchResult(
|
||||||
|
hadith_id=int(hid) if hid else 0,
|
||||||
|
score=r.get("score", 0),
|
||||||
|
collection=payload.get("collection", "Unknown"),
|
||||||
|
hadith_number=str(payload.get("hadith_number", "")),
|
||||||
|
arabic_text=payload.get("arabic_text", "")[:200] + "..." if payload.get("arabic_text") else "",
|
||||||
|
english_text=payload.get("english_text", "")[:200] + "..." if payload.get("english_text") else "",
|
||||||
|
grade=payload.get("grade", "")
|
||||||
|
))
|
||||||
|
|
||||||
|
return results, embed_time, search_time
|
||||||
|
|
||||||
|
|
||||||
|
def display_search_results(query: str, results: List[SearchResult], embed_time: float, search_time: float):
|
||||||
|
"""Display search results in a nice format."""
|
||||||
|
total_time = embed_time + search_time
|
||||||
|
|
||||||
|
console.print(f"\n[bold cyan]Query:[/bold cyan] {query}")
|
||||||
|
console.print(f"[dim]Embedding: {embed_time:.1f}ms | Search: {search_time:.1f}ms | Total: {total_time:.1f}ms[/dim]")
|
||||||
|
|
||||||
|
if not results:
|
||||||
|
console.print("[yellow]No results found.[/yellow]")
|
||||||
|
return
|
||||||
|
|
||||||
|
table = Table(title=f"Top {len(results)} Results", show_lines=True)
|
||||||
|
table.add_column("#", style="dim", width=3)
|
||||||
|
table.add_column("Score", justify="right", width=8)
|
||||||
|
table.add_column("Collection", width=15)
|
||||||
|
table.add_column("Hadith #", width=10)
|
||||||
|
table.add_column("Text Preview", width=60)
|
||||||
|
table.add_column("Grade", width=10)
|
||||||
|
|
||||||
|
for i, r in enumerate(results, 1):
|
||||||
|
text_preview = r.english_text if r.english_text else r.arabic_text
|
||||||
|
table.add_row(
|
||||||
|
str(i),
|
||||||
|
f"{r.score:.4f}",
|
||||||
|
r.collection,
|
||||||
|
r.hadith_number,
|
||||||
|
text_preview[:80] + "..." if len(text_preview) > 80 else text_preview,
|
||||||
|
r.grade or "-"
|
||||||
|
)
|
||||||
|
|
||||||
|
console.print(table)
|
||||||
|
|
||||||
|
|
||||||
|
async def run_benchmarks(warmup_count: int = 3) -> BenchmarkReport:
|
||||||
|
"""Run full benchmark suite."""
|
||||||
|
console.print(Panel.fit(
|
||||||
|
"[bold blue]Semantic Search Benchmark[/bold blue]\n"
|
||||||
|
f"Target: <500ms per query\n"
|
||||||
|
f"TEI: {TEI_HOST}:{TEI_PORT}\n"
|
||||||
|
f"Qdrant: {QDRANT_HOST}:{QDRANT_PORT}",
|
||||||
|
title="Step 6.2"
|
||||||
|
))
|
||||||
|
|
||||||
|
all_queries = (
|
||||||
|
[(q, "arabic") for q in SAMPLE_QUERIES["arabic"]] +
|
||||||
|
[(q, "english") for q in SAMPLE_QUERIES["english"]] +
|
||||||
|
[(q, "mixed") for q in SAMPLE_QUERIES["mixed"]]
|
||||||
|
)
|
||||||
|
|
||||||
|
query_results = []
|
||||||
|
total_times = []
|
||||||
|
successful = 0
|
||||||
|
failed = 0
|
||||||
|
|
||||||
|
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||||
|
# Warmup queries
|
||||||
|
console.print(f"\n[yellow]Running {warmup_count} warmup queries...[/yellow]")
|
||||||
|
for i in range(warmup_count):
|
||||||
|
try:
|
||||||
|
await semantic_search(client, "test warmup query", limit=5)
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[dim]Warmup {i+1} error: {e}[/dim]")
|
||||||
|
|
||||||
|
console.print("[green]Warmup complete.[/green]\n")
|
||||||
|
|
||||||
|
# Run benchmarks
|
||||||
|
with Progress(
|
||||||
|
SpinnerColumn(),
|
||||||
|
TextColumn("[progress.description]{task.description}"),
|
||||||
|
BarColumn(),
|
||||||
|
TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
|
||||||
|
console=console
|
||||||
|
) as progress:
|
||||||
|
task = progress.add_task("Running benchmarks...", total=len(all_queries))
|
||||||
|
|
||||||
|
for query_data, lang in all_queries:
|
||||||
|
query = query_data["query"]
|
||||||
|
description = query_data["description"]
|
||||||
|
|
||||||
|
try:
|
||||||
|
results, embed_time, search_time = await semantic_search(
|
||||||
|
client, query, limit=10
|
||||||
|
)
|
||||||
|
|
||||||
|
total_time = embed_time + search_time
|
||||||
|
total_times.append(total_time)
|
||||||
|
|
||||||
|
benchmark = QueryBenchmark(
|
||||||
|
query=query,
|
||||||
|
language=lang,
|
||||||
|
description=description,
|
||||||
|
embedding_time_ms=embed_time,
|
||||||
|
search_time_ms=search_time,
|
||||||
|
total_time_ms=total_time,
|
||||||
|
results_count=len(results),
|
||||||
|
top_score=results[0].score if results else 0,
|
||||||
|
meets_target=total_time < 500
|
||||||
|
)
|
||||||
|
|
||||||
|
query_results.append(benchmark)
|
||||||
|
successful += 1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[red]Error for query '{query[:30]}...': {e}[/red]")
|
||||||
|
failed += 1
|
||||||
|
|
||||||
|
progress.advance(task)
|
||||||
|
|
||||||
|
# Calculate statistics
|
||||||
|
if total_times:
|
||||||
|
sorted_times = sorted(total_times)
|
||||||
|
p50_idx = int(len(sorted_times) * 0.50)
|
||||||
|
p95_idx = int(len(sorted_times) * 0.95)
|
||||||
|
p99_idx = int(len(sorted_times) * 0.99)
|
||||||
|
|
||||||
|
report = BenchmarkReport(
|
||||||
|
total_queries=len(all_queries),
|
||||||
|
successful_queries=successful,
|
||||||
|
failed_queries=failed,
|
||||||
|
avg_embedding_time_ms=statistics.mean(q.embedding_time_ms for q in query_results),
|
||||||
|
avg_search_time_ms=statistics.mean(q.search_time_ms for q in query_results),
|
||||||
|
avg_total_time_ms=statistics.mean(total_times),
|
||||||
|
p50_time_ms=sorted_times[p50_idx],
|
||||||
|
p95_time_ms=sorted_times[p95_idx] if p95_idx < len(sorted_times) else sorted_times[-1],
|
||||||
|
p99_time_ms=sorted_times[p99_idx] if p99_idx < len(sorted_times) else sorted_times[-1],
|
||||||
|
min_time_ms=min(total_times),
|
||||||
|
max_time_ms=max(total_times),
|
||||||
|
queries_meeting_target=sum(1 for t in total_times if t < 500),
|
||||||
|
target_ms=500,
|
||||||
|
query_results=query_results,
|
||||||
|
timestamp=datetime.now().isoformat()
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
report = BenchmarkReport(
|
||||||
|
total_queries=len(all_queries),
|
||||||
|
successful_queries=0,
|
||||||
|
failed_queries=failed,
|
||||||
|
avg_embedding_time_ms=0,
|
||||||
|
avg_search_time_ms=0,
|
||||||
|
avg_total_time_ms=0,
|
||||||
|
p50_time_ms=0,
|
||||||
|
p95_time_ms=0,
|
||||||
|
p99_time_ms=0,
|
||||||
|
min_time_ms=0,
|
||||||
|
max_time_ms=0,
|
||||||
|
queries_meeting_target=0,
|
||||||
|
target_ms=500,
|
||||||
|
query_results=[],
|
||||||
|
timestamp=datetime.now().isoformat()
|
||||||
|
)
|
||||||
|
|
||||||
|
return report
|
||||||
|
|
||||||
|
|
||||||
|
def display_benchmark_report(report: BenchmarkReport):
|
||||||
|
"""Display benchmark report."""
|
||||||
|
console.print("\n" + "="*60)
|
||||||
|
console.print("[bold]BENCHMARK RESULTS[/bold]")
|
||||||
|
console.print("="*60)
|
||||||
|
|
||||||
|
# Summary stats
|
||||||
|
console.print(f"\n[cyan]Query Statistics:[/cyan]")
|
||||||
|
console.print(f" Total queries: {report.total_queries}")
|
||||||
|
console.print(f" Successful: [green]{report.successful_queries}[/green]")
|
||||||
|
console.print(f" Failed: [red]{report.failed_queries}[/red]")
|
||||||
|
|
||||||
|
console.print(f"\n[cyan]Timing Statistics:[/cyan]")
|
||||||
|
console.print(f" Average embedding time: {report.avg_embedding_time_ms:.1f}ms")
|
||||||
|
console.print(f" Average search time: {report.avg_search_time_ms:.1f}ms")
|
||||||
|
console.print(f" Average total time: {report.avg_total_time_ms:.1f}ms")
|
||||||
|
|
||||||
|
console.print(f"\n[cyan]Percentiles:[/cyan]")
|
||||||
|
console.print(f" P50: {report.p50_time_ms:.1f}ms")
|
||||||
|
console.print(f" P95: {report.p95_time_ms:.1f}ms")
|
||||||
|
console.print(f" P99: {report.p99_time_ms:.1f}ms")
|
||||||
|
console.print(f" Min: {report.min_time_ms:.1f}ms")
|
||||||
|
console.print(f" Max: {report.max_time_ms:.1f}ms")
|
||||||
|
|
||||||
|
# Target check
|
||||||
|
target_pct = (report.queries_meeting_target / report.successful_queries * 100) if report.successful_queries else 0
|
||||||
|
target_met = target_pct >= 95 # 95% of queries should meet target
|
||||||
|
|
||||||
|
console.print(f"\n[cyan]Performance Target (<{report.target_ms}ms):[/cyan]")
|
||||||
|
status = "[bold green]✓ TARGET MET[/bold green]" if target_met else "[bold red]✗ TARGET NOT MET[/bold red]"
|
||||||
|
console.print(f" Queries meeting target: {report.queries_meeting_target}/{report.successful_queries} ({target_pct:.1f}%)")
|
||||||
|
console.print(f" Status: {status}")
|
||||||
|
|
||||||
|
# Detailed results table
|
||||||
|
if report.query_results:
|
||||||
|
console.print("\n[cyan]Detailed Results:[/cyan]")
|
||||||
|
|
||||||
|
table = Table(show_lines=False)
|
||||||
|
table.add_column("Language", width=8)
|
||||||
|
table.add_column("Query", width=35)
|
||||||
|
table.add_column("Embed", justify="right", width=8)
|
||||||
|
table.add_column("Search", justify="right", width=8)
|
||||||
|
table.add_column("Total", justify="right", width=8)
|
||||||
|
table.add_column("Results", justify="right", width=7)
|
||||||
|
table.add_column("Status", width=6)
|
||||||
|
|
||||||
|
for r in report.query_results:
|
||||||
|
status_icon = "✓" if r.meets_target else "✗"
|
||||||
|
status_style = "green" if r.meets_target else "red"
|
||||||
|
|
||||||
|
table.add_row(
|
||||||
|
r.language,
|
||||||
|
r.query[:35] + "..." if len(r.query) > 35 else r.query,
|
||||||
|
f"{r.embedding_time_ms:.0f}ms",
|
||||||
|
f"{r.search_time_ms:.0f}ms",
|
||||||
|
f"{r.total_time_ms:.0f}ms",
|
||||||
|
str(r.results_count),
|
||||||
|
f"[{status_style}]{status_icon}[/{status_style}]"
|
||||||
|
)
|
||||||
|
|
||||||
|
console.print(table)
|
||||||
|
|
||||||
|
|
||||||
|
async def interactive_search():
|
||||||
|
"""Interactive search mode."""
|
||||||
|
console.print(Panel.fit(
|
||||||
|
"[bold blue]Interactive Semantic Search[/bold blue]\n"
|
||||||
|
"Type your query and press Enter. Type 'quit' to exit.",
|
||||||
|
title="Interactive Mode"
|
||||||
|
))
|
||||||
|
|
||||||
|
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
query = input("\n🔍 Query: ").strip()
|
||||||
|
|
||||||
|
if query.lower() in ('quit', 'exit', 'q'):
|
||||||
|
console.print("[dim]Goodbye![/dim]")
|
||||||
|
break
|
||||||
|
|
||||||
|
if not query:
|
||||||
|
continue
|
||||||
|
|
||||||
|
results, embed_time, search_time = await semantic_search(
|
||||||
|
client, query, limit=10
|
||||||
|
)
|
||||||
|
|
||||||
|
display_search_results(query, results, embed_time, search_time)
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
console.print("\n[dim]Interrupted. Goodbye![/dim]")
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[red]Error: {e}[/red]")
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
"""Main entry point."""
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description="Hadith Semantic Search Testing")
|
||||||
|
parser.add_argument("--mode", choices=["benchmark", "interactive", "demo"],
|
||||||
|
default="benchmark", help="Run mode")
|
||||||
|
parser.add_argument("--query", type=str, help="Single query to run")
|
||||||
|
parser.add_argument("--output", type=str, default="benchmark_results.json",
|
||||||
|
help="Output file for benchmark results")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.query:
|
||||||
|
# Single query mode
|
||||||
|
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||||
|
results, embed_time, search_time = await semantic_search(
|
||||||
|
client, args.query, limit=10
|
||||||
|
)
|
||||||
|
display_search_results(args.query, results, embed_time, search_time)
|
||||||
|
|
||||||
|
elif args.mode == "benchmark":
|
||||||
|
# Full benchmark
|
||||||
|
report = await run_benchmarks()
|
||||||
|
display_benchmark_report(report)
|
||||||
|
|
||||||
|
# Save results
|
||||||
|
with open(args.output, 'w') as f:
|
||||||
|
json.dump(asdict(report), f, indent=2, default=str)
|
||||||
|
console.print(f"\n[dim]Results saved to {args.output}[/dim]")
|
||||||
|
|
||||||
|
elif args.mode == "interactive":
|
||||||
|
await interactive_search()
|
||||||
|
|
||||||
|
elif args.mode == "demo":
|
||||||
|
# Demo with a few sample queries
|
||||||
|
console.print(Panel.fit(
|
||||||
|
"[bold blue]Semantic Search Demo[/bold blue]",
|
||||||
|
title="Demo Mode"
|
||||||
|
))
|
||||||
|
|
||||||
|
demo_queries = [
|
||||||
|
"الصلاة في المسجد",
|
||||||
|
"five daily prayers",
|
||||||
|
"patience during hardship",
|
||||||
|
"بر الوالدين"
|
||||||
|
]
|
||||||
|
|
||||||
|
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||||
|
for query in demo_queries:
|
||||||
|
try:
|
||||||
|
results, embed_time, search_time = await semantic_search(
|
||||||
|
client, query, limit=5
|
||||||
|
)
|
||||||
|
display_search_results(query, results, embed_time, search_time)
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[red]Error for '{query}': {e}[/red]")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
|
|
@ -0,0 +1,476 @@
|
||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Step 6: Verify Embeddings & Test Semantic Search\n",
|
||||||
|
"\n",
|
||||||
|
"This notebook provides interactive verification and testing of the hadith embedding system.\n",
|
||||||
|
"\n",
|
||||||
|
"**Prerequisites:**\n",
|
||||||
|
"- PostgreSQL accessible at pg.betelgeusebytes.io\n",
|
||||||
|
"- Qdrant accessible at qdrant.vector.svc.cluster.local\n",
|
||||||
|
"- TEI accessible at tei.ml.svc.cluster.local"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## 1. Setup & Configuration"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Install dependencies\n",
|
||||||
|
"!pip install -q psycopg2-binary httpx rich"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import os\n",
|
||||||
|
"import json\n",
|
||||||
|
"import time\n",
|
||||||
|
"import httpx\n",
|
||||||
|
"import psycopg2\n",
|
||||||
|
"from psycopg2.extras import RealDictCursor\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"from IPython.display import display, HTML, Markdown\n",
|
||||||
|
"\n",
|
||||||
|
"# Configuration\n",
|
||||||
|
"POSTGRES_CONFIG = {\n",
|
||||||
|
" 'host': os.getenv('POSTGRES_HOST', 'pg.betelgeusebytes.io'),\n",
|
||||||
|
" 'port': int(os.getenv('POSTGRES_PORT', '5432')),\n",
|
||||||
|
" 'database': os.getenv('POSTGRES_DB', 'hadith_db'),\n",
|
||||||
|
" 'user': os.getenv('POSTGRES_USER', 'hadith_ingest'),\n",
|
||||||
|
" 'password': os.getenv('POSTGRES_PASSWORD', ''), # SET THIS!\n",
|
||||||
|
" 'sslmode': 'require'\n",
|
||||||
|
"}\n",
|
||||||
|
"\n",
|
||||||
|
"QDRANT_URL = f\"http://{os.getenv('QDRANT_HOST', 'qdrant.vector.svc.cluster.local')}:{os.getenv('QDRANT_PORT', '6333')}\"\n",
|
||||||
|
"QDRANT_COLLECTION = os.getenv('QDRANT_COLLECTION', 'hadith_embeddings')\n",
|
||||||
|
"\n",
|
||||||
|
"TEI_URL = f\"http://{os.getenv('TEI_HOST', 'tei.ml.svc.cluster.local')}:{os.getenv('TEI_PORT', '80')}\"\n",
|
||||||
|
"\n",
|
||||||
|
"print(f\"PostgreSQL: {POSTGRES_CONFIG['host']}:{POSTGRES_CONFIG['port']}/{POSTGRES_CONFIG['database']}\")\n",
|
||||||
|
"print(f\"Qdrant: {QDRANT_URL}\")\n",
|
||||||
|
"print(f\"TEI: {TEI_URL}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# ⚠️ SET YOUR PASSWORD HERE\n",
|
||||||
|
"POSTGRES_CONFIG['password'] = 'YOUR_PASSWORD_HERE' # CHANGE THIS!"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## 2. Database Verification"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def get_db_connection():\n",
|
||||||
|
" return psycopg2.connect(**POSTGRES_CONFIG)\n",
|
||||||
|
"\n",
|
||||||
|
"def run_query(query):\n",
|
||||||
|
" conn = get_db_connection()\n",
|
||||||
|
" try:\n",
|
||||||
|
" df = pd.read_sql(query, conn)\n",
|
||||||
|
" return df\n",
|
||||||
|
" finally:\n",
|
||||||
|
" conn.close()\n",
|
||||||
|
"\n",
|
||||||
|
"# Test connection\n",
|
||||||
|
"try:\n",
|
||||||
|
" conn = get_db_connection()\n",
|
||||||
|
" conn.close()\n",
|
||||||
|
" print(\"✅ Database connection successful!\")\n",
|
||||||
|
"except Exception as e:\n",
|
||||||
|
" print(f\"❌ Database connection failed: {e}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Get total hadith count and embedding status\n",
|
||||||
|
"query = \"\"\"\n",
|
||||||
|
"SELECT \n",
|
||||||
|
" COUNT(*) as total_hadiths,\n",
|
||||||
|
" SUM(CASE WHEN embedding_generated THEN 1 ELSE 0 END) as embedded,\n",
|
||||||
|
" SUM(CASE WHEN NOT embedding_generated THEN 1 ELSE 0 END) as not_embedded,\n",
|
||||||
|
" ROUND(100.0 * SUM(CASE WHEN embedding_generated THEN 1 ELSE 0 END) / COUNT(*), 2) as pct_complete\n",
|
||||||
|
"FROM hadiths\n",
|
||||||
|
"\"\"\"\n",
|
||||||
|
"\n",
|
||||||
|
"df = run_query(query)\n",
|
||||||
|
"display(df)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Get breakdown by collection\n",
|
||||||
|
"query = \"\"\"\n",
|
||||||
|
"SELECT \n",
|
||||||
|
" c.name_english as collection,\n",
|
||||||
|
" COUNT(h.id) as total,\n",
|
||||||
|
" SUM(CASE WHEN h.embedding_generated THEN 1 ELSE 0 END) as embedded,\n",
|
||||||
|
" ROUND(100.0 * SUM(CASE WHEN h.embedding_generated THEN 1 ELSE 0 END) / COUNT(h.id), 2) as pct_embedded\n",
|
||||||
|
"FROM hadiths h\n",
|
||||||
|
"JOIN collections c ON h.collection_id = c.id\n",
|
||||||
|
"GROUP BY c.id, c.name_english\n",
|
||||||
|
"ORDER BY total DESC\n",
|
||||||
|
"\"\"\"\n",
|
||||||
|
"\n",
|
||||||
|
"df_collections = run_query(query)\n",
|
||||||
|
"display(df_collections)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## 3. Qdrant Verification"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Check Qdrant collection\n",
|
||||||
|
"with httpx.Client(timeout=30.0) as client:\n",
|
||||||
|
" try:\n",
|
||||||
|
" response = client.get(f\"{QDRANT_URL}/collections/{QDRANT_COLLECTION}\")\n",
|
||||||
|
" response.raise_for_status()\n",
|
||||||
|
" collection_info = response.json()\n",
|
||||||
|
" print(\"✅ Qdrant collection found!\")\n",
|
||||||
|
" print(f\"\\nCollection status: {collection_info['result']['status']}\")\n",
|
||||||
|
" print(f\"Vector dimension: {collection_info['result']['config']['params']['vectors']['size']}\")\n",
|
||||||
|
" except Exception as e:\n",
|
||||||
|
" print(f\"❌ Qdrant error: {e}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Count points in Qdrant\n",
|
||||||
|
"with httpx.Client(timeout=30.0) as client:\n",
|
||||||
|
" response = client.post(\n",
|
||||||
|
" f\"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/points/count\",\n",
|
||||||
|
" json={\"exact\": True}\n",
|
||||||
|
" )\n",
|
||||||
|
" response.raise_for_status()\n",
|
||||||
|
" count = response.json()['result']['count']\n",
|
||||||
|
" print(f\"\\n📊 Total embeddings in Qdrant: {count:,}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## 4. TEI Service Test"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Test TEI embedding service\n",
|
||||||
|
"test_text = \"الصلاة في المسجد الحرام\"\n",
|
||||||
|
"\n",
|
||||||
|
"with httpx.Client(timeout=30.0) as client:\n",
|
||||||
|
" start = time.perf_counter()\n",
|
||||||
|
" response = client.post(\n",
|
||||||
|
" f\"{TEI_URL}/embed\",\n",
|
||||||
|
" json={\"inputs\": test_text}\n",
|
||||||
|
" )\n",
|
||||||
|
" elapsed = (time.perf_counter() - start) * 1000\n",
|
||||||
|
" \n",
|
||||||
|
" response.raise_for_status()\n",
|
||||||
|
" embedding = response.json()[0]\n",
|
||||||
|
" \n",
|
||||||
|
" print(f\"✅ TEI service working!\")\n",
|
||||||
|
" print(f\"\\nTest text: {test_text}\")\n",
|
||||||
|
" print(f\"Embedding dimension: {len(embedding)}\")\n",
|
||||||
|
" print(f\"Time: {elapsed:.1f}ms\")\n",
|
||||||
|
" print(f\"First 5 values: {embedding[:5]}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## 5. Semantic Search Testing"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def semantic_search(query_text, limit=10):\n",
|
||||||
|
" \"\"\"Perform semantic search and return results with timing.\"\"\"\n",
|
||||||
|
" with httpx.Client(timeout=30.0) as client:\n",
|
||||||
|
" # Get embedding\n",
|
||||||
|
" start = time.perf_counter()\n",
|
||||||
|
" embed_response = client.post(f\"{TEI_URL}/embed\", json={\"inputs\": query_text})\n",
|
||||||
|
" embed_response.raise_for_status()\n",
|
||||||
|
" embedding = embed_response.json()[0]\n",
|
||||||
|
" embed_time = (time.perf_counter() - start) * 1000\n",
|
||||||
|
" \n",
|
||||||
|
" # Search Qdrant\n",
|
||||||
|
" start = time.perf_counter()\n",
|
||||||
|
" search_response = client.post(\n",
|
||||||
|
" f\"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/points/search\",\n",
|
||||||
|
" json={\"vector\": embedding, \"limit\": limit, \"with_payload\": True}\n",
|
||||||
|
" )\n",
|
||||||
|
" search_response.raise_for_status()\n",
|
||||||
|
" results = search_response.json()['result']\n",
|
||||||
|
" search_time = (time.perf_counter() - start) * 1000\n",
|
||||||
|
" \n",
|
||||||
|
" return results, embed_time, search_time\n",
|
||||||
|
"\n",
|
||||||
|
"def display_results(query, results, embed_time, search_time):\n",
|
||||||
|
" \"\"\"Display search results nicely.\"\"\"\n",
|
||||||
|
" total_time = embed_time + search_time\n",
|
||||||
|
" status = \"✅\" if total_time < 500 else \"⚠️\"\n",
|
||||||
|
" \n",
|
||||||
|
" print(f\"\\n{'='*60}\")\n",
|
||||||
|
" print(f\"Query: {query}\")\n",
|
||||||
|
" print(f\"Timing: {embed_time:.0f}ms (embed) + {search_time:.0f}ms (search) = {total_time:.0f}ms {status}\")\n",
|
||||||
|
" print(f\"{'='*60}\\n\")\n",
|
||||||
|
" \n",
|
||||||
|
" for i, r in enumerate(results, 1):\n",
|
||||||
|
" score = r['score']\n",
|
||||||
|
" payload = r.get('payload', {})\n",
|
||||||
|
" \n",
|
||||||
|
" text = payload.get('english_text') or payload.get('arabic_text', '')\n",
|
||||||
|
" text = text[:150] + '...' if len(text) > 150 else text\n",
|
||||||
|
" \n",
|
||||||
|
" print(f\"{i}. [Score: {score:.4f}] {payload.get('collection', 'Unknown')} #{payload.get('hadith_number', 'N/A')}\")\n",
|
||||||
|
" print(f\" {text}\")\n",
|
||||||
|
" print()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Test Arabic query\n",
|
||||||
|
"query = \"الصلاة في المسجد الحرام\"\n",
|
||||||
|
"results, embed_time, search_time = semantic_search(query, limit=5)\n",
|
||||||
|
"display_results(query, results, embed_time, search_time)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Test English query\n",
|
||||||
|
"query = \"five daily prayers\"\n",
|
||||||
|
"results, embed_time, search_time = semantic_search(query, limit=5)\n",
|
||||||
|
"display_results(query, results, embed_time, search_time)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Test more queries\n",
|
||||||
|
"test_queries = [\n",
|
||||||
|
" \"الصيام في شهر رمضان\",\n",
|
||||||
|
" \"patience during hardship\",\n",
|
||||||
|
" \"بر الوالدين\",\n",
|
||||||
|
" \"charity and helping poor\",\n",
|
||||||
|
" \"الجنة والنار\"\n",
|
||||||
|
"]\n",
|
||||||
|
"\n",
|
||||||
|
"for q in test_queries:\n",
|
||||||
|
" results, embed_time, search_time = semantic_search(q, limit=3)\n",
|
||||||
|
" display_results(q, results, embed_time, search_time)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## 6. Performance Benchmarking"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import statistics\n",
|
||||||
|
"\n",
|
||||||
|
"# Benchmark queries\n",
|
||||||
|
"benchmark_queries = [\n",
|
||||||
|
" \"الصلاة في المسجد الحرام\",\n",
|
||||||
|
" \"أبو هريرة رضي الله عنه\",\n",
|
||||||
|
" \"الصيام في شهر رمضان\",\n",
|
||||||
|
" \"five daily prayers\",\n",
|
||||||
|
" \"treatment of neighbors\",\n",
|
||||||
|
" \"patience during hardship\",\n",
|
||||||
|
" \"marriage and family\",\n",
|
||||||
|
" \"honesty and truthfulness\",\n",
|
||||||
|
" \"الزكاة والصدقة\",\n",
|
||||||
|
" \"الحج والعمرة\"\n",
|
||||||
|
"]\n",
|
||||||
|
"\n",
|
||||||
|
"# Warmup\n",
|
||||||
|
"print(\"Warming up...\")\n",
|
||||||
|
"for _ in range(3):\n",
|
||||||
|
" semantic_search(\"warmup query\", limit=5)\n",
|
||||||
|
"\n",
|
||||||
|
"# Run benchmark\n",
|
||||||
|
"print(\"\\nRunning benchmark...\")\n",
|
||||||
|
"times = []\n",
|
||||||
|
"\n",
|
||||||
|
"for q in benchmark_queries:\n",
|
||||||
|
" results, embed_time, search_time = semantic_search(q, limit=10)\n",
|
||||||
|
" total = embed_time + search_time\n",
|
||||||
|
" times.append(total)\n",
|
||||||
|
" status = \"✅\" if total < 500 else \"⚠️\"\n",
|
||||||
|
" print(f\" {q[:40]:40s} → {total:6.1f}ms {status}\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Statistics\n",
|
||||||
|
"print(f\"\\n{'='*60}\")\n",
|
||||||
|
"print(\"BENCHMARK RESULTS\")\n",
|
||||||
|
"print(f\"{'='*60}\")\n",
|
||||||
|
"print(f\"Queries: {len(times)}\")\n",
|
||||||
|
"print(f\"Average: {statistics.mean(times):.1f}ms\")\n",
|
||||||
|
"print(f\"Median: {statistics.median(times):.1f}ms\")\n",
|
||||||
|
"print(f\"Min: {min(times):.1f}ms\")\n",
|
||||||
|
"print(f\"Max: {max(times):.1f}ms\")\n",
|
||||||
|
"print(f\"StdDev: {statistics.stdev(times):.1f}ms\")\n",
|
||||||
|
"\n",
|
||||||
|
"meeting_target = sum(1 for t in times if t < 500)\n",
|
||||||
|
"print(f\"\\nMeeting <500ms target: {meeting_target}/{len(times)} ({100*meeting_target/len(times):.1f}%)\")\n",
|
||||||
|
"\n",
|
||||||
|
"if meeting_target == len(times):\n",
|
||||||
|
" print(\"\\n✅ TARGET MET!\")\n",
|
||||||
|
"else:\n",
|
||||||
|
" print(\"\\n⚠️ Some queries exceeded target\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## 7. Interactive Search"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Interactive search cell - run this and enter your query\n",
|
||||||
|
"query = input(\"Enter your search query: \")\n",
|
||||||
|
"if query:\n",
|
||||||
|
" results, embed_time, search_time = semantic_search(query, limit=10)\n",
|
||||||
|
" display_results(query, results, embed_time, search_time)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## 8. Verification Summary"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Final verification summary\n",
|
||||||
|
"print(\"=\"*60)\n",
|
||||||
|
"print(\"STEP 6 VERIFICATION SUMMARY\")\n",
|
||||||
|
"print(\"=\"*60)\n",
|
||||||
|
"\n",
|
||||||
|
"# Database check\n",
|
||||||
|
"df = run_query(\"SELECT COUNT(*) as total, SUM(CASE WHEN embedding_generated THEN 1 ELSE 0 END) as embedded FROM hadiths\")\n",
|
||||||
|
"total = df['total'][0]\n",
|
||||||
|
"embedded = df['embedded'][0]\n",
|
||||||
|
"print(f\"\\n✅ Database: {total:,} hadiths, {embedded:,} embedded ({100*embedded/total:.1f}%)\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Qdrant check\n",
|
||||||
|
"with httpx.Client(timeout=30.0) as client:\n",
|
||||||
|
" response = client.post(f\"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/points/count\", json={\"exact\": True})\n",
|
||||||
|
" qdrant_count = response.json()['result']['count']\n",
|
||||||
|
" print(f\"✅ Qdrant: {qdrant_count:,} embeddings stored\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Benchmark summary\n",
|
||||||
|
"if 'times' in dir() and times:\n",
|
||||||
|
" print(f\"✅ Performance: Avg {statistics.mean(times):.0f}ms, P95 {sorted(times)[int(len(times)*0.95)]:.0f}ms\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Missing check\n",
|
||||||
|
"missing = total - qdrant_count\n",
|
||||||
|
"if missing == 0:\n",
|
||||||
|
" print(f\"\\n🎉 ALL {total:,} HADITHS VERIFIED!\")\n",
|
||||||
|
"else:\n",
|
||||||
|
" print(f\"\\n⚠️ {missing:,} embeddings potentially missing\")\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"\\n\" + \"=\"*60)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"name": "python",
|
||||||
|
"version": "3.11.0"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,192 @@
|
||||||
|
-- ============================================================================
|
||||||
|
-- Step 6.1: PostgreSQL Verification Queries
|
||||||
|
-- Run these against hadith_db to verify data integrity
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- Connect: psql -h pg.betelgeusebytes.io -U hadith_ingest -d hadith_db
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- 1. Basic Statistics
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- Total hadith count
|
||||||
|
SELECT COUNT(*) AS total_hadiths FROM hadiths;
|
||||||
|
|
||||||
|
-- Hadiths by collection with embedding status
|
||||||
|
SELECT
|
||||||
|
c.name_english AS collection,
|
||||||
|
COUNT(h.id) AS total,
|
||||||
|
SUM(CASE WHEN h.embedding_generated THEN 1 ELSE 0 END) AS embedded,
|
||||||
|
ROUND(100.0 * SUM(CASE WHEN h.embedding_generated THEN 1 ELSE 0 END) / COUNT(h.id), 2) AS pct_embedded
|
||||||
|
FROM hadiths h
|
||||||
|
JOIN collections c ON h.collection_id = c.id
|
||||||
|
GROUP BY c.id, c.name_english
|
||||||
|
ORDER BY total DESC;
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- 2. Embedding Status Summary
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- Overall embedding status
|
||||||
|
SELECT
|
||||||
|
SUM(CASE WHEN embedding_generated THEN 1 ELSE 0 END) AS embedded,
|
||||||
|
SUM(CASE WHEN NOT embedding_generated THEN 1 ELSE 0 END) AS not_embedded,
|
||||||
|
COUNT(*) AS total,
|
||||||
|
ROUND(100.0 * SUM(CASE WHEN embedding_generated THEN 1 ELSE 0 END) / COUNT(*), 2) AS pct_complete
|
||||||
|
FROM hadiths;
|
||||||
|
|
||||||
|
-- List hadiths without embeddings (if any)
|
||||||
|
SELECT
|
||||||
|
h.id,
|
||||||
|
c.name_english AS collection,
|
||||||
|
h.hadith_number,
|
||||||
|
LEFT(h.arabic_text, 100) AS arabic_preview
|
||||||
|
FROM hadiths h
|
||||||
|
JOIN collections c ON h.collection_id = c.id
|
||||||
|
WHERE NOT h.embedding_generated
|
||||||
|
ORDER BY h.id
|
||||||
|
LIMIT 20;
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- 3. Text Quality Checks
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- Hadiths with empty or null texts
|
||||||
|
SELECT
|
||||||
|
'Empty Arabic' AS issue,
|
||||||
|
COUNT(*) AS count
|
||||||
|
FROM hadiths
|
||||||
|
WHERE arabic_text IS NULL OR LENGTH(TRIM(arabic_text)) = 0
|
||||||
|
|
||||||
|
UNION ALL
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
'Empty English' AS issue,
|
||||||
|
COUNT(*) AS count
|
||||||
|
FROM hadiths
|
||||||
|
WHERE english_text IS NULL OR LENGTH(TRIM(english_text)) = 0
|
||||||
|
|
||||||
|
UNION ALL
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
'Empty Both' AS issue,
|
||||||
|
COUNT(*) AS count
|
||||||
|
FROM hadiths
|
||||||
|
WHERE (arabic_text IS NULL OR LENGTH(TRIM(arabic_text)) = 0)
|
||||||
|
AND (english_text IS NULL OR LENGTH(TRIM(english_text)) = 0);
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- 4. Grade Distribution
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
COALESCE(grade, 'Unknown') AS grade,
|
||||||
|
COUNT(*) AS count,
|
||||||
|
ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER(), 2) AS percentage
|
||||||
|
FROM hadiths
|
||||||
|
GROUP BY grade
|
||||||
|
ORDER BY count DESC;
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- 5. Arabic Normalization Verification
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- Check that normalized column is populated
|
||||||
|
SELECT
|
||||||
|
COUNT(*) AS total,
|
||||||
|
SUM(CASE WHEN arabic_normalized IS NOT NULL AND LENGTH(arabic_normalized) > 0 THEN 1 ELSE 0 END) AS normalized,
|
||||||
|
SUM(CASE WHEN arabic_normalized IS NULL OR LENGTH(arabic_normalized) = 0 THEN 1 ELSE 0 END) AS not_normalized
|
||||||
|
FROM hadiths
|
||||||
|
WHERE arabic_text IS NOT NULL AND LENGTH(arabic_text) > 0;
|
||||||
|
|
||||||
|
-- Sample comparison of original vs normalized
|
||||||
|
SELECT
|
||||||
|
id,
|
||||||
|
LEFT(arabic_text, 100) AS original,
|
||||||
|
LEFT(arabic_normalized, 100) AS normalized
|
||||||
|
FROM hadiths
|
||||||
|
WHERE arabic_text IS NOT NULL
|
||||||
|
LIMIT 5;
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- 6. Metadata Completeness
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- Check source_metadata JSON completeness
|
||||||
|
SELECT
|
||||||
|
COUNT(*) AS total,
|
||||||
|
SUM(CASE WHEN source_metadata IS NOT NULL THEN 1 ELSE 0 END) AS has_metadata,
|
||||||
|
SUM(CASE WHEN source_metadata ? 'api_source' THEN 1 ELSE 0 END) AS has_api_source,
|
||||||
|
SUM(CASE WHEN source_metadata ? 'ingested_at' THEN 1 ELSE 0 END) AS has_ingested_at
|
||||||
|
FROM hadiths;
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- 7. ID Range Check (for Qdrant comparison)
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- Get ID range
|
||||||
|
SELECT
|
||||||
|
MIN(id) AS min_id,
|
||||||
|
MAX(id) AS max_id,
|
||||||
|
COUNT(*) AS total_ids,
|
||||||
|
MAX(id) - MIN(id) + 1 AS expected_if_sequential,
|
||||||
|
COUNT(*) = (MAX(id) - MIN(id) + 1) AS is_sequential
|
||||||
|
FROM hadiths;
|
||||||
|
|
||||||
|
-- Find gaps in IDs (if any)
|
||||||
|
WITH id_series AS (
|
||||||
|
SELECT generate_series(
|
||||||
|
(SELECT MIN(id) FROM hadiths),
|
||||||
|
(SELECT MAX(id) FROM hadiths)
|
||||||
|
) AS expected_id
|
||||||
|
)
|
||||||
|
SELECT expected_id AS missing_id
|
||||||
|
FROM id_series
|
||||||
|
WHERE expected_id NOT IN (SELECT id FROM hadiths)
|
||||||
|
ORDER BY expected_id
|
||||||
|
LIMIT 50;
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- 8. Sample Data for Manual Verification
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
-- Sample 10 hadiths with all fields
|
||||||
|
SELECT
|
||||||
|
h.id,
|
||||||
|
c.name_english AS collection,
|
||||||
|
b.name_english AS book,
|
||||||
|
h.hadith_number,
|
||||||
|
h.grade,
|
||||||
|
LENGTH(h.arabic_text) AS arabic_len,
|
||||||
|
LENGTH(h.english_text) AS english_len,
|
||||||
|
LENGTH(h.urdu_text) AS urdu_len,
|
||||||
|
h.embedding_generated,
|
||||||
|
h.created_at
|
||||||
|
FROM hadiths h
|
||||||
|
JOIN collections c ON h.collection_id = c.id
|
||||||
|
LEFT JOIN books b ON h.book_id = b.id
|
||||||
|
ORDER BY RANDOM()
|
||||||
|
LIMIT 10;
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- 9. NER/RE Preparation Status
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
SUM(CASE WHEN entities_extracted THEN 1 ELSE 0 END) AS entities_extracted,
|
||||||
|
SUM(CASE WHEN relations_extracted THEN 1 ELSE 0 END) AS relations_extracted,
|
||||||
|
COUNT(*) AS total
|
||||||
|
FROM hadiths;
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- 10. Quick Health Check Query (run this first)
|
||||||
|
-- ============================================================================
|
||||||
|
|
||||||
|
SELECT
|
||||||
|
'Database Health Check' AS check_type,
|
||||||
|
(SELECT COUNT(*) FROM hadiths) AS total_hadiths,
|
||||||
|
(SELECT COUNT(*) FROM collections) AS total_collections,
|
||||||
|
(SELECT COUNT(*) FROM books) AS total_books,
|
||||||
|
(SELECT SUM(CASE WHEN embedding_generated THEN 1 ELSE 0 END) FROM hadiths) AS embedded_count,
|
||||||
|
(SELECT COUNT(*) FROM hadiths WHERE arabic_text IS NOT NULL AND LENGTH(arabic_text) > 0) AS has_arabic,
|
||||||
|
(SELECT COUNT(*) FROM hadiths WHERE english_text IS NOT NULL AND LENGTH(english_text) > 0) AS has_english;
|
||||||
|
|
@ -0,0 +1,33 @@
|
||||||
|
{
|
||||||
|
"total_hadiths_db": 41349,
|
||||||
|
"total_embeddings_qdrant": 41268,
|
||||||
|
"embeddings_with_payloads": 0,
|
||||||
|
"missing_embeddings": 81,
|
||||||
|
"embedding_dimension": 1024,
|
||||||
|
"collection_exists": true,
|
||||||
|
"collection_status": "green",
|
||||||
|
"sample_ids_missing": [
|
||||||
|
"09895d61-26cf-444a-bc7b-163a765dd37a",
|
||||||
|
"0990e5a0-7956-440c-9ee7-26dd0d5ecf8f",
|
||||||
|
"09926a1f-925a-49c8-9ff1-6870dc78bb2b",
|
||||||
|
"09951573-3a3e-4e4f-8fd9-268b9ba024eb",
|
||||||
|
"09fe7958-5af2-47f6-8d74-4bc10816a9df",
|
||||||
|
"09feef6b-c0ef-44a2-b9e8-bb40b13d7682",
|
||||||
|
"0a0b2754-6be4-4831-9c4b-1d2c420523f5",
|
||||||
|
"0a110e84-0ac3-4c50-8f04-8f8e8de876ce",
|
||||||
|
"0a711aef-8ae6-4dcd-9800-4a140356827f",
|
||||||
|
"0a71da11-bc62-4dca-b3a6-3d56b59dbb6b",
|
||||||
|
"0a7baa50-58f3-4c96-8103-c4e6c1fdbac6",
|
||||||
|
"0a7d0d2c-98c2-48b5-be84-a6a46f6151c2",
|
||||||
|
"0a80949a-0bce-40ea-8b26-1a65ae72c39b",
|
||||||
|
"0a82ae08-1d1f-49b8-97d2-2985c47fd79a",
|
||||||
|
"0a8374e0-0a33-41cd-a9d6-8d54a7eef227",
|
||||||
|
"0a843db5-23bc-4919-95cd-c88475d43ad9",
|
||||||
|
"0ad5aa90-6bb7-438f-967a-41786a4e7b9b",
|
||||||
|
"0ad626f3-0865-45c0-9ab8-14323351b278",
|
||||||
|
"0ad671b9-cf0f-4f46-8a2d-e13ec8fd19f7",
|
||||||
|
"0ad6de6b-763b-4874-ab3f-e0f6263749eb"
|
||||||
|
],
|
||||||
|
"verification_time_seconds": 9.182509422302246,
|
||||||
|
"timestamp": "2025-11-28T10:03:44.365382"
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,387 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Step 6.1: Verify Embeddings in Qdrant
|
||||||
|
=====================================
|
||||||
|
Validates that all hadiths have embeddings stored in Qdrant vector database.
|
||||||
|
|
||||||
|
Author: Hadith Scholar AI Project
|
||||||
|
Date: 2025
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import asyncio
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Dict, List, Optional, Tuple
|
||||||
|
from dataclasses import dataclass, asdict
|
||||||
|
|
||||||
|
if sys.platform == 'win32':
|
||||||
|
os.environ['PYTHONIOENCODING'] = 'utf-8'
|
||||||
|
if hasattr(sys.stdout, 'reconfigure'):
|
||||||
|
sys.stdout.reconfigure(encoding='utf-8')
|
||||||
|
if hasattr(sys.stderr, 'reconfigure'):
|
||||||
|
sys.stderr.reconfigure(encoding='utf-8')
|
||||||
|
|
||||||
|
import psycopg2
|
||||||
|
from psycopg2.extras import RealDictCursor
|
||||||
|
import httpx
|
||||||
|
from rich.console import Console
|
||||||
|
from rich.table import Table
|
||||||
|
from rich.progress import Progress, SpinnerColumn, TextColumn
|
||||||
|
from rich.panel import Panel
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
POSTGRES_HOST = os.getenv("POSTGRES_HOST", "pg.betelgeusebytes.io")
|
||||||
|
POSTGRES_PORT = int(os.getenv("POSTGRES_PORT", "5432"))
|
||||||
|
POSTGRES_DB = os.getenv("POSTGRES_DB", "hadith_db")
|
||||||
|
POSTGRES_USER = os.getenv("POSTGRES_USER", "hadith_ingest")
|
||||||
|
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD", "hadith_ingest")
|
||||||
|
# TEI_URL = "https://embeddings.betelgeusebytes.io"
|
||||||
|
# QDRANT_URL = "https://vector.betelgeusebytes.io"
|
||||||
|
QDRANT_HOST = os.getenv("QDRANT_HOST", "https://vector.betelgeusebytes.io")
|
||||||
|
QDRANT_PORT = int(os.getenv("QDRANT_PORT", "443"))
|
||||||
|
QDRANT_COLLECTION = os.getenv("QDRANT_COLLECTION", "hadith_embeddings")
|
||||||
|
|
||||||
|
# For external access
|
||||||
|
QDRANT_EXTERNAL = os.getenv("QDRANT_EXTERNAL", "qdrant.betelgeusebytes.io")
|
||||||
|
|
||||||
|
console = Console()
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class VerificationResult:
|
||||||
|
"""Results from embedding verification."""
|
||||||
|
total_hadiths_db: int
|
||||||
|
total_embeddings_qdrant: int
|
||||||
|
embeddings_with_payloads: int
|
||||||
|
missing_embeddings: int
|
||||||
|
embedding_dimension: int
|
||||||
|
collection_exists: bool
|
||||||
|
collection_status: str
|
||||||
|
sample_ids_missing: List[int]
|
||||||
|
verification_time_seconds: float
|
||||||
|
timestamp: str
|
||||||
|
|
||||||
|
|
||||||
|
def get_db_connection():
|
||||||
|
"""Create PostgreSQL connection."""
|
||||||
|
return psycopg2.connect(
|
||||||
|
host=POSTGRES_HOST,
|
||||||
|
port=POSTGRES_PORT,
|
||||||
|
database=POSTGRES_DB,
|
||||||
|
user=POSTGRES_USER,
|
||||||
|
password=POSTGRES_PASSWORD,
|
||||||
|
sslmode='require'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def get_qdrant_collection_info(client: httpx.AsyncClient) -> Dict:
|
||||||
|
"""Get Qdrant collection information."""
|
||||||
|
try:
|
||||||
|
response = await client.get(
|
||||||
|
f"{QDRANT_HOST}:{QDRANT_PORT}/collections/{QDRANT_COLLECTION}"
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.json()
|
||||||
|
except httpx.HTTPError as e:
|
||||||
|
console.print(f"[red]Error connecting to Qdrant: {e}[/red]")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
async def count_qdrant_points(client: httpx.AsyncClient) -> int:
|
||||||
|
"""Count total points in Qdrant collection."""
|
||||||
|
try:
|
||||||
|
response = await client.post(
|
||||||
|
f"{QDRANT_HOST}:{QDRANT_PORT}/collections/{QDRANT_COLLECTION}/points/count",
|
||||||
|
json={"exact": True}
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.json().get("result", {}).get("count", 0)
|
||||||
|
except httpx.HTTPError as e:
|
||||||
|
console.print(f"[red]Error counting Qdrant points: {e}[/red]")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
async def get_qdrant_points_sample(
|
||||||
|
client: httpx.AsyncClient,
|
||||||
|
offset: int = 0,
|
||||||
|
limit: int = 100
|
||||||
|
) -> List[Dict]:
|
||||||
|
"""Get a sample of points from Qdrant."""
|
||||||
|
try:
|
||||||
|
response = await client.post(
|
||||||
|
f"{QDRANT_HOST}:{QDRANT_PORT}/collections/{QDRANT_COLLECTION}/points/scroll",
|
||||||
|
json={
|
||||||
|
"limit": limit,
|
||||||
|
"offset": offset,
|
||||||
|
"with_payload": True,
|
||||||
|
"with_vector": False
|
||||||
|
}
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.json().get("result", {}).get("points", [])
|
||||||
|
except httpx.HTTPError as e:
|
||||||
|
console.print(f"[red]Error fetching Qdrant points: {e}[/red]")
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
async def get_all_qdrant_ids(client: httpx.AsyncClient) -> set:
|
||||||
|
"""Get all point IDs from Qdrant (paginated)."""
|
||||||
|
all_ids = set()
|
||||||
|
offset = None
|
||||||
|
batch_size = 1000
|
||||||
|
|
||||||
|
with Progress(
|
||||||
|
SpinnerColumn(),
|
||||||
|
TextColumn("[progress.description]{task.description}"),
|
||||||
|
console=console
|
||||||
|
) as progress:
|
||||||
|
task = progress.add_task("Fetching Qdrant IDs...", total=None)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
payload = {
|
||||||
|
"limit": batch_size,
|
||||||
|
"with_payload": False,
|
||||||
|
"with_vector": False
|
||||||
|
}
|
||||||
|
if offset is not None:
|
||||||
|
payload["offset"] = offset
|
||||||
|
|
||||||
|
response = await client.post(
|
||||||
|
f"{QDRANT_HOST}:{QDRANT_PORT}/collections/{QDRANT_COLLECTION}/points/scroll",
|
||||||
|
json=payload,
|
||||||
|
timeout=60.0
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
result = response.json().get("result", {})
|
||||||
|
points = result.get("points", [])
|
||||||
|
|
||||||
|
if not points:
|
||||||
|
break
|
||||||
|
|
||||||
|
for point in points:
|
||||||
|
all_ids.add(point["id"])
|
||||||
|
|
||||||
|
offset = result.get("next_page_offset")
|
||||||
|
progress.update(task, description=f"Fetched {len(all_ids)} IDs...")
|
||||||
|
# console.print(f" Fetched IDs: [green]{offset}[/green]")
|
||||||
|
|
||||||
|
if offset is None:
|
||||||
|
break
|
||||||
|
|
||||||
|
except httpx.HTTPError as e:
|
||||||
|
console.print(f"[red]Error during ID fetch: {e}[/red]")
|
||||||
|
break
|
||||||
|
|
||||||
|
return all_ids
|
||||||
|
|
||||||
|
|
||||||
|
def get_all_hadith_ids_from_db() -> set:
|
||||||
|
"""Get all hadith IDs from PostgreSQL."""
|
||||||
|
conn = get_db_connection()
|
||||||
|
try:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.execute("SELECT id FROM hadiths ORDER BY id")
|
||||||
|
return {row[0] for row in cur.fetchall()}
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def get_hadith_count_from_db() -> int:
|
||||||
|
"""Get total hadith count from PostgreSQL."""
|
||||||
|
conn = get_db_connection()
|
||||||
|
try:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.execute("SELECT COUNT(*) FROM hadiths")
|
||||||
|
return cur.fetchone()[0]
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def get_embedding_stats_from_db() -> Dict:
|
||||||
|
"""Get embedding generation stats from PostgreSQL."""
|
||||||
|
conn = get_db_connection()
|
||||||
|
try:
|
||||||
|
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||||
|
cur.execute("""
|
||||||
|
SELECT
|
||||||
|
COUNT(*) as total,
|
||||||
|
SUM(CASE WHEN embedding_generated THEN 1 ELSE 0 END) as embedded,
|
||||||
|
SUM(CASE WHEN NOT embedding_generated THEN 1 ELSE 0 END) as not_embedded
|
||||||
|
FROM hadiths
|
||||||
|
""")
|
||||||
|
return dict(cur.fetchone())
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def get_collection_stats_by_source() -> List[Dict]:
|
||||||
|
"""Get hadith counts by collection/source."""
|
||||||
|
conn = get_db_connection()
|
||||||
|
try:
|
||||||
|
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||||
|
cur.execute("""
|
||||||
|
SELECT
|
||||||
|
c.name_english as collection,
|
||||||
|
COUNT(h.id) as count,
|
||||||
|
SUM(CASE WHEN h.embedding_generated THEN 1 ELSE 0 END) as embedded
|
||||||
|
FROM hadiths h
|
||||||
|
JOIN collections c ON h.collection_id = c.id
|
||||||
|
GROUP BY c.id, c.name_english
|
||||||
|
ORDER BY count DESC
|
||||||
|
""")
|
||||||
|
return [dict(row) for row in cur.fetchall()]
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
async def verify_embeddings() -> VerificationResult:
|
||||||
|
"""Main verification function."""
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
console.print(Panel.fit(
|
||||||
|
"[bold blue]Hadith Embeddings Verification[/bold blue]\n"
|
||||||
|
f"Database: {POSTGRES_HOST}:{POSTGRES_PORT}/{POSTGRES_DB}\n"
|
||||||
|
f"Qdrant: {QDRANT_HOST}:{QDRANT_PORT}/{QDRANT_COLLECTION}",
|
||||||
|
title="Step 6.1"
|
||||||
|
))
|
||||||
|
|
||||||
|
# Step 1: Get PostgreSQL stats
|
||||||
|
console.print("\n[yellow]1. Checking PostgreSQL database...[/yellow]")
|
||||||
|
db_stats = get_embedding_stats_from_db()
|
||||||
|
total_hadiths = db_stats['total']
|
||||||
|
console.print(f" Total hadiths: [green]{total_hadiths:,}[/green]")
|
||||||
|
console.print(f" Marked as embedded: [green]{db_stats['embedded']:,}[/green]")
|
||||||
|
|
||||||
|
# Step 2: Get collection breakdown
|
||||||
|
console.print("\n[yellow]2. Collection breakdown:[/yellow]")
|
||||||
|
collection_stats = get_collection_stats_by_source()
|
||||||
|
|
||||||
|
table = Table(title="Hadiths by Collection")
|
||||||
|
table.add_column("Collection", style="cyan")
|
||||||
|
table.add_column("Total", justify="right")
|
||||||
|
table.add_column("Embedded", justify="right", style="green")
|
||||||
|
|
||||||
|
for stat in collection_stats:
|
||||||
|
table.add_row(
|
||||||
|
stat['collection'],
|
||||||
|
f"{stat['count']:,}",
|
||||||
|
f"{stat['embedded']:,}"
|
||||||
|
)
|
||||||
|
console.print(table)
|
||||||
|
|
||||||
|
# Step 3: Check Qdrant collection
|
||||||
|
console.print("\n[yellow]3. Checking Qdrant collection...[/yellow]")
|
||||||
|
|
||||||
|
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||||
|
collection_info = await get_qdrant_collection_info(client)
|
||||||
|
|
||||||
|
if not collection_info:
|
||||||
|
return VerificationResult(
|
||||||
|
total_hadiths_db=total_hadiths,
|
||||||
|
total_embeddings_qdrant=0,
|
||||||
|
embeddings_with_payloads=0,
|
||||||
|
missing_embeddings=total_hadiths,
|
||||||
|
embedding_dimension=0,
|
||||||
|
collection_exists=False,
|
||||||
|
collection_status="NOT_FOUND",
|
||||||
|
sample_ids_missing=[],
|
||||||
|
verification_time_seconds=time.time() - start_time,
|
||||||
|
timestamp=datetime.now().isoformat()
|
||||||
|
)
|
||||||
|
|
||||||
|
result = collection_info.get("result", {})
|
||||||
|
status = result.get("status", "unknown")
|
||||||
|
vectors_config = result.get("config", {}).get("params", {}).get("vectors", {})
|
||||||
|
embedding_dim = vectors_config.get("size", 0)
|
||||||
|
|
||||||
|
console.print(f" Collection status: [green]{status}[/green]")
|
||||||
|
console.print(f" Embedding dimension: [green]{embedding_dim}[/green]")
|
||||||
|
|
||||||
|
# Step 4: Count Qdrant points
|
||||||
|
console.print("\n[yellow]4. Counting Qdrant embeddings...[/yellow]")
|
||||||
|
qdrant_count = await count_qdrant_points(client)
|
||||||
|
console.print(f" Total embeddings: [green]{qdrant_count:,}[/green]")
|
||||||
|
|
||||||
|
# Step 5: Find missing embeddings
|
||||||
|
console.print("\n[yellow]5. Identifying missing embeddings...[/yellow]")
|
||||||
|
db_ids = get_all_hadith_ids_from_db()
|
||||||
|
console.print(f" DB Ids: [green]{len(db_ids)}[/green]")
|
||||||
|
qdrant_ids = await get_all_qdrant_ids(client)
|
||||||
|
console.print(f" DB Ids: [green]{len(qdrant_ids)}[/green]")
|
||||||
|
missing_ids = db_ids - qdrant_ids
|
||||||
|
extra_ids = qdrant_ids - db_ids
|
||||||
|
|
||||||
|
console.print(f" IDs in DB: [blue]{len(db_ids):,}[/blue]")
|
||||||
|
console.print(f" IDs in Qdrant: [blue]{len(qdrant_ids):,}[/blue]")
|
||||||
|
console.print(f" Missing embeddings: [{'red' if missing_ids else 'green'}]{len(missing_ids):,}[/{'red' if missing_ids else 'green'}]")
|
||||||
|
|
||||||
|
if extra_ids:
|
||||||
|
console.print(f" Extra IDs in Qdrant (orphaned): [yellow]{len(extra_ids):,}[/yellow]")
|
||||||
|
|
||||||
|
# Get sample of missing IDs
|
||||||
|
sample_missing = sorted(list(missing_ids))[:20] if missing_ids else []
|
||||||
|
|
||||||
|
# Step 6: Verify sample payload integrity
|
||||||
|
console.print("\n[yellow]6. Verifying payload integrity...[/yellow]")
|
||||||
|
sample_points = await get_qdrant_points_sample(client, limit=100)
|
||||||
|
|
||||||
|
payloads_with_data = sum(
|
||||||
|
1 for p in sample_points
|
||||||
|
if p.get("payload") and p["payload"].get("hadith_id")
|
||||||
|
)
|
||||||
|
|
||||||
|
console.print(f" Sample size: {len(sample_points)}")
|
||||||
|
console.print(f" With valid payloads: [green]{payloads_with_data}[/green]")
|
||||||
|
|
||||||
|
verification_time = time.time() - start_time
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
console.print("\n" + "="*50)
|
||||||
|
console.print("[bold]VERIFICATION SUMMARY[/bold]")
|
||||||
|
console.print("="*50)
|
||||||
|
|
||||||
|
if len(missing_ids) == 0:
|
||||||
|
console.print("[bold green]✓ ALL EMBEDDINGS VERIFIED![/bold green]")
|
||||||
|
else:
|
||||||
|
console.print(f"[bold red]✗ {len(missing_ids):,} EMBEDDINGS MISSING[/bold red]")
|
||||||
|
if sample_missing:
|
||||||
|
console.print(f" Sample missing IDs: {sample_missing[:10]}")
|
||||||
|
|
||||||
|
console.print(f"\nVerification completed in {verification_time:.2f} seconds")
|
||||||
|
|
||||||
|
return VerificationResult(
|
||||||
|
total_hadiths_db=total_hadiths,
|
||||||
|
total_embeddings_qdrant=qdrant_count,
|
||||||
|
embeddings_with_payloads=payloads_with_data,
|
||||||
|
missing_embeddings=len(missing_ids),
|
||||||
|
embedding_dimension=embedding_dim,
|
||||||
|
collection_exists=True,
|
||||||
|
collection_status=status,
|
||||||
|
sample_ids_missing=sample_missing,
|
||||||
|
verification_time_seconds=verification_time,
|
||||||
|
timestamp=datetime.now().isoformat()
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
"""Main entry point."""
|
||||||
|
result = await verify_embeddings()
|
||||||
|
|
||||||
|
# Save results to JSON
|
||||||
|
output_file = "verification_results.json"
|
||||||
|
with open(output_file, 'w') as f:
|
||||||
|
json.dump(asdict(result), f, indent=2)
|
||||||
|
|
||||||
|
console.print(f"\n[dim]Results saved to {output_file}[/dim]")
|
||||||
|
|
||||||
|
# Exit with error code if missing embeddings
|
||||||
|
if result.missing_embeddings > 0:
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
Loading…
Reference in New Issue