update project.

update README.md: adding full recap that can be used ass a context for a prompt inclaude
This commit is contained in:
salah 2026-01-29 10:34:08 +01:00
parent bc2e9656a1
commit 2bbd4b571b
52 changed files with 36441 additions and 2020 deletions

10
.idea/.gitignore vendored Normal file
View File

@ -0,0 +1,10 @@
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Ignored default folder with query files
/queries/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

View File

@ -0,0 +1,14 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/.venv" />
</content>
<orderEntry type="jdk" jdkName="Python 3.12 (hadith-ingestion)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyDocumentationSettings">
<option name="format" value="PLAIN" />
<option name="myDocStringFormat" value="Plain" />
</component>
</module>

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

7
.idea/misc.xml Normal file
View File

@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Black">
<option name="sdkName" value="Python 3.12 (hadith-ingestion)" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12 (hadith-ingestion)" project-jdk-type="Python SDK" />
</project>

8
.idea/modules.xml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/hadith-ingestion.iml" filepath="$PROJECT_DIR$/.idea/hadith-ingestion.iml" />
</modules>
</component>
</project>

6
.idea/vcs.xml Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="" vcs="Git" />
</component>
</project>

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,275 @@
# 🚀 HadithAPI.com Deployment - Quick Start
## What You Got
Three comprehensive guides:
1. **PHASE_2_IMPLEMENTATION_GUIDE.md** - Original guide with PostgreSQL schema
2. **HADITHAPI_INTEGRATION_GUIDE.md** - Complete HadithAPI.com implementation
3. **This summary** - Quick deployment steps
## 📦 Complete Package Structure
The HadithAPI guide includes everything you need:
### Production-Ready Code
**hadithapi_client.py** - Full API client with pagination and rate limiting
**main_hadithapi.py** - Complete ingestion service
**settings.py** - Configuration with your API key
**Dockerfile** - Container image
**Argo Workflows** - Kubernetes automation
**Test scripts** - Validation and troubleshooting
### Key Features
- ✅ Automatic pagination handling
- ✅ Rate limiting (30 req/min)
- ✅ Error handling and retries
- ✅ Progress tracking
- ✅ Structured logging
- ✅ Multi-language support (Arabic, English, Urdu)
## 🎯 5-Minute Quick Start
### 1. Database Setup (2 min)
```bash
# Use schema from PHASE_2_IMPLEMENTATION_GUIDE.md Section 1
kubectl -n db exec -it postgres-0 -- psql -U app -d gitea
# Copy all SQL from Section 1.2 through 1.6
# This creates hadith_db with complete schema
```
### 2. Create Project Structure (1 min)
```bash
mkdir -p hadith-ingestion/{config,src/{api_clients,processors,database,utils},argo/workflows}
cd hadith-ingestion/
# Copy code from HADITHAPI_INTEGRATION_GUIDE.md:
# - Section 2.1 → src/api_clients/hadithapi_client.py
# - Section 4.1 → src/main_hadithapi.py
# - Section 5.1 → config/settings.py
# - Section 6.1 → Dockerfile
# - Section 6.4 → argo/workflows/ingest-hadithapi.yaml
# Also copy from PHASE_2_IMPLEMENTATION_GUIDE.md:
# - Section 3.4 → src/api_clients/base_client.py
# - Section 3.6 → src/processors/text_cleaner.py
# - Section 3.7 → src/database/repository.py
```
### 3. Build & Deploy (2 min)
```bash
# Build image
docker build -t hadith-ingestion:latest .
# Create secrets
kubectl -n argo create secret generic hadith-db-secret \
--from-literal=password='YOUR_PASSWORD'
kubectl -n argo create secret generic hadithapi-secret \
--from-literal=api-key='$2y$10$nTJnyX3WUDoGmjKrKqSmbecANVsQWKyffmtp9fxmsQwR15DEv4mK'
# Test with 10 hadiths
argo submit -n argo argo/workflows/ingest-hadithapi.yaml \
--parameter book-slug=sahih-bukhari \
--parameter limit=10 \
--watch
```
## 📊 Expected Results
### Available Collections
| Book | Hadiths | Time |
|------|---------|------|
| Sahih Bukhari | ~7,500 | 2-3h |
| Sahih Muslim | ~7,000 | 2-3h |
| Sunan Abu Dawood | ~5,000 | 1-2h |
| Jami` at-Tirmidhi | ~4,000 | 1-2h |
| Sunan an-Nasa'i | ~5,700 | 2h |
| Sunan Ibn Majah | ~4,300 | 1-2h |
| **TOTAL** | **~33,500** | **10-15h** |
## 🔧 Key Differences from Sunnah.com
| Feature | HadithAPI.com | Sunnah.com |
|---------|---------------|------------|
| **API Key** | ✅ Public (provided) | ❌ Requires PR |
| **Rate Limit** | Unknown (using 30/min) | 100/min |
| **Coverage** | 6 major books | 10+ books |
| **Languages** | Arabic, English, Urdu | Arabic, English |
| **Cost** | ✅ Free | Free |
| **Stability** | Good | Excellent |
## 📝 Complete File Checklist
Create these files from the guides:
```
hadith-ingestion/
├── Dockerfile ✓ Section 6.1
├── requirements.txt ✓ Phase 2 Section 3.2
├── .env ✓ Section 5.2
├── build-hadithapi-ingestion.sh ✓ Section 6.2
├── create-secrets.sh ✓ Section 6.3
├── test-hadithapi-local.sh ✓ Section 7.1
├── test-hadithapi-k8s.sh ✓ Section 7.2
├── run-full-ingestion.sh ✓ Section 7.3
├── config/
│ ├── __init__.py (empty file)
│ └── settings.py ✓ Section 5.1
├── src/
│ ├── __init__.py (empty file)
│ ├── main_hadithapi.py ✓ Section 4.1
│ ├── api_clients/
│ │ ├── __init__.py (empty file)
│ │ ├── base_client.py ✓ Phase 2 Sec 3.4
│ │ └── hadithapi_client.py ✓ Section 2.1
│ ├── processors/
│ │ ├── __init__.py (empty file)
│ │ └── text_cleaner.py ✓ Phase 2 Sec 3.6
│ ├── database/
│ │ ├── __init__.py (empty file)
│ │ ├── connection.py (optional)
│ │ └── repository.py ✓ Phase 2 Sec 3.7
│ └── utils/
│ ├── __init__.py (empty file)
│ └── logger.py (optional)
└── argo/
└── workflows/
└── ingest-hadithapi.yaml ✓ Section 6.4
```
## 🎬 Step-by-Step Execution
### Day 1: Setup & Test (2-3 hours)
```bash
# 1. Create database schema
# 2. Set up project structure
# 3. Build Docker image
# 4. Create secrets
# 5. Run test with 10 hadiths
# 6. Verify data
```
### Day 2: Ingest Major Collections (10-15 hours)
```bash
# Ingest all 6 major collections sequentially
./run-full-ingestion.sh
# Or manually one by one:
argo submit ... --parameter book-slug=sahih-bukhari
argo submit ... --parameter book-slug=sahih-muslim
# etc...
```
### Day 3: Validation & Next Steps
```bash
# 1. Verify data quality
# 2. Check statistics
# 3. Proceed to Phase 3 (ML model development)
```
## ✅ Verification Checklist
After ingestion completes:
```bash
# 1. Check total hadiths
kubectl -n db exec -it postgres-0 -- psql -U hadith_ingest -d hadith_db -c "
SELECT COUNT(*) FROM hadiths;
"
# Expected: ~33,500
# 2. Check per collection
kubectl -n db exec -it postgres-0 -- psql -U hadith_ingest -d hadith_db -c "
SELECT
c.name_english,
COUNT(h.id) as count
FROM collections c
LEFT JOIN hadiths h ON c.id = h.collection_id
WHERE c.abbreviation IN ('bukhari', 'muslim', 'abudawud', 'tirmidhi', 'nasai', 'ibnmajah')
GROUP BY c.name_english;
"
# 3. Check for errors
kubectl -n db exec -it postgres-0 -- psql -U hadith_ingest -d hadith_db -c "
SELECT * FROM ingestion_jobs
WHERE status = 'failed'
ORDER BY created_at DESC;
"
```
## 🐛 Common Issues & Solutions
### Issue: Rate Limiting
```
Error: 429 Too Many Requests
Solution: Already set to conservative 30/min
If still hitting limits, edit settings.py:
API_RATE_LIMIT = 20
```
### Issue: Connection Timeout
```
Error: Connection timeout to database
Solution:
1. Check PostgreSQL is running
2. Verify credentials in secrets
3. Test connection manually
```
### Issue: Missing Chapters
```
Warning: chapters_fetch_failed
Solution: Script automatically falls back to fetching all hadiths
This is expected and not critical
```
## 📚 Documentation References
All details in the comprehensive guides:
1. **PHASE_2_IMPLEMENTATION_GUIDE.md**
- PostgreSQL schema (Section 1)
- Base utilities (Section 3)
- Database repository (Section 3.7)
2. **HADITHAPI_INTEGRATION_GUIDE.md**
- API client (Section 2)
- Main ingestion service (Section 4)
- Deployment (Section 6)
- Testing (Section 7)
## 🎯 Next Phase
After Phase 2 completion:
→ **Phase 3: ML Model Development**
- Annotate sample hadiths (Label Studio)
- Train NER model
- Train relation extraction model
- Fine-tune LLM with LoRA
## 💡 Pro Tips
1. **Start Small**: Test with `--limit 10` first
2. **Monitor Progress**: Use `argo logs -n argo <workflow> -f`
3. **Check Logs**: Structured JSON logs for easy debugging
4. **Backup Data**: Before major operations
5. **Rate Limiting**: Be conservative to avoid blocks
## 🎉 Success Criteria
Phase 2 is complete when:
- ✅ Database schema created
- ✅ 33,500+ hadiths ingested
- ✅ All 6 collections present
- ✅ No critical errors
- ✅ Data validated
- ✅ Ready for embedding generation
---
**Estimated Total Time: 1-2 days**
**Difficulty: Intermediate**
**Prerequisites: Phase 1 completed (all core services running)**
Ready to start? Begin with Section 1 of PHASE_2_IMPLEMENTATION_GUIDE.md!

View File

@ -1,4 +1,4 @@
find . -type f -name "*.txt" -o -name "production" -o -name "*.py" -o -name "*.yaml" -o -name "Dockerfile" -o -name "*.sh" -o -name "*.env" ! -name "*.md" ! -name "*.xls" ! -name "*.xlsx"| while read file; do
find . -type f -name "*.txt" -o -name "production" -o -name "*.py" -o -name "*.yaml" -o -name "Dockerfile" -o -name "*.sh" -o -name "*.env" ! -name "*.md" | while read file; do
echo "=== $file ===" >> combined.txt
cat "$file" >> combined.txt
echo "" >> combined.txt

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,33 @@
# ============================================================================
# Step 7: Environment Configuration
# ============================================================================
# Copy this file to .env and update with your values
# Usage: source .env
# ============================================================================
# PostgreSQL Configuration
export POSTGRES_HOST=pg.betelgeusebytes.io
export POSTGRES_PORT=5432
export POSTGRES_DB=hadith_db
export POSTGRES_USER=hadith_ingest
export POSTGRES_PASSWORD=hadith_ingest
# Label Studio Configuration
export LABEL_STUDIO_URL=https://label.betelgeusebytes.io
export LABEL_STUDIO_API_KEY=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ0b2tlbl90eXBlIjoicmVmcmVzaCIsImV4cCI6ODA3MTUyMjgzMSwiaWF0IjoxNzY0MzIyODMxLCJqdGkiOiJhYWVkMjNjODdmODc0MmY2OWJmMmFjZDc5YTVjMzMyMiIsInVzZXJfaWQiOjF9.4B_ZAPL6TmIcA6-zcKJ8JDRI3FsikX3HgTK3bbmK0mk
# To get API key:
# 1. Login to Label Studio
# 2. Go to Account & Settings (user icon top right)
# 3. Access Token section
# 4. Copy the token
# Qdrant Configuration (for active learning)
export QDRANT_HOST=https://vector.betelgeusebytes.io
# QDRANT_HOST = os.getenv("QDRANT_HOST", "https://vector.betelgeusebytes.io")
export QDRANT_PORT=6333
export QDRANT_COLLECTION=hadith_embeddings
# For external access:
# export QDRANT_HOST=qdrant.betelgeusebytes.io

View File

@ -0,0 +1,401 @@
# Hadith Named Entity Recognition (NER) Annotation Guidelines
## Table of Contents
1. [Introduction](#introduction)
2. [Entity Types](#entity-types)
3. [Annotation Rules](#annotation-rules)
4. [Arabic-Specific Guidelines](#arabic-specific-guidelines)
5. [Relation Types](#relation-types)
6. [Examples](#examples)
7. [Edge Cases](#edge-cases)
8. [Quality Standards](#quality-standards)
---
## Introduction
This document provides guidelines for annotating Islamic hadith texts with named entities and relations. The goal is to create high-quality training data for machine learning models that will automatically extract information from hadith literature.
### Purpose
- Extract narrator names from hadith chains (isnad)
- Identify places, dates, and other important entities
- Map relationships between narrators for knowledge graph construction
### Annotator Requirements
- Basic understanding of Arabic (for Arabic text annotation)
- Familiarity with Islamic terminology
- Understanding of hadith structure (isnad + matn)
---
## Entity Types
### 1. PERSON (شخص)
**Definition:** Full names of individuals, including prophets, companions, scholars, and narrators.
**Include:**
- Full names: محمد بن عبد الله (Muhammad ibn Abdullah)
- Prophet references: النبي صلى الله عليه وسلم (when used as a name reference)
- Companion names: عمر بن الخطاب (Umar ibn al-Khattab)
**Exclude:**
- Generic references like "a man" (رجل) or "someone" (أحد)
- Pronouns
**Color:** 🔴 Red (#FF6B6B)
**Hotkey:** P
### 2. KUNYA (كنية)
**Definition:** Honorific names starting with "Abu" (أبو - father of) or "Umm" (أم - mother of).
**Examples:**
- أبو هريرة (Abu Hurairah)
- أبو بكر (Abu Bakr)
- أم سلمة (Umm Salamah)
- أبو عبد الله (Abu Abdullah)
**Note:** Kunya may appear alone or with a full name. Tag only the Kunya portion.
**Color:** 🔵 Teal (#4ECDC4)
**Hotkey:** K
### 3. NISBA (نسبة)
**Definition:** Attributive names indicating tribe, place of origin, profession, or lineage.
**Examples:**
- البخاري (al-Bukhari - from Bukhara)
- القرشي (al-Qurashi - from Quraysh tribe)
- المدني (al-Madani - from Medina)
- الأنصاري (al-Ansari - of the Ansar)
- الحنبلي (al-Hanbali - Hanbali school)
**Note:** Often appears at the end of a name. Tag the nisba separately from PERSON.
**Color:** 🔷 Blue (#45B7D1)
**Hotkey:** N
### 4. PLACE (مكان)
**Definition:** Geographic locations including cities, regions, mosques, and landmarks.
**Examples:**
- مكة (Mecca)
- المدينة (Medina)
- المسجد الحرام (The Sacred Mosque)
- بيت المقدس (Jerusalem)
- الشام (Syria/Levant)
- خيبر (Khaybar)
**Include:**
- Cities, towns, villages
- Mosques and religious sites
- Regions and countries
- Mountains, valleys, wells
**Color:** 🟢 Green (#96CEB4)
**Hotkey:** L
### 5. DATE (تاريخ)
**Definition:** Temporal references including years, months, days, and events used as time markers.
**Examples:**
- سنة مئتين (year 200 AH)
- في رمضان (in Ramadan)
- يوم الجمعة (on Friday)
- بعد الهجرة (after the Hijra)
- غزوة بدر (Battle of Badr - as a time reference)
- يوم عرفة (Day of Arafah)
**Color:** 🟡 Yellow (#FFEAA7)
**Hotkey:** D
### 6. TRIBE (قبيلة)
**Definition:** Names of Arab tribes and clans.
**Examples:**
- قريش (Quraysh)
- بني هاشم (Banu Hashim)
- الأنصار (the Ansar)
- المهاجرين (the Muhajirun)
- خزاعة (Khuza'a)
**Note:** When part of a nisba (القرشي), tag as NISBA. When standalone, tag as TRIBE.
**Color:** 🟣 Purple (#DDA0DD)
**Hotkey:** T
### 7. TITLE (لقب)
**Definition:** Titles and honorifics that aren't kunyas.
**Examples:**
- أمير المؤمنين (Commander of the Faithful)
- رسول الله (Messenger of Allah)
- خليفة (Caliph)
- إمام (Imam)
- شيخ (Sheikh)
**Color:** 🟩 Mint (#98D8C8)
**Hotkey:** I
---
## Annotation Rules
### General Rules
1. **Tag the longest valid span**
- ✅ `[محمد بن عبد الله بن عبد المطلب]PERSON`
- ❌ `[محمد]PERSON [بن عبد الله]PERSON`
2. **Don't overlap tags**
- If a name contains a nisba, tag separately:
- `[محمد بن إسماعيل]PERSON [البخاري]NISBA`
3. **Include particles when part of name**
- Include "ال" (al-) when part of a name: `[البخاري]NISBA`
- Include "بن/ابن" (ibn/bin) within PERSON tags
4. **Exclude common words**
- Don't tag: عن (from), أن (that), قال (said), حدثنا (narrated)
5. **Be consistent**
- Same entity = same tag type throughout the document
### Boundary Rules
1. **Start boundary:** First character of the entity
2. **End boundary:** Last character of the entity (including diacritics)
3. **Don't include:**
- Leading/trailing spaces
- Punctuation marks
- Pronouns or suffixes
---
## Arabic-Specific Guidelines
### Handling Diacritics (تشكيل)
- Include diacritics within the tag span
- Don't let diacritics affect boundary decisions
### Common Patterns
| Pattern | Example | Tags |
|---------|---------|------|
| Name + bin/ibn + Name | محمد بن إسماعيل | [محمد بن إسماعيل]PERSON |
| Name + Kunya | أبو هريرة عبد الرحمن | [أبو هريرة]KUNYA [عبد الرحمن]PERSON |
| Name + Nisba | البخاري محمد | [البخاري]NISBA [محمد]PERSON |
| Full chain | حدثنا محمد بن إسماعيل البخاري | حدثنا [محمد بن إسماعيل]PERSON [البخاري]NISBA |
### Prophet References
- "النبي صلى الله عليه وسلم" → Tag "النبي" as TITLE
- "رسول الله صلى الله عليه وسلم" → Tag "رسول الله" as TITLE
- "محمد صلى الله عليه وسلم" → Tag "محمد" as PERSON
---
## Relation Types
### 1. NARRATED_FROM (روى عن)
**Definition:** A narrator received/heard the hadith from another narrator.
**Direction:** Source → Target (A NARRATED_FROM B = A heard from B)
**Indicators:**
- عن (from)
- حدثنا (narrated to us)
- أخبرنا (informed us)
- سمعت (I heard)
### 2. TEACHER_OF (أستاذ)
**Definition:** A scholar taught or trained another scholar.
**Direction:** Teacher → Student
### 3. STUDENT_OF (تلميذ)
**Definition:** Inverse of TEACHER_OF.
**Direction:** Student → Teacher
### 4. CONTEMPORARY_OF (معاصر)
**Definition:** Two people lived in the same era and potentially knew each other.
**Direction:** Bidirectional
### 5. RELATED_TO (قريب)
**Definition:** Family relationship (father, son, brother, etc.)
**Direction:** Bidirectional
**Indicators:**
- بن/ابن (son of)
- أخ/أخو (brother of)
- زوج/زوجة (spouse of)
### 6. LIVED_IN (سكن)
**Definition:** Person resided in or was associated with a place.
**Direction:** Person → Place
### 7. BORN_IN (ولد في)
**Definition:** Person's birthplace.
**Direction:** Person → Place
### 8. DIED_IN (توفي في)
**Definition:** Place of death.
**Direction:** Person → Place
---
## Examples
### Example 1: Simple Narrator Chain
**Arabic Text:**
```
حدثنا محمد بن إسماعيل البخاري عن أبي هريرة رضي الله عنه قال: قال رسول الله صلى الله عليه وسلم...
```
**Annotations:**
- `[محمد بن إسماعيل]PERSON`
- `[البخاري]NISBA`
- `[أبي هريرة]KUNYA`
- `[رسول الله]TITLE`
**Relations:**
- محمد بن إسماعيل NARRATED_FROM أبي هريرة
- أبي هريرة NARRATED_FROM رسول الله
### Example 2: Place Reference
**Arabic Text:**
```
كان النبي صلى الله عليه وسلم في المسجد الحرام بمكة
```
**Annotations:**
- `[النبي]TITLE`
- `[المسجد الحرام]PLACE`
- `[مكة]PLACE`
### Example 3: Complex Chain
**Arabic Text:**
```
حدثنا عبد الله بن يوسف التنيسي أخبرنا مالك عن نافع عن عبد الله بن عمر رضي الله عنهما
```
**Annotations:**
- `[عبد الله بن يوسف]PERSON`
- `[التنيسي]NISBA`
- `[مالك]PERSON`
- `[نافع]PERSON`
- `[عبد الله بن عمر]PERSON`
**Relations:**
- عبد الله بن يوسف NARRATED_FROM مالك
- مالك NARRATED_FROM نافع
- نافع NARRATED_FROM عبد الله بن عمر
### Example 4: English Text
**English Text:**
```
Narrated Abu Hurairah: The Prophet (ﷺ) said, "Whoever believes in Allah and the Last Day..."
```
**Annotations:**
- `[Abu Hurairah]KUNYA`
- `[The Prophet]TITLE`
---
## Edge Cases
### 1. Ambiguous References
- "رجل من الأنصار" (a man from the Ansar) → Tag only `[الأنصار]TRIBE`
- "بعض أصحاب النبي" (some companions) → Don't tag
### 2. Partial Names
- If only a first name is given and identity is clear → Tag as PERSON
- If unclear → Don't tag
### 3. Titles vs Names
- "الإمام البخاري" → `[الإمام]TITLE` `[البخاري]NISBA`
- If Imam is part of the known name → Consider context
### 4. Multiple Nisbas
```
محمد بن إسماعيل البخاري الجعفي
```
`[محمد بن إسماعيل]PERSON` `[البخاري]NISBA` `[الجعفي]NISBA`
### 5. Kunya Used as Name
- Some people are known primarily by Kunya
- Abu Bakr → `[Abu Bakr]KUNYA` (not PERSON, as it's a kunya form)
---
## Quality Standards
### Accuracy Requirements
- **Entity Detection:** >95% of entities should be identified
- **Entity Classification:** >90% should have correct type
- **Boundary Precision:** >95% should have exact boundaries
### Consistency Checks
- Same entity tagged consistently across document
- Related entities (person + nisba) both tagged
- No orphaned relations (both endpoints must be entities)
### Review Process
1. **Self-review:** Annotator reviews own work
2. **Peer review:** Second annotator reviews 20% of tasks
3. **Expert review:** Arabic/Islamic scholar reviews edge cases
### Inter-Annotator Agreement
- Target Cohen's Kappa > 0.8 for entity types
- Target > 0.75 for relations
---
## Keyboard Shortcuts
| Action | Key |
|--------|-----|
| PERSON | P |
| KUNYA | K |
| NISBA | N |
| PLACE | L |
| DATE | D |
| TRIBE | T |
| TITLE | I |
| Submit | Ctrl+Enter |
| Skip | Ctrl+→ |
| Undo | Ctrl+Z |
---
## Common Mistakes to Avoid
1. ❌ Tagging common words (عن، أن، قال)
2. ❌ Missing nisbas at the end of names
3. ❌ Overlapping entity spans
4. ❌ Inconsistent tagging of same entity
5. ❌ Tagging pronouns
6. ❌ Missing entities in long chains
7. ❌ Incorrect relation directions
---
## Contact & Support
For questions or edge cases:
- Create a discussion in Label Studio
- Tag the project lead for complex cases
- Document unusual cases for guideline updates
---
*Version 1.0 - Last Updated: 2025*

View File

@ -0,0 +1,240 @@
# Islamic Hadith Scholar AI - Phase 3 Implementation Summary
## Project Context Prompt
Use this prompt to continue the project or onboard a new AI assistant:
---
## 🎯 PROJECT OVERVIEW
I'm building an **Islamic Hadith Scholar AI System** - a production-grade AI platform for analyzing Islamic hadith literature. The system processes approximately **40,000 hadiths** from 8 major collections (Sahih Bukhari, Sahih Muslim, Abu Dawood, Tirmidhi, Ibn Majah, Nasa'i, Ahmad, Silsila Sahiha) in Arabic, English, and Urdu.
### Infrastructure
- **Kubernetes cluster** on Hetzner Cloud (2-node, 32 cores, 128GB RAM)
- **Domain:** betelgeusebytes.io
- **18+ deployed services** across 8 namespaces
---
## 🏗️ ARCHITECTURE & SERVICES
### Data Layer
| Service | Endpoint | Purpose |
|---------|----------|---------|
| PostgreSQL 18 | pg.betelgeusebytes.io:5432 | Main database (hadith_db) |
| Neo4j 5.20 | neo4j.betelgeusebytes.io | Graph database (Phase 4) |
| Redis 7 | redis.db.svc.cluster.local | Caching |
| Elasticsearch 8.14 | elasticsearch.elastic.svc.cluster.local | Search & logs |
### ML & Vector Services
| Service | Endpoint | Purpose |
|---------|----------|---------|
| TEI | tei.ml.svc.cluster.local:80 | BGE-M3 embeddings (1024-dim) |
| vLLM | vllm.ml.svc.cluster.local:8000 | Qwen2.5-7B inference |
| Qdrant | qdrant.vector.svc.cluster.local:6333 | Vector search |
| MLflow | mlflow.betelgeusebytes.io | Experiment tracking |
| Label Studio | label.betelgeusebytes.io | Annotation |
| JupyterLab | notebook.betelgeusebytes.io | Experimentation |
### Orchestration & Monitoring
- **Argo Workflows** (namespace: ml)
- **Prometheus + Grafana** for monitoring
- **Fluent Bit + OpenTelemetry** for logging
---
## 💾 DATABASE SCHEMA
### Main Tables
```sql
-- hadiths (~40k rows)
id, collection_id, book_id, hadith_number
arabic_text, arabic_normalized (auto-generated)
english_text, urdu_text, grade
embedding_generated = TRUE (all done)
entities_extracted = FALSE (pending NER)
relations_extracted = FALSE (pending RE)
source_metadata (JSONB)
created_at, updated_at
-- collections (8 rows)
id, name_english, name_arabic, total_hadiths
-- books
id, collection_id, name_english, name_arabic, book_number
-- narrators_metadata
id, name_arabic, name_english, kunya, nisba, birth_year, death_year
-- annotations
id, hadith_id, annotation_type, annotation_data (JSONB)
```
---
## ✅ COMPLETED PHASES
### Phase 1-2: Infrastructure & Data Ingestion (100%)
- ✅ All 18 services deployed and operational
- ✅ ~40,000 hadiths ingested from hadithapi.com
- ✅ Multi-language support (Arabic, English, Urdu)
- ✅ PostgreSQL schema with proper indices
### Phase 3: ML Pipeline
#### Step 1-5: Embeddings (100%)
- ✅ TEI deployed with BGE-M3 model
- ✅ vLLM deployed with Qwen2.5-7B-Instruct
- ✅ Embedding generator script created
- ✅ Argo workflow for batch processing
- ✅ **All ~40k hadiths embedded in Qdrant**
#### Step 6: Verify Embeddings & Semantic Search (100%)
**Delivered Files:**
- `verify_embeddings.py` - Validates all hadiths have embeddings
- `semantic_search.py` - Benchmark suite (target: <500ms)
- `search_api.py` - FastAPI service with endpoints
- `verification_queries.sql` - SQL verification queries
- `k8s-search-api.yaml` - Kubernetes deployment
- `step6_verification.ipynb` - Interactive Jupyter notebook
**API Endpoints:**
- `POST /search` - Semantic search
- `GET /search?q=` - Simple search
- `GET /hadith/{id}` - Get by ID
- `GET /similar/{id}` - Find similar
- `GET /health`, `GET /stats`
**Performance Target:** <500ms per query (achieved)
#### Step 7: Annotation Setup (100%)
**Delivered Files:**
- `annotation_setup.py` - Main setup script
- `label_studio_client.py` - API client
- `active_learning.py` - Smart sampling strategies
- `export_queries.sql` - SQL export queries
- `ANNOTATION_GUIDELINES.md` - Comprehensive guidelines
**Entity Types for NER:**
- PERSON, KUNYA, NISBA, PLACE, DATE, TRIBE, TITLE
**Relation Types:**
- NARRATED_FROM, TEACHER_OF, STUDENT_OF, CONTEMPORARY_OF
- RELATED_TO, LIVED_IN, BORN_IN, DIED_IN
**Sampling Strategies:**
- Stratified (proportional to collection size)
- Chain-focused (hadiths with clear isnad)
- Active Learning: diversity, representative, chain_complexity, hybrid
---
## 🔜 REMAINING STEPS (Phase 3)
### Step 8: NER Model Training (3-5 days)
**Need to implement:**
- Script to export Label Studio annotations to HuggingFace format
- Training pipeline for XLM-RoBERTa-large or AraBERT
- LoRA configuration for efficient fine-tuning
- Evaluation metrics (target: F1 > 0.85)
- MLflow experiment tracking integration
- Deployment as Kubernetes service
- Inference API endpoint
### Step 9: Relation Extraction Model (3-5 days)
**Need to implement:**
- Pipeline to use NER outputs as input
- Training script for relation classification
- Model architecture (transformer-based)
- Evaluation (target: F1 > 0.80)
- Deployment configuration
### Step 10: LLM Fine-tuning with LoRA (5-7 days)
**Need to implement:**
- Instruction dataset format for:
- Entity extraction from hadith text
- Relation extraction between narrators
- Question answering about hadiths
- Hadith explanation/interpretation
- LoRA fine-tuning script for Qwen2.5-7B
- Training configuration
- Adapter merging/dynamic loading
- Deployment to vLLM with LoRA support
---
## 📁 KEY COMMANDS
```bash
# Database access
psql -h pg.betelgeusebytes.io -U hadith_ingest -d hadith_db
# Check Qdrant collection
curl http://qdrant.betelgeusebytes.io/collections/hadith_embeddings
# Kubernetes pods
kubectl -n ml get pods
kubectl -n vector get pods
kubectl -n db get pods
# Argo workflows
argo list -n ml
argo logs -n ml <workflow-name>
# Label Studio (get API key from settings)
curl -H "Authorization: Token YOUR_KEY" https://label.betelgeusebytes.io/api/projects
```
---
## 📋 DELIVERABLES FORMAT
For each step, provide:
1. **Overview** - What we're building and why
2. **Prerequisites** - What needs to be ready
3. **Implementation** - Complete, copy-paste ready code
4. **Deployment** - Kubernetes YAML if applicable
5. **Testing** - How to verify it works
6. **Troubleshooting** - Common issues and fixes
---
## ⏳ TIMELINE
| Step | Task | Duration | Status |
|------|------|----------|--------|
| 6 | Verify Embeddings & Search | 1 hour | ✅ Complete |
| 7 | Annotation Setup | 1-2 days | ✅ Complete |
| 8 | NER Training | 3-5 days | 🔜 Next |
| 9 | RE Training | 3-5 days | Pending |
| 10 | LLM Fine-tuning | 5-7 days | Pending |
**Total Remaining:** ~2-3 weeks
---
## 🔑 IMPORTANT NOTES
1. **All services use internal K8s DNS** for communication
2. **External access** available via *.betelgeusebytes.io with TLS
3. **Security:** Default passwords need to be changed in production
4. **Embeddings:** BGE-M3 produces 1024-dimensional vectors
5. **Model:** Qwen2.5-7B-Instruct is the base LLM for fine-tuning
6. **Annotation:** Target 500 NER + 300 Relation samples before training
---
## 🎯 NEXT REQUEST
Please help me implement **Step 8: NER Model Training** with:
- Export script for Label Studio → HuggingFace format
- XLM-RoBERTa or AraBERT training pipeline
- LoRA configuration
- MLflow integration
- K8s deployment
- Inference API
---
*Generated: 2025 | Project: Islamic Hadith Scholar AI*

View File

@ -0,0 +1,380 @@
# Step 7: Annotation Setup with Label Studio
## 📋 Overview
This step sets up the annotation infrastructure for training NER and Relation Extraction models. We export hadiths for annotation, configure Label Studio projects, and implement active learning to speed up the annotation process.
**Duration:** 1-2 days for setup, ongoing for annotation
---
## 📁 Files Included
| File | Description |
|------|-------------|
| `annotation_setup.py` | Main setup script - exports data and creates projects |
| `label_studio_client.py` | API client for Label Studio operations |
| `active_learning.py` | Active learning strategies for smart sampling |
| `export_queries.sql` | SQL queries for various sampling strategies |
| `ANNOTATION_GUIDELINES.md` | Comprehensive annotation guidelines for annotators |
| `requirements.txt` | Python dependencies |
---
## 🔧 Prerequisites
1. **Label Studio** running at `label.betelgeusebytes.io`
2. **PostgreSQL** access with hadith data
3. **Qdrant** with embeddings (for active learning)
4. **Python 3.10+** with pip
---
## 🚀 Quick Start
### 1. Install Dependencies
```bash
pip install -r requirements.txt
```
### 2. Set Environment Variables
```bash
export POSTGRES_HOST=pg.betelgeusebytes.io
export POSTGRES_PORT=5432
export POSTGRES_DB=hadith_db
export POSTGRES_USER=hadith_ingest
export POSTGRES_PASSWORD=your_password
export LABEL_STUDIO_URL=https://label.betelgeusebytes.io
export LABEL_STUDIO_API_KEY=your_api_key # Get from Label Studio settings
```
### 3. Export Data for Annotation
```bash
# Export 500 hadiths for NER annotation (stratified sampling)
python annotation_setup.py --ner-count 500 --relation-count 300 --export-only
# This creates:
# - annotation_data/ner_tasks.json
# - annotation_data/relation_tasks.json
# - annotation_data/ner_config.xml
# - annotation_data/relation_config.xml
```
### 4. Create Label Studio Projects (Optional)
If you have an API key:
```bash
python annotation_setup.py --ner-count 500 --relation-count 300
```
Or create projects manually and import the JSON files.
---
## 📊 Sampling Strategies
### 1. Stratified Sampling (Default for NER)
Proportional samples from each hadith collection.
```bash
python annotation_setup.py --strategy stratified
```
### 2. Chain-Focused Sampling (Default for Relations)
Focuses on hadiths with clear narrator chains (isnad).
```python
# Looks for patterns like:
# - حدثنا (narrated to us)
# - أخبرنا (informed us)
# - عن...عن (from...from)
```
### 3. Active Learning Sampling
Uses embeddings to select informative samples.
```bash
# Diversity sampling - most different from annotated samples
python active_learning.py --strategy diversity --count 50
# Representative sampling - cluster-based selection
python active_learning.py --strategy representative --count 50
# Chain complexity - complex narrator chains
python active_learning.py --strategy chain_complexity --count 50
# Hybrid - combines all strategies
python active_learning.py --strategy hybrid --count 100
```
---
## 🏷️ Entity Types for NER
| Entity | Description | Example | Hotkey |
|--------|-------------|---------|--------|
| PERSON | Full names | محمد بن عبد الله | P |
| KUNYA | Abu/Umm names | أبو هريرة | K |
| NISBA | Attributions | البخاري، القرشي | N |
| PLACE | Locations | مكة، المدينة | L |
| DATE | Time references | سنة مئتين | D |
| TRIBE | Tribe names | قريش، بني هاشم | T |
| TITLE | Honorifics | رسول الله، أمير المؤمنين | I |
---
## 🔗 Relation Types
| Relation | Description | Direction |
|----------|-------------|-----------|
| NARRATED_FROM | Narrator chain link | A → B |
| TEACHER_OF | Teaching relationship | Teacher → Student |
| STUDENT_OF | Inverse of TEACHER_OF | Student → Teacher |
| CONTEMPORARY_OF | Same era | Bidirectional |
| RELATED_TO | Family relation | Bidirectional |
| LIVED_IN | Residence | Person → Place |
| BORN_IN | Birthplace | Person → Place |
| DIED_IN | Place of death | Person → Place |
---
## 📝 Label Studio Project Setup
### Manual Setup (Recommended)
1. **Login to Label Studio:** https://label.betelgeusebytes.io
2. **Create NER Project:**
- Click "Create Project"
- Name: "Hadith NER Annotation"
- Go to Settings → Labeling Interface
- Paste content from `annotation_data/ner_config.xml`
- Save
3. **Import NER Tasks:**
- Go to project → Import
- Upload `annotation_data/ner_tasks.json`
4. **Create Relation Project:**
- Create another project: "Hadith Relation Extraction"
- Use `annotation_data/relation_config.xml`
- Import `annotation_data/relation_tasks.json`
### Programmatic Setup
```python
from label_studio_client import LabelStudioClient
import asyncio
async def setup():
client = LabelStudioClient()
# Create NER project
with open("annotation_data/ner_config.xml") as f:
config = f.read()
project = await client.create_project(
title="Hadith NER Annotation",
description="Named entity recognition for hadith texts",
label_config=config
)
# Import tasks
await client.import_tasks_from_file(
project["id"],
"annotation_data/ner_tasks.json"
)
asyncio.run(setup())
```
---
## 📤 Exporting Annotations
### Export to JSON
```bash
python label_studio_client.py export --project 1 --output annotations.json
```
### Convert to HuggingFace Format
```bash
python label_studio_client.py convert \
--input annotations.json \
--output ner_dataset.json \
--format huggingface
```
### Convert to spaCy Format
```bash
python label_studio_client.py convert \
--input annotations.json \
--output spacy_data.json \
--format spacy
```
### Convert Relations to Graph
```bash
python label_studio_client.py convert \
--input relation_annotations.json \
--output relations.json \
--format relations
```
---
## 📈 Active Learning Workflow
### Initial Annotation (Cold Start)
1. Start with stratified sample of 100 hadiths
2. Annotate these completely
3. Train preliminary model
### Iterative Improvement
```bash
# After initial annotations, use active learning
python active_learning.py --strategy hybrid --count 50 --output next_batch.json
# Import to Label Studio
python label_studio_client.py import --project 1 --file next_batch.json
# Annotate, then repeat
```
### Strategy Recommendations
| Stage | Strategy | Rationale |
|-------|----------|-----------|
| Initial (0-100) | Stratified | Cover all collections |
| Early (100-300) | Diversity | Expand coverage |
| Middle (300-500) | Representative | Fill gaps in clusters |
| Later (500+) | Chain Complexity | Focus on hard cases |
---
## 📊 Annotation Progress Tracking
### Check Progress via SQL
```sql
-- Overall progress
SELECT
SUM(CASE WHEN entities_extracted THEN 1 ELSE 0 END) as ner_done,
SUM(CASE WHEN relations_extracted THEN 1 ELSE 0 END) as relations_done,
COUNT(*) as total
FROM hadiths;
-- Progress by collection
SELECT
c.name_english,
SUM(CASE WHEN h.entities_extracted THEN 1 ELSE 0 END) as ner_done,
COUNT(*) as total
FROM hadiths h
JOIN collections c ON h.collection_id = c.id
GROUP BY c.name_english;
```
### Check via Label Studio API
```bash
python label_studio_client.py stats --project 1
```
---
## 🎯 Quality Guidelines
### Inter-Annotator Agreement Target
- Entity Detection: Cohen's κ > 0.8
- Entity Classification: κ > 0.75
- Relations: κ > 0.7
### Review Process
1. Each task annotated by 1-2 annotators
2. Disagreements reviewed by expert
3. Edge cases documented for guideline updates
### Quality Checks
- Run `ANNOTATION_GUIDELINES.md` training with annotators
- Spot-check 10% of annotations weekly
- Track agreement scores over time
---
## 🐛 Troubleshooting
### "Connection refused" to Label Studio
```bash
# Check if Label Studio is running
curl https://label.betelgeusebytes.io/health
# Check API key
curl -H "Authorization: Token YOUR_API_KEY" \
https://label.betelgeusebytes.io/api/projects
```
### "No embeddings for active learning"
Ensure Step 6 completed successfully:
```bash
curl http://qdrant.betelgeusebytes.io/collections/hadith_embeddings
```
### Export fails with encoding errors
Use UTF-8 encoding explicitly:
```python
with open(file, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False)
```
---
## ✅ Checklist Before Step 8
Before proceeding to NER model training:
- [ ] At least 300-500 hadiths annotated for NER
- [ ] At least 200-300 hadiths annotated for relations
- [ ] Annotation guidelines reviewed and finalized
- [ ] Inter-annotator agreement > 0.75
- [ ] Annotations exported in HuggingFace format
- [ ] Edge cases documented
---
## 📚 Next Steps
Once sufficient annotations are collected:
1. **Step 8:** Train NER model (XLM-RoBERTa or AraBERT)
- Use exported HuggingFace format data
- Target F1 > 0.85
2. **Step 9:** Train Relation Extraction model
- Use NER outputs as input
- Target F1 > 0.80
---
## 📎 Additional Resources
- [Label Studio Documentation](https://labelstud.io/guide/)
- [Active Learning for NER](https://arxiv.org/abs/2101.11112)
- [Arabic NER Guidelines](https://www.aclweb.org/anthology/)
---
*Version 1.0 - Last Updated: 2025*

View File

@ -0,0 +1,599 @@
#!/usr/bin/env python3
"""
Step 7: Active Learning Strategy for Hadith Annotation
=======================================================
Implements active learning to speed up annotation by selecting
the most informative samples for labeling.
Author: Hadith Scholar AI Project
Date: 2025
"""
import os
import json
import random
import asyncio
from datetime import datetime
from typing import Dict, List, Optional, Tuple
from dataclasses import dataclass
import math
import sys
import numpy as np
import httpx
import psycopg2
from psycopg2.extras import RealDictCursor
from rich.console import Console
from rich.table import Table
from rich.progress import Progress
if sys.platform == 'win32':
os.environ['PYTHONIOENCODING'] = 'utf-8'
if hasattr(sys.stdout, 'reconfigure'):
sys.stdout.reconfigure(encoding='utf-8')
if hasattr(sys.stderr, 'reconfigure'):
sys.stderr.reconfigure(encoding='utf-8')
# Configuration
POSTGRES_HOST = os.getenv("POSTGRES_HOST", "pg.betelgeusebytes.io")
POSTGRES_PORT = int(os.getenv("POSTGRES_PORT", "5432"))
POSTGRES_DB = os.getenv("POSTGRES_DB", "hadith_db")
POSTGRES_USER = os.getenv("POSTGRES_USER", "hadith_ingest")
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD", "hadith_ingest")
# TEI_URL = "https://embeddings.betelgeusebytes.io"
# QDRANT_URL = "https://vector.betelgeusebytes.io"
QDRANT_HOST = os.getenv("QDRANT_HOST", "https://vector.betelgeusebytes.io")
QDRANT_PORT = int(os.getenv("QDRANT_PORT", "443"))
QDRANT_COLLECTION = os.getenv("QDRANT_COLLECTION", "hadith_embeddings")
# For external access
QDRANT_EXTERNAL = os.getenv("QDRANT_EXTERNAL", "qdrant.betelgeusebytes.io")
# TEI_URL = "https://embeddings.betelgeusebytes.io"
TEI_HOST = os.getenv("TEI_HOST", "https://embeddings.betelgeusebytes.io")
TEI_PORT = int(os.getenv("TEI_PORT", "443"))
console = Console()
@dataclass
class SampleCandidate:
"""A candidate sample for annotation."""
hadith_id: int
arabic_text: str
english_text: str
collection: str
score: float
strategy: str
metadata: Dict
class ActiveLearningSampler:
"""
Active learning sampler for hadith annotation.
Strategies:
1. Uncertainty Sampling - Select samples where model is least confident
2. Diversity Sampling - Select samples that are most different from annotated
3. Representative Sampling - Select samples that represent clusters
4. Hybrid - Combine multiple strategies
"""
def __init__(self):
self.db_conn = None
self.qdrant_client = None
def _get_db_connection(self):
"""Get database connection."""
if self.db_conn is None or self.db_conn.closed:
self.db_conn = psycopg2.connect(
host=POSTGRES_HOST,
port=POSTGRES_PORT,
database=POSTGRES_DB,
user=POSTGRES_USER,
password=POSTGRES_PASSWORD,
sslmode='require'
)
return self.db_conn
async def _search_qdrant(
self,
vector: List[float],
limit: int = 100,
filter_ids: List[int] = None
) -> List[Dict]:
"""Search Qdrant for similar vectors."""
async with httpx.AsyncClient(timeout=30.0) as client:
payload = {
"vector": vector,
"limit": limit,
"with_payload": True
}
if filter_ids:
payload["filter"] = {
"must_not": [
{"has_id": filter_ids}
]
}
response = await client.post(
f"{QDRANT_HOST}:{QDRANT_PORT}/collections/{QDRANT_COLLECTION}/points/search",
json=payload
)
response.raise_for_status()
return response.json().get("result", [])
async def _get_random_vectors(self, count: int = 10) -> List[Dict]:
"""Get random vectors from Qdrant for centroid calculation."""
async with httpx.AsyncClient(timeout=30.0) as client:
response = await client.post(
f"{QDRANT_HOST}:{QDRANT_PORT}/collections/{QDRANT_COLLECTION}/points/scroll",
json={
"limit": count,
"with_vector": True,
"with_payload": True
}
)
response.raise_for_status()
return response.json().get("result", {}).get("points", [])
def get_annotated_hadith_ids(self) -> List[int]:
"""Get IDs of already annotated hadiths."""
conn = self._get_db_connection()
with conn.cursor() as cur:
cur.execute("""
SELECT id FROM hadiths
WHERE entities_extracted = true
""")
return [row[0] for row in cur.fetchall()]
def get_unannotated_hadiths(self, limit: int = 1000) -> List[Dict]:
"""Get unannotated hadiths with their metadata."""
conn = self._get_db_connection()
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT
h.id,
h.hadith_number,
h.arabic_text,
h.english_text,
h.grade,
c.name_english as collection
FROM hadiths h
JOIN collections c ON h.collection_id = c.id
WHERE NOT h.entities_extracted
AND h.arabic_text IS NOT NULL
AND LENGTH(h.arabic_text) > 50
ORDER BY RANDOM()
LIMIT %s
""", (limit,))
return [dict(row) for row in cur.fetchall()]
# ========================================================================
# Sampling Strategies
# ========================================================================
async def diversity_sampling(
self,
count: int = 50,
annotated_ids: List[int] = None
) -> List[SampleCandidate]:
"""
Select samples that are most different from already annotated samples.
Uses embedding distance to find diverse samples.
"""
if annotated_ids is None:
annotated_ids = self.get_annotated_hadith_ids()
if not annotated_ids:
# No annotations yet, use random sampling
return await self.random_sampling(count)
# Get centroid of annotated samples
annotated_vectors = []
async with httpx.AsyncClient(timeout=30.0) as client:
for batch_start in range(0, min(len(annotated_ids), 100), 10):
batch_ids = annotated_ids[batch_start:batch_start+10]
response = await client.post(
f"{QDRANT_HOST}:{QDRANT_PORT}/collections/{QDRANT_COLLECTION}/points",
json={"ids": batch_ids, "with_vector": True}
)
if response.status_code == 200:
for point in response.json().get("result", []):
if "vector" in point:
annotated_vectors.append(point["vector"])
if not annotated_vectors:
return await self.random_sampling(count)
# Calculate centroid
centroid = np.mean(annotated_vectors, axis=0).tolist()
# Find points far from centroid (negative similarity search)
# We'll get many candidates and select the most distant
candidates = await self._search_qdrant(
centroid,
limit=count * 3,
filter_ids=annotated_ids
)
# Sort by distance (lower score = more distant for cosine similarity)
candidates.sort(key=lambda x: x.get("score", 1))
# Get hadith details
hadith_ids = [
c.get("payload", {}).get("hadith_id") or c.get("id")
for c in candidates[:count]
]
hadith_map = self._get_hadiths_by_ids(hadith_ids)
results = []
for i, c in enumerate(candidates[:count]):
hid = c.get("payload", {}).get("hadith_id") or c.get("id")
if hid in hadith_map:
h = hadith_map[hid]
results.append(SampleCandidate(
hadith_id=hid,
arabic_text=h.get("arabic_text", ""),
english_text=h.get("english_text", ""),
collection=h.get("collection", ""),
score=1 - c.get("score", 0), # Convert similarity to diversity
strategy="diversity",
metadata={"rank": i + 1}
))
return results
async def representative_sampling(
self,
count: int = 50,
n_clusters: int = 10
) -> List[SampleCandidate]:
"""
Select samples that are representative of different clusters.
Uses k-means-like approach on embeddings.
"""
# Get random sample of vectors to identify clusters
sample_points = await self._get_random_vectors(count=500)
if len(sample_points) < n_clusters:
return await self.random_sampling(count)
# Simple k-means clustering on vectors
vectors = np.array([p["vector"] for p in sample_points])
# Initialize centroids randomly
centroid_indices = random.sample(range(len(vectors)), n_clusters)
centroids = vectors[centroid_indices]
# Run k-means iterations
for _ in range(10):
# Assign points to nearest centroid
distances = np.linalg.norm(vectors[:, np.newaxis] - centroids, axis=2)
assignments = np.argmin(distances, axis=1)
# Update centroids
new_centroids = []
for k in range(n_clusters):
cluster_points = vectors[assignments == k]
if len(cluster_points) > 0:
new_centroids.append(cluster_points.mean(axis=0))
else:
new_centroids.append(centroids[k])
centroids = np.array(new_centroids)
# Select samples closest to each centroid
samples_per_cluster = max(1, count // n_clusters)
selected = []
annotated_ids = set(self.get_annotated_hadith_ids())
for k in range(n_clusters):
cluster_mask = assignments == k
cluster_indices = np.where(cluster_mask)[0]
if len(cluster_indices) == 0:
continue
# Sort by distance to centroid
cluster_vectors = vectors[cluster_mask]
distances = np.linalg.norm(cluster_vectors - centroids[k], axis=1)
sorted_indices = np.argsort(distances)
added = 0
for idx in sorted_indices:
point = sample_points[cluster_indices[idx]]
hid = point.get("payload", {}).get("hadith_id") or point.get("id")
if hid not in annotated_ids and added < samples_per_cluster:
selected.append({
"hadith_id": hid,
"cluster": k,
"distance": float(distances[idx])
})
added += 1
# Get hadith details
hadith_ids = [s["hadith_id"] for s in selected]
hadith_map = self._get_hadiths_by_ids(hadith_ids)
results = []
for s in selected[:count]:
hid = s["hadith_id"]
if hid in hadith_map:
h = hadith_map[hid]
results.append(SampleCandidate(
hadith_id=hid,
arabic_text=h.get("arabic_text", ""),
english_text=h.get("english_text", ""),
collection=h.get("collection", ""),
score=1.0 / (1.0 + s["distance"]),
strategy="representative",
metadata={"cluster": s["cluster"]}
))
return results
async def chain_complexity_sampling(
self,
count: int = 50
) -> List[SampleCandidate]:
"""
Select samples with complex narrator chains for relation annotation.
Uses heuristics based on chain patterns.
"""
conn = self._get_db_connection()
with conn.cursor(cursor_factory=RealDictCursor) as cur:
# Find hadiths with complex chains
cur.execute("""
WITH chain_scores AS (
SELECT
h.id,
h.hadith_number,
h.arabic_text,
h.english_text,
c.name_english as collection,
-- Score based on chain indicators
(
-- Count "عن" occurrences (narrator chain links)
(LENGTH(h.arabic_text) - LENGTH(REPLACE(h.arabic_text, 'عن', ''))) / 2 * 2
-- Count "حدثنا" occurrences
+ (LENGTH(h.arabic_text) - LENGTH(REPLACE(h.arabic_text, 'حدثنا', ''))) / 5 * 3
-- Count "أخبرنا" occurrences
+ (LENGTH(h.arabic_text) - LENGTH(REPLACE(h.arabic_text, 'أخبرنا', ''))) / 6 * 3
-- Bonus for longer texts (more potential entities)
+ LEAST(LENGTH(h.arabic_text) / 100, 10)
) as complexity_score
FROM hadiths h
JOIN collections c ON h.collection_id = c.id
WHERE NOT h.entities_extracted
AND h.arabic_text IS NOT NULL
AND LENGTH(h.arabic_text) > 100
)
SELECT *
FROM chain_scores
WHERE complexity_score > 5
ORDER BY complexity_score DESC, RANDOM()
LIMIT %s
""", (count,))
hadiths = cur.fetchall()
results = []
for h in hadiths:
results.append(SampleCandidate(
hadith_id=h["id"],
arabic_text=h.get("arabic_text", ""),
english_text=h.get("english_text", ""),
collection=h.get("collection", ""),
score=float(h.get("complexity_score", 0)) / 20.0, # Normalize
strategy="chain_complexity",
metadata={"complexity_score": h.get("complexity_score", 0)}
))
return results
async def random_sampling(self, count: int = 50) -> List[SampleCandidate]:
"""Simple random sampling as baseline."""
hadiths = self.get_unannotated_hadiths(limit=count)
results = []
for h in hadiths:
results.append(SampleCandidate(
hadith_id=h["id"],
arabic_text=h.get("arabic_text", ""),
english_text=h.get("english_text", ""),
collection=h.get("collection", ""),
score=random.random(),
strategy="random",
metadata={}
))
return results
async def hybrid_sampling(
self,
count: int = 50,
weights: Dict[str, float] = None
) -> List[SampleCandidate]:
"""
Combine multiple sampling strategies.
Default weights:
- diversity: 0.3
- representative: 0.3
- chain_complexity: 0.3
- random: 0.1
"""
if weights is None:
weights = {
"diversity": 0.3,
"representative": 0.3,
"chain_complexity": 0.3,
"random": 0.1
}
# Normalize weights
total_weight = sum(weights.values())
weights = {k: v / total_weight for k, v in weights.items()}
# Get samples from each strategy
all_candidates = []
for strategy, weight in weights.items():
strategy_count = max(1, int(count * weight * 1.5)) # Get extra for dedup
if strategy == "diversity":
candidates = await self.diversity_sampling(strategy_count)
elif strategy == "representative":
candidates = await self.representative_sampling(strategy_count)
elif strategy == "chain_complexity":
candidates = await self.chain_complexity_sampling(strategy_count)
else:
candidates = await self.random_sampling(strategy_count)
# Adjust scores by weight
for c in candidates:
c.score *= weight
all_candidates.extend(candidates)
# Deduplicate by hadith_id, keeping highest score
seen = {}
for c in all_candidates:
if c.hadith_id not in seen or c.score > seen[c.hadith_id].score:
seen[c.hadith_id] = c
# Sort by score and return top N
results = sorted(seen.values(), key=lambda x: -x.score)
return results[:count]
def _get_hadiths_by_ids(self, hadith_ids: List[int]) -> Dict[int, Dict]:
"""Get hadith details by IDs."""
if not hadith_ids:
return {}
conn = self._get_db_connection()
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT
h.id,
h.hadith_number,
h.arabic_text,
h.english_text,
h.grade,
c.name_english as collection
FROM hadiths h
JOIN collections c ON h.collection_id = c.id
WHERE h.id = ANY(%s::uuid[])
""", (list(hadith_ids),))
return {row["id"]: dict(row) for row in cur.fetchall()}
def close(self):
"""Close database connection."""
if self.db_conn and not self.db_conn.closed:
self.db_conn.close()
# ============================================================================
# Export Functions
# ============================================================================
def export_samples_for_label_studio(
samples: List[SampleCandidate],
output_path: str
) -> str:
"""Export samples in Label Studio format."""
tasks = []
for s in samples:
task = {
"data": {
"hadith_id": s.hadith_id,
"arabic_text": s.arabic_text,
"english_text": s.english_text,
"collection": s.collection,
"selection_score": s.score,
"selection_strategy": s.strategy
},
"meta": {
"strategy": s.strategy,
"metadata": s.metadata
}
}
tasks.append(task)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(tasks, f, ensure_ascii=False, indent=2)
return output_path
# ============================================================================
# CLI
# ============================================================================
async def main():
"""Main CLI interface."""
import argparse
parser = argparse.ArgumentParser(description="Active Learning Sampler")
parser.add_argument("--strategy", choices=[
"diversity", "representative", "chain_complexity", "random", "hybrid"
], default="hybrid", help="Sampling strategy")
parser.add_argument("--count", type=int, default=50, help="Number of samples")
parser.add_argument("--output", type=str, default="active_learning_samples.json")
args = parser.parse_args()
console.print(f"[bold]Active Learning Sampling[/bold]")
console.print(f"Strategy: {args.strategy}")
console.print(f"Count: {args.count}")
sampler = ActiveLearningSampler()
try:
if args.strategy == "diversity":
samples = await sampler.diversity_sampling(args.count)
elif args.strategy == "representative":
samples = await sampler.representative_sampling(args.count)
elif args.strategy == "chain_complexity":
samples = await sampler.chain_complexity_sampling(args.count)
elif args.strategy == "random":
samples = await sampler.random_sampling(args.count)
else:
samples = await sampler.hybrid_sampling(args.count)
# Display results
table = Table(title=f"Selected Samples ({args.strategy})")
table.add_column("ID", style="cyan")
table.add_column("Collection")
table.add_column("Score", justify="right")
table.add_column("Strategy")
table.add_column("Preview", width=40)
for s in samples[:20]: # Show first 20
preview = (s.arabic_text or s.english_text or "")[:40] + "..."
table.add_row(
str(s.hadith_id),
s.collection,
f"{s.score:.3f}",
s.strategy,
preview
)
console.print(table)
# Export
export_samples_for_label_studio(samples, args.output)
console.print(f"\n[green]Exported {len(samples)} samples to {args.output}[/green]")
finally:
sampler.close()
if __name__ == "__main__":
asyncio.run(main())

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,47 @@
<View>
<Header value="Hadith Entity and Relation Annotation"/>
<Collapse defaultActiveKey="arabic">
<Panel key="arabic" header="Arabic Text">
<Labels name="ner_ar" toName="arabic_text">
<Label value="PERSON" background="#FF6B6B" hotkey="p"/>
<Label value="KUNYA" background="#4ECDC4" hotkey="k"/>
<Label value="NISBA" background="#45B7D1" hotkey="n"/>
<Label value="PLACE" background="#96CEB4" hotkey="l"/>
<Label value="DATE" background="#FFEAA7" hotkey="d"/>
<Label value="TRIBE" background="#DDA0DD" hotkey="t"/>
<Label value="TITLE" background="#98D8C8" hotkey="i"/>
</Labels>
<Text name="arabic_text" value="$arabic_text" granularity="word"/>
</Panel>
<Panel key="english" header="English Text">
<Labels name="ner_en" toName="english_text">
<Label value="PERSON" background="#FF6B6B"/>
<Label value="KUNYA" background="#4ECDC4"/>
<Label value="NISBA" background="#45B7D1"/>
<Label value="PLACE" background="#96CEB4"/>
<Label value="DATE" background="#FFEAA7"/>
<Label value="TRIBE" background="#DDA0DD"/>
<Label value="TITLE" background="#98D8C8"/>
</Labels>
<Text name="english_text" value="$english_text" granularity="word"/>
</Panel>
</Collapse>
<Header value="Relations between Entities" size="4"/>
<Relations>
<Relation value="NARRATED_FROM" hotkey="r"/>
<Relation value="TEACHER_OF" hotkey="e"/>
<Relation value="STUDENT_OF" hotkey="s"/>
<Relation value="CONTEMPORARY_OF" hotkey="c"/>
<Relation value="RELATED_TO"/>
<Relation value="LIVED_IN"/>
</Relations>
<View style="margin-top: 15px; padding: 10px; background: #e8f4f8; border-radius: 5px;">
<Header value="Hadith Info" size="5"/>
<Text name="meta" value="$collection | Hadith #$hadith_number | Grade: $grade"/>
</View>
</View>

View File

@ -0,0 +1,41 @@
<View>
<Header value="Hadith Named Entity Recognition (NER)"/>
<View style="display: flex; flex-direction: row;">
<View style="flex: 1; margin-right: 10px;">
<Header value="Arabic Text" size="4"/>
<Text name="arabic_text" value="$arabic_text" granularity="word"/>
<Labels name="ner_arabic" toName="arabic_text">
<Label value="PERSON" background="#FF6B6B" hotkey="p"/>
<Label value="KUNYA" background="#4ECDC4" hotkey="k"/>
<Label value="NISBA" background="#45B7D1" hotkey="n"/>
<Label value="PLACE" background="#96CEB4" hotkey="l"/>
<Label value="DATE" background="#FFEAA7" hotkey="d"/>
<Label value="TRIBE" background="#DDA0DD" hotkey="t"/>
<Label value="TITLE" background="#98D8C8" hotkey="i"/>
</Labels>
</View>
<View style="flex: 1; margin-left: 10px;">
<Header value="English Text" size="4"/>
<Text name="english_text" value="$english_text" granularity="word"/>
<Labels name="ner_english" toName="english_text">
<Label value="PERSON" background="#FF6B6B" hotkey="1"/>
<Label value="KUNYA" background="#4ECDC4" hotkey="2"/>
<Label value="NISBA" background="#45B7D1" hotkey="3"/>
<Label value="PLACE" background="#96CEB4" hotkey="4"/>
<Label value="DATE" background="#FFEAA7" hotkey="5"/>
<Label value="TRIBE" background="#DDA0DD" hotkey="6"/>
<Label value="TITLE" background="#98D8C8" hotkey="7"/>
</Labels>
</View>
</View>
<View style="margin-top: 20px; padding: 10px; background: #f5f5f5; border-radius: 5px;">
<Header value="Metadata" size="5"/>
<Text name="collection" value="Collection: $collection"/>
<Text name="hadith_num" value="Hadith #: $hadith_number"/>
<Text name="grade" value="Grade: $grade"/>
</View>
</View>

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,42 @@
<View>
<Header value="Hadith Relation Extraction"/>
<View style="margin-bottom: 20px;">
<Header value="Arabic Text with Entities" size="4"/>
<Labels name="entities_ar" toName="arabic_text">
<Label value="NARRATOR" background="#FF6B6B"/>
<Label value="PERSON" background="#4ECDC4"/>
<Label value="PLACE" background="#96CEB4"/>
</Labels>
<Text name="arabic_text" value="$arabic_text" granularity="word"/>
</View>
<View style="margin-bottom: 20px;">
<Header value="English Text with Entities" size="4"/>
<Labels name="entities_en" toName="english_text">
<Label value="NARRATOR" background="#FF6B6B"/>
<Label value="PERSON" background="#4ECDC4"/>
<Label value="PLACE" background="#96CEB4"/>
</Labels>
<Text name="english_text" value="$english_text" granularity="word"/>
</View>
<Header value="Relations" size="4"/>
<Relations>
<Relation value="NARRATED_FROM"/>
<Relation value="TEACHER_OF"/>
<Relation value="STUDENT_OF"/>
<Relation value="CONTEMPORARY_OF"/>
<Relation value="RELATED_TO"/>
<Relation value="LIVED_IN"/>
<Relation value="DIED_IN"/>
<Relation value="BORN_IN"/>
</Relations>
<View style="margin-top: 20px; padding: 10px; background: #f5f5f5; border-radius: 5px;">
<Header value="Metadata" size="5"/>
<Text name="collection" value="Collection: $collection"/>
<Text name="hadith_num" value="Hadith #: $hadith_number"/>
</View>
</View>

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,635 @@
#!/usr/bin/env python3
"""
Step 7: Annotation Setup with Label Studio
===========================================
Exports hadiths for annotation and configures Label Studio projects.
Author: Hadith Scholar AI Project
Date: 2025
"""
import os
import sys
import json
import random
import asyncio
from datetime import datetime
from typing import Dict, List, Optional, Tuple
from dataclasses import dataclass, asdict
import hashlib
import psycopg2
from psycopg2.extras import RealDictCursor
import httpx
from rich.console import Console
from rich.table import Table
from rich.panel import Panel
from rich.progress import Progress, SpinnerColumn, TextColumn
if sys.platform == 'win32':
os.environ['PYTHONIOENCODING'] = 'utf-8'
if hasattr(sys.stdout, 'reconfigure'):
sys.stdout.reconfigure(encoding='utf-8')
if hasattr(sys.stderr, 'reconfigure'):
sys.stderr.reconfigure(encoding='utf-8')
# Configuration
POSTGRES_HOST = os.getenv("POSTGRES_HOST", "pg.betelgeusebytes.io")
POSTGRES_PORT = int(os.getenv("POSTGRES_PORT", "5432"))
POSTGRES_DB = os.getenv("POSTGRES_DB", "hadith_db")
POSTGRES_USER = os.getenv("POSTGRES_USER", "hadith_ingest")
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD", "hadith_ingest")
# TEI_URL = "https://embeddings.betelgeusebytes.io"
# QDRANT_URL = "https://vector.betelgeusebytes.io"
QDRANT_HOST = os.getenv("QDRANT_HOST", "https://vector.betelgeusebytes.io")
QDRANT_PORT = int(os.getenv("QDRANT_PORT", "443"))
QDRANT_COLLECTION = os.getenv("QDRANT_COLLECTION", "hadith_embeddings")
# For external access
QDRANT_EXTERNAL = os.getenv("QDRANT_EXTERNAL", "qdrant.betelgeusebytes.io")
# TEI_URL = "https://embeddings.betelgeusebytes.io"
TEI_HOST = os.getenv("TEI_HOST", "https://embeddings.betelgeusebytes.io")
TEI_PORT = int(os.getenv("TEI_PORT", "443"))
LABEL_STUDIO_URL = os.getenv("LABEL_STUDIO_URL", "https://label.betelgeusebytes.io")
LABEL_STUDIO_API_KEY = os.getenv("LABEL_STUDIO_API_KEY", "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ0b2tlbl90eXBlIjoicmVmcmVzaCIsImV4cCI6ODA3MTUyMjgzMSwiaWF0IjoxNzY0MzIyODMxLCJqdGkiOiJhYWVkMjNjODdmODc0MmY2OWJmMmFjZDc5YTVjMzMyMiIsInVzZXJfaWQiOjF9.4B_ZAPL6TmIcA6-zcKJ8JDRI3FsikX3HgTK3bbmK0mk")
console = Console()
# ============================================================================
# Label Studio Project Configurations
# ============================================================================
# NER Labeling Configuration for Hadith Text
NER_LABELING_CONFIG = """
<View>
<Header value="Hadith Named Entity Recognition (NER)"/>
<View style="display: flex; flex-direction: row;">
<View style="flex: 1; margin-right: 10px;">
<Header value="Arabic Text" size="4"/>
<Text name="arabic_text" value="$arabic_text" granularity="word"/>
<Labels name="ner_arabic" toName="arabic_text">
<Label value="PERSON" background="#FF6B6B" hotkey="p"/>
<Label value="KUNYA" background="#4ECDC4" hotkey="k"/>
<Label value="NISBA" background="#45B7D1" hotkey="n"/>
<Label value="PLACE" background="#96CEB4" hotkey="l"/>
<Label value="DATE" background="#FFEAA7" hotkey="d"/>
<Label value="TRIBE" background="#DDA0DD" hotkey="t"/>
<Label value="TITLE" background="#98D8C8" hotkey="i"/>
</Labels>
</View>
<View style="flex: 1; margin-left: 10px;">
<Header value="English Text" size="4"/>
<Text name="english_text" value="$english_text" granularity="word"/>
<Labels name="ner_english" toName="english_text">
<Label value="PERSON" background="#FF6B6B" hotkey="1"/>
<Label value="KUNYA" background="#4ECDC4" hotkey="2"/>
<Label value="NISBA" background="#45B7D1" hotkey="3"/>
<Label value="PLACE" background="#96CEB4" hotkey="4"/>
<Label value="DATE" background="#FFEAA7" hotkey="5"/>
<Label value="TRIBE" background="#DDA0DD" hotkey="6"/>
<Label value="TITLE" background="#98D8C8" hotkey="7"/>
</Labels>
</View>
</View>
<View style="margin-top: 20px; padding: 10px; background: #f5f5f5; border-radius: 5px;">
<Header value="Metadata" size="5"/>
<Text name="collection" value="Collection: $collection"/>
<Text name="hadith_num" value="Hadith #: $hadith_number"/>
<Text name="grade" value="Grade: $grade"/>
</View>
</View>
"""
# Relation Extraction Labeling Configuration
RELATION_LABELING_CONFIG = """
<View>
<Header value="Hadith Relation Extraction"/>
<View style="margin-bottom: 20px;">
<Header value="Arabic Text with Entities" size="4"/>
<Labels name="entities_ar" toName="arabic_text">
<Label value="NARRATOR" background="#FF6B6B"/>
<Label value="PERSON" background="#4ECDC4"/>
<Label value="PLACE" background="#96CEB4"/>
</Labels>
<Text name="arabic_text" value="$arabic_text" granularity="word"/>
</View>
<View style="margin-bottom: 20px;">
<Header value="English Text with Entities" size="4"/>
<Labels name="entities_en" toName="english_text">
<Label value="NARRATOR" background="#FF6B6B"/>
<Label value="PERSON" background="#4ECDC4"/>
<Label value="PLACE" background="#96CEB4"/>
</Labels>
<Text name="english_text" value="$english_text" granularity="word"/>
</View>
<Header value="Relations" size="4"/>
<Relations>
<Relation value="NARRATED_FROM"/>
<Relation value="TEACHER_OF"/>
<Relation value="STUDENT_OF"/>
<Relation value="CONTEMPORARY_OF"/>
<Relation value="RELATED_TO"/>
<Relation value="LIVED_IN"/>
<Relation value="DIED_IN"/>
<Relation value="BORN_IN"/>
</Relations>
<View style="margin-top: 20px; padding: 10px; background: #f5f5f5; border-radius: 5px;">
<Header value="Metadata" size="5"/>
<Text name="collection" value="Collection: $collection"/>
<Text name="hadith_num" value="Hadith #: $hadith_number"/>
</View>
</View>
"""
# Combined NER + Relations Configuration (for advanced annotators)
COMBINED_LABELING_CONFIG = """
<View>
<Header value="Hadith Entity and Relation Annotation"/>
<Collapse defaultActiveKey="arabic">
<Panel key="arabic" header="Arabic Text">
<Labels name="ner_ar" toName="arabic_text">
<Label value="PERSON" background="#FF6B6B" hotkey="p"/>
<Label value="KUNYA" background="#4ECDC4" hotkey="k"/>
<Label value="NISBA" background="#45B7D1" hotkey="n"/>
<Label value="PLACE" background="#96CEB4" hotkey="l"/>
<Label value="DATE" background="#FFEAA7" hotkey="d"/>
<Label value="TRIBE" background="#DDA0DD" hotkey="t"/>
<Label value="TITLE" background="#98D8C8" hotkey="i"/>
</Labels>
<Text name="arabic_text" value="$arabic_text" granularity="word"/>
</Panel>
<Panel key="english" header="English Text">
<Labels name="ner_en" toName="english_text">
<Label value="PERSON" background="#FF6B6B"/>
<Label value="KUNYA" background="#4ECDC4"/>
<Label value="NISBA" background="#45B7D1"/>
<Label value="PLACE" background="#96CEB4"/>
<Label value="DATE" background="#FFEAA7"/>
<Label value="TRIBE" background="#DDA0DD"/>
<Label value="TITLE" background="#98D8C8"/>
</Labels>
<Text name="english_text" value="$english_text" granularity="word"/>
</Panel>
</Collapse>
<Header value="Relations between Entities" size="4"/>
<Relations>
<Relation value="NARRATED_FROM" hotkey="r"/>
<Relation value="TEACHER_OF" hotkey="e"/>
<Relation value="STUDENT_OF" hotkey="s"/>
<Relation value="CONTEMPORARY_OF" hotkey="c"/>
<Relation value="RELATED_TO"/>
<Relation value="LIVED_IN"/>
</Relations>
<View style="margin-top: 15px; padding: 10px; background: #e8f4f8; border-radius: 5px;">
<Header value="Hadith Info" size="5"/>
<Text name="meta" value="$collection | Hadith #$hadith_number | Grade: $grade"/>
</View>
</View>
"""
# ============================================================================
# Database Functions
# ============================================================================
def get_db_connection():
"""Create PostgreSQL connection."""
return psycopg2.connect(
host=POSTGRES_HOST,
port=POSTGRES_PORT,
database=POSTGRES_DB,
user=POSTGRES_USER,
password=POSTGRES_PASSWORD,
sslmode='require'
)
def export_hadiths_for_annotation(
count: int = 500,
strategy: str = "stratified",
seed: int = 42
) -> List[Dict]:
"""
Export hadiths for annotation using various sampling strategies.
Strategies:
- random: Pure random sampling
- stratified: Proportional sampling from each collection
- chain_focused: Focus on hadiths with isnad (narrator chains)
- diverse: Maximize text diversity using embeddings
"""
conn = get_db_connection()
try:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
if strategy == "random":
# Simple random sampling
cur.execute("""
SELECT
h.id,
h.hadith_number,
h.arabic_text,
h.english_text,
h.urdu_text,
h.grade,
c.name_english as collection,
c.name_arabic as collection_arabic,
b.name_english as book,
b.name_arabic as book_arabic
FROM hadiths h
JOIN collections c ON h.collection_id = c.id
LEFT JOIN books b ON h.book_id = b.id
WHERE h.arabic_text IS NOT NULL
AND LENGTH(h.arabic_text) > 50
ORDER BY RANDOM()
LIMIT %s
""", (count,))
elif strategy == "stratified":
# Get collection distribution
cur.execute("""
SELECT c.id, c.name_english, COUNT(h.id) as cnt
FROM hadiths h
JOIN collections c ON h.collection_id = c.id
WHERE h.arabic_text IS NOT NULL AND LENGTH(h.arabic_text) > 50
GROUP BY c.id, c.name_english
""")
collections = cur.fetchall()
total = sum(c['cnt'] for c in collections)
# Calculate samples per collection
all_hadiths = []
for coll in collections:
sample_count = max(1, int(count * coll['cnt'] / total))
cur.execute("""
SELECT
h.id,
h.hadith_number,
h.arabic_text,
h.english_text,
h.urdu_text,
h.grade,
c.name_english as collection,
c.name_arabic as collection_arabic,
b.name_english as book,
b.name_arabic as book_arabic
FROM hadiths h
JOIN collections c ON h.collection_id = c.id
LEFT JOIN books b ON h.book_id = b.id
WHERE h.collection_id = %s
AND h.arabic_text IS NOT NULL
AND LENGTH(h.arabic_text) > 50
ORDER BY RANDOM()
LIMIT %s
""", (coll['id'], sample_count))
all_hadiths.extend(cur.fetchall())
return [dict(h) for h in all_hadiths[:count]]
elif strategy == "chain_focused":
# Focus on hadiths with clear isnad patterns
# Look for common narrator chain indicators
cur.execute("""
SELECT
h.id,
h.hadith_number,
h.arabic_text,
h.english_text,
h.urdu_text,
h.grade,
c.name_english as collection,
c.name_arabic as collection_arabic,
b.name_english as book,
b.name_arabic as book_arabic
FROM hadiths h
JOIN collections c ON h.collection_id = c.id
LEFT JOIN books b ON h.book_id = b.id
WHERE h.arabic_text IS NOT NULL
AND LENGTH(h.arabic_text) > 100
AND (
h.arabic_text LIKE '%%حدثنا%%'
OR h.arabic_text LIKE '%%أخبرنا%%'
OR h.arabic_text LIKE '%%عن%%عن%%'
OR h.english_text LIKE '%%narrated%%'
)
ORDER BY RANDOM()
LIMIT %s
""", (count,))
else:
raise ValueError(f"Unknown strategy: {strategy}")
results = cur.fetchall()
return [dict(h) for h in results]
finally:
conn.close()
def get_collection_statistics() -> List[Dict]:
"""Get statistics for each hadith collection."""
conn = get_db_connection()
try:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT
c.name_english as collection,
COUNT(h.id) as total,
SUM(CASE WHEN h.entities_extracted THEN 1 ELSE 0 END) as entities_done,
SUM(CASE WHEN h.relations_extracted THEN 1 ELSE 0 END) as relations_done,
AVG(LENGTH(h.arabic_text)) as avg_arabic_len,
AVG(LENGTH(h.english_text)) as avg_english_len
FROM hadiths h
JOIN collections c ON h.collection_id = c.id
GROUP BY c.id, c.name_english
ORDER BY total DESC
""")
return [dict(row) for row in cur.fetchall()]
finally:
conn.close()
# ============================================================================
# Label Studio API Functions
# ============================================================================
async def create_label_studio_project(
client: httpx.AsyncClient,
title: str,
description: str,
label_config: str
) -> Dict:
"""Create a new Label Studio project."""
response = await client.post(
f"{LABEL_STUDIO_URL}/api/projects",
headers={"Authorization": f"Token {LABEL_STUDIO_API_KEY}"},
json={
"title": title,
"description": description,
"label_config": label_config,
"is_published": True,
"show_collab_predictions": True,
"evaluate_predictions_automatically": True
}
)
response.raise_for_status()
return response.json()
async def import_tasks_to_project(
client: httpx.AsyncClient,
project_id: int,
tasks: List[Dict]
) -> Dict:
"""Import annotation tasks to a Label Studio project."""
response = await client.post(
f"{LABEL_STUDIO_URL}/api/projects/{project_id}/import",
headers={"Authorization": f"Token {LABEL_STUDIO_API_KEY}"},
json=tasks
)
response.raise_for_status()
return response.json()
async def get_project_stats(
client: httpx.AsyncClient,
project_id: int
) -> Dict:
"""Get annotation statistics for a project."""
response = await client.get(
f"{LABEL_STUDIO_URL}/api/projects/{project_id}",
headers={"Authorization": f"Token {LABEL_STUDIO_API_KEY}"}
)
response.raise_for_status()
return response.json()
def convert_hadiths_to_tasks(hadiths: List[Dict]) -> List[Dict]:
"""Convert hadith records to Label Studio task format."""
tasks = []
for h in hadiths:
task = {
"data": {
"hadith_id": h['id'],
"arabic_text": h.get('arabic_text', '') or '',
"english_text": h.get('english_text', '') or '',
"urdu_text": h.get('urdu_text', '') or '',
"collection": h.get('collection', ''),
"collection_arabic": h.get('collection_arabic', ''),
"book": h.get('book', '') or '',
"book_arabic": h.get('book_arabic', '') or '',
"hadith_number": str(h.get('hadith_number', '')),
"grade": h.get('grade', '') or 'Unknown'
},
"meta": {
"source": "hadith_db",
"exported_at": datetime.now().isoformat()
}
}
tasks.append(task)
return tasks
# ============================================================================
# Export Functions
# ============================================================================
def export_to_json(hadiths: List[Dict], output_path: str):
"""Export hadiths to JSON file for Label Studio import."""
tasks = convert_hadiths_to_tasks(hadiths)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(tasks, f, ensure_ascii=False, indent=2)
console.print(f"[green]Exported {len(tasks)} tasks to {output_path}[/green]")
return output_path
def export_to_csv(hadiths: List[Dict], output_path: str):
"""Export hadiths to CSV file."""
import csv
fieldnames = [
'hadith_id', 'collection', 'book', 'hadith_number',
'arabic_text', 'english_text', 'grade'
]
with open(output_path, 'w', encoding='utf-8', newline='') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for h in hadiths:
writer.writerow({
'hadith_id': h['id'],
'collection': h.get('collection', ''),
'book': h.get('book', ''),
'hadith_number': h.get('hadith_number', ''),
'arabic_text': h.get('arabic_text', ''),
'english_text': h.get('english_text', ''),
'grade': h.get('grade', '')
})
console.print(f"[green]Exported {len(hadiths)} hadiths to {output_path}[/green]")
return output_path
# ============================================================================
# Main Setup Functions
# ============================================================================
async def setup_annotation_projects(
ner_count: int = 500,
relation_count: int = 300,
export_only: bool = False
):
"""
Set up Label Studio projects for NER and Relation annotation.
"""
console.print(Panel.fit(
"[bold blue]Step 7: Label Studio Annotation Setup[/bold blue]\n"
f"Label Studio: {LABEL_STUDIO_URL}\n"
f"NER samples: {ner_count} | Relation samples: {relation_count}",
title="Annotation Setup"
))
# Step 1: Export hadiths for NER annotation
console.print("\n[yellow]1. Exporting hadiths for NER annotation...[/yellow]")
ner_hadiths = export_hadiths_for_annotation(
count=ner_count,
strategy="stratified"
)
# Show distribution
collections = {}
for h in ner_hadiths:
coll = h.get('collection', 'Unknown')
collections[coll] = collections.get(coll, 0) + 1
table = Table(title="NER Sample Distribution")
table.add_column("Collection", style="cyan")
table.add_column("Count", justify="right")
for coll, cnt in sorted(collections.items(), key=lambda x: -x[1]):
table.add_row(coll, str(cnt))
console.print(table)
# Export NER tasks
ner_json_path = "annotation_data/ner_tasks.json"
os.makedirs("annotation_data", exist_ok=True)
export_to_json(ner_hadiths, ner_json_path)
# Step 2: Export hadiths for Relation annotation (chain-focused)
console.print("\n[yellow]2. Exporting hadiths for Relation annotation...[/yellow]")
relation_hadiths = export_hadiths_for_annotation(
count=relation_count,
strategy="chain_focused"
)
relation_json_path = "annotation_data/relation_tasks.json"
export_to_json(relation_hadiths, relation_json_path)
# Step 3: Save labeling configurations
console.print("\n[yellow]3. Saving Label Studio configurations...[/yellow]")
with open("annotation_data/ner_config.xml", 'w') as f:
f.write(NER_LABELING_CONFIG)
console.print(" Saved: annotation_data/ner_config.xml")
with open("annotation_data/relation_config.xml", 'w') as f:
f.write(RELATION_LABELING_CONFIG)
console.print(" Saved: annotation_data/relation_config.xml")
with open("annotation_data/combined_config.xml", 'w') as f:
f.write(COMBINED_LABELING_CONFIG)
console.print(" Saved: annotation_data/combined_config.xml")
if export_only:
console.print("\n[green]✓ Export complete! Import files manually to Label Studio.[/green]")
return
# Step 4: Create Label Studio projects (if API key provided)
if not LABEL_STUDIO_API_KEY:
console.print("\n[yellow]⚠ LABEL_STUDIO_API_KEY not set. Skipping project creation.[/yellow]")
console.print(" Set the API key and run again, or import tasks manually.")
return
console.print("\n[yellow]4. Creating Label Studio projects...[/yellow]")
async with httpx.AsyncClient(timeout=60.0) as client:
# Create NER project
try:
ner_project = await create_label_studio_project(
client,
title="Hadith NER Annotation",
description="Named Entity Recognition for Islamic hadith texts. "
"Label persons, places, dates, and other entities.",
label_config=NER_LABELING_CONFIG
)
console.print(f" [green]✓ Created NER project (ID: {ner_project['id']})[/green]")
# Import NER tasks
ner_tasks = convert_hadiths_to_tasks(ner_hadiths)
await import_tasks_to_project(client, ner_project['id'], ner_tasks)
console.print(f" [green]✓ Imported {len(ner_tasks)} NER tasks[/green]")
except Exception as e:
console.print(f" [red]✗ NER project error: {e}[/red]")
# Create Relation project
try:
relation_project = await create_label_studio_project(
client,
title="Hadith Relation Extraction",
description="Extract relations between narrators and entities in hadith texts.",
label_config=RELATION_LABELING_CONFIG
)
console.print(f" [green]✓ Created Relation project (ID: {relation_project['id']})[/green]")
# Import Relation tasks
relation_tasks = convert_hadiths_to_tasks(relation_hadiths)
await import_tasks_to_project(client, relation_project['id'], relation_tasks)
console.print(f" [green]✓ Imported {len(relation_tasks)} Relation tasks[/green]")
except Exception as e:
console.print(f" [red]✗ Relation project error: {e}[/red]")
console.print("\n[bold green]✓ Annotation setup complete![/bold green]")
async def main():
"""Main entry point."""
import argparse
parser = argparse.ArgumentParser(description="Hadith Annotation Setup")
parser.add_argument("--ner-count", type=int, default=500,
help="Number of hadiths for NER annotation")
parser.add_argument("--relation-count", type=int, default=300,
help="Number of hadiths for relation annotation")
parser.add_argument("--export-only", action="store_true",
help="Only export files, don't create Label Studio projects")
parser.add_argument("--strategy", choices=["random", "stratified", "chain_focused"],
default="stratified", help="Sampling strategy")
args = parser.parse_args()
await setup_annotation_projects(
ner_count=args.ner_count,
relation_count=args.relation_count,
export_only=args.export_only
)
if __name__ == "__main__":
asyncio.run(main())

View File

@ -0,0 +1,373 @@
-- ============================================================================
-- Step 7: SQL Queries for Hadith Annotation Export
-- ============================================================================
-- Run: psql -h pg.betelgeusebytes.io -U hadith_ingest -d hadith_db -f export_queries.sql
-- ============================================================================
-- ============================================================================
-- 1. STRATIFIED SAMPLING - 500 hadiths proportional to collection size
-- ============================================================================
-- First, create a temporary function for stratified sampling
CREATE OR REPLACE FUNCTION sample_hadiths_stratified(total_sample INT)
RETURNS TABLE (
id INT,
hadith_number VARCHAR,
arabic_text TEXT,
english_text TEXT,
urdu_text TEXT,
grade VARCHAR,
collection_name VARCHAR,
collection_arabic VARCHAR,
book_name VARCHAR,
book_arabic VARCHAR
) AS $$
DECLARE
coll RECORD;
total_hadiths INT;
sample_count INT;
BEGIN
-- Get total count
SELECT COUNT(*) INTO total_hadiths
FROM hadiths h
WHERE h.arabic_text IS NOT NULL AND LENGTH(h.arabic_text) > 50;
-- Sample from each collection proportionally
FOR coll IN
SELECT c.id as coll_id, c.name_english, COUNT(h.id) as cnt
FROM hadiths h
JOIN collections c ON h.collection_id = c.id
WHERE h.arabic_text IS NOT NULL AND LENGTH(h.arabic_text) > 50
GROUP BY c.id, c.name_english
LOOP
sample_count := GREATEST(1, (total_sample * coll.cnt / total_hadiths));
RETURN QUERY
SELECT
h.id,
h.hadith_number,
h.arabic_text,
h.english_text,
h.urdu_text,
h.grade,
c.name_english,
c.name_arabic,
b.name_english,
b.name_arabic
FROM hadiths h
JOIN collections c ON h.collection_id = c.id
LEFT JOIN books b ON h.book_id = b.id
WHERE h.collection_id = coll.coll_id
AND h.arabic_text IS NOT NULL
AND LENGTH(h.arabic_text) > 50
ORDER BY RANDOM()
LIMIT sample_count;
END LOOP;
END;
$$ LANGUAGE plpgsql;
-- Export 500 stratified samples
\copy (SELECT * FROM sample_hadiths_stratified(500) LIMIT 500) TO 'ner_annotation_sample.csv' WITH CSV HEADER;
-- ============================================================================
-- 2. CHAIN-FOCUSED SAMPLING - Hadiths with clear narrator chains (isnad)
-- ============================================================================
-- Export 300 hadiths with clear narrator chain patterns
\copy (
SELECT
h.id,
h.hadith_number,
h.arabic_text,
h.english_text,
h.urdu_text,
h.grade,
c.name_english as collection_name,
c.name_arabic as collection_arabic,
b.name_english as book_name,
b.name_arabic as book_arabic
FROM hadiths h
JOIN collections c ON h.collection_id = c.id
LEFT JOIN books b ON h.book_id = b.id
WHERE h.arabic_text IS NOT NULL
AND LENGTH(h.arabic_text) > 100
AND (
-- Common narrator chain indicators in Arabic
h.arabic_text LIKE '%حدثنا%' -- "narrated to us"
OR h.arabic_text LIKE '%أخبرنا%' -- "informed us"
OR h.arabic_text LIKE '%عن%عن%عن%' -- chain pattern "from...from...from"
OR h.arabic_text LIKE '%سمعت%' -- "I heard"
OR h.arabic_text LIKE '%قال رسول الله%' -- "The Messenger of Allah said"
-- English patterns
OR h.english_text ILIKE '%narrated%narrated%'
OR h.english_text ILIKE '%reported%that%said%'
)
ORDER BY RANDOM()
LIMIT 300
) TO 'relation_annotation_sample.csv' WITH CSV HEADER;
-- ============================================================================
-- 3. RANDOM SAMPLING - Simple random sample
-- ============================================================================
\copy (
SELECT
h.id,
h.hadith_number,
h.arabic_text,
h.english_text,
h.urdu_text,
h.grade,
c.name_english as collection_name,
b.name_english as book_name
FROM hadiths h
JOIN collections c ON h.collection_id = c.id
LEFT JOIN books b ON h.book_id = b.id
WHERE h.arabic_text IS NOT NULL
AND LENGTH(h.arabic_text) > 50
ORDER BY RANDOM()
LIMIT 500
) TO 'random_annotation_sample.csv' WITH CSV HEADER;
-- ============================================================================
-- 4. GRADE-STRATIFIED SAMPLING - Ensure representation of all grades
-- ============================================================================
\copy (
WITH grade_samples AS (
SELECT
h.*,
c.name_english as collection_name,
b.name_english as book_name,
ROW_NUMBER() OVER (PARTITION BY h.grade ORDER BY RANDOM()) as rn
FROM hadiths h
JOIN collections c ON h.collection_id = c.id
LEFT JOIN books b ON h.book_id = b.id
WHERE h.arabic_text IS NOT NULL
AND LENGTH(h.arabic_text) > 50
)
SELECT
id,
hadith_number,
arabic_text,
english_text,
grade,
collection_name,
book_name
FROM grade_samples
WHERE rn <= 100 -- Up to 100 per grade
ORDER BY grade, RANDOM()
LIMIT 500
) TO 'grade_stratified_sample.csv' WITH CSV HEADER;
-- ============================================================================
-- 5. COLLECTION-SPECIFIC EXPORTS
-- ============================================================================
-- Sahih Bukhari samples
\copy (
SELECT
h.id,
h.hadith_number,
h.arabic_text,
h.english_text,
h.grade,
b.name_english as book_name
FROM hadiths h
JOIN collections c ON h.collection_id = c.id
LEFT JOIN books b ON h.book_id = b.id
WHERE c.name_english ILIKE '%bukhari%'
AND h.arabic_text IS NOT NULL
ORDER BY RANDOM()
LIMIT 100
) TO 'bukhari_sample.csv' WITH CSV HEADER;
-- Sahih Muslim samples
\copy (
SELECT
h.id,
h.hadith_number,
h.arabic_text,
h.english_text,
h.grade,
b.name_english as book_name
FROM hadiths h
JOIN collections c ON h.collection_id = c.id
LEFT JOIN books b ON h.book_id = b.id
WHERE c.name_english ILIKE '%muslim%'
AND h.arabic_text IS NOT NULL
ORDER BY RANDOM()
LIMIT 100
) TO 'muslim_sample.csv' WITH CSV HEADER;
-- ============================================================================
-- 6. EXPORT AS JSON (for Label Studio)
-- ============================================================================
-- Create JSON export for Label Studio
\copy (
SELECT json_build_object(
'data', json_build_object(
'hadith_id', h.id,
'arabic_text', h.arabic_text,
'english_text', COALESCE(h.english_text, ''),
'urdu_text', COALESCE(h.urdu_text, ''),
'collection', c.name_english,
'collection_arabic', c.name_arabic,
'book', COALESCE(b.name_english, ''),
'hadith_number', h.hadith_number,
'grade', COALESCE(h.grade, 'Unknown')
)
)
FROM hadiths h
JOIN collections c ON h.collection_id = c.id
LEFT JOIN books b ON h.book_id = b.id
WHERE h.arabic_text IS NOT NULL
AND LENGTH(h.arabic_text) > 50
ORDER BY RANDOM()
LIMIT 500
) TO 'label_studio_tasks.jsonl';
-- ============================================================================
-- 7. STATISTICS QUERIES
-- ============================================================================
-- Distribution by collection
SELECT
c.name_english as collection,
COUNT(h.id) as total_hadiths,
COUNT(h.id) FILTER (WHERE LENGTH(h.arabic_text) > 100) as with_arabic,
COUNT(h.id) FILTER (WHERE LENGTH(h.english_text) > 100) as with_english,
ROUND(AVG(LENGTH(h.arabic_text))) as avg_arabic_len
FROM hadiths h
JOIN collections c ON h.collection_id = c.id
GROUP BY c.id, c.name_english
ORDER BY total_hadiths DESC;
-- Distribution by grade
SELECT
COALESCE(grade, 'Unknown') as grade,
COUNT(*) as count,
ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER(), 2) as percentage
FROM hadiths
GROUP BY grade
ORDER BY count DESC;
-- Narrator chain pattern frequency
SELECT
'حدثنا (narrated to us)' as pattern,
COUNT(*) as count
FROM hadiths WHERE arabic_text LIKE '%حدثنا%'
UNION ALL
SELECT
'أخبرنا (informed us)' as pattern,
COUNT(*) as count
FROM hadiths WHERE arabic_text LIKE '%أخبرنا%'
UNION ALL
SELECT
'عن...عن (from...from)' as pattern,
COUNT(*) as count
FROM hadiths WHERE arabic_text LIKE '%عن%عن%'
UNION ALL
SELECT
'قال رسول الله (Prophet said)' as pattern,
COUNT(*) as count
FROM hadiths WHERE arabic_text LIKE '%قال رسول الله%';
-- ============================================================================
-- 8. CREATE ANNOTATION TRACKING TABLE
-- ============================================================================
-- Table to track annotation progress
CREATE TABLE IF NOT EXISTS annotation_batches (
id SERIAL PRIMARY KEY,
batch_name VARCHAR(100) NOT NULL,
batch_type VARCHAR(50) NOT NULL, -- 'NER', 'RELATION', 'COMBINED'
hadith_ids INTEGER[] NOT NULL,
total_count INTEGER NOT NULL,
annotated_count INTEGER DEFAULT 0,
label_studio_project_id INTEGER,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
completed_at TIMESTAMP,
notes TEXT
);
-- Index for quick lookup
CREATE INDEX IF NOT EXISTS idx_annotation_batches_type ON annotation_batches(batch_type);
-- Function to create a new annotation batch
CREATE OR REPLACE FUNCTION create_annotation_batch(
p_batch_name VARCHAR(100),
p_batch_type VARCHAR(50),
p_hadith_ids INTEGER[]
) RETURNS INTEGER AS $$
DECLARE
v_batch_id INTEGER;
BEGIN
INSERT INTO annotation_batches (batch_name, batch_type, hadith_ids, total_count)
VALUES (p_batch_name, p_batch_type, p_hadith_ids, array_length(p_hadith_ids, 1))
RETURNING id INTO v_batch_id;
RETURN v_batch_id;
END;
$$ LANGUAGE plpgsql;
-- ============================================================================
-- 9. HELPER VIEWS
-- ============================================================================
-- View for unannotated hadiths
CREATE OR REPLACE VIEW unannotated_hadiths AS
SELECT
h.id,
h.hadith_number,
h.arabic_text,
h.english_text,
c.name_english as collection,
h.grade
FROM hadiths h
JOIN collections c ON h.collection_id = c.id
WHERE NOT h.entities_extracted
AND h.arabic_text IS NOT NULL
AND LENGTH(h.arabic_text) > 50;
-- View for partially annotated hadiths (entities done, relations pending)
CREATE OR REPLACE VIEW partial_annotations AS
SELECT
h.id,
h.hadith_number,
h.arabic_text,
h.english_text,
c.name_english as collection
FROM hadiths h
JOIN collections c ON h.collection_id = c.id
WHERE h.entities_extracted
AND NOT h.relations_extracted;
-- ============================================================================
-- 10. SAMPLE QUERY FOR ACTIVE LEARNING
-- ============================================================================
-- Get hadiths similar to annotated ones (for active learning)
-- This requires embeddings to be available
-- Placeholder for when we implement active learning in Step 7
/*
SELECT
h.id,
h.arabic_text,
h.english_text,
similarity_score
FROM hadiths h
JOIN (
-- Find similar hadiths based on embedding distance
SELECT
hadith_id,
1 - (embedding <-> (SELECT AVG(embedding) FROM annotated_hadiths)) as similarity_score
FROM hadith_embeddings
WHERE hadith_id NOT IN (SELECT id FROM annotated_hadiths)
ORDER BY similarity_score DESC
LIMIT 100
) similar ON h.id = similar.hadith_id
ORDER BY similarity_score DESC;
*/

View File

@ -0,0 +1,530 @@
#!/usr/bin/env python3
"""
Step 7: Label Studio API Client
================================
Manages Label Studio projects, imports/exports annotations.
Author: Hadith Scholar AI Project
Date: 2025
"""
import os
import json
import time
import asyncio
from datetime import datetime
from typing import Dict, List, Optional, Any
from dataclasses import dataclass, asdict
from pathlib import Path
from label_studio_sdk import LabelStudio
from rich.console import Console
from rich.table import Table
from rich.progress import Progress, SpinnerColumn, TextColumn
console = Console()
LABEL_STUDIO_URL = os.getenv("LABEL_STUDIO_URL", "https://label.betelgeusebytes.io")
LABEL_STUDIO_API_KEY = os.getenv("LABEL_STUDIO_API_KEY", "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ0b2tlbl90eXBlIjoicmVmcmVzaCIsImV4cCI6ODA3MTUyMjgzMSwiaWF0IjoxNzY0MzIyODMxLCJqdGkiOiJhYWVkMjNjODdmODc0MmY2OWJmMmFjZDc5YTVjMzMyMiIsInVzZXJfaWQiOjF9.4B_ZAPL6TmIcA6-zcKJ8JDRI3FsikX3HgTK3bbmK0mk")
@dataclass
class Project:
"""Label Studio project."""
id: int
title: str
description: str
task_count: int
annotation_count: int
created_at: str
@dataclass
class AnnotationStats:
"""Annotation statistics."""
total_tasks: int
annotated_tasks: int
total_annotations: int
agreement_score: Optional[float]
class LabelStudioClient:
"""Client for Label Studio API using official SDK."""
def __init__(self, url: str = None, api_key: str = None):
self.url = (url or LABEL_STUDIO_URL).rstrip('/')
self.api_key = api_key or LABEL_STUDIO_API_KEY
self.client = LabelStudio(base_url=self.url, api_key=self.api_key)
async def list_projects(self) -> List[Project]:
"""List all projects."""
projects_data = self.client.projects.list()
projects = []
for p in projects_data:
projects.append(Project(
id=p.id,
title=p.title,
description=p.description or "",
task_count=getattr(p, 'task_number', 0) or 0,
annotation_count=getattr(p, 'total_annotations_number', 0) or 0,
created_at=getattr(p, 'created_at', '') or ""
))
return projects
async def get_project(self, project_id: int) -> Dict:
"""Get project details."""
project = self.client.projects.get(id=project_id)
return {
"id": project.id,
"title": project.title,
"description": project.description,
"task_number": getattr(project, 'task_number', 0),
"total_annotations_number": getattr(project, 'total_annotations_number', 0),
"num_tasks_with_annotations": getattr(project, 'num_tasks_with_annotations', 0),
"created_at": getattr(project, 'created_at', '')
}
async def create_project(
self,
title: str,
description: str = "",
label_config: str = "",
**kwargs
) -> Dict:
"""Create a new project."""
project = self.client.projects.create(
title=title,
description=description,
label_config=label_config
)
return {
"id": project.id,
"title": project.title,
"description": project.description
}
async def update_project(self, project_id: int, **kwargs) -> Dict:
"""Update project settings."""
project = self.client.projects.update(id=project_id, **kwargs)
return {"id": project.id, "title": project.title}
async def delete_project(self, project_id: int) -> None:
"""Delete a project."""
self.client.projects.delete(id=project_id)
async def import_tasks(
self,
project_id: int,
tasks: List[Dict],
batch_size: int = 100
) -> Dict:
"""Import tasks to a project in batches."""
total_imported = 0
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
console=console
) as progress:
task_progress = progress.add_task(
f"Importing {len(tasks)} tasks...",
total=len(tasks)
)
for i in range(0, len(tasks), batch_size):
batch = tasks[i:i + batch_size]
try:
self.client.projects.import_tasks(id=project_id, request=batch)
total_imported += len(batch)
progress.update(task_progress, advance=len(batch))
except Exception as e:
console.print(f"[yellow]Warning: Failed to import batch {i//batch_size + 1}: {e}[/yellow]")
continue
return {"imported": total_imported}
async def import_tasks_from_file(
self,
project_id: int,
file_path: str,
batch_size: int = 100
) -> Dict:
"""Import tasks from a JSON file in batches."""
with open(file_path, 'r', encoding='utf-8') as f:
tasks = json.load(f)
console.print(f"[blue]Loading {len(tasks)} tasks from {file_path}[/blue]")
return await self.import_tasks(project_id, tasks, batch_size)
async def get_tasks(
self,
project_id: int,
page: int = 1,
page_size: int = 100
) -> Dict:
"""Get tasks from a project."""
tasks = self.client.tasks.list(project=project_id, page=page, page_size=page_size)
return {"tasks": list(tasks)}
async def get_all_tasks(self, project_id: int) -> List[Dict]:
"""Get all tasks from a project."""
all_tasks = []
page = 1
while True:
result = await self.get_tasks(project_id, page=page, page_size=100)
tasks = result.get("tasks", [])
if not tasks:
break
all_tasks.extend(tasks)
if len(tasks) < 100:
break
page += 1
return all_tasks
async def delete_all_tasks(self, project_id: int) -> None:
"""Delete all tasks from a project."""
tasks = await self.get_all_tasks(project_id)
for task in tasks:
self.client.tasks.delete(id=task.id)
async def get_annotations(self, task_id: int) -> List[Dict]:
"""Get annotations for a task."""
annotations = self.client.annotations.list(task=task_id)
return list(annotations)
async def create_annotation(
self,
task_id: int,
result: List[Dict],
**kwargs
) -> Dict:
"""Create an annotation for a task."""
annotation = self.client.annotations.create(
task=task_id,
result=result,
**kwargs
)
return {"id": annotation.id}
async def export_annotations(
self,
project_id: int,
export_format: str = "JSON"
) -> List[Dict]:
"""Export all annotations from a project."""
export_result = self.client.projects.exports.create(
id=project_id,
export_type=export_format
)
return export_result
async def export_annotations_to_file(
self,
project_id: int,
output_path: str,
export_format: str = "JSON"
) -> str:
"""Export annotations to a file."""
data = await self.export_annotations(project_id, export_format)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
return output_path
async def get_project_stats(self, project_id: int) -> AnnotationStats:
"""Get annotation statistics for a project."""
project_data = await self.get_project(project_id)
return AnnotationStats(
total_tasks=project_data.get("task_number", 0),
annotated_tasks=project_data.get("num_tasks_with_annotations", 0),
total_annotations=project_data.get("total_annotations_number", 0),
agreement_score=None
)
# ============================================================================
# Annotation Conversion Functions
# ============================================================================
def convert_label_studio_to_huggingface(annotations: List[Dict]) -> List[Dict]:
"""
Convert Label Studio annotations to HuggingFace NER format.
Output format:
{
"tokens": ["word1", "word2", ...],
"ner_tags": ["O", "B-PERSON", "I-PERSON", ...]
}
"""
converted = []
for task in annotations:
if not task.get("annotations"):
continue
# Get the first annotation (or could handle multiple)
annotation = task["annotations"][0]
result = annotation.get("result", [])
# Get the text
data = task.get("data", {})
text = data.get("arabic_text", "") or data.get("english_text", "")
if not text:
continue
# Simple tokenization (space-based for now)
tokens = text.split()
ner_tags = ["O"] * len(tokens)
# Apply annotations
for item in result:
if item.get("type") != "labels":
continue
value = item.get("value", {})
label = value.get("labels", ["O"])[0]
start = value.get("start", 0)
end = value.get("end", 0)
# Find tokens that overlap with this span
char_pos = 0
for i, token in enumerate(tokens):
token_start = char_pos
token_end = char_pos + len(token)
if token_start >= start and token_end <= end:
if ner_tags[i] == "O":
ner_tags[i] = f"B-{label}"
else:
ner_tags[i] = f"I-{label}"
elif token_start < end and token_end > start:
# Partial overlap
if ner_tags[i] == "O":
ner_tags[i] = f"B-{label}"
char_pos = token_end + 1 # +1 for space
converted.append({
"id": task.get("id"),
"hadith_id": data.get("hadith_id"),
"tokens": tokens,
"ner_tags": ner_tags
})
return converted
def convert_label_studio_to_spacy(annotations: List[Dict]) -> List[tuple]:
"""
Convert Label Studio annotations to spaCy training format.
Output format:
[
("text", {"entities": [(start, end, label), ...]})
]
"""
converted = []
for task in annotations:
if not task.get("annotations"):
continue
annotation = task["annotations"][0]
result = annotation.get("result", [])
data = task.get("data", {})
text = data.get("arabic_text", "") or data.get("english_text", "")
if not text:
continue
entities = []
for item in result:
if item.get("type") != "labels":
continue
value = item.get("value", {})
label = value.get("labels", ["O"])[0]
start = value.get("start", 0)
end = value.get("end", 0)
entities.append((start, end, label))
converted.append((text, {"entities": entities}))
return converted
def convert_relations_to_graph(annotations: List[Dict]) -> List[Dict]:
"""
Convert relation annotations to graph format for Neo4j.
Output format:
{
"source": {"text": "...", "type": "...", "start": N, "end": N},
"target": {"text": "...", "type": "...", "start": N, "end": N},
"relation": "NARRATED_FROM"
}
"""
relations = []
for task in annotations:
if not task.get("annotations"):
continue
annotation = task["annotations"][0]
result = annotation.get("result", [])
# First pass: collect all entities by ID
entities_by_id = {}
for item in result:
if item.get("type") == "labels":
entities_by_id[item.get("id")] = {
"text": item.get("value", {}).get("text", ""),
"type": item.get("value", {}).get("labels", [""])[0],
"start": item.get("value", {}).get("start", 0),
"end": item.get("value", {}).get("end", 0)
}
# Second pass: extract relations
for item in result:
if item.get("type") == "relation":
from_id = item.get("from_id")
to_id = item.get("to_id")
relation_type = item.get("labels", ["RELATED_TO"])[0]
if from_id in entities_by_id and to_id in entities_by_id:
relations.append({
"hadith_id": task.get("data", {}).get("hadith_id"),
"source": entities_by_id[from_id],
"target": entities_by_id[to_id],
"relation": relation_type
})
return relations
# ============================================================================
# CLI Interface
# ============================================================================
async def main():
"""Main CLI interface."""
import argparse
parser = argparse.ArgumentParser(description="Label Studio Client")
subparsers = parser.add_subparsers(dest="command", help="Command")
# List projects
list_parser = subparsers.add_parser("list", help="List all projects")
# Create project
create_parser = subparsers.add_parser("create", help="Create a project")
create_parser.add_argument("--title", required=True)
create_parser.add_argument("--config", required=True, help="Path to label config XML")
create_parser.add_argument("--description", default="")
# Import tasks
import_parser = subparsers.add_parser("import", help="Import tasks")
import_parser.add_argument("--project", type=int, required=True)
import_parser.add_argument("--file", required=True, help="Path to tasks JSON")
# Export annotations
export_parser = subparsers.add_parser("export", help="Export annotations")
export_parser.add_argument("--project", type=int, required=True)
export_parser.add_argument("--output", required=True)
export_parser.add_argument("--format", default="JSON", choices=["JSON", "CSV", "CONLL"])
# Convert annotations
convert_parser = subparsers.add_parser("convert", help="Convert annotations")
convert_parser.add_argument("--input", required=True)
convert_parser.add_argument("--output", required=True)
convert_parser.add_argument("--format", choices=["huggingface", "spacy", "relations"])
# Stats
stats_parser = subparsers.add_parser("stats", help="Get project statistics")
stats_parser.add_argument("--project", type=int, required=True)
args = parser.parse_args()
if not args.command:
parser.print_help()
return
client = LabelStudioClient()
if args.command == "list":
projects = await client.list_projects()
table = Table(title="Label Studio Projects")
table.add_column("ID", style="cyan")
table.add_column("Title")
table.add_column("Tasks", justify="right")
table.add_column("Annotations", justify="right")
for p in projects:
table.add_row(
str(p.id),
p.title,
str(p.task_count),
str(p.annotation_count)
)
console.print(table)
elif args.command == "create":
with open(args.config, 'r') as f:
label_config = f.read()
project = await client.create_project(
title=args.title,
description=args.description,
label_config=label_config
)
console.print(f"[green]Created project: {project['id']} - {project['title']}[/green]")
elif args.command == "import":
result = await client.import_tasks_from_file(args.project, args.file)
console.print(f"[green]Imported tasks to project {args.project}[/green]")
elif args.command == "export":
path = await client.export_annotations_to_file(
args.project,
args.output,
args.format
)
console.print(f"[green]Exported annotations to {path}[/green]")
elif args.command == "convert":
with open(args.input, 'r', encoding='utf-8') as f:
annotations = json.load(f)
if args.format == "huggingface":
converted = convert_label_studio_to_huggingface(annotations)
elif args.format == "spacy":
converted = convert_label_studio_to_spacy(annotations)
elif args.format == "relations":
converted = convert_relations_to_graph(annotations)
with open(args.output, 'w', encoding='utf-8') as f:
json.dump(converted, f, ensure_ascii=False, indent=2)
console.print(f"[green]Converted {len(converted)} items to {args.output}[/green]")
elif args.command == "stats":
stats = await client.get_project_stats(args.project)
console.print(f"\n[bold]Project {args.project} Statistics:[/bold]")
console.print(f" Total tasks: {stats.total_tasks}")
console.print(f" Annotated: {stats.annotated_tasks}")
console.print(f" Total annotations: {stats.total_annotations}")
if stats.agreement_score:
console.print(f" Agreement: {stats.agreement_score:.2%}")
if __name__ == "__main__":
asyncio.run(main())

View File

@ -0,0 +1,24 @@
# Step 7: Annotation Setup with Label Studio
# Requirements for hadith-phase3-step7
# Database
psycopg2-binary>=2.9.9
# HTTP client
httpx>=0.27.0
# Rich console output
rich>=13.7.0
# Data handling
numpy>=1.24.0
pandas>=2.0.0
# JSON handling
orjson>=3.9.0
# Date handling
python-dateutil>=2.8.2
# Label Studio SDK (optional - for direct integration)
label-studio-sdk>=0.0.32

View File

@ -0,0 +1,203 @@
#!/bin/bash
# ============================================================================
# Step 7: Annotation Setup Runner
# ============================================================================
# Usage: ./run_step7.sh [setup|export|active|client|help]
# ============================================================================
set -e
# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'
# Configuration
export POSTGRES_HOST="${POSTGRES_HOST:-pg.betelgeusebytes.io}"
export POSTGRES_PORT="${POSTGRES_PORT:-5432}"
export POSTGRES_DB="${POSTGRES_DB:-hadith_db}"
export POSTGRES_USER="${POSTGRES_USER:-hadith_ingest}"
export POSTGRES_PASSWORD="${POSTGRES_PASSWORD:-hadith_ingest}"
export LABEL_STUDIO_URL="${LABEL_STUDIO_URL:-https://label.betelgeusebytes.io}"
export LABEL_STUDIO_API_KEY="${LABEL_STUDIO_API_KEY:-eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ0b2tlbl90eXBlIjoicmVmcmVzaCIsImV4cCI6ODA3MTUyMjgzMSwiaWF0IjoxNzY0MzIyODMxLCJqdGkiOiJhYWVkMjNjODdmODc0MmY2OWJmMmFjZDc5YTVjMzMyMiIsInVzZXJfaWQiOjF9.4B_ZAPL6TmIcA6-zcKJ8JDRI3FsikX3HgTK3bbmK0mk}"
export QDRANT_HOST="${QDRANT_HOST:-https://vector.betelgeusebytes.io}"
export QDRANT_PORT="${QDRANT_PORT:-443}"
export QDRANT_COLLECTION="${QDRANT_COLLECTION:-hadith_embeddings}"
# Check password
check_password() {
if [ -z "$POSTGRES_PASSWORD" ]; then
echo -e "${RED}Error: POSTGRES_PASSWORD not set${NC}"
echo "Set it with: export POSTGRES_PASSWORD='your_password'"
exit 1
fi
}
# Install dependencies
install_deps() {
echo -e "${BLUE}Installing dependencies...${NC}"
pip install -q -r requirements.txt
echo -e "${GREEN}Dependencies installed.${NC}"
}
# Run annotation setup
run_setup() {
echo -e "\n${BLUE}═══════════════════════════════════════════════════${NC}"
echo -e "${BLUE}Running Annotation Setup...${NC}"
echo -e "${BLUE}═══════════════════════════════════════════════════${NC}\n"
# Parse arguments
NER_COUNT="${1:-500}"
RELATION_COUNT="${2:-300}"
EXPORT_ONLY="${3:-}"
if [ "$EXPORT_ONLY" == "--export-only" ]; then
python annotation_setup.py \
--ner-count "$NER_COUNT" \
--relation-count "$RELATION_COUNT" \
--export-only
else
python annotation_setup.py \
--ner-count "$NER_COUNT" \
--relation-count "$RELATION_COUNT"
fi
echo -e "\n${GREEN}✓ Annotation setup complete!${NC}"
echo -e "\nOutput files in ./annotation_data/:"
ls -la annotation_data/ 2>/dev/null || echo " (directory will be created on first run)"
}
# Run export only
run_export() {
echo -e "\n${BLUE}═══════════════════════════════════════════════════${NC}"
echo -e "${BLUE}Exporting Hadiths for Annotation...${NC}"
echo -e "${BLUE}═══════════════════════════════════════════════════${NC}\n"
NER_COUNT="${1:-500}"
RELATION_COUNT="${2:-300}"
python annotation_setup.py \
--ner-count "$NER_COUNT" \
--relation-count "$RELATION_COUNT" \
--export-only
echo -e "\n${GREEN}✓ Export complete!${NC}"
}
# Run active learning sampler
run_active() {
echo -e "\n${BLUE}═══════════════════════════════════════════════════${NC}"
echo -e "${BLUE}Running Active Learning Sampler...${NC}"
echo -e "${BLUE}═══════════════════════════════════════════════════${NC}\n"
STRATEGY="${1:-hybrid}"
COUNT="${2:-50}"
OUTPUT="${3:-active_learning_samples.json}"
python active_learning.py \
--strategy "$STRATEGY" \
--count "$COUNT" \
--output "$OUTPUT"
echo -e "\n${GREEN}✓ Active learning sampling complete!${NC}"
}
# Run Label Studio client commands
run_client() {
echo -e "\n${BLUE}═══════════════════════════════════════════════════${NC}"
echo -e "${BLUE}Label Studio Client${NC}"
echo -e "${BLUE}═══════════════════════════════════════════════════${NC}\n"
# Pass all arguments to the client
python label_studio_client.py "$@"
}
# Run SQL export
run_sql() {
echo -e "\n${BLUE}═══════════════════════════════════════════════════${NC}"
echo -e "${BLUE}Running SQL Export Queries...${NC}"
echo -e "${BLUE}═══════════════════════════════════════════════════${NC}\n"
PGPASSWORD="$POSTGRES_PASSWORD" psql \
-h "$POSTGRES_HOST" \
-p "$POSTGRES_PORT" \
-U "$POSTGRES_USER" \
-d "$POSTGRES_DB" \
-f export_queries.sql
}
# Show usage
show_usage() {
echo "Step 7: Annotation Setup with Label Studio"
echo ""
echo "Usage: $0 [command] [options]"
echo ""
echo "Commands:"
echo " setup [ner_count] [rel_count] [--export-only]"
echo " Run full annotation setup"
echo " export [ner_count] [rel_count]"
echo " Export hadiths for annotation only"
echo " active [strategy] [count] [output]"
echo " Run active learning sampler"
echo " Strategies: diversity, representative, chain_complexity, random, hybrid"
echo " client [args...] Run Label Studio client commands"
echo " sql Run SQL export queries"
echo " install Install Python dependencies"
echo " help Show this help message"
echo ""
echo "Examples:"
echo " $0 setup 500 300 --export-only # Export 500 NER + 300 relation samples"
echo " $0 active hybrid 100 # Get 100 samples using hybrid strategy"
echo " $0 client list # List Label Studio projects"
echo " $0 client export --project 1 --output ann.json"
echo ""
echo "Environment variables:"
echo " POSTGRES_PASSWORD Database password (required)"
echo " LABEL_STUDIO_API_KEY Label Studio API key (for project creation)"
}
# Main
case "${1:-help}" in
setup)
check_password
install_deps
shift
run_setup "$@"
;;
export)
check_password
install_deps
shift
run_export "$@"
;;
active)
check_password
install_deps
shift
run_active "$@"
;;
client)
install_deps
shift
run_client "$@"
;;
sql)
check_password
run_sql
;;
install)
install_deps
;;
help|--help|-h)
show_usage
;;
*)
echo -e "${RED}Unknown command: $1${NC}"
show_usage
exit 1
;;
esac

View File

@ -0,0 +1,25 @@
import httpx
from qdrant_client import QdrantClient
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# Generate query embedding
query = "الصلاة"
response = httpx.post(
"https://embeddings.betelgeusebytes.io/embed",
json={"inputs": [query]},
verify=False,
)
query_vector = response.json()[0]
# Search using internal Qdrant service
# qdrant = QdrantClient(url="http://qdrant.vector.svc.cluster.local:6333")
qdrant = QdrantClient(url="https://vector.betelgeusebytes.io:443/")
results = qdrant.query_points(
collection_name="hadith_embeddings",
query=query_vector,
limit=5,
)
for i, r in enumerate(results.points, 1):
print(f"{i}. Hadith {r.id} (score: {r.score:.4f})")

View File

@ -0,0 +1,37 @@
# ============================================================================
# Step 6: Environment Configuration
# ============================================================================
# Copy this file to .env and update with your values
# Usage: source .env
# ============================================================================
# PostgreSQL Configuration
export POSTGRES_HOST=pg.betelgeusebytes.io
export POSTGRES_PORT=5432
export POSTGRES_DB=hadith_db
export POSTGRES_USER=hadith_ingest
export POSTGRES_PASSWORD=YOUR_PASSWORD_HERE
# Qdrant Configuration (internal cluster access)
export QDRANT_HOST=qdrant.vector.svc.cluster.local
export QDRANT_PORT=6333
export QDRANT_COLLECTION=hadith_embeddings
# Qdrant Configuration (external access - uncomment if needed)
# export QDRANT_HOST=qdrant.betelgeusebytes.io
# export QDRANT_PORT=443
# TEI Configuration (internal cluster access)
export TEI_HOST=tei.ml.svc.cluster.local
export TEI_PORT=80
# TEI Configuration (external access - uncomment if needed)
# export TEI_HOST=tei.betelgeusebytes.io
# export TEI_PORT=443
# vLLM Configuration (for later steps)
export VLLM_HOST=vllm.ml.svc.cluster.local
export VLLM_PORT=8000
# MLflow Configuration
export MLFLOW_TRACKING_URI=https://mlflow.betelgeusebytes.io

View File

@ -0,0 +1,48 @@
# ============================================================================
# Hadith Semantic Search API - Dockerfile
# ============================================================================
# Build: docker build -t hadith-search-api:latest .
# Run: docker run -p 8080:8080 --env-file .env hadith-search-api:latest
# ============================================================================
FROM python:3.11-slim
# Set environment variables
ENV PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
PIP_NO_CACHE_DIR=1 \
PIP_DISABLE_PIP_VERSION_CHECK=1
# Install system dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
libpq-dev \
&& rm -rf /var/lib/apt/lists/*
# Create non-root user
RUN useradd --create-home --shell /bin/bash appuser
# Set work directory
WORKDIR /app
# Install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy application code
COPY search_api.py .
# Change ownership
RUN chown -R appuser:appuser /app
# Switch to non-root user
USER appuser
# Expose port
EXPOSE 8080
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
CMD python -c "import httpx; httpx.get('http://localhost:8080/health', timeout=5)"
# Run the application
CMD ["python", "-m", "uvicorn", "search_api:app", "--host", "0.0.0.0", "--port", "8080", "--workers", "2"]

View File

@ -0,0 +1,407 @@
# Step 6: Verify Embeddings & Test Semantic Search
## 📋 Overview
This step validates that all ~40,000 hadiths have been properly embedded and stored in Qdrant, then builds and benchmarks a semantic search system.
**Target Performance:** <500ms per query
---
## 📁 Files Included
| File | Description |
|------|-------------|
| `verify_embeddings.py` | Validates all hadiths have embeddings in Qdrant |
| `semantic_search.py` | Tests semantic search with benchmarking |
| `search_api.py` | Production-ready FastAPI search service |
| `verification_queries.sql` | SQL queries for database verification |
| `k8s-search-api.yaml` | Kubernetes deployment manifests |
| `Dockerfile` | Container image for search API |
| `requirements.txt` | Python dependencies |
| `run_tests.sh` | Quick test runner script |
---
## 🔧 Prerequisites
1. **Python 3.10+** with pip
2. **Access to services:**
- PostgreSQL at `pg.betelgeusebytes.io:5432`
- Qdrant at `qdrant.vector.svc.cluster.local:6333`
- TEI at `tei.ml.svc.cluster.local:80`
3. **Environment variables:**
```bash
export POSTGRES_HOST=pg.betelgeusebytes.io
export POSTGRES_PORT=5432
export POSTGRES_DB=hadith_db
export POSTGRES_USER=hadith_ingest
export POSTGRES_PASSWORD=your_password
export QDRANT_HOST=qdrant.vector.svc.cluster.local
export QDRANT_PORT=6333
export QDRANT_COLLECTION=hadith_embeddings
export TEI_HOST=tei.ml.svc.cluster.local
export TEI_PORT=80
```
---
## 🚀 Quick Start
### 1. Install Dependencies
```bash
pip install -r requirements.txt
```
### 2. Run Embedding Verification
```bash
python verify_embeddings.py
```
**Expected Output:**
```
┌───────────────────────────────────────┐
│ Step 6.1 - Hadith Embeddings Verification │
└───────────────────────────────────────┘
1. Checking PostgreSQL database...
Total hadiths: 40,123
Marked as embedded: 40,123
2. Collection breakdown:
┌────────────────┬────────┬──────────┐
│ Collection │ Total │ Embedded │
├────────────────┼────────┼──────────┤
│ Sahih Bukhari │ 7,563 │ 7,563 │
│ Sahih Muslim │ 7,453 │ 7,453 │
│ ... │ ... │ ... │
└────────────────┴────────┴──────────┘
✓ ALL EMBEDDINGS VERIFIED!
```
### 3. Run Semantic Search Benchmark
```bash
# Full benchmark
python semantic_search.py --mode benchmark
# Interactive mode
python semantic_search.py --mode interactive
# Single query
python semantic_search.py --query "الصلاة في المسجد"
# Demo mode
python semantic_search.py --mode demo
```
**Expected Benchmark Output:**
```
═══════════════════════════════════════════════════════
BENCHMARK RESULTS
═══════════════════════════════════════════════════════
Query Statistics:
Total queries: 22
Successful: 22
Failed: 0
Timing Statistics:
Average embedding time: 45.3ms
Average search time: 12.8ms
Average total time: 58.1ms
Percentiles:
P50: 55.2ms
P95: 89.4ms
P99: 112.3ms
Performance Target (<500ms):
Queries meeting target: 22/22 (100.0%)
Status: ✓ TARGET MET
```
---
## 📊 Verification Queries (SQL)
Run these directly against PostgreSQL:
```bash
psql -h pg.betelgeusebytes.io -U hadith_ingest -d hadith_db -f verification_queries.sql
```
Or run individual queries:
```sql
-- Quick health check
SELECT
'Database Health Check' AS check_type,
(SELECT COUNT(*) FROM hadiths) AS total_hadiths,
(SELECT COUNT(*) FROM collections) AS total_collections,
(SELECT SUM(CASE WHEN embedding_generated THEN 1 ELSE 0 END) FROM hadiths) AS embedded_count;
-- Find missing embeddings
SELECT id, collection_id, hadith_number
FROM hadiths
WHERE NOT embedding_generated
LIMIT 10;
```
---
## 🔍 Qdrant Verification
```bash
# Check collection exists
curl http://qdrant.betelgeusebytes.io/collections/hadith_embeddings
# Count points
curl -X POST http://qdrant.betelgeusebytes.io/collections/hadith_embeddings/points/count \
-H "Content-Type: application/json" \
-d '{"exact": true}'
# Sample search (requires embedding)
curl -X POST http://qdrant.betelgeusebytes.io/collections/hadith_embeddings/points/search \
-H "Content-Type: application/json" \
-d '{
"vector": [0.1, 0.2, ...], # 1024-dim vector
"limit": 5,
"with_payload": true
}'
```
---
## 🌐 Deploy Search API
### Option 1: Run Locally
```bash
python search_api.py
# API available at http://localhost:8080
```
### Option 2: Deploy to Kubernetes
```bash
# Create namespace
kubectl create namespace hadith
# Create ConfigMap with API code
kubectl create configmap search-api-code \
--from-file=search_api.py \
-n hadith
# Update secrets in k8s-search-api.yaml with your password
# Deploy
kubectl apply -f k8s-search-api.yaml
# Check status
kubectl -n hadith get pods
kubectl -n hadith logs -f deployment/search-api
```
### Option 3: Build Docker Image
```bash
# Build image
docker build -t hadith-search-api:latest .
# Run locally
docker run -p 8080:8080 \
-e POSTGRES_PASSWORD=your_password \
-e POSTGRES_HOST=pg.betelgeusebytes.io \
-e QDRANT_HOST=qdrant.betelgeusebytes.io \
-e TEI_HOST=tei.betelgeusebytes.io \
hadith-search-api:latest
```
---
## 🔌 API Endpoints
| Method | Endpoint | Description |
|--------|----------|-------------|
| GET | `/health` | Health check |
| GET | `/stats` | Database statistics |
| POST | `/search` | Semantic search |
| GET | `/search?q=query` | Simple search |
| GET | `/hadith/{id}` | Get hadith by ID |
| GET | `/similar/{id}` | Find similar hadiths |
### Example API Calls
```bash
# Health check
curl https://search.betelgeusebytes.io/health
# Get stats
curl https://search.betelgeusebytes.io/stats
# Search (GET)
curl "https://search.betelgeusebytes.io/search?q=five%20daily%20prayers&limit=5"
# Search (POST)
curl -X POST https://search.betelgeusebytes.io/search \
-H "Content-Type: application/json" \
-d '{
"query": "الصلاة في المسجد الحرام",
"limit": 10,
"min_score": 0.5
}'
# Get specific hadith
curl https://search.betelgeusebytes.io/hadith/1234
# Find similar hadiths
curl https://search.betelgeusebytes.io/similar/1234?limit=5
```
---
## 📈 Sample Search Queries
### Arabic Queries
| Query | Description |
|-------|-------------|
| الصلاة في المسجد الحرام | Prayer in the Sacred Mosque |
| أبو هريرة رضي الله عنه | Abu Hurairah (RA) |
| الصيام في شهر رمضان | Fasting in Ramadan |
| الزكاة والصدقة | Zakat and charity |
| الحج والعمرة | Hajj and Umrah |
| الوضوء والطهارة | Ablution and purification |
| بر الوالدين | Honoring parents |
| الجنة والنار | Paradise and Hell |
### English Queries
| Query | Description |
|-------|-------------|
| five daily prayers | The five obligatory prayers |
| treatment of neighbors | Rights of neighbors |
| patience during hardship | Patience in trials |
| marriage and family | Islamic marriage guidance |
| honesty and truthfulness | Being truthful |
| Day of Judgment signs | Signs of the Last Day |
| companions of the Prophet | Sahaba and their virtues |
| seeking knowledge in Islam | Importance of knowledge |
---
## 🐛 Troubleshooting
### 1. "Connection refused" to services
**Problem:** Cannot connect to PostgreSQL/Qdrant/TEI
**Solution:**
```bash
# Check if running inside cluster
kubectl -n ml get pods
kubectl -n vector get pods
# For external access, use port-forward
kubectl port-forward -n vector svc/qdrant 6333:6333
kubectl port-forward -n ml svc/tei 8080:80
```
### 2. "Missing embeddings found"
**Problem:** Some hadiths don't have embeddings
**Solution:**
```bash
# Re-run embedding workflow for missing IDs
argo submit -n ml embedding-workflow.yaml \
--parameter hadith-ids="[1,2,3,...]"
# Or update the database flag
psql -h pg.betelgeusebytes.io -U hadith_ingest -d hadith_db -c \
"UPDATE hadiths SET embedding_generated = false WHERE id IN (1,2,3)"
```
### 3. Slow search performance (>500ms)
**Problem:** Queries taking too long
**Solutions:**
1. Check TEI service health:
```bash
curl http://tei.ml.svc.cluster.local/health
```
2. Check Qdrant indexing:
```bash
curl http://qdrant.betelgeusebytes.io/collections/hadith_embeddings | jq '.result.status'
```
3. Enable HNSW index in Qdrant:
```bash
curl -X PATCH http://qdrant.betelgeusebytes.io/collections/hadith_embeddings \
-H "Content-Type: application/json" \
-d '{"optimizers_config": {"indexing_threshold": 10000}}'
```
### 4. "Embedding dimension mismatch"
**Problem:** Qdrant rejects embeddings
**Solution:** Verify BGE-M3 produces 1024-dim vectors:
```bash
curl -X POST http://tei.ml.svc.cluster.local/embed \
-H "Content-Type: application/json" \
-d '{"inputs": "test"}' | jq 'length'
```
---
## ✅ Verification Checklist
Before proceeding to Step 7, ensure:
- [ ] `verify_embeddings.py` shows 0 missing embeddings
- [ ] All 8 collections have 100% embedding coverage
- [ ] Benchmark shows P95 < 500ms
- [ ] At least 95% of queries meet the target
- [ ] API endpoints respond correctly
- [ ] Sample Arabic and English queries return relevant results
- [ ] Results are enriched with full hadith data from PostgreSQL
---
## 📚 Next Steps
Once Step 6 is verified:
1. **Step 7:** Annotation Setup with Label Studio
- Export 500 random hadiths for annotation
- Configure NER labeling project
- Create annotation guidelines
2. **Step 8:** NER Model Training
- Use annotated data to train entity extraction model
- Target F1 > 0.85
---
## 📝 Output Files
After running the scripts:
- `verification_results.json` - Embedding verification report
- `benchmark_results.json` - Performance benchmark results
These can be stored in MLflow for tracking:
```bash
mlflow experiments create -n "step6-verification"
mlflow runs create -e step6-verification --run-name "embedding-verification"
```

View File

@ -0,0 +1,260 @@
{
"total_queries": 22,
"successful_queries": 22,
"failed_queries": 0,
"avg_embedding_time_ms": 124.84432727135506,
"avg_search_time_ms": 22.30660000085746,
"avg_total_time_ms": 147.15092727221253,
"p50_time_ms": 146.22399999643676,
"p95_time_ms": 163.0423000169685,
"p99_time_ms": 172.9122999822721,
"min_time_ms": 130.5485999910161,
"max_time_ms": 172.9122999822721,
"queries_meeting_target": 22,
"target_ms": 500,
"query_results": [
{
"query": "\u0627\u0644\u0635\u0644\u0627\u0629 \u0641\u064a \u0627\u0644\u0645\u0633\u062c\u062f \u0627\u0644\u062d\u0631\u0627\u0645",
"language": "arabic",
"description": "Prayer in the Sacred Mosque",
"embedding_time_ms": 137.83629999670666,
"search_time_ms": 21.302799999830313,
"total_time_ms": 159.13909999653697,
"results_count": 10,
"top_score": 0.7661493,
"meets_target": true
},
{
"query": "\u0623\u0628\u0648 \u0647\u0631\u064a\u0631\u0629 \u0631\u0636\u064a \u0627\u0644\u0644\u0647 \u0639\u0646\u0647",
"language": "arabic",
"description": "Abu Hurairah (RA)",
"embedding_time_ms": 142.00750000600237,
"search_time_ms": 21.034800010966137,
"total_time_ms": 163.0423000169685,
"results_count": 10,
"top_score": 0.79470885,
"meets_target": true
},
{
"query": "\u0627\u0644\u0635\u064a\u0627\u0645 \u0641\u064a \u0634\u0647\u0631 \u0631\u0645\u0636\u0627\u0646",
"language": "arabic",
"description": "Fasting in Ramadan",
"embedding_time_ms": 141.40879998740274,
"search_time_ms": 21.46490001177881,
"total_time_ms": 162.87369999918155,
"results_count": 10,
"top_score": 0.81152785,
"meets_target": true
},
{
"query": "\u0627\u0644\u0632\u0643\u0627\u0629 \u0648\u0627\u0644\u0635\u062f\u0642\u0629",
"language": "arabic",
"description": "Zakat and charity",
"embedding_time_ms": 125.70800000685267,
"search_time_ms": 21.658099998603575,
"total_time_ms": 147.36610000545625,
"results_count": 10,
"top_score": 0.73705375,
"meets_target": true
},
{
"query": "\u0627\u0644\u062d\u062c \u0648\u0627\u0644\u0639\u0645\u0631\u0629",
"language": "arabic",
"description": "Hajj and Umrah",
"embedding_time_ms": 128.12189999385737,
"search_time_ms": 21.92250000371132,
"total_time_ms": 150.0443999975687,
"results_count": 10,
"top_score": 0.75435185,
"meets_target": true
},
{
"query": "\u0627\u0644\u0646\u0628\u064a \u0635\u0644\u0649 \u0627\u0644\u0644\u0647 \u0639\u0644\u064a\u0647 \u0648\u0633\u0644\u0645 \u0641\u064a \u0627\u0644\u0645\u062f\u064a\u0646\u0629",
"language": "arabic",
"description": "Prophet (PBUH) in Medina",
"embedding_time_ms": 151.73289999074768,
"search_time_ms": 21.179399991524406,
"total_time_ms": 172.9122999822721,
"results_count": 10,
"top_score": 0.75164807,
"meets_target": true
},
{
"query": "\u0627\u0644\u0648\u0636\u0648\u0621 \u0648\u0627\u0644\u0637\u0647\u0627\u0631\u0629",
"language": "arabic",
"description": "Ablution and purification",
"embedding_time_ms": 131.21989999490324,
"search_time_ms": 21.02040000318084,
"total_time_ms": 152.24029999808408,
"results_count": 10,
"top_score": 0.6073998,
"meets_target": true
},
{
"query": "\u0628\u0631 \u0627\u0644\u0648\u0627\u0644\u062f\u064a\u0646",
"language": "arabic",
"description": "Honoring parents",
"embedding_time_ms": 120.76360000355635,
"search_time_ms": 22.190200004843064,
"total_time_ms": 142.9538000083994,
"results_count": 10,
"top_score": 0.7476402,
"meets_target": true
},
{
"query": "\u0627\u0644\u062c\u0646\u0629 \u0648\u0627\u0644\u0646\u0627\u0631",
"language": "arabic",
"description": "Paradise and Hell",
"embedding_time_ms": 124.25219999568071,
"search_time_ms": 23.127499996917322,
"total_time_ms": 147.37969999259803,
"results_count": 10,
"top_score": 0.7781049,
"meets_target": true
},
{
"query": "\u0627\u0644\u0625\u064a\u0645\u0627\u0646 \u0648\u0627\u0644\u0625\u0633\u0644\u0627\u0645",
"language": "arabic",
"description": "Faith and Islam",
"embedding_time_ms": 127.87359999492764,
"search_time_ms": 21.657500008586794,
"total_time_ms": 149.53110000351444,
"results_count": 10,
"top_score": 0.7572472,
"meets_target": true
},
{
"query": "five daily prayers",
"language": "english",
"description": "The five obligatory prayers",
"embedding_time_ms": 109.2108000011649,
"search_time_ms": 21.33779998985119,
"total_time_ms": 130.5485999910161,
"results_count": 10,
"top_score": 0.759544,
"meets_target": true
},
{
"query": "Prophet Muhammad in Mecca",
"language": "english",
"description": "Prophet's life in Mecca",
"embedding_time_ms": 113.58699999982491,
"search_time_ms": 23.10490001400467,
"total_time_ms": 136.69190001382958,
"results_count": 10,
"top_score": 0.80261445,
"meets_target": true
},
{
"query": "treatment of neighbors",
"language": "english",
"description": "Rights and treatment of neighbors",
"embedding_time_ms": 115.10320000525098,
"search_time_ms": 21.576900006039068,
"total_time_ms": 136.68010001129005,
"results_count": 10,
"top_score": 0.61891544,
"meets_target": true
},
{
"query": "patience during hardship",
"language": "english",
"description": "Patience in difficult times",
"embedding_time_ms": 122.27100000018254,
"search_time_ms": 22.26110000628978,
"total_time_ms": 144.53210000647232,
"results_count": 10,
"top_score": 0.697459,
"meets_target": true
},
{
"query": "marriage and family",
"language": "english",
"description": "Islamic marriage guidance",
"embedding_time_ms": 116.44650000380352,
"search_time_ms": 21.11739999963902,
"total_time_ms": 137.56390000344254,
"results_count": 10,
"top_score": 0.6414789,
"meets_target": true
},
{
"query": "honesty and truthfulness",
"language": "english",
"description": "Importance of being truthful",
"embedding_time_ms": 119.8251000023447,
"search_time_ms": 22.77229999890551,
"total_time_ms": 142.5974000012502,
"results_count": 10,
"top_score": 0.64781964,
"meets_target": true
},
{
"query": "Day of Judgment signs",
"language": "english",
"description": "Signs of the Last Day",
"embedding_time_ms": 112.60979999497067,
"search_time_ms": 22.185800000443123,
"total_time_ms": 134.7955999954138,
"results_count": 10,
"top_score": 0.71163684,
"meets_target": true
},
{
"query": "charity and helping poor",
"language": "english",
"description": "Giving charity to the needy",
"embedding_time_ms": 120.4487000068184,
"search_time_ms": 22.555499992449768,
"total_time_ms": 143.00419999926817,
"results_count": 10,
"top_score": 0.72138125,
"meets_target": true
},
{
"query": "companions of the Prophet",
"language": "english",
"description": "Sahaba and their virtues",
"embedding_time_ms": 112.31199999747332,
"search_time_ms": 23.779299997841008,
"total_time_ms": 136.09129999531433,
"results_count": 10,
"top_score": 0.7868167,
"meets_target": true
},
{
"query": "seeking knowledge in Islam",
"language": "english",
"description": "Importance of knowledge",
"embedding_time_ms": 119.9167999875499,
"search_time_ms": 25.41219998965971,
"total_time_ms": 145.3289999772096,
"results_count": 10,
"top_score": 0.76270455,
"meets_target": true
},
{
"query": "\u0642\u0627\u0644 \u0631\u0633\u0648\u0644 \u0627\u0644\u0644\u0647 about kindness",
"language": "mixed",
"description": "Prophet's sayings about kindness (mixed)",
"embedding_time_ms": 134.37929999781772,
"search_time_ms": 21.40019999933429,
"total_time_ms": 155.77949999715202,
"results_count": 10,
"top_score": 0.8195741,
"meets_target": true
},
{
"query": "women rights \u0627\u0644\u0625\u0633\u0644\u0627\u0645",
"language": "mixed",
"description": "Women's rights in Islam (mixed)",
"embedding_time_ms": 119.54030000197235,
"search_time_ms": 26.683699994464405,
"total_time_ms": 146.22399999643676,
"results_count": 10,
"top_score": 0.72458637,
"meets_target": true
}
],
"timestamp": "2025-11-28T10:05:31.314855"
}

View File

@ -0,0 +1,37 @@
# ============================================================================
# Step 6: Environment Configuration
# ============================================================================
# Copy this file to .env and update with your values
# Usage: source .env
# ============================================================================
# PostgreSQL Configuration
export POSTGRES_HOST=pg.betelgeusebytes.io
export POSTGRES_PORT=5432
export POSTGRES_DB=hadith_db
export POSTGRES_USER=hadith_ingest
export POSTGRES_PASSWORD=YOUR_PASSWORD_HERE
# Qdrant Configuration (internal cluster access)
export QDRANT_HOST=qdrant.vector.svc.cluster.local
export QDRANT_PORT=6333
export QDRANT_COLLECTION=hadith_embeddings
# Qdrant Configuration (external access - uncomment if needed)
# export QDRANT_HOST=qdrant.betelgeusebytes.io
# export QDRANT_PORT=443
# TEI Configuration (internal cluster access)
export TEI_HOST=tei.ml.svc.cluster.local
export TEI_PORT=80
# TEI Configuration (external access - uncomment if needed)
# export TEI_HOST=tei.betelgeusebytes.io
# export TEI_PORT=443
# vLLM Configuration (for later steps)
export VLLM_HOST=vllm.ml.svc.cluster.local
export VLLM_PORT=8000
# MLflow Configuration
export MLFLOW_TRACKING_URI=https://mlflow.betelgeusebytes.io

View File

@ -0,0 +1,48 @@
# ============================================================================
# Hadith Semantic Search API - Dockerfile
# ============================================================================
# Build: docker build -t hadith-search-api:latest .
# Run: docker run -p 8080:8080 --env-file .env hadith-search-api:latest
# ============================================================================
FROM python:3.11-slim
# Set environment variables
ENV PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
PIP_NO_CACHE_DIR=1 \
PIP_DISABLE_PIP_VERSION_CHECK=1
# Install system dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
libpq-dev \
&& rm -rf /var/lib/apt/lists/*
# Create non-root user
RUN useradd --create-home --shell /bin/bash appuser
# Set work directory
WORKDIR /app
# Install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy application code
COPY search_api.py .
# Change ownership
RUN chown -R appuser:appuser /app
# Switch to non-root user
USER appuser
# Expose port
EXPOSE 8080
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
CMD python -c "import httpx; httpx.get('http://localhost:8080/health', timeout=5)"
# Run the application
CMD ["python", "-m", "uvicorn", "search_api:app", "--host", "0.0.0.0", "--port", "8080", "--workers", "2"]

View File

@ -0,0 +1,407 @@
# Step 6: Verify Embeddings & Test Semantic Search
## 📋 Overview
This step validates that all ~40,000 hadiths have been properly embedded and stored in Qdrant, then builds and benchmarks a semantic search system.
**Target Performance:** <500ms per query
---
## 📁 Files Included
| File | Description |
|------|-------------|
| `verify_embeddings.py` | Validates all hadiths have embeddings in Qdrant |
| `semantic_search.py` | Tests semantic search with benchmarking |
| `search_api.py` | Production-ready FastAPI search service |
| `verification_queries.sql` | SQL queries for database verification |
| `k8s-search-api.yaml` | Kubernetes deployment manifests |
| `Dockerfile` | Container image for search API |
| `requirements.txt` | Python dependencies |
| `run_tests.sh` | Quick test runner script |
---
## 🔧 Prerequisites
1. **Python 3.10+** with pip
2. **Access to services:**
- PostgreSQL at `pg.betelgeusebytes.io:5432`
- Qdrant at `qdrant.vector.svc.cluster.local:6333`
- TEI at `tei.ml.svc.cluster.local:80`
3. **Environment variables:**
```bash
export POSTGRES_HOST=pg.betelgeusebytes.io
export POSTGRES_PORT=5432
export POSTGRES_DB=hadith_db
export POSTGRES_USER=hadith_ingest
export POSTGRES_PASSWORD=your_password
export QDRANT_HOST=qdrant.vector.svc.cluster.local
export QDRANT_PORT=6333
export QDRANT_COLLECTION=hadith_embeddings
export TEI_HOST=tei.ml.svc.cluster.local
export TEI_PORT=80
```
---
## 🚀 Quick Start
### 1. Install Dependencies
```bash
pip install -r requirements.txt
```
### 2. Run Embedding Verification
```bash
python verify_embeddings.py
```
**Expected Output:**
```
┌───────────────────────────────────────┐
│ Step 6.1 - Hadith Embeddings Verification │
└───────────────────────────────────────┘
1. Checking PostgreSQL database...
Total hadiths: 40,123
Marked as embedded: 40,123
2. Collection breakdown:
┌────────────────┬────────┬──────────┐
│ Collection │ Total │ Embedded │
├────────────────┼────────┼──────────┤
│ Sahih Bukhari │ 7,563 │ 7,563 │
│ Sahih Muslim │ 7,453 │ 7,453 │
│ ... │ ... │ ... │
└────────────────┴────────┴──────────┘
✓ ALL EMBEDDINGS VERIFIED!
```
### 3. Run Semantic Search Benchmark
```bash
# Full benchmark
python semantic_search.py --mode benchmark
# Interactive mode
python semantic_search.py --mode interactive
# Single query
python semantic_search.py --query "الصلاة في المسجد"
# Demo mode
python semantic_search.py --mode demo
```
**Expected Benchmark Output:**
```
═══════════════════════════════════════════════════════
BENCHMARK RESULTS
═══════════════════════════════════════════════════════
Query Statistics:
Total queries: 22
Successful: 22
Failed: 0
Timing Statistics:
Average embedding time: 45.3ms
Average search time: 12.8ms
Average total time: 58.1ms
Percentiles:
P50: 55.2ms
P95: 89.4ms
P99: 112.3ms
Performance Target (<500ms):
Queries meeting target: 22/22 (100.0%)
Status: ✓ TARGET MET
```
---
## 📊 Verification Queries (SQL)
Run these directly against PostgreSQL:
```bash
psql -h pg.betelgeusebytes.io -U hadith_ingest -d hadith_db -f verification_queries.sql
```
Or run individual queries:
```sql
-- Quick health check
SELECT
'Database Health Check' AS check_type,
(SELECT COUNT(*) FROM hadiths) AS total_hadiths,
(SELECT COUNT(*) FROM collections) AS total_collections,
(SELECT SUM(CASE WHEN embedding_generated THEN 1 ELSE 0 END) FROM hadiths) AS embedded_count;
-- Find missing embeddings
SELECT id, collection_id, hadith_number
FROM hadiths
WHERE NOT embedding_generated
LIMIT 10;
```
---
## 🔍 Qdrant Verification
```bash
# Check collection exists
curl http://qdrant.betelgeusebytes.io/collections/hadith_embeddings
# Count points
curl -X POST http://qdrant.betelgeusebytes.io/collections/hadith_embeddings/points/count \
-H "Content-Type: application/json" \
-d '{"exact": true}'
# Sample search (requires embedding)
curl -X POST http://qdrant.betelgeusebytes.io/collections/hadith_embeddings/points/search \
-H "Content-Type: application/json" \
-d '{
"vector": [0.1, 0.2, ...], # 1024-dim vector
"limit": 5,
"with_payload": true
}'
```
---
## 🌐 Deploy Search API
### Option 1: Run Locally
```bash
python search_api.py
# API available at http://localhost:8080
```
### Option 2: Deploy to Kubernetes
```bash
# Create namespace
kubectl create namespace hadith
# Create ConfigMap with API code
kubectl create configmap search-api-code \
--from-file=search_api.py \
-n hadith
# Update secrets in k8s-search-api.yaml with your password
# Deploy
kubectl apply -f k8s-search-api.yaml
# Check status
kubectl -n hadith get pods
kubectl -n hadith logs -f deployment/search-api
```
### Option 3: Build Docker Image
```bash
# Build image
docker build -t hadith-search-api:latest .
# Run locally
docker run -p 8080:8080 \
-e POSTGRES_PASSWORD=your_password \
-e POSTGRES_HOST=pg.betelgeusebytes.io \
-e QDRANT_HOST=qdrant.betelgeusebytes.io \
-e TEI_HOST=tei.betelgeusebytes.io \
hadith-search-api:latest
```
---
## 🔌 API Endpoints
| Method | Endpoint | Description |
|--------|----------|-------------|
| GET | `/health` | Health check |
| GET | `/stats` | Database statistics |
| POST | `/search` | Semantic search |
| GET | `/search?q=query` | Simple search |
| GET | `/hadith/{id}` | Get hadith by ID |
| GET | `/similar/{id}` | Find similar hadiths |
### Example API Calls
```bash
# Health check
curl https://search.betelgeusebytes.io/health
# Get stats
curl https://search.betelgeusebytes.io/stats
# Search (GET)
curl "https://search.betelgeusebytes.io/search?q=five%20daily%20prayers&limit=5"
# Search (POST)
curl -X POST https://search.betelgeusebytes.io/search \
-H "Content-Type: application/json" \
-d '{
"query": "الصلاة في المسجد الحرام",
"limit": 10,
"min_score": 0.5
}'
# Get specific hadith
curl https://search.betelgeusebytes.io/hadith/1234
# Find similar hadiths
curl https://search.betelgeusebytes.io/similar/1234?limit=5
```
---
## 📈 Sample Search Queries
### Arabic Queries
| Query | Description |
|-------|-------------|
| الصلاة في المسجد الحرام | Prayer in the Sacred Mosque |
| أبو هريرة رضي الله عنه | Abu Hurairah (RA) |
| الصيام في شهر رمضان | Fasting in Ramadan |
| الزكاة والصدقة | Zakat and charity |
| الحج والعمرة | Hajj and Umrah |
| الوضوء والطهارة | Ablution and purification |
| بر الوالدين | Honoring parents |
| الجنة والنار | Paradise and Hell |
### English Queries
| Query | Description |
|-------|-------------|
| five daily prayers | The five obligatory prayers |
| treatment of neighbors | Rights of neighbors |
| patience during hardship | Patience in trials |
| marriage and family | Islamic marriage guidance |
| honesty and truthfulness | Being truthful |
| Day of Judgment signs | Signs of the Last Day |
| companions of the Prophet | Sahaba and their virtues |
| seeking knowledge in Islam | Importance of knowledge |
---
## 🐛 Troubleshooting
### 1. "Connection refused" to services
**Problem:** Cannot connect to PostgreSQL/Qdrant/TEI
**Solution:**
```bash
# Check if running inside cluster
kubectl -n ml get pods
kubectl -n vector get pods
# For external access, use port-forward
kubectl port-forward -n vector svc/qdrant 6333:6333
kubectl port-forward -n ml svc/tei 8080:80
```
### 2. "Missing embeddings found"
**Problem:** Some hadiths don't have embeddings
**Solution:**
```bash
# Re-run embedding workflow for missing IDs
argo submit -n ml embedding-workflow.yaml \
--parameter hadith-ids="[1,2,3,...]"
# Or update the database flag
psql -h pg.betelgeusebytes.io -U hadith_ingest -d hadith_db -c \
"UPDATE hadiths SET embedding_generated = false WHERE id IN (1,2,3)"
```
### 3. Slow search performance (>500ms)
**Problem:** Queries taking too long
**Solutions:**
1. Check TEI service health:
```bash
curl http://tei.ml.svc.cluster.local/health
```
2. Check Qdrant indexing:
```bash
curl http://qdrant.betelgeusebytes.io/collections/hadith_embeddings | jq '.result.status'
```
3. Enable HNSW index in Qdrant:
```bash
curl -X PATCH http://qdrant.betelgeusebytes.io/collections/hadith_embeddings \
-H "Content-Type: application/json" \
-d '{"optimizers_config": {"indexing_threshold": 10000}}'
```
### 4. "Embedding dimension mismatch"
**Problem:** Qdrant rejects embeddings
**Solution:** Verify BGE-M3 produces 1024-dim vectors:
```bash
curl -X POST http://tei.ml.svc.cluster.local/embed \
-H "Content-Type: application/json" \
-d '{"inputs": "test"}' | jq 'length'
```
---
## ✅ Verification Checklist
Before proceeding to Step 7, ensure:
- [ ] `verify_embeddings.py` shows 0 missing embeddings
- [ ] All 8 collections have 100% embedding coverage
- [ ] Benchmark shows P95 < 500ms
- [ ] At least 95% of queries meet the target
- [ ] API endpoints respond correctly
- [ ] Sample Arabic and English queries return relevant results
- [ ] Results are enriched with full hadith data from PostgreSQL
---
## 📚 Next Steps
Once Step 6 is verified:
1. **Step 7:** Annotation Setup with Label Studio
- Export 500 random hadiths for annotation
- Configure NER labeling project
- Create annotation guidelines
2. **Step 8:** NER Model Training
- Use annotated data to train entity extraction model
- Target F1 > 0.85
---
## 📝 Output Files
After running the scripts:
- `verification_results.json` - Embedding verification report
- `benchmark_results.json` - Performance benchmark results
These can be stored in MLflow for tracking:
```bash
mlflow experiments create -n "step6-verification"
mlflow runs create -e step6-verification --run-name "embedding-verification"
```

View File

@ -0,0 +1,183 @@
# ============================================================================
# Step 6: Semantic Search API - Kubernetes Deployment
# ============================================================================
# Deploy: kubectl apply -f k8s-search-api.yaml
# ============================================================================
---
# Namespace (if not exists)
apiVersion: v1
kind: Namespace
metadata:
name: hadith
---
# ConfigMap for non-sensitive configuration
apiVersion: v1
kind: ConfigMap
metadata:
name: search-api-config
namespace: hadith
data:
POSTGRES_HOST: "postgres.db.svc.cluster.local"
POSTGRES_PORT: "5432"
POSTGRES_DB: "hadith_db"
POSTGRES_USER: "hadith_ingest"
QDRANT_HOST: "qdrant.vector.svc.cluster.local"
QDRANT_PORT: "6333"
QDRANT_COLLECTION: "hadith_embeddings"
TEI_HOST: "tei.ml.svc.cluster.local"
TEI_PORT: "80"
---
# Secret for database password
apiVersion: v1
kind: Secret
metadata:
name: search-api-secrets
namespace: hadith
type: Opaque
stringData:
POSTGRES_PASSWORD: "CHANGE_ME_TO_YOUR_PASSWORD"
---
# Deployment
apiVersion: apps/v1
kind: Deployment
metadata:
name: search-api
namespace: hadith
labels:
app: search-api
spec:
replicas: 2
selector:
matchLabels:
app: search-api
template:
metadata:
labels:
app: search-api
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8080"
prometheus.io/path: "/metrics"
spec:
nodeSelector:
node: hetzner-2
containers:
- name: search-api
image: python:3.11-slim
command:
- /bin/bash
- -c
- |
pip install --no-cache-dir \
fastapi uvicorn httpx psycopg2-binary pydantic && \
python /app/search_api.py
ports:
- containerPort: 8080
name: http
envFrom:
- configMapRef:
name: search-api-config
- secretRef:
name: search-api-secrets
volumeMounts:
- name: app-code
mountPath: /app
resources:
requests:
cpu: "250m"
memory: "256Mi"
limits:
cpu: "1"
memory: "512Mi"
readinessProbe:
httpGet:
path: /health
port: 8080
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 3
livenessProbe:
httpGet:
path: /health
port: 8080
initialDelaySeconds: 60
periodSeconds: 30
timeoutSeconds: 5
failureThreshold: 3
volumes:
- name: app-code
configMap:
name: search-api-code
---
# Service
apiVersion: v1
kind: Service
metadata:
name: search-api
namespace: hadith
spec:
selector:
app: search-api
ports:
- name: http
port: 80
targetPort: 8080
type: ClusterIP
---
# Ingress
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: search-api
namespace: hadith
annotations:
cert-manager.io/cluster-issuer: letsencrypt-prod
nginx.ingress.kubernetes.io/proxy-body-size: "10m"
nginx.ingress.kubernetes.io/proxy-read-timeout: "60"
nginx.ingress.kubernetes.io/proxy-send-timeout: "60"
spec:
ingressClassName: nginx
tls:
- hosts:
- search.betelgeusebytes.io
secretName: search-api-tls
rules:
- host: search.betelgeusebytes.io
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: search-api
port:
number: 80
---
# HorizontalPodAutoscaler (optional)
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: search-api-hpa
namespace: hadith
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: search-api
minReplicas: 2
maxReplicas: 5
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 80

View File

@ -0,0 +1,19 @@
# Step 6: Verify Embeddings & Semantic Search
# Requirements for hadith-phase3-step6
# Database
psycopg2-binary>=2.9.9
# HTTP client
httpx>=0.27.0
# Rich console output
rich>=13.7.0
# Data handling
python-dateutil>=2.8.2
# Optional: for running as web API
fastapi>=0.111.0
uvicorn>=0.30.0
pydantic>=2.7.0

View File

@ -0,0 +1,217 @@
#!/bin/bash
# ============================================================================
# Step 6: Quick Test Runner
# ============================================================================
# Usage: ./run_tests.sh [verify|benchmark|demo|api|all]
# ============================================================================
set -e
# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Configuration - Update these for your environment
export POSTGRES_HOST="${POSTGRES_HOST:-pg.betelgeusebytes.io}"
export POSTGRES_PORT="${POSTGRES_PORT:-5432}"
export POSTGRES_DB="${POSTGRES_DB:-hadith_db}"
export POSTGRES_USER="${POSTGRES_USER:-hadith_ingest}"
export POSTGRES_PASSWORD="${POSTGRES_PASSWORD:-}"
export QDRANT_HOST="${QDRANT_HOST:-qdrant.vector.svc.cluster.local}"
export QDRANT_PORT="${QDRANT_PORT:-6333}"
export QDRANT_COLLECTION="${QDRANT_COLLECTION:-hadith_embeddings}"
export TEI_HOST="${TEI_HOST:-tei.ml.svc.cluster.local}"
export TEI_PORT="${TEI_PORT:-80}"
# Check if password is set
check_password() {
if [ -z "$POSTGRES_PASSWORD" ]; then
echo -e "${RED}Error: POSTGRES_PASSWORD environment variable is not set${NC}"
echo "Set it with: export POSTGRES_PASSWORD='your_password'"
exit 1
fi
}
# Install dependencies
install_deps() {
echo -e "${BLUE}Installing dependencies...${NC}"
pip install -q -r requirements.txt
echo -e "${GREEN}Dependencies installed.${NC}"
}
# Run verification
run_verify() {
echo -e "\n${BLUE}═══════════════════════════════════════════════════${NC}"
echo -e "${BLUE}Running Embedding Verification...${NC}"
echo -e "${BLUE}═══════════════════════════════════════════════════${NC}\n"
python verify_embeddings.py
if [ $? -eq 0 ]; then
echo -e "\n${GREEN}✓ Verification passed!${NC}"
else
echo -e "\n${RED}✗ Verification failed - some embeddings are missing${NC}"
exit 1
fi
}
# Run benchmark
run_benchmark() {
echo -e "\n${BLUE}═══════════════════════════════════════════════════${NC}"
echo -e "${BLUE}Running Semantic Search Benchmark...${NC}"
echo -e "${BLUE}═══════════════════════════════════════════════════${NC}\n"
python semantic_search.py --mode benchmark --output benchmark_results.json
echo -e "\n${GREEN}✓ Benchmark complete. Results saved to benchmark_results.json${NC}"
}
# Run demo
run_demo() {
echo -e "\n${BLUE}═══════════════════════════════════════════════════${NC}"
echo -e "${BLUE}Running Search Demo...${NC}"
echo -e "${BLUE}═══════════════════════════════════════════════════${NC}\n"
python semantic_search.py --mode demo
}
# Run API server
run_api() {
echo -e "\n${BLUE}═══════════════════════════════════════════════════${NC}"
echo -e "${BLUE}Starting Search API Server...${NC}"
echo -e "${BLUE}═══════════════════════════════════════════════════${NC}\n"
echo -e "${YELLOW}API will be available at: http://localhost:8080${NC}"
echo -e "${YELLOW}Swagger docs at: http://localhost:8080/docs${NC}"
echo -e "${YELLOW}Press Ctrl+C to stop${NC}\n"
python search_api.py
}
# Run SQL verification
run_sql() {
echo -e "\n${BLUE}═══════════════════════════════════════════════════${NC}"
echo -e "${BLUE}Running SQL Verification Queries...${NC}"
echo -e "${BLUE}═══════════════════════════════════════════════════${NC}\n"
PGPASSWORD="$POSTGRES_PASSWORD" psql \
-h "$POSTGRES_HOST" \
-p "$POSTGRES_PORT" \
-U "$POSTGRES_USER" \
-d "$POSTGRES_DB" \
-f verification_queries.sql
}
# Quick connectivity test
test_connectivity() {
echo -e "\n${BLUE}Testing Service Connectivity...${NC}\n"
# Test PostgreSQL
echo -n "PostgreSQL ($POSTGRES_HOST:$POSTGRES_PORT): "
if PGPASSWORD="$POSTGRES_PASSWORD" psql -h "$POSTGRES_HOST" -p "$POSTGRES_PORT" -U "$POSTGRES_USER" -d "$POSTGRES_DB" -c "SELECT 1" > /dev/null 2>&1; then
echo -e "${GREEN}✓ Connected${NC}"
else
echo -e "${RED}✗ Failed${NC}"
fi
# Test Qdrant
echo -n "Qdrant ($QDRANT_HOST:$QDRANT_PORT): "
if curl -s "http://$QDRANT_HOST:$QDRANT_PORT/collections" > /dev/null 2>&1; then
echo -e "${GREEN}✓ Connected${NC}"
else
echo -e "${RED}✗ Failed${NC}"
fi
# Test TEI
echo -n "TEI ($TEI_HOST:$TEI_PORT): "
if curl -s "http://$TEI_HOST:$TEI_PORT/health" > /dev/null 2>&1; then
echo -e "${GREEN}✓ Connected${NC}"
else
echo -e "${RED}✗ Failed${NC}"
fi
echo ""
}
# Show usage
show_usage() {
echo "Usage: $0 [command]"
echo ""
echo "Commands:"
echo " verify Run embedding verification"
echo " benchmark Run semantic search benchmark"
echo " demo Run search demo with sample queries"
echo " api Start the search API server"
echo " sql Run SQL verification queries"
echo " test Test connectivity to all services"
echo " all Run verify + benchmark + demo"
echo " install Install Python dependencies"
echo " help Show this help message"
echo ""
echo "Environment variables:"
echo " POSTGRES_HOST PostgreSQL host (default: pg.betelgeusebytes.io)"
echo " POSTGRES_PORT PostgreSQL port (default: 5432)"
echo " POSTGRES_DB Database name (default: hadith_db)"
echo " POSTGRES_USER Database user (default: hadith_ingest)"
echo " POSTGRES_PASSWORD Database password (required)"
echo " QDRANT_HOST Qdrant host (default: qdrant.vector.svc.cluster.local)"
echo " QDRANT_PORT Qdrant port (default: 6333)"
echo " TEI_HOST TEI host (default: tei.ml.svc.cluster.local)"
echo " TEI_PORT TEI port (default: 80)"
}
# Main
case "${1:-help}" in
verify)
check_password
install_deps
run_verify
;;
benchmark)
check_password
install_deps
run_benchmark
;;
demo)
check_password
install_deps
run_demo
;;
api)
check_password
install_deps
run_api
;;
sql)
check_password
run_sql
;;
test)
check_password
test_connectivity
;;
all)
check_password
install_deps
test_connectivity
run_verify
run_benchmark
run_demo
;;
install)
install_deps
;;
help|--help|-h)
show_usage
;;
*)
echo -e "${RED}Unknown command: $1${NC}"
show_usage
exit 1
;;
esac

View File

@ -0,0 +1,567 @@
#!/usr/bin/env python3
"""
Step 6.3: Semantic Search API Service
======================================
Production-ready FastAPI service for hadith semantic search.
Author: Hadith Scholar AI Project
Date: 2025
"""
import os
import time
import logging
from datetime import datetime
from typing import List, Optional
from contextlib import asynccontextmanager
import httpx
import psycopg2
from psycopg2.pool import ThreadedConnectionPool
from psycopg2.extras import RealDictCursor
from fastapi import FastAPI, HTTPException, Query, Depends
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, Field
# Logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Configuration
POSTGRES_HOST = os.getenv("POSTGRES_HOST", "pg.betelgeusebytes.io")
POSTGRES_PORT = int(os.getenv("POSTGRES_PORT", "5432"))
POSTGRES_DB = os.getenv("POSTGRES_DB", "hadith_db")
POSTGRES_USER = os.getenv("POSTGRES_USER", "hadith_ingest")
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD", "")
QDRANT_HOST = os.getenv("QDRANT_HOST", "qdrant.vector.svc.cluster.local")
QDRANT_PORT = int(os.getenv("QDRANT_PORT", "6333"))
QDRANT_COLLECTION = os.getenv("QDRANT_COLLECTION", "hadith_embeddings")
TEI_HOST = os.getenv("TEI_HOST", "tei.ml.svc.cluster.local")
TEI_PORT = int(os.getenv("TEI_PORT", "80"))
# ============================================================================
# Pydantic Models
# ============================================================================
class SearchQuery(BaseModel):
"""Search query input."""
query: str = Field(..., min_length=1, max_length=1000, description="Search query text")
limit: int = Field(default=10, ge=1, le=100, description="Number of results to return")
min_score: float = Field(default=0.0, ge=0.0, le=1.0, description="Minimum similarity score")
collections: Optional[List[str]] = Field(default=None, description="Filter by collection names")
grades: Optional[List[str]] = Field(default=None, description="Filter by hadith grades")
class HadithResult(BaseModel):
"""Individual hadith search result."""
hadith_id: int
score: float
collection: str
book: Optional[str]
hadith_number: str
arabic_text: Optional[str]
arabic_normalized: Optional[str]
english_text: Optional[str]
urdu_text: Optional[str]
grade: Optional[str]
class SearchResponse(BaseModel):
"""Search response."""
query: str
results: List[HadithResult]
total_results: int
embedding_time_ms: float
search_time_ms: float
total_time_ms: float
timestamp: str
class HealthResponse(BaseModel):
"""Health check response."""
status: str
database: str
qdrant: str
tei: str
timestamp: str
class CollectionStats(BaseModel):
"""Collection statistics."""
name: str
total_hadiths: int
embedded_count: int
class StatsResponse(BaseModel):
"""Statistics response."""
total_hadiths: int
total_embedded: int
collections: List[CollectionStats]
timestamp: str
# ============================================================================
# Database Pool & Connections
# ============================================================================
db_pool: Optional[ThreadedConnectionPool] = None
http_client: Optional[httpx.AsyncClient] = None
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Manage application lifecycle."""
global db_pool, http_client
# Startup
logger.info("Starting up semantic search service...")
# Initialize database pool
try:
db_pool = ThreadedConnectionPool(
minconn=2,
maxconn=10,
host=POSTGRES_HOST,
port=POSTGRES_PORT,
database=POSTGRES_DB,
user=POSTGRES_USER,
password=POSTGRES_PASSWORD,
sslmode='require'
)
logger.info("Database pool initialized")
except Exception as e:
logger.error(f"Failed to initialize database pool: {e}")
db_pool = None
# Initialize HTTP client
http_client = httpx.AsyncClient(timeout=30.0)
logger.info("HTTP client initialized")
yield
# Shutdown
logger.info("Shutting down...")
if db_pool:
db_pool.closeall()
if http_client:
await http_client.aclose()
# ============================================================================
# FastAPI App
# ============================================================================
app = FastAPI(
title="Hadith Semantic Search API",
description="Semantic search service for Islamic hadith literature",
version="1.0.0",
lifespan=lifespan
)
# CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# ============================================================================
# Helper Functions
# ============================================================================
def get_db_connection():
"""Get database connection from pool."""
if db_pool is None:
raise HTTPException(status_code=503, detail="Database pool not available")
return db_pool.getconn()
def release_db_connection(conn):
"""Return connection to pool."""
if db_pool and conn:
db_pool.putconn(conn)
async def get_embedding(text: str) -> tuple[List[float], float]:
"""Get embedding from TEI service."""
start = time.perf_counter()
try:
response = await http_client.post(
f"http://{TEI_HOST}:{TEI_PORT}/embed",
json={"inputs": text}
)
response.raise_for_status()
elapsed_ms = (time.perf_counter() - start) * 1000
embeddings = response.json()
if isinstance(embeddings, list) and len(embeddings) > 0:
if isinstance(embeddings[0], list):
return embeddings[0], elapsed_ms
return embeddings, elapsed_ms
raise ValueError("Unexpected embedding format")
except httpx.HTTPError as e:
logger.error(f"TEI request failed: {e}")
raise HTTPException(status_code=503, detail=f"Embedding service error: {e}")
async def search_qdrant(
embedding: List[float],
limit: int = 10,
min_score: float = 0.0,
filters: Optional[dict] = None
) -> tuple[List[dict], float]:
"""Search Qdrant with embedding vector."""
start = time.perf_counter()
try:
payload = {
"vector": embedding,
"limit": limit,
"with_payload": True,
"score_threshold": min_score
}
if filters:
payload["filter"] = filters
response = await http_client.post(
f"http://{QDRANT_HOST}:{QDRANT_PORT}/collections/{QDRANT_COLLECTION}/points/search",
json=payload
)
response.raise_for_status()
elapsed_ms = (time.perf_counter() - start) * 1000
results = response.json().get("result", [])
return results, elapsed_ms
except httpx.HTTPError as e:
logger.error(f"Qdrant request failed: {e}")
raise HTTPException(status_code=503, detail=f"Vector search service error: {e}")
def enrich_results_from_db(hadith_ids: List[int]) -> dict[int, dict]:
"""Fetch full hadith data from PostgreSQL."""
if not hadith_ids:
return {}
conn = get_db_connection()
try:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT
h.id,
h.hadith_number,
h.arabic_text,
h.arabic_normalized,
h.english_text,
h.urdu_text,
h.grade,
c.name_english as collection_name,
b.name_english as book_name
FROM hadiths h
JOIN collections c ON h.collection_id = c.id
LEFT JOIN books b ON h.book_id = b.id
WHERE h.id = ANY(%s)
""", (hadith_ids,))
return {row['id']: dict(row) for row in cur.fetchall()}
finally:
release_db_connection(conn)
def build_qdrant_filter(collections: Optional[List[str]], grades: Optional[List[str]]) -> Optional[dict]:
"""Build Qdrant filter from parameters."""
conditions = []
if collections:
conditions.append({
"key": "collection",
"match": {"any": collections}
})
if grades:
conditions.append({
"key": "grade",
"match": {"any": grades}
})
if not conditions:
return None
if len(conditions) == 1:
return {"must": conditions}
return {"must": conditions}
# ============================================================================
# API Endpoints
# ============================================================================
@app.get("/health", response_model=HealthResponse)
async def health_check():
"""Check health of all services."""
db_status = "healthy"
qdrant_status = "healthy"
tei_status = "healthy"
# Check database
try:
conn = get_db_connection()
with conn.cursor() as cur:
cur.execute("SELECT 1")
release_db_connection(conn)
except Exception as e:
db_status = f"unhealthy: {e}"
# Check Qdrant
try:
response = await http_client.get(
f"http://{QDRANT_HOST}:{QDRANT_PORT}/collections/{QDRANT_COLLECTION}"
)
if response.status_code != 200:
qdrant_status = f"unhealthy: status {response.status_code}"
except Exception as e:
qdrant_status = f"unhealthy: {e}"
# Check TEI
try:
response = await http_client.get(f"http://{TEI_HOST}:{TEI_PORT}/health")
if response.status_code != 200:
tei_status = f"unhealthy: status {response.status_code}"
except Exception as e:
tei_status = f"unhealthy: {e}"
overall = "healthy" if all(
s == "healthy" for s in [db_status, qdrant_status, tei_status]
) else "degraded"
return HealthResponse(
status=overall,
database=db_status,
qdrant=qdrant_status,
tei=tei_status,
timestamp=datetime.now().isoformat()
)
@app.get("/stats", response_model=StatsResponse)
async def get_stats():
"""Get database statistics."""
conn = get_db_connection()
try:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
# Total counts
cur.execute("""
SELECT
COUNT(*) as total,
SUM(CASE WHEN embedding_generated THEN 1 ELSE 0 END) as embedded
FROM hadiths
""")
totals = cur.fetchone()
# By collection
cur.execute("""
SELECT
c.name_english as name,
COUNT(h.id) as total_hadiths,
SUM(CASE WHEN h.embedding_generated THEN 1 ELSE 0 END) as embedded_count
FROM hadiths h
JOIN collections c ON h.collection_id = c.id
GROUP BY c.id, c.name_english
ORDER BY total_hadiths DESC
""")
collections = [CollectionStats(**dict(row)) for row in cur.fetchall()]
return StatsResponse(
total_hadiths=totals['total'],
total_embedded=totals['embedded'],
collections=collections,
timestamp=datetime.now().isoformat()
)
finally:
release_db_connection(conn)
@app.post("/search", response_model=SearchResponse)
async def semantic_search(query: SearchQuery):
"""Perform semantic search on hadiths."""
total_start = time.perf_counter()
# Get embedding
embedding, embed_time = await get_embedding(query.query)
# Build filters
filters = build_qdrant_filter(query.collections, query.grades)
# Search Qdrant
qdrant_results, search_time = await search_qdrant(
embedding,
limit=query.limit,
min_score=query.min_score,
filters=filters
)
# Extract hadith IDs
hadith_ids = []
for r in qdrant_results:
payload = r.get("payload", {})
hid = payload.get("hadith_id") or payload.get("id")
if hid:
hadith_ids.append(int(hid))
# Enrich from database
db_data = enrich_results_from_db(hadith_ids)
# Build results
results = []
for r in qdrant_results:
payload = r.get("payload", {})
hid = payload.get("hadith_id") or payload.get("id")
if hid and int(hid) in db_data:
data = db_data[int(hid)]
results.append(HadithResult(
hadith_id=int(hid),
score=r.get("score", 0),
collection=data.get("collection_name", "Unknown"),
book=data.get("book_name"),
hadith_number=data.get("hadith_number", ""),
arabic_text=data.get("arabic_text"),
arabic_normalized=data.get("arabic_normalized"),
english_text=data.get("english_text"),
urdu_text=data.get("urdu_text"),
grade=data.get("grade")
))
else:
# Fallback to payload
results.append(HadithResult(
hadith_id=int(hid) if hid else 0,
score=r.get("score", 0),
collection=payload.get("collection", "Unknown"),
book=payload.get("book"),
hadith_number=str(payload.get("hadith_number", "")),
arabic_text=payload.get("arabic_text"),
arabic_normalized=payload.get("arabic_normalized"),
english_text=payload.get("english_text"),
urdu_text=payload.get("urdu_text"),
grade=payload.get("grade")
))
total_time = (time.perf_counter() - total_start) * 1000
return SearchResponse(
query=query.query,
results=results,
total_results=len(results),
embedding_time_ms=embed_time,
search_time_ms=search_time,
total_time_ms=total_time,
timestamp=datetime.now().isoformat()
)
@app.get("/search", response_model=SearchResponse)
async def semantic_search_get(
q: str = Query(..., min_length=1, max_length=1000, description="Search query"),
limit: int = Query(default=10, ge=1, le=100),
min_score: float = Query(default=0.0, ge=0.0, le=1.0)
):
"""GET version of semantic search for simple queries."""
query = SearchQuery(query=q, limit=limit, min_score=min_score)
return await semantic_search(query)
@app.get("/hadith/{hadith_id}")
async def get_hadith(hadith_id: int):
"""Get a specific hadith by ID."""
conn = get_db_connection()
try:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT
h.id,
h.hadith_number,
h.arabic_text,
h.arabic_normalized,
h.english_text,
h.urdu_text,
h.grade,
h.source_metadata,
h.embedding_generated,
h.entities_extracted,
h.relations_extracted,
h.created_at,
h.updated_at,
c.name_english as collection_name,
c.name_arabic as collection_arabic,
b.name_english as book_name,
b.name_arabic as book_arabic
FROM hadiths h
JOIN collections c ON h.collection_id = c.id
LEFT JOIN books b ON h.book_id = b.id
WHERE h.id = %s
""", (hadith_id,))
row = cur.fetchone()
if not row:
raise HTTPException(status_code=404, detail=f"Hadith {hadith_id} not found")
return dict(row)
finally:
release_db_connection(conn)
@app.get("/similar/{hadith_id}", response_model=SearchResponse)
async def find_similar(
hadith_id: int,
limit: int = Query(default=10, ge=1, le=100)
):
"""Find hadiths similar to a given hadith."""
# Get the hadith text
conn = get_db_connection()
try:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT arabic_text, english_text
FROM hadiths
WHERE id = %s
""", (hadith_id,))
row = cur.fetchone()
if not row:
raise HTTPException(status_code=404, detail=f"Hadith {hadith_id} not found")
# Use Arabic text preferably, fall back to English
text = row['arabic_text'] or row['english_text']
if not text:
raise HTTPException(status_code=400, detail="Hadith has no text content")
finally:
release_db_connection(conn)
# Search for similar hadiths
query = SearchQuery(query=text, limit=limit + 1) # +1 to exclude self
response = await semantic_search(query)
# Filter out the source hadith
response.results = [r for r in response.results if r.hadith_id != hadith_id][:limit]
response.total_results = len(response.results)
return response
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8080)

View File

@ -0,0 +1,666 @@
#!/usr/bin/env python3
"""
Step 6.2: Semantic Search Testing & Benchmarking
=================================================
Tests semantic search functionality and benchmarks performance.
Target: <500ms per query.
Author: Hadith Scholar AI Project
Date: 2025
"""
import os
import sys
import json
import time
import asyncio
import statistics
from datetime import datetime
from typing import Dict, List, Optional, Tuple
from dataclasses import dataclass, asdict, field
import httpx
import psycopg2
from psycopg2.extras import RealDictCursor
from rich.console import Console
from rich.table import Table
from rich.panel import Panel
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
# Configuration
POSTGRES_HOST = os.getenv("POSTGRES_HOST", "pg.betelgeusebytes.io")
POSTGRES_PORT = int(os.getenv("POSTGRES_PORT", "5432"))
POSTGRES_DB = os.getenv("POSTGRES_DB", "hadith_db")
POSTGRES_USER = os.getenv("POSTGRES_USER", "hadith_ingest")
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD", "")
QDRANT_HOST = os.getenv("QDRANT_HOST", "qdrant.vector.svc.cluster.local")
QDRANT_PORT = int(os.getenv("QDRANT_PORT", "6333"))
QDRANT_COLLECTION = os.getenv("QDRANT_COLLECTION", "hadith_embeddings")
TEI_HOST = os.getenv("TEI_HOST", "tei.ml.svc.cluster.local")
TEI_PORT = int(os.getenv("TEI_PORT", "80"))
console = Console()
# ============================================================================
# Sample Queries for Testing
# ============================================================================
SAMPLE_QUERIES = {
"arabic": [
{
"query": "الصلاة في المسجد الحرام",
"description": "Prayer in the Sacred Mosque",
"expected_topics": ["prayer", "mosque", "mecca"]
},
{
"query": "أبو هريرة رضي الله عنه",
"description": "Abu Hurairah (RA)",
"expected_topics": ["narrator", "companion"]
},
{
"query": "الصيام في شهر رمضان",
"description": "Fasting in Ramadan",
"expected_topics": ["fasting", "ramadan"]
},
{
"query": "الزكاة والصدقة",
"description": "Zakat and charity",
"expected_topics": ["charity", "zakat"]
},
{
"query": "الحج والعمرة",
"description": "Hajj and Umrah",
"expected_topics": ["pilgrimage", "hajj", "umrah"]
},
{
"query": "النبي صلى الله عليه وسلم في المدينة",
"description": "Prophet (PBUH) in Medina",
"expected_topics": ["prophet", "medina"]
},
{
"query": "الوضوء والطهارة",
"description": "Ablution and purification",
"expected_topics": ["ablution", "purification", "wudu"]
},
{
"query": "بر الوالدين",
"description": "Honoring parents",
"expected_topics": ["parents", "kindness", "family"]
},
{
"query": "الجنة والنار",
"description": "Paradise and Hell",
"expected_topics": ["afterlife", "paradise", "hell"]
},
{
"query": "الإيمان والإسلام",
"description": "Faith and Islam",
"expected_topics": ["faith", "belief", "islam"]
}
],
"english": [
{
"query": "five daily prayers",
"description": "The five obligatory prayers",
"expected_topics": ["prayer", "salah", "obligation"]
},
{
"query": "Prophet Muhammad in Mecca",
"description": "Prophet's life in Mecca",
"expected_topics": ["prophet", "mecca", "biography"]
},
{
"query": "treatment of neighbors",
"description": "Rights and treatment of neighbors",
"expected_topics": ["neighbors", "rights", "ethics"]
},
{
"query": "patience during hardship",
"description": "Patience in difficult times",
"expected_topics": ["patience", "sabr", "trials"]
},
{
"query": "marriage and family",
"description": "Islamic marriage guidance",
"expected_topics": ["marriage", "family", "nikah"]
},
{
"query": "honesty and truthfulness",
"description": "Importance of being truthful",
"expected_topics": ["honesty", "truth", "character"]
},
{
"query": "Day of Judgment signs",
"description": "Signs of the Last Day",
"expected_topics": ["judgment", "signs", "eschatology"]
},
{
"query": "charity and helping poor",
"description": "Giving charity to the needy",
"expected_topics": ["charity", "poor", "sadaqah"]
},
{
"query": "companions of the Prophet",
"description": "Sahaba and their virtues",
"expected_topics": ["companions", "sahaba", "virtue"]
},
{
"query": "seeking knowledge in Islam",
"description": "Importance of knowledge",
"expected_topics": ["knowledge", "learning", "education"]
}
],
"mixed": [
{
"query": "قال رسول الله about kindness",
"description": "Prophet's sayings about kindness (mixed)",
"expected_topics": ["prophet", "kindness", "ethics"]
},
{
"query": "women rights الإسلام",
"description": "Women's rights in Islam (mixed)",
"expected_topics": ["women", "rights", "islam"]
}
]
}
@dataclass
class SearchResult:
"""Individual search result."""
hadith_id: int
score: float
collection: str
hadith_number: str
arabic_text: str
english_text: str
grade: str
@dataclass
class QueryBenchmark:
"""Benchmark results for a single query."""
query: str
language: str
description: str
embedding_time_ms: float
search_time_ms: float
total_time_ms: float
results_count: int
top_score: float
meets_target: bool # <500ms
@dataclass
class BenchmarkReport:
"""Full benchmark report."""
total_queries: int
successful_queries: int
failed_queries: int
avg_embedding_time_ms: float
avg_search_time_ms: float
avg_total_time_ms: float
p50_time_ms: float
p95_time_ms: float
p99_time_ms: float
min_time_ms: float
max_time_ms: float
queries_meeting_target: int
target_ms: int
query_results: List[QueryBenchmark] = field(default_factory=list)
timestamp: str = ""
def get_db_connection():
"""Create PostgreSQL connection."""
return psycopg2.connect(
host=POSTGRES_HOST,
port=POSTGRES_PORT,
database=POSTGRES_DB,
user=POSTGRES_USER,
password=POSTGRES_PASSWORD,
sslmode='require'
)
async def get_embedding(client: httpx.AsyncClient, text: str) -> Tuple[List[float], float]:
"""Get embedding from TEI service."""
start = time.perf_counter()
response = await client.post(
f"http://{TEI_HOST}:{TEI_PORT}/embed",
json={"inputs": text}
)
response.raise_for_status()
elapsed_ms = (time.perf_counter() - start) * 1000
# TEI returns list of embeddings, we want the first one
embeddings = response.json()
if isinstance(embeddings, list) and len(embeddings) > 0:
if isinstance(embeddings[0], list):
return embeddings[0], elapsed_ms
return embeddings, elapsed_ms
raise ValueError(f"Unexpected embedding response format: {type(embeddings)}")
async def search_qdrant(
client: httpx.AsyncClient,
embedding: List[float],
limit: int = 10
) -> Tuple[List[Dict], float]:
"""Search Qdrant with embedding vector."""
start = time.perf_counter()
response = await client.post(
f"http://{QDRANT_HOST}:{QDRANT_PORT}/collections/{QDRANT_COLLECTION}/points/search",
json={
"vector": embedding,
"limit": limit,
"with_payload": True
}
)
response.raise_for_status()
elapsed_ms = (time.perf_counter() - start) * 1000
results = response.json().get("result", [])
return results, elapsed_ms
def enrich_results_from_db(hadith_ids: List[int]) -> Dict[int, Dict]:
"""Fetch full hadith data from PostgreSQL."""
if not hadith_ids:
return {}
conn = get_db_connection()
try:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT
h.id,
h.hadith_number,
h.arabic_text,
h.english_text,
h.grade,
c.name_english as collection_name
FROM hadiths h
JOIN collections c ON h.collection_id = c.id
WHERE h.id = ANY(%s)
""", (hadith_ids,))
return {row['id']: dict(row) for row in cur.fetchall()}
finally:
conn.close()
async def semantic_search(
client: httpx.AsyncClient,
query: str,
limit: int = 10
) -> Tuple[List[SearchResult], float, float]:
"""Perform semantic search and return results with timing."""
# Step 1: Get embedding
embedding, embed_time = await get_embedding(client, query)
# Step 2: Search Qdrant
qdrant_results, search_time = await search_qdrant(client, embedding, limit)
# Step 3: Get hadith IDs and enrich from DB
hadith_ids = []
for r in qdrant_results:
payload = r.get("payload", {})
hid = payload.get("hadith_id") or payload.get("id")
if hid:
hadith_ids.append(int(hid))
db_data = enrich_results_from_db(hadith_ids)
# Step 4: Build results
results = []
for r in qdrant_results:
payload = r.get("payload", {})
hid = payload.get("hadith_id") or payload.get("id")
if hid and int(hid) in db_data:
data = db_data[int(hid)]
results.append(SearchResult(
hadith_id=int(hid),
score=r.get("score", 0),
collection=data.get("collection_name", "Unknown"),
hadith_number=data.get("hadith_number", ""),
arabic_text=data.get("arabic_text", "")[:200] + "..." if data.get("arabic_text") else "",
english_text=data.get("english_text", "")[:200] + "..." if data.get("english_text") else "",
grade=data.get("grade", "")
))
else:
# Fallback to payload data
results.append(SearchResult(
hadith_id=int(hid) if hid else 0,
score=r.get("score", 0),
collection=payload.get("collection", "Unknown"),
hadith_number=str(payload.get("hadith_number", "")),
arabic_text=payload.get("arabic_text", "")[:200] + "..." if payload.get("arabic_text") else "",
english_text=payload.get("english_text", "")[:200] + "..." if payload.get("english_text") else "",
grade=payload.get("grade", "")
))
return results, embed_time, search_time
def display_search_results(query: str, results: List[SearchResult], embed_time: float, search_time: float):
"""Display search results in a nice format."""
total_time = embed_time + search_time
console.print(f"\n[bold cyan]Query:[/bold cyan] {query}")
console.print(f"[dim]Embedding: {embed_time:.1f}ms | Search: {search_time:.1f}ms | Total: {total_time:.1f}ms[/dim]")
if not results:
console.print("[yellow]No results found.[/yellow]")
return
table = Table(title=f"Top {len(results)} Results", show_lines=True)
table.add_column("#", style="dim", width=3)
table.add_column("Score", justify="right", width=8)
table.add_column("Collection", width=15)
table.add_column("Hadith #", width=10)
table.add_column("Text Preview", width=60)
table.add_column("Grade", width=10)
for i, r in enumerate(results, 1):
text_preview = r.english_text if r.english_text else r.arabic_text
table.add_row(
str(i),
f"{r.score:.4f}",
r.collection,
r.hadith_number,
text_preview[:80] + "..." if len(text_preview) > 80 else text_preview,
r.grade or "-"
)
console.print(table)
async def run_benchmarks(warmup_count: int = 3) -> BenchmarkReport:
"""Run full benchmark suite."""
console.print(Panel.fit(
"[bold blue]Semantic Search Benchmark[/bold blue]\n"
f"Target: <500ms per query\n"
f"TEI: {TEI_HOST}:{TEI_PORT}\n"
f"Qdrant: {QDRANT_HOST}:{QDRANT_PORT}",
title="Step 6.2"
))
all_queries = (
[(q, "arabic") for q in SAMPLE_QUERIES["arabic"]] +
[(q, "english") for q in SAMPLE_QUERIES["english"]] +
[(q, "mixed") for q in SAMPLE_QUERIES["mixed"]]
)
query_results = []
total_times = []
successful = 0
failed = 0
async with httpx.AsyncClient(timeout=30.0) as client:
# Warmup queries
console.print(f"\n[yellow]Running {warmup_count} warmup queries...[/yellow]")
for i in range(warmup_count):
try:
await semantic_search(client, "test warmup query", limit=5)
except Exception as e:
console.print(f"[dim]Warmup {i+1} error: {e}[/dim]")
console.print("[green]Warmup complete.[/green]\n")
# Run benchmarks
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
console=console
) as progress:
task = progress.add_task("Running benchmarks...", total=len(all_queries))
for query_data, lang in all_queries:
query = query_data["query"]
description = query_data["description"]
try:
results, embed_time, search_time = await semantic_search(
client, query, limit=10
)
total_time = embed_time + search_time
total_times.append(total_time)
benchmark = QueryBenchmark(
query=query,
language=lang,
description=description,
embedding_time_ms=embed_time,
search_time_ms=search_time,
total_time_ms=total_time,
results_count=len(results),
top_score=results[0].score if results else 0,
meets_target=total_time < 500
)
query_results.append(benchmark)
successful += 1
except Exception as e:
console.print(f"[red]Error for query '{query[:30]}...': {e}[/red]")
failed += 1
progress.advance(task)
# Calculate statistics
if total_times:
sorted_times = sorted(total_times)
p50_idx = int(len(sorted_times) * 0.50)
p95_idx = int(len(sorted_times) * 0.95)
p99_idx = int(len(sorted_times) * 0.99)
report = BenchmarkReport(
total_queries=len(all_queries),
successful_queries=successful,
failed_queries=failed,
avg_embedding_time_ms=statistics.mean(q.embedding_time_ms for q in query_results),
avg_search_time_ms=statistics.mean(q.search_time_ms for q in query_results),
avg_total_time_ms=statistics.mean(total_times),
p50_time_ms=sorted_times[p50_idx],
p95_time_ms=sorted_times[p95_idx] if p95_idx < len(sorted_times) else sorted_times[-1],
p99_time_ms=sorted_times[p99_idx] if p99_idx < len(sorted_times) else sorted_times[-1],
min_time_ms=min(total_times),
max_time_ms=max(total_times),
queries_meeting_target=sum(1 for t in total_times if t < 500),
target_ms=500,
query_results=query_results,
timestamp=datetime.now().isoformat()
)
else:
report = BenchmarkReport(
total_queries=len(all_queries),
successful_queries=0,
failed_queries=failed,
avg_embedding_time_ms=0,
avg_search_time_ms=0,
avg_total_time_ms=0,
p50_time_ms=0,
p95_time_ms=0,
p99_time_ms=0,
min_time_ms=0,
max_time_ms=0,
queries_meeting_target=0,
target_ms=500,
query_results=[],
timestamp=datetime.now().isoformat()
)
return report
def display_benchmark_report(report: BenchmarkReport):
"""Display benchmark report."""
console.print("\n" + "="*60)
console.print("[bold]BENCHMARK RESULTS[/bold]")
console.print("="*60)
# Summary stats
console.print(f"\n[cyan]Query Statistics:[/cyan]")
console.print(f" Total queries: {report.total_queries}")
console.print(f" Successful: [green]{report.successful_queries}[/green]")
console.print(f" Failed: [red]{report.failed_queries}[/red]")
console.print(f"\n[cyan]Timing Statistics:[/cyan]")
console.print(f" Average embedding time: {report.avg_embedding_time_ms:.1f}ms")
console.print(f" Average search time: {report.avg_search_time_ms:.1f}ms")
console.print(f" Average total time: {report.avg_total_time_ms:.1f}ms")
console.print(f"\n[cyan]Percentiles:[/cyan]")
console.print(f" P50: {report.p50_time_ms:.1f}ms")
console.print(f" P95: {report.p95_time_ms:.1f}ms")
console.print(f" P99: {report.p99_time_ms:.1f}ms")
console.print(f" Min: {report.min_time_ms:.1f}ms")
console.print(f" Max: {report.max_time_ms:.1f}ms")
# Target check
target_pct = (report.queries_meeting_target / report.successful_queries * 100) if report.successful_queries else 0
target_met = target_pct >= 95 # 95% of queries should meet target
console.print(f"\n[cyan]Performance Target (<{report.target_ms}ms):[/cyan]")
status = "[bold green]✓ TARGET MET[/bold green]" if target_met else "[bold red]✗ TARGET NOT MET[/bold red]"
console.print(f" Queries meeting target: {report.queries_meeting_target}/{report.successful_queries} ({target_pct:.1f}%)")
console.print(f" Status: {status}")
# Detailed results table
if report.query_results:
console.print("\n[cyan]Detailed Results:[/cyan]")
table = Table(show_lines=False)
table.add_column("Language", width=8)
table.add_column("Query", width=35)
table.add_column("Embed", justify="right", width=8)
table.add_column("Search", justify="right", width=8)
table.add_column("Total", justify="right", width=8)
table.add_column("Results", justify="right", width=7)
table.add_column("Status", width=6)
for r in report.query_results:
status_icon = "" if r.meets_target else ""
status_style = "green" if r.meets_target else "red"
table.add_row(
r.language,
r.query[:35] + "..." if len(r.query) > 35 else r.query,
f"{r.embedding_time_ms:.0f}ms",
f"{r.search_time_ms:.0f}ms",
f"{r.total_time_ms:.0f}ms",
str(r.results_count),
f"[{status_style}]{status_icon}[/{status_style}]"
)
console.print(table)
async def interactive_search():
"""Interactive search mode."""
console.print(Panel.fit(
"[bold blue]Interactive Semantic Search[/bold blue]\n"
"Type your query and press Enter. Type 'quit' to exit.",
title="Interactive Mode"
))
async with httpx.AsyncClient(timeout=30.0) as client:
while True:
try:
query = input("\n🔍 Query: ").strip()
if query.lower() in ('quit', 'exit', 'q'):
console.print("[dim]Goodbye![/dim]")
break
if not query:
continue
results, embed_time, search_time = await semantic_search(
client, query, limit=10
)
display_search_results(query, results, embed_time, search_time)
except KeyboardInterrupt:
console.print("\n[dim]Interrupted. Goodbye![/dim]")
break
except Exception as e:
console.print(f"[red]Error: {e}[/red]")
async def main():
"""Main entry point."""
import argparse
parser = argparse.ArgumentParser(description="Hadith Semantic Search Testing")
parser.add_argument("--mode", choices=["benchmark", "interactive", "demo"],
default="benchmark", help="Run mode")
parser.add_argument("--query", type=str, help="Single query to run")
parser.add_argument("--output", type=str, default="benchmark_results.json",
help="Output file for benchmark results")
args = parser.parse_args()
if args.query:
# Single query mode
async with httpx.AsyncClient(timeout=30.0) as client:
results, embed_time, search_time = await semantic_search(
client, args.query, limit=10
)
display_search_results(args.query, results, embed_time, search_time)
elif args.mode == "benchmark":
# Full benchmark
report = await run_benchmarks()
display_benchmark_report(report)
# Save results
with open(args.output, 'w') as f:
json.dump(asdict(report), f, indent=2, default=str)
console.print(f"\n[dim]Results saved to {args.output}[/dim]")
elif args.mode == "interactive":
await interactive_search()
elif args.mode == "demo":
# Demo with a few sample queries
console.print(Panel.fit(
"[bold blue]Semantic Search Demo[/bold blue]",
title="Demo Mode"
))
demo_queries = [
"الصلاة في المسجد",
"five daily prayers",
"patience during hardship",
"بر الوالدين"
]
async with httpx.AsyncClient(timeout=30.0) as client:
for query in demo_queries:
try:
results, embed_time, search_time = await semantic_search(
client, query, limit=5
)
display_search_results(query, results, embed_time, search_time)
except Exception as e:
console.print(f"[red]Error for '{query}': {e}[/red]")
if __name__ == "__main__":
asyncio.run(main())

View File

@ -0,0 +1,476 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Step 6: Verify Embeddings & Test Semantic Search\n",
"\n",
"This notebook provides interactive verification and testing of the hadith embedding system.\n",
"\n",
"**Prerequisites:**\n",
"- PostgreSQL accessible at pg.betelgeusebytes.io\n",
"- Qdrant accessible at qdrant.vector.svc.cluster.local\n",
"- TEI accessible at tei.ml.svc.cluster.local"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Setup & Configuration"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Install dependencies\n",
"!pip install -q psycopg2-binary httpx rich"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import json\n",
"import time\n",
"import httpx\n",
"import psycopg2\n",
"from psycopg2.extras import RealDictCursor\n",
"import pandas as pd\n",
"from IPython.display import display, HTML, Markdown\n",
"\n",
"# Configuration\n",
"POSTGRES_CONFIG = {\n",
" 'host': os.getenv('POSTGRES_HOST', 'pg.betelgeusebytes.io'),\n",
" 'port': int(os.getenv('POSTGRES_PORT', '5432')),\n",
" 'database': os.getenv('POSTGRES_DB', 'hadith_db'),\n",
" 'user': os.getenv('POSTGRES_USER', 'hadith_ingest'),\n",
" 'password': os.getenv('POSTGRES_PASSWORD', ''), # SET THIS!\n",
" 'sslmode': 'require'\n",
"}\n",
"\n",
"QDRANT_URL = f\"http://{os.getenv('QDRANT_HOST', 'qdrant.vector.svc.cluster.local')}:{os.getenv('QDRANT_PORT', '6333')}\"\n",
"QDRANT_COLLECTION = os.getenv('QDRANT_COLLECTION', 'hadith_embeddings')\n",
"\n",
"TEI_URL = f\"http://{os.getenv('TEI_HOST', 'tei.ml.svc.cluster.local')}:{os.getenv('TEI_PORT', '80')}\"\n",
"\n",
"print(f\"PostgreSQL: {POSTGRES_CONFIG['host']}:{POSTGRES_CONFIG['port']}/{POSTGRES_CONFIG['database']}\")\n",
"print(f\"Qdrant: {QDRANT_URL}\")\n",
"print(f\"TEI: {TEI_URL}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# ⚠️ SET YOUR PASSWORD HERE\n",
"POSTGRES_CONFIG['password'] = 'YOUR_PASSWORD_HERE' # CHANGE THIS!"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Database Verification"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_db_connection():\n",
" return psycopg2.connect(**POSTGRES_CONFIG)\n",
"\n",
"def run_query(query):\n",
" conn = get_db_connection()\n",
" try:\n",
" df = pd.read_sql(query, conn)\n",
" return df\n",
" finally:\n",
" conn.close()\n",
"\n",
"# Test connection\n",
"try:\n",
" conn = get_db_connection()\n",
" conn.close()\n",
" print(\"✅ Database connection successful!\")\n",
"except Exception as e:\n",
" print(f\"❌ Database connection failed: {e}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Get total hadith count and embedding status\n",
"query = \"\"\"\n",
"SELECT \n",
" COUNT(*) as total_hadiths,\n",
" SUM(CASE WHEN embedding_generated THEN 1 ELSE 0 END) as embedded,\n",
" SUM(CASE WHEN NOT embedding_generated THEN 1 ELSE 0 END) as not_embedded,\n",
" ROUND(100.0 * SUM(CASE WHEN embedding_generated THEN 1 ELSE 0 END) / COUNT(*), 2) as pct_complete\n",
"FROM hadiths\n",
"\"\"\"\n",
"\n",
"df = run_query(query)\n",
"display(df)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Get breakdown by collection\n",
"query = \"\"\"\n",
"SELECT \n",
" c.name_english as collection,\n",
" COUNT(h.id) as total,\n",
" SUM(CASE WHEN h.embedding_generated THEN 1 ELSE 0 END) as embedded,\n",
" ROUND(100.0 * SUM(CASE WHEN h.embedding_generated THEN 1 ELSE 0 END) / COUNT(h.id), 2) as pct_embedded\n",
"FROM hadiths h\n",
"JOIN collections c ON h.collection_id = c.id\n",
"GROUP BY c.id, c.name_english\n",
"ORDER BY total DESC\n",
"\"\"\"\n",
"\n",
"df_collections = run_query(query)\n",
"display(df_collections)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. Qdrant Verification"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Check Qdrant collection\n",
"with httpx.Client(timeout=30.0) as client:\n",
" try:\n",
" response = client.get(f\"{QDRANT_URL}/collections/{QDRANT_COLLECTION}\")\n",
" response.raise_for_status()\n",
" collection_info = response.json()\n",
" print(\"✅ Qdrant collection found!\")\n",
" print(f\"\\nCollection status: {collection_info['result']['status']}\")\n",
" print(f\"Vector dimension: {collection_info['result']['config']['params']['vectors']['size']}\")\n",
" except Exception as e:\n",
" print(f\"❌ Qdrant error: {e}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Count points in Qdrant\n",
"with httpx.Client(timeout=30.0) as client:\n",
" response = client.post(\n",
" f\"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/points/count\",\n",
" json={\"exact\": True}\n",
" )\n",
" response.raise_for_status()\n",
" count = response.json()['result']['count']\n",
" print(f\"\\n📊 Total embeddings in Qdrant: {count:,}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4. TEI Service Test"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Test TEI embedding service\n",
"test_text = \"الصلاة في المسجد الحرام\"\n",
"\n",
"with httpx.Client(timeout=30.0) as client:\n",
" start = time.perf_counter()\n",
" response = client.post(\n",
" f\"{TEI_URL}/embed\",\n",
" json={\"inputs\": test_text}\n",
" )\n",
" elapsed = (time.perf_counter() - start) * 1000\n",
" \n",
" response.raise_for_status()\n",
" embedding = response.json()[0]\n",
" \n",
" print(f\"✅ TEI service working!\")\n",
" print(f\"\\nTest text: {test_text}\")\n",
" print(f\"Embedding dimension: {len(embedding)}\")\n",
" print(f\"Time: {elapsed:.1f}ms\")\n",
" print(f\"First 5 values: {embedding[:5]}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5. Semantic Search Testing"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def semantic_search(query_text, limit=10):\n",
" \"\"\"Perform semantic search and return results with timing.\"\"\"\n",
" with httpx.Client(timeout=30.0) as client:\n",
" # Get embedding\n",
" start = time.perf_counter()\n",
" embed_response = client.post(f\"{TEI_URL}/embed\", json={\"inputs\": query_text})\n",
" embed_response.raise_for_status()\n",
" embedding = embed_response.json()[0]\n",
" embed_time = (time.perf_counter() - start) * 1000\n",
" \n",
" # Search Qdrant\n",
" start = time.perf_counter()\n",
" search_response = client.post(\n",
" f\"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/points/search\",\n",
" json={\"vector\": embedding, \"limit\": limit, \"with_payload\": True}\n",
" )\n",
" search_response.raise_for_status()\n",
" results = search_response.json()['result']\n",
" search_time = (time.perf_counter() - start) * 1000\n",
" \n",
" return results, embed_time, search_time\n",
"\n",
"def display_results(query, results, embed_time, search_time):\n",
" \"\"\"Display search results nicely.\"\"\"\n",
" total_time = embed_time + search_time\n",
" status = \"✅\" if total_time < 500 else \"⚠️\"\n",
" \n",
" print(f\"\\n{'='*60}\")\n",
" print(f\"Query: {query}\")\n",
" print(f\"Timing: {embed_time:.0f}ms (embed) + {search_time:.0f}ms (search) = {total_time:.0f}ms {status}\")\n",
" print(f\"{'='*60}\\n\")\n",
" \n",
" for i, r in enumerate(results, 1):\n",
" score = r['score']\n",
" payload = r.get('payload', {})\n",
" \n",
" text = payload.get('english_text') or payload.get('arabic_text', '')\n",
" text = text[:150] + '...' if len(text) > 150 else text\n",
" \n",
" print(f\"{i}. [Score: {score:.4f}] {payload.get('collection', 'Unknown')} #{payload.get('hadith_number', 'N/A')}\")\n",
" print(f\" {text}\")\n",
" print()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Test Arabic query\n",
"query = \"الصلاة في المسجد الحرام\"\n",
"results, embed_time, search_time = semantic_search(query, limit=5)\n",
"display_results(query, results, embed_time, search_time)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Test English query\n",
"query = \"five daily prayers\"\n",
"results, embed_time, search_time = semantic_search(query, limit=5)\n",
"display_results(query, results, embed_time, search_time)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Test more queries\n",
"test_queries = [\n",
" \"الصيام في شهر رمضان\",\n",
" \"patience during hardship\",\n",
" \"بر الوالدين\",\n",
" \"charity and helping poor\",\n",
" \"الجنة والنار\"\n",
"]\n",
"\n",
"for q in test_queries:\n",
" results, embed_time, search_time = semantic_search(q, limit=3)\n",
" display_results(q, results, embed_time, search_time)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 6. Performance Benchmarking"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import statistics\n",
"\n",
"# Benchmark queries\n",
"benchmark_queries = [\n",
" \"الصلاة في المسجد الحرام\",\n",
" \"أبو هريرة رضي الله عنه\",\n",
" \"الصيام في شهر رمضان\",\n",
" \"five daily prayers\",\n",
" \"treatment of neighbors\",\n",
" \"patience during hardship\",\n",
" \"marriage and family\",\n",
" \"honesty and truthfulness\",\n",
" \"الزكاة والصدقة\",\n",
" \"الحج والعمرة\"\n",
"]\n",
"\n",
"# Warmup\n",
"print(\"Warming up...\")\n",
"for _ in range(3):\n",
" semantic_search(\"warmup query\", limit=5)\n",
"\n",
"# Run benchmark\n",
"print(\"\\nRunning benchmark...\")\n",
"times = []\n",
"\n",
"for q in benchmark_queries:\n",
" results, embed_time, search_time = semantic_search(q, limit=10)\n",
" total = embed_time + search_time\n",
" times.append(total)\n",
" status = \"✅\" if total < 500 else \"⚠️\"\n",
" print(f\" {q[:40]:40s} → {total:6.1f}ms {status}\")\n",
"\n",
"# Statistics\n",
"print(f\"\\n{'='*60}\")\n",
"print(\"BENCHMARK RESULTS\")\n",
"print(f\"{'='*60}\")\n",
"print(f\"Queries: {len(times)}\")\n",
"print(f\"Average: {statistics.mean(times):.1f}ms\")\n",
"print(f\"Median: {statistics.median(times):.1f}ms\")\n",
"print(f\"Min: {min(times):.1f}ms\")\n",
"print(f\"Max: {max(times):.1f}ms\")\n",
"print(f\"StdDev: {statistics.stdev(times):.1f}ms\")\n",
"\n",
"meeting_target = sum(1 for t in times if t < 500)\n",
"print(f\"\\nMeeting <500ms target: {meeting_target}/{len(times)} ({100*meeting_target/len(times):.1f}%)\")\n",
"\n",
"if meeting_target == len(times):\n",
" print(\"\\n✅ TARGET MET!\")\n",
"else:\n",
" print(\"\\n⚠ Some queries exceeded target\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 7. Interactive Search"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Interactive search cell - run this and enter your query\n",
"query = input(\"Enter your search query: \")\n",
"if query:\n",
" results, embed_time, search_time = semantic_search(query, limit=10)\n",
" display_results(query, results, embed_time, search_time)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 8. Verification Summary"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Final verification summary\n",
"print(\"=\"*60)\n",
"print(\"STEP 6 VERIFICATION SUMMARY\")\n",
"print(\"=\"*60)\n",
"\n",
"# Database check\n",
"df = run_query(\"SELECT COUNT(*) as total, SUM(CASE WHEN embedding_generated THEN 1 ELSE 0 END) as embedded FROM hadiths\")\n",
"total = df['total'][0]\n",
"embedded = df['embedded'][0]\n",
"print(f\"\\n✅ Database: {total:,} hadiths, {embedded:,} embedded ({100*embedded/total:.1f}%)\")\n",
"\n",
"# Qdrant check\n",
"with httpx.Client(timeout=30.0) as client:\n",
" response = client.post(f\"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/points/count\", json={\"exact\": True})\n",
" qdrant_count = response.json()['result']['count']\n",
" print(f\"✅ Qdrant: {qdrant_count:,} embeddings stored\")\n",
"\n",
"# Benchmark summary\n",
"if 'times' in dir() and times:\n",
" print(f\"✅ Performance: Avg {statistics.mean(times):.0f}ms, P95 {sorted(times)[int(len(times)*0.95)]:.0f}ms\")\n",
"\n",
"# Missing check\n",
"missing = total - qdrant_count\n",
"if missing == 0:\n",
" print(f\"\\n🎉 ALL {total:,} HADITHS VERIFIED!\")\n",
"else:\n",
" print(f\"\\n⚠ {missing:,} embeddings potentially missing\")\n",
"\n",
"print(\"\\n\" + \"=\"*60)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.11.0"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@ -0,0 +1,192 @@
-- ============================================================================
-- Step 6.1: PostgreSQL Verification Queries
-- Run these against hadith_db to verify data integrity
-- ============================================================================
-- Connect: psql -h pg.betelgeusebytes.io -U hadith_ingest -d hadith_db
-- ============================================================================
-- 1. Basic Statistics
-- ============================================================================
-- Total hadith count
SELECT COUNT(*) AS total_hadiths FROM hadiths;
-- Hadiths by collection with embedding status
SELECT
c.name_english AS collection,
COUNT(h.id) AS total,
SUM(CASE WHEN h.embedding_generated THEN 1 ELSE 0 END) AS embedded,
ROUND(100.0 * SUM(CASE WHEN h.embedding_generated THEN 1 ELSE 0 END) / COUNT(h.id), 2) AS pct_embedded
FROM hadiths h
JOIN collections c ON h.collection_id = c.id
GROUP BY c.id, c.name_english
ORDER BY total DESC;
-- ============================================================================
-- 2. Embedding Status Summary
-- ============================================================================
-- Overall embedding status
SELECT
SUM(CASE WHEN embedding_generated THEN 1 ELSE 0 END) AS embedded,
SUM(CASE WHEN NOT embedding_generated THEN 1 ELSE 0 END) AS not_embedded,
COUNT(*) AS total,
ROUND(100.0 * SUM(CASE WHEN embedding_generated THEN 1 ELSE 0 END) / COUNT(*), 2) AS pct_complete
FROM hadiths;
-- List hadiths without embeddings (if any)
SELECT
h.id,
c.name_english AS collection,
h.hadith_number,
LEFT(h.arabic_text, 100) AS arabic_preview
FROM hadiths h
JOIN collections c ON h.collection_id = c.id
WHERE NOT h.embedding_generated
ORDER BY h.id
LIMIT 20;
-- ============================================================================
-- 3. Text Quality Checks
-- ============================================================================
-- Hadiths with empty or null texts
SELECT
'Empty Arabic' AS issue,
COUNT(*) AS count
FROM hadiths
WHERE arabic_text IS NULL OR LENGTH(TRIM(arabic_text)) = 0
UNION ALL
SELECT
'Empty English' AS issue,
COUNT(*) AS count
FROM hadiths
WHERE english_text IS NULL OR LENGTH(TRIM(english_text)) = 0
UNION ALL
SELECT
'Empty Both' AS issue,
COUNT(*) AS count
FROM hadiths
WHERE (arabic_text IS NULL OR LENGTH(TRIM(arabic_text)) = 0)
AND (english_text IS NULL OR LENGTH(TRIM(english_text)) = 0);
-- ============================================================================
-- 4. Grade Distribution
-- ============================================================================
SELECT
COALESCE(grade, 'Unknown') AS grade,
COUNT(*) AS count,
ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER(), 2) AS percentage
FROM hadiths
GROUP BY grade
ORDER BY count DESC;
-- ============================================================================
-- 5. Arabic Normalization Verification
-- ============================================================================
-- Check that normalized column is populated
SELECT
COUNT(*) AS total,
SUM(CASE WHEN arabic_normalized IS NOT NULL AND LENGTH(arabic_normalized) > 0 THEN 1 ELSE 0 END) AS normalized,
SUM(CASE WHEN arabic_normalized IS NULL OR LENGTH(arabic_normalized) = 0 THEN 1 ELSE 0 END) AS not_normalized
FROM hadiths
WHERE arabic_text IS NOT NULL AND LENGTH(arabic_text) > 0;
-- Sample comparison of original vs normalized
SELECT
id,
LEFT(arabic_text, 100) AS original,
LEFT(arabic_normalized, 100) AS normalized
FROM hadiths
WHERE arabic_text IS NOT NULL
LIMIT 5;
-- ============================================================================
-- 6. Metadata Completeness
-- ============================================================================
-- Check source_metadata JSON completeness
SELECT
COUNT(*) AS total,
SUM(CASE WHEN source_metadata IS NOT NULL THEN 1 ELSE 0 END) AS has_metadata,
SUM(CASE WHEN source_metadata ? 'api_source' THEN 1 ELSE 0 END) AS has_api_source,
SUM(CASE WHEN source_metadata ? 'ingested_at' THEN 1 ELSE 0 END) AS has_ingested_at
FROM hadiths;
-- ============================================================================
-- 7. ID Range Check (for Qdrant comparison)
-- ============================================================================
-- Get ID range
SELECT
MIN(id) AS min_id,
MAX(id) AS max_id,
COUNT(*) AS total_ids,
MAX(id) - MIN(id) + 1 AS expected_if_sequential,
COUNT(*) = (MAX(id) - MIN(id) + 1) AS is_sequential
FROM hadiths;
-- Find gaps in IDs (if any)
WITH id_series AS (
SELECT generate_series(
(SELECT MIN(id) FROM hadiths),
(SELECT MAX(id) FROM hadiths)
) AS expected_id
)
SELECT expected_id AS missing_id
FROM id_series
WHERE expected_id NOT IN (SELECT id FROM hadiths)
ORDER BY expected_id
LIMIT 50;
-- ============================================================================
-- 8. Sample Data for Manual Verification
-- ============================================================================
-- Sample 10 hadiths with all fields
SELECT
h.id,
c.name_english AS collection,
b.name_english AS book,
h.hadith_number,
h.grade,
LENGTH(h.arabic_text) AS arabic_len,
LENGTH(h.english_text) AS english_len,
LENGTH(h.urdu_text) AS urdu_len,
h.embedding_generated,
h.created_at
FROM hadiths h
JOIN collections c ON h.collection_id = c.id
LEFT JOIN books b ON h.book_id = b.id
ORDER BY RANDOM()
LIMIT 10;
-- ============================================================================
-- 9. NER/RE Preparation Status
-- ============================================================================
SELECT
SUM(CASE WHEN entities_extracted THEN 1 ELSE 0 END) AS entities_extracted,
SUM(CASE WHEN relations_extracted THEN 1 ELSE 0 END) AS relations_extracted,
COUNT(*) AS total
FROM hadiths;
-- ============================================================================
-- 10. Quick Health Check Query (run this first)
-- ============================================================================
SELECT
'Database Health Check' AS check_type,
(SELECT COUNT(*) FROM hadiths) AS total_hadiths,
(SELECT COUNT(*) FROM collections) AS total_collections,
(SELECT COUNT(*) FROM books) AS total_books,
(SELECT SUM(CASE WHEN embedding_generated THEN 1 ELSE 0 END) FROM hadiths) AS embedded_count,
(SELECT COUNT(*) FROM hadiths WHERE arabic_text IS NOT NULL AND LENGTH(arabic_text) > 0) AS has_arabic,
(SELECT COUNT(*) FROM hadiths WHERE english_text IS NOT NULL AND LENGTH(english_text) > 0) AS has_english;

View File

@ -0,0 +1,377 @@
#!/usr/bin/env python3
"""
Step 6.1: Verify Embeddings in Qdrant
=====================================
Validates that all hadiths have embeddings stored in Qdrant vector database.
Author: Hadith Scholar AI Project
Date: 2025
"""
import os
import sys
import json
import time
import asyncio
from datetime import datetime
from typing import Dict, List, Optional, Tuple
from dataclasses import dataclass, asdict
import psycopg2
from psycopg2.extras import RealDictCursor
import httpx
from rich.console import Console
from rich.table import Table
from rich.progress import Progress, SpinnerColumn, TextColumn
from rich.panel import Panel
# Configuration
POSTGRES_HOST = os.getenv("POSTGRES_HOST", "pg.betelgeusebytes.io")
POSTGRES_PORT = int(os.getenv("POSTGRES_PORT", "5432"))
POSTGRES_DB = os.getenv("POSTGRES_DB", "hadith_db")
POSTGRES_USER = os.getenv("POSTGRES_USER", "hadith_ingest")
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD", "")
QDRANT_HOST = os.getenv("QDRANT_HOST", "qdrant.vector.svc.cluster.local")
QDRANT_PORT = int(os.getenv("QDRANT_PORT", "6333"))
QDRANT_COLLECTION = os.getenv("QDRANT_COLLECTION", "hadith_embeddings")
# For external access
QDRANT_EXTERNAL = os.getenv("QDRANT_EXTERNAL", "qdrant.betelgeusebytes.io")
console = Console()
@dataclass
class VerificationResult:
"""Results from embedding verification."""
total_hadiths_db: int
total_embeddings_qdrant: int
embeddings_with_payloads: int
missing_embeddings: int
embedding_dimension: int
collection_exists: bool
collection_status: str
sample_ids_missing: List[int]
verification_time_seconds: float
timestamp: str
def get_db_connection():
"""Create PostgreSQL connection."""
return psycopg2.connect(
host=POSTGRES_HOST,
port=POSTGRES_PORT,
database=POSTGRES_DB,
user=POSTGRES_USER,
password=POSTGRES_PASSWORD,
sslmode='require'
)
async def get_qdrant_collection_info(client: httpx.AsyncClient) -> Dict:
"""Get Qdrant collection information."""
try:
response = await client.get(
f"http://{QDRANT_HOST}:{QDRANT_PORT}/collections/{QDRANT_COLLECTION}"
)
response.raise_for_status()
return response.json()
except httpx.HTTPError as e:
console.print(f"[red]Error connecting to Qdrant: {e}[/red]")
return {}
async def count_qdrant_points(client: httpx.AsyncClient) -> int:
"""Count total points in Qdrant collection."""
try:
response = await client.post(
f"http://{QDRANT_HOST}:{QDRANT_PORT}/collections/{QDRANT_COLLECTION}/points/count",
json={"exact": True}
)
response.raise_for_status()
return response.json().get("result", {}).get("count", 0)
except httpx.HTTPError as e:
console.print(f"[red]Error counting Qdrant points: {e}[/red]")
return 0
async def get_qdrant_points_sample(
client: httpx.AsyncClient,
offset: int = 0,
limit: int = 100
) -> List[Dict]:
"""Get a sample of points from Qdrant."""
try:
response = await client.post(
f"http://{QDRANT_HOST}:{QDRANT_PORT}/collections/{QDRANT_COLLECTION}/points/scroll",
json={
"limit": limit,
"offset": offset,
"with_payload": True,
"with_vector": False
}
)
response.raise_for_status()
return response.json().get("result", {}).get("points", [])
except httpx.HTTPError as e:
console.print(f"[red]Error fetching Qdrant points: {e}[/red]")
return []
async def get_all_qdrant_ids(client: httpx.AsyncClient) -> set:
"""Get all point IDs from Qdrant (paginated)."""
all_ids = set()
offset = None
batch_size = 1000
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
console=console
) as progress:
task = progress.add_task("Fetching Qdrant IDs...", total=None)
while True:
try:
payload = {
"limit": batch_size,
"with_payload": False,
"with_vector": False
}
if offset is not None:
payload["offset"] = offset
response = await client.post(
f"http://{QDRANT_HOST}:{QDRANT_PORT}/collections/{QDRANT_COLLECTION}/points/scroll",
json=payload,
timeout=60.0
)
response.raise_for_status()
result = response.json().get("result", {})
points = result.get("points", [])
if not points:
break
for point in points:
all_ids.add(point["id"])
offset = result.get("next_page_offset")
progress.update(task, description=f"Fetched {len(all_ids)} IDs...")
if offset is None:
break
except httpx.HTTPError as e:
console.print(f"[red]Error during ID fetch: {e}[/red]")
break
return all_ids
def get_all_hadith_ids_from_db() -> set:
"""Get all hadith IDs from PostgreSQL."""
conn = get_db_connection()
try:
with conn.cursor() as cur:
cur.execute("SELECT id FROM hadiths ORDER BY id")
return {row[0] for row in cur.fetchall()}
finally:
conn.close()
def get_hadith_count_from_db() -> int:
"""Get total hadith count from PostgreSQL."""
conn = get_db_connection()
try:
with conn.cursor() as cur:
cur.execute("SELECT COUNT(*) FROM hadiths")
return cur.fetchone()[0]
finally:
conn.close()
def get_embedding_stats_from_db() -> Dict:
"""Get embedding generation stats from PostgreSQL."""
conn = get_db_connection()
try:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT
COUNT(*) as total,
SUM(CASE WHEN embedding_generated THEN 1 ELSE 0 END) as embedded,
SUM(CASE WHEN NOT embedding_generated THEN 1 ELSE 0 END) as not_embedded
FROM hadiths
""")
return dict(cur.fetchone())
finally:
conn.close()
def get_collection_stats_by_source() -> List[Dict]:
"""Get hadith counts by collection/source."""
conn = get_db_connection()
try:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT
c.name_english as collection,
COUNT(h.id) as count,
SUM(CASE WHEN h.embedding_generated THEN 1 ELSE 0 END) as embedded
FROM hadiths h
JOIN collections c ON h.collection_id = c.id
GROUP BY c.id, c.name_english
ORDER BY count DESC
""")
return [dict(row) for row in cur.fetchall()]
finally:
conn.close()
async def verify_embeddings() -> VerificationResult:
"""Main verification function."""
start_time = time.time()
console.print(Panel.fit(
"[bold blue]Hadith Embeddings Verification[/bold blue]\n"
f"Database: {POSTGRES_HOST}:{POSTGRES_PORT}/{POSTGRES_DB}\n"
f"Qdrant: {QDRANT_HOST}:{QDRANT_PORT}/{QDRANT_COLLECTION}",
title="Step 6.1"
))
# Step 1: Get PostgreSQL stats
console.print("\n[yellow]1. Checking PostgreSQL database...[/yellow]")
db_stats = get_embedding_stats_from_db()
total_hadiths = db_stats['total']
console.print(f" Total hadiths: [green]{total_hadiths:,}[/green]")
console.print(f" Marked as embedded: [green]{db_stats['embedded']:,}[/green]")
# Step 2: Get collection breakdown
console.print("\n[yellow]2. Collection breakdown:[/yellow]")
collection_stats = get_collection_stats_by_source()
table = Table(title="Hadiths by Collection")
table.add_column("Collection", style="cyan")
table.add_column("Total", justify="right")
table.add_column("Embedded", justify="right", style="green")
for stat in collection_stats:
table.add_row(
stat['collection'],
f"{stat['count']:,}",
f"{stat['embedded']:,}"
)
console.print(table)
# Step 3: Check Qdrant collection
console.print("\n[yellow]3. Checking Qdrant collection...[/yellow]")
async with httpx.AsyncClient(timeout=30.0) as client:
collection_info = await get_qdrant_collection_info(client)
if not collection_info:
return VerificationResult(
total_hadiths_db=total_hadiths,
total_embeddings_qdrant=0,
embeddings_with_payloads=0,
missing_embeddings=total_hadiths,
embedding_dimension=0,
collection_exists=False,
collection_status="NOT_FOUND",
sample_ids_missing=[],
verification_time_seconds=time.time() - start_time,
timestamp=datetime.now().isoformat()
)
result = collection_info.get("result", {})
status = result.get("status", "unknown")
vectors_config = result.get("config", {}).get("params", {}).get("vectors", {})
embedding_dim = vectors_config.get("size", 0)
console.print(f" Collection status: [green]{status}[/green]")
console.print(f" Embedding dimension: [green]{embedding_dim}[/green]")
# Step 4: Count Qdrant points
console.print("\n[yellow]4. Counting Qdrant embeddings...[/yellow]")
qdrant_count = await count_qdrant_points(client)
console.print(f" Total embeddings: [green]{qdrant_count:,}[/green]")
# Step 5: Find missing embeddings
console.print("\n[yellow]5. Identifying missing embeddings...[/yellow]")
db_ids = get_all_hadith_ids_from_db()
qdrant_ids = await get_all_qdrant_ids(client)
missing_ids = db_ids - qdrant_ids
extra_ids = qdrant_ids - db_ids
console.print(f" IDs in DB: [blue]{len(db_ids):,}[/blue]")
console.print(f" IDs in Qdrant: [blue]{len(qdrant_ids):,}[/blue]")
console.print(f" Missing embeddings: [{'red' if missing_ids else 'green'}]{len(missing_ids):,}[/{'red' if missing_ids else 'green'}]")
if extra_ids:
console.print(f" Extra IDs in Qdrant (orphaned): [yellow]{len(extra_ids):,}[/yellow]")
# Get sample of missing IDs
sample_missing = sorted(list(missing_ids))[:20] if missing_ids else []
# Step 6: Verify sample payload integrity
console.print("\n[yellow]6. Verifying payload integrity...[/yellow]")
sample_points = await get_qdrant_points_sample(client, limit=100)
payloads_with_data = sum(
1 for p in sample_points
if p.get("payload") and p["payload"].get("hadith_id")
)
console.print(f" Sample size: {len(sample_points)}")
console.print(f" With valid payloads: [green]{payloads_with_data}[/green]")
verification_time = time.time() - start_time
# Summary
console.print("\n" + "="*50)
console.print("[bold]VERIFICATION SUMMARY[/bold]")
console.print("="*50)
if len(missing_ids) == 0:
console.print("[bold green]✓ ALL EMBEDDINGS VERIFIED![/bold green]")
else:
console.print(f"[bold red]✗ {len(missing_ids):,} EMBEDDINGS MISSING[/bold red]")
if sample_missing:
console.print(f" Sample missing IDs: {sample_missing[:10]}")
console.print(f"\nVerification completed in {verification_time:.2f} seconds")
return VerificationResult(
total_hadiths_db=total_hadiths,
total_embeddings_qdrant=qdrant_count,
embeddings_with_payloads=payloads_with_data,
missing_embeddings=len(missing_ids),
embedding_dimension=embedding_dim,
collection_exists=True,
collection_status=status,
sample_ids_missing=sample_missing,
verification_time_seconds=verification_time,
timestamp=datetime.now().isoformat()
)
async def main():
"""Main entry point."""
result = await verify_embeddings()
# Save results to JSON
output_file = "verification_results.json"
with open(output_file, 'w') as f:
json.dump(asdict(result), f, indent=2)
console.print(f"\n[dim]Results saved to {output_file}[/dim]")
# Exit with error code if missing embeddings
if result.missing_embeddings > 0:
sys.exit(1)
if __name__ == "__main__":
asyncio.run(main())

View File

@ -0,0 +1,183 @@
# ============================================================================
# Step 6: Semantic Search API - Kubernetes Deployment
# ============================================================================
# Deploy: kubectl apply -f k8s-search-api.yaml
# ============================================================================
---
# Namespace (if not exists)
apiVersion: v1
kind: Namespace
metadata:
name: hadith
---
# ConfigMap for non-sensitive configuration
apiVersion: v1
kind: ConfigMap
metadata:
name: search-api-config
namespace: hadith
data:
POSTGRES_HOST: "postgres.db.svc.cluster.local"
POSTGRES_PORT: "5432"
POSTGRES_DB: "hadith_db"
POSTGRES_USER: "hadith_ingest"
QDRANT_HOST: "qdrant.vector.svc.cluster.local"
QDRANT_PORT: "6333"
QDRANT_COLLECTION: "hadith_embeddings"
TEI_HOST: "tei.ml.svc.cluster.local"
TEI_PORT: "80"
---
# Secret for database password
apiVersion: v1
kind: Secret
metadata:
name: search-api-secrets
namespace: hadith
type: Opaque
stringData:
POSTGRES_PASSWORD: "CHANGE_ME_TO_YOUR_PASSWORD"
---
# Deployment
apiVersion: apps/v1
kind: Deployment
metadata:
name: search-api
namespace: hadith
labels:
app: search-api
spec:
replicas: 2
selector:
matchLabels:
app: search-api
template:
metadata:
labels:
app: search-api
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8080"
prometheus.io/path: "/metrics"
spec:
nodeSelector:
node: hetzner-2
containers:
- name: search-api
image: python:3.11-slim
command:
- /bin/bash
- -c
- |
pip install --no-cache-dir \
fastapi uvicorn httpx psycopg2-binary pydantic && \
python /app/search_api.py
ports:
- containerPort: 8080
name: http
envFrom:
- configMapRef:
name: search-api-config
- secretRef:
name: search-api-secrets
volumeMounts:
- name: app-code
mountPath: /app
resources:
requests:
cpu: "250m"
memory: "256Mi"
limits:
cpu: "1"
memory: "512Mi"
readinessProbe:
httpGet:
path: /health
port: 8080
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 3
livenessProbe:
httpGet:
path: /health
port: 8080
initialDelaySeconds: 60
periodSeconds: 30
timeoutSeconds: 5
failureThreshold: 3
volumes:
- name: app-code
configMap:
name: search-api-code
---
# Service
apiVersion: v1
kind: Service
metadata:
name: search-api
namespace: hadith
spec:
selector:
app: search-api
ports:
- name: http
port: 80
targetPort: 8080
type: ClusterIP
---
# Ingress
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: search-api
namespace: hadith
annotations:
cert-manager.io/cluster-issuer: letsencrypt-prod
nginx.ingress.kubernetes.io/proxy-body-size: "10m"
nginx.ingress.kubernetes.io/proxy-read-timeout: "60"
nginx.ingress.kubernetes.io/proxy-send-timeout: "60"
spec:
ingressClassName: nginx
tls:
- hosts:
- search.betelgeusebytes.io
secretName: search-api-tls
rules:
- host: search.betelgeusebytes.io
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: search-api
port:
number: 80
---
# HorizontalPodAutoscaler (optional)
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: search-api-hpa
namespace: hadith
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: search-api
minReplicas: 2
maxReplicas: 5
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 80

View File

@ -0,0 +1,19 @@
# Step 6: Verify Embeddings & Semantic Search
# Requirements for hadith-phase3-step6
# Database
psycopg2-binary>=2.9.9
# HTTP client
httpx>=0.27.0
# Rich console output
rich>=13.7.0
# Data handling
python-dateutil>=2.8.2
# Optional: for running as web API
fastapi>=0.111.0
uvicorn>=0.30.0
pydantic>=2.7.0

View File

@ -0,0 +1,225 @@
#!/bin/bash
# ============================================================================
# Step 6: Quick Test Runner
# ============================================================================
# Usage: ./run_tests.sh [verify|benchmark|demo|api|all]
# ============================================================================
set -e
# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Configuration - Update these for your environment
export POSTGRES_HOST="${POSTGRES_HOST:-pg.betelgeusebytes.io}"
export POSTGRES_PORT="${POSTGRES_PORT:-5432}"
export POSTGRES_DB="${POSTGRES_DB:-hadith_db}"
export POSTGRES_USER="${POSTGRES_USER:-hadith_ingest}"
export POSTGRES_PASSWORD="${POSTGRES_PASSWORD:-hadith_ingest}"
# TEI_URL = "https://embeddings.betelgeusebytes.io"
# QDRANT_URL = "https://vector.betelgeusebytes.io"
# export QDRANT_HOST="${QDRANT_HOST:-qdrant.vector.svc.cluster.local}"
# export QDRANT_PORT="${QDRANT_PORT:-6333}"
# export QDRANT_COLLECTION="${QDRANT_COLLECTION:-hadith_embeddings}"
# export TEI_HOST="${TEI_HOST:-tei.ml.svc.cluster.local}"
# export TEI_PORT="${TEI_PORT:-80}"
export QDRANT_HOST="${QDRANT_HOST:-https://vector.betelgeusebytes.io}"
export QDRANT_PORT="${QDRANT_PORT:-443}"
export QDRANT_COLLECTION="${QDRANT_COLLECTION:-hadith_embeddings}"
export TEI_HOST="${TEI_HOST:-https://embeddings.betelgeusebytes.io}"
export TEI_PORT="${TEI_PORT:-443}"
# Check if password is set
check_password() {
if [ -z "$POSTGRES_PASSWORD" ]; then
echo -e "${RED}Error: POSTGRES_PASSWORD environment variable is not set${NC}"
echo "Set it with: export POSTGRES_PASSWORD='your_password'"
exit 1
fi
}
# Install dependencies
install_deps() {
echo -e "${BLUE}Installing dependencies...${NC}"
pip install -q -r requirements.txt
echo -e "${GREEN}Dependencies installed.${NC}"
}
# Run verification
run_verify() {
echo -e "\n${BLUE}═══════════════════════════════════════════════════${NC}"
echo -e "${BLUE}Running Embedding Verification...${NC}"
echo -e "${BLUE}═══════════════════════════════════════════════════${NC}\n"
python verify_embeddings.py
if [ $? -eq 0 ]; then
echo -e "\n${GREEN}✓ Verification passed!${NC}"
else
echo -e "\n${RED}✗ Verification failed - some embeddings are missing${NC}"
exit 1
fi
}
# Run benchmark
run_benchmark() {
echo -e "\n${BLUE}═══════════════════════════════════════════════════${NC}"
echo -e "${BLUE}Running Semantic Search Benchmark...${NC}"
echo -e "${BLUE}═══════════════════════════════════════════════════${NC}\n"
python semantic_search.py --mode benchmark --output benchmark_results.json
echo -e "\n${GREEN}✓ Benchmark complete. Results saved to benchmark_results.json${NC}"
}
# Run demo
run_demo() {
echo -e "\n${BLUE}═══════════════════════════════════════════════════${NC}"
echo -e "${BLUE}Running Search Demo...${NC}"
echo -e "${BLUE}═══════════════════════════════════════════════════${NC}\n"
python semantic_search.py --mode demo
}
# Run API server
run_api() {
echo -e "\n${BLUE}═══════════════════════════════════════════════════${NC}"
echo -e "${BLUE}Starting Search API Server...${NC}"
echo -e "${BLUE}═══════════════════════════════════════════════════${NC}\n"
echo -e "${YELLOW}API will be available at: http://localhost:8080${NC}"
echo -e "${YELLOW}Swagger docs at: http://localhost:8080/docs${NC}"
echo -e "${YELLOW}Press Ctrl+C to stop${NC}\n"
python search_api.py
}
# Run SQL verification
run_sql() {
echo -e "\n${BLUE}═══════════════════════════════════════════════════${NC}"
echo -e "${BLUE}Running SQL Verification Queries...${NC}"
echo -e "${BLUE}═══════════════════════════════════════════════════${NC}\n"
PGPASSWORD="$POSTGRES_PASSWORD" psql \
-h "$POSTGRES_HOST" \
-p "$POSTGRES_PORT" \
-U "$POSTGRES_USER" \
-d "$POSTGRES_DB" \
-f verification_queries.sql
}
# Quick connectivity test
test_connectivity() {
echo -e "\n${BLUE}Testing Service Connectivity...${NC}\n"
# Test PostgreSQL
echo -n "PostgreSQL ($POSTGRES_HOST:$POSTGRES_PORT): "
if PGPASSWORD="$POSTGRES_PASSWORD" psql -h "$POSTGRES_HOST" -p "$POSTGRES_PORT" -U "$POSTGRES_USER" -d "$POSTGRES_DB" -c "SELECT 1" > /dev/null 2>&1; then
echo -e "${GREEN}✓ Connected${NC}"
else
echo -e "${RED}✗ Failed${NC}"
fi
# Test Qdrant
echo -n "Qdrant ($QDRANT_HOST:$QDRANT_PORT): "
if curl -s "http://$QDRANT_HOST:$QDRANT_PORT/collections" > /dev/null 2>&1; then
echo -e "${GREEN}✓ Connected${NC}"
else
echo -e "${RED}✗ Failed${NC}"
fi
# Test TEI
echo -n "TEI ($TEI_HOST:$TEI_PORT): "
if curl -s "http://$TEI_HOST:$TEI_PORT/health" > /dev/null 2>&1; then
echo -e "${GREEN}✓ Connected${NC}"
else
echo -e "${RED}✗ Failed${NC}"
fi
echo ""
}
# Show usage
show_usage() {
echo "Usage: $0 [command]"
echo ""
echo "Commands:"
echo " verify Run embedding verification"
echo " benchmark Run semantic search benchmark"
echo " demo Run search demo with sample queries"
echo " api Start the search API server"
echo " sql Run SQL verification queries"
echo " test Test connectivity to all services"
echo " all Run verify + benchmark + demo"
echo " install Install Python dependencies"
echo " help Show this help message"
echo ""
echo "Environment variables:"
echo " POSTGRES_HOST PostgreSQL host (default: pg.betelgeusebytes.io)"
echo " POSTGRES_PORT PostgreSQL port (default: 5432)"
echo " POSTGRES_DB Database name (default: hadith_db)"
echo " POSTGRES_USER Database user (default: hadith_ingest)"
echo " POSTGRES_PASSWORD Database password (required)"
echo " QDRANT_HOST Qdrant host (default: qdrant.vector.svc.cluster.local)"
echo " QDRANT_PORT Qdrant port (default: 6333)"
echo " TEI_HOST TEI host (default: tei.ml.svc.cluster.local)"
echo " TEI_PORT TEI port (default: 80)"
}
# Main
case "${1:-help}" in
verify)
check_password
install_deps
run_verify
;;
benchmark)
check_password
install_deps
run_benchmark
;;
demo)
check_password
install_deps
run_demo
;;
api)
check_password
install_deps
run_api
;;
sql)
check_password
run_sql
;;
test)
check_password
test_connectivity
;;
all)
check_password
install_deps
test_connectivity
run_verify
run_benchmark
run_demo
;;
install)
install_deps
;;
help|--help|-h)
show_usage
;;
*)
echo -e "${RED}Unknown command: $1${NC}"
show_usage
exit 1
;;
esac

View File

@ -0,0 +1,578 @@
#!/usr/bin/env python3
"""
Step 6.3: Semantic Search API Service
======================================
Production-ready FastAPI service for hadith semantic search.
Author: Hadith Scholar AI Project
Date: 2025
"""
import os
import time
import logging
from datetime import datetime
from typing import List, Optional
from contextlib import asynccontextmanager
import sys
import httpx
import psycopg2
from psycopg2.pool import ThreadedConnectionPool
from psycopg2.extras import RealDictCursor
from fastapi import FastAPI, HTTPException, Query, Depends
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, Field
if sys.platform == 'win32':
os.environ['PYTHONIOENCODING'] = 'utf-8'
if hasattr(sys.stdout, 'reconfigure'):
sys.stdout.reconfigure(encoding='utf-8')
if hasattr(sys.stderr, 'reconfigure'):
sys.stderr.reconfigure(encoding='utf-8')
# Logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Configuration
POSTGRES_HOST = os.getenv("POSTGRES_HOST", "pg.betelgeusebytes.io")
POSTGRES_PORT = int(os.getenv("POSTGRES_PORT", "5432"))
POSTGRES_DB = os.getenv("POSTGRES_DB", "hadith_db")
POSTGRES_USER = os.getenv("POSTGRES_USER", "hadith_ingest")
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD", "hadith_ingest")
# TEI_URL = "https://embeddings.betelgeusebytes.io"
# QDRANT_URL = "https://vector.betelgeusebytes.io"
QDRANT_HOST = os.getenv("QDRANT_HOST", "https://vector.betelgeusebytes.io")
QDRANT_PORT = int(os.getenv("QDRANT_PORT", "443"))
QDRANT_COLLECTION = os.getenv("QDRANT_COLLECTION", "hadith_embeddings")
# For external access
QDRANT_EXTERNAL = os.getenv("QDRANT_EXTERNAL", "qdrant.betelgeusebytes.io")
# TEI_URL = "https://embeddings.betelgeusebytes.io"
TEI_HOST = os.getenv("TEI_HOST", "https://embeddings.betelgeusebytes.io")
TEI_PORT = int(os.getenv("TEI_PORT", "443"))
# ============================================================================
# Pydantic Models
# ============================================================================
class SearchQuery(BaseModel):
"""Search query input."""
query: str = Field(..., min_length=1, max_length=1000, description="Search query text")
limit: int = Field(default=10, ge=1, le=100, description="Number of results to return")
min_score: float = Field(default=0.0, ge=0.0, le=1.0, description="Minimum similarity score")
collections: Optional[List[str]] = Field(default=None, description="Filter by collection names")
grades: Optional[List[str]] = Field(default=None, description="Filter by hadith grades")
class HadithResult(BaseModel):
"""Individual hadith search result."""
hadith_id: int
score: float
collection: str
book: Optional[str]
hadith_number: str
arabic_text: Optional[str]
arabic_normalized: Optional[str]
english_text: Optional[str]
urdu_text: Optional[str]
grade: Optional[str]
class SearchResponse(BaseModel):
"""Search response."""
query: str
results: List[HadithResult]
total_results: int
embedding_time_ms: float
search_time_ms: float
total_time_ms: float
timestamp: str
class HealthResponse(BaseModel):
"""Health check response."""
status: str
database: str
qdrant: str
tei: str
timestamp: str
class CollectionStats(BaseModel):
"""Collection statistics."""
name: str
total_hadiths: int
embedded_count: int
class StatsResponse(BaseModel):
"""Statistics response."""
total_hadiths: int
total_embedded: int
collections: List[CollectionStats]
timestamp: str
# ============================================================================
# Database Pool & Connections
# ============================================================================
db_pool: Optional[ThreadedConnectionPool] = None
http_client: Optional[httpx.AsyncClient] = None
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Manage application lifecycle."""
global db_pool, http_client
# Startup
logger.info("Starting up semantic search service...")
# Initialize database pool
try:
db_pool = ThreadedConnectionPool(
minconn=2,
maxconn=10,
host=POSTGRES_HOST,
port=POSTGRES_PORT,
database=POSTGRES_DB,
user=POSTGRES_USER,
password=POSTGRES_PASSWORD,
sslmode='require'
)
logger.info("Database pool initialized")
except Exception as e:
logger.error(f"Failed to initialize database pool: {e}")
db_pool = None
# Initialize HTTP client
http_client = httpx.AsyncClient(timeout=30.0)
logger.info("HTTP client initialized")
yield
# Shutdown
logger.info("Shutting down...")
if db_pool:
db_pool.closeall()
if http_client:
await http_client.aclose()
# ============================================================================
# FastAPI App
# ============================================================================
app = FastAPI(
title="Hadith Semantic Search API",
description="Semantic search service for Islamic hadith literature",
version="1.0.0",
lifespan=lifespan
)
# CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# ============================================================================
# Helper Functions
# ============================================================================
def get_db_connection():
"""Get database connection from pool."""
if db_pool is None:
raise HTTPException(status_code=503, detail="Database pool not available")
return db_pool.getconn()
def release_db_connection(conn):
"""Return connection to pool."""
if db_pool and conn:
db_pool.putconn(conn)
async def get_embedding(text: str) -> tuple[List[float], float]:
"""Get embedding from TEI service."""
start = time.perf_counter()
try:
response = await http_client.post(
f"{TEI_HOST}:{TEI_PORT}/embed",
json={"inputs": text}
)
response.raise_for_status()
elapsed_ms = (time.perf_counter() - start) * 1000
embeddings = response.json()
if isinstance(embeddings, list) and len(embeddings) > 0:
if isinstance(embeddings[0], list):
return embeddings[0], elapsed_ms
return embeddings, elapsed_ms
raise ValueError("Unexpected embedding format")
except httpx.HTTPError as e:
logger.error(f"TEI request failed: {e}")
raise HTTPException(status_code=503, detail=f"Embedding service error: {e}")
async def search_qdrant(
embedding: List[float],
limit: int = 10,
min_score: float = 0.0,
filters: Optional[dict] = None
) -> tuple[List[dict], float]:
"""Search Qdrant with embedding vector."""
start = time.perf_counter()
try:
payload = {
"vector": embedding,
"limit": limit,
"with_payload": True,
"score_threshold": min_score
}
if filters:
payload["filter"] = filters
response = await http_client.post(
f"{QDRANT_HOST}:{QDRANT_PORT}/collections/{QDRANT_COLLECTION}/points/search",
json=payload
)
response.raise_for_status()
elapsed_ms = (time.perf_counter() - start) * 1000
results = response.json().get("result", [])
return results, elapsed_ms
except httpx.HTTPError as e:
logger.error(f"Qdrant request failed: {e}")
raise HTTPException(status_code=503, detail=f"Vector search service error: {e}")
def enrich_results_from_db(hadith_ids: List[int]) -> dict[int, dict]:
"""Fetch full hadith data from PostgreSQL."""
if not hadith_ids:
return {}
conn = get_db_connection()
try:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT
h.id,
h.hadith_number,
h.arabic_text,
h.arabic_normalized,
h.english_text,
h.urdu_text,
h.grade,
c.name_english as collection_name,
b.name_english as book_name
FROM hadiths h
JOIN collections c ON h.collection_id = c.id
LEFT JOIN books b ON h.book_id = b.id
WHERE h.id = ANY(%s)
""", (hadith_ids,))
return {row['id']: dict(row) for row in cur.fetchall()}
finally:
release_db_connection(conn)
def build_qdrant_filter(collections: Optional[List[str]], grades: Optional[List[str]]) -> Optional[dict]:
"""Build Qdrant filter from parameters."""
conditions = []
if collections:
conditions.append({
"key": "collection",
"match": {"any": collections}
})
if grades:
conditions.append({
"key": "grade",
"match": {"any": grades}
})
if not conditions:
return None
if len(conditions) == 1:
return {"must": conditions}
return {"must": conditions}
# ============================================================================
# API Endpoints
# ============================================================================
@app.get("/health", response_model=HealthResponse)
async def health_check():
"""Check health of all services."""
db_status = "healthy"
qdrant_status = "healthy"
tei_status = "healthy"
# Check database
try:
conn = get_db_connection()
with conn.cursor() as cur:
cur.execute("SELECT 1")
release_db_connection(conn)
except Exception as e:
db_status = f"unhealthy: {e}"
# Check Qdrant
try:
response = await http_client.get(
f"{QDRANT_HOST}:{QDRANT_PORT}/collections/{QDRANT_COLLECTION}"
)
if response.status_code != 200:
qdrant_status = f"unhealthy: status {response.status_code}"
except Exception as e:
qdrant_status = f"unhealthy: {e}"
# Check TEI
try:
response = await http_client.get(f"{TEI_HOST}:{TEI_PORT}/health")
if response.status_code != 200:
tei_status = f"unhealthy: status {response.status_code}"
except Exception as e:
tei_status = f"unhealthy: {e}"
overall = "healthy" if all(
s == "healthy" for s in [db_status, qdrant_status, tei_status]
) else "degraded"
return HealthResponse(
status=overall,
database=db_status,
qdrant=qdrant_status,
tei=tei_status,
timestamp=datetime.now().isoformat()
)
@app.get("/stats", response_model=StatsResponse)
async def get_stats():
"""Get database statistics."""
conn = get_db_connection()
try:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
# Total counts
cur.execute("""
SELECT
COUNT(*) as total,
SUM(CASE WHEN embedding_generated THEN 1 ELSE 0 END) as embedded
FROM hadiths
""")
totals = cur.fetchone()
# By collection
cur.execute("""
SELECT
c.name_english as name,
COUNT(h.id) as total_hadiths,
SUM(CASE WHEN h.embedding_generated THEN 1 ELSE 0 END) as embedded_count
FROM hadiths h
JOIN collections c ON h.collection_id = c.id
GROUP BY c.id, c.name_english
ORDER BY total_hadiths DESC
""")
collections = [CollectionStats(**dict(row)) for row in cur.fetchall()]
return StatsResponse(
total_hadiths=totals['total'],
total_embedded=totals['embedded'],
collections=collections,
timestamp=datetime.now().isoformat()
)
finally:
release_db_connection(conn)
@app.post("/search", response_model=SearchResponse)
async def semantic_search(query: SearchQuery):
"""Perform semantic search on hadiths."""
total_start = time.perf_counter()
# Get embedding
embedding, embed_time = await get_embedding(query.query)
# Build filters
filters = build_qdrant_filter(query.collections, query.grades)
# Search Qdrant
qdrant_results, search_time = await search_qdrant(
embedding,
limit=query.limit,
min_score=query.min_score,
filters=filters
)
# Extract hadith IDs
hadith_ids = []
for r in qdrant_results:
payload = r.get("payload", {})
hid = payload.get("hadith_id") or payload.get("id")
if hid:
hadith_ids.append(int(hid))
# Enrich from database
db_data = enrich_results_from_db(hadith_ids)
# Build results
results = []
for r in qdrant_results:
payload = r.get("payload", {})
hid = payload.get("hadith_id") or payload.get("id")
if hid and int(hid) in db_data:
data = db_data[int(hid)]
results.append(HadithResult(
hadith_id=int(hid),
score=r.get("score", 0),
collection=data.get("collection_name", "Unknown"),
book=data.get("book_name"),
hadith_number=data.get("hadith_number", ""),
arabic_text=data.get("arabic_text"),
arabic_normalized=data.get("arabic_normalized"),
english_text=data.get("english_text"),
urdu_text=data.get("urdu_text"),
grade=data.get("grade")
))
else:
# Fallback to payload
results.append(HadithResult(
hadith_id=int(hid) if hid else 0,
score=r.get("score", 0),
collection=payload.get("collection", "Unknown"),
book=payload.get("book"),
hadith_number=str(payload.get("hadith_number", "")),
arabic_text=payload.get("arabic_text"),
arabic_normalized=payload.get("arabic_normalized"),
english_text=payload.get("english_text"),
urdu_text=payload.get("urdu_text"),
grade=payload.get("grade")
))
total_time = (time.perf_counter() - total_start) * 1000
return SearchResponse(
query=query.query,
results=results,
total_results=len(results),
embedding_time_ms=embed_time,
search_time_ms=search_time,
total_time_ms=total_time,
timestamp=datetime.now().isoformat()
)
@app.get("/search", response_model=SearchResponse)
async def semantic_search_get(
q: str = Query(..., min_length=1, max_length=1000, description="Search query"),
limit: int = Query(default=10, ge=1, le=100),
min_score: float = Query(default=0.0, ge=0.0, le=1.0)
):
"""GET version of semantic search for simple queries."""
query = SearchQuery(query=q, limit=limit, min_score=min_score)
return await semantic_search(query)
@app.get("/hadith/{hadith_id}")
async def get_hadith(hadith_id: int):
"""Get a specific hadith by ID."""
conn = get_db_connection()
try:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT
h.id,
h.hadith_number,
h.arabic_text,
h.arabic_normalized,
h.english_text,
h.urdu_text,
h.grade,
h.source_metadata,
h.embedding_generated,
h.entities_extracted,
h.relations_extracted,
h.created_at,
h.updated_at,
c.name_english as collection_name,
c.name_arabic as collection_arabic,
b.name_english as book_name,
b.name_arabic as book_arabic
FROM hadiths h
JOIN collections c ON h.collection_id = c.id
LEFT JOIN books b ON h.book_id = b.id
WHERE h.id = %s
""", (hadith_id,))
row = cur.fetchone()
if not row:
raise HTTPException(status_code=404, detail=f"Hadith {hadith_id} not found")
return dict(row)
finally:
release_db_connection(conn)
@app.get("/similar/{hadith_id}", response_model=SearchResponse)
async def find_similar(
hadith_id: int,
limit: int = Query(default=10, ge=1, le=100)
):
"""Find hadiths similar to a given hadith."""
# Get the hadith text
conn = get_db_connection()
try:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT arabic_text, english_text
FROM hadiths
WHERE id = %s
""", (hadith_id,))
row = cur.fetchone()
if not row:
raise HTTPException(status_code=404, detail=f"Hadith {hadith_id} not found")
# Use Arabic text preferably, fall back to English
text = row['arabic_text'] or row['english_text']
if not text:
raise HTTPException(status_code=400, detail="Hadith has no text content")
finally:
release_db_connection(conn)
# Search for similar hadiths
query = SearchQuery(query=text, limit=limit + 1) # +1 to exclude self
response = await semantic_search(query)
# Filter out the source hadith
response.results = [r for r in response.results if r.hadith_id != hadith_id][:limit]
response.total_results = len(response.results)
return response
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8080)

View File

@ -0,0 +1,677 @@
#!/usr/bin/env python3
"""
Step 6.2: Semantic Search Testing & Benchmarking
=================================================
Tests semantic search functionality and benchmarks performance.
Target: <500ms per query.
Author: Hadith Scholar AI Project
Date: 2025
"""
import os
import sys
import json
import time
import asyncio
import statistics
from datetime import datetime
from typing import Dict, List, Optional, Tuple
from dataclasses import dataclass, asdict, field
import httpx
import psycopg2
from psycopg2.extras import RealDictCursor
from rich.console import Console
from rich.table import Table
from rich.panel import Panel
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
if sys.platform == 'win32':
os.environ['PYTHONIOENCODING'] = 'utf-8'
if hasattr(sys.stdout, 'reconfigure'):
sys.stdout.reconfigure(encoding='utf-8')
if hasattr(sys.stderr, 'reconfigure'):
sys.stderr.reconfigure(encoding='utf-8')
# Configuration
POSTGRES_HOST = os.getenv("POSTGRES_HOST", "pg.betelgeusebytes.io")
POSTGRES_PORT = int(os.getenv("POSTGRES_PORT", "5432"))
POSTGRES_DB = os.getenv("POSTGRES_DB", "hadith_db")
POSTGRES_USER = os.getenv("POSTGRES_USER", "hadith_ingest")
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD", "hadith_ingest")
# TEI_URL = "https://embeddings.betelgeusebytes.io"
# QDRANT_URL = "https://vector.betelgeusebytes.io"
QDRANT_HOST = os.getenv("QDRANT_HOST", "https://vector.betelgeusebytes.io")
QDRANT_PORT = int(os.getenv("QDRANT_PORT", "443"))
QDRANT_COLLECTION = os.getenv("QDRANT_COLLECTION", "hadith_embeddings")
# For external access
QDRANT_EXTERNAL = os.getenv("QDRANT_EXTERNAL", "qdrant.betelgeusebytes.io")
# TEI_URL = "https://embeddings.betelgeusebytes.io"
TEI_HOST = os.getenv("TEI_HOST", "https://embeddings.betelgeusebytes.io")
TEI_PORT = int(os.getenv("TEI_PORT", "443"))
console = Console()
# ============================================================================
# Sample Queries for Testing
# ============================================================================
SAMPLE_QUERIES = {
"arabic": [
{
"query": "الصلاة في المسجد الحرام",
"description": "Prayer in the Sacred Mosque",
"expected_topics": ["prayer", "mosque", "mecca"]
},
{
"query": "أبو هريرة رضي الله عنه",
"description": "Abu Hurairah (RA)",
"expected_topics": ["narrator", "companion"]
},
{
"query": "الصيام في شهر رمضان",
"description": "Fasting in Ramadan",
"expected_topics": ["fasting", "ramadan"]
},
{
"query": "الزكاة والصدقة",
"description": "Zakat and charity",
"expected_topics": ["charity", "zakat"]
},
{
"query": "الحج والعمرة",
"description": "Hajj and Umrah",
"expected_topics": ["pilgrimage", "hajj", "umrah"]
},
{
"query": "النبي صلى الله عليه وسلم في المدينة",
"description": "Prophet (PBUH) in Medina",
"expected_topics": ["prophet", "medina"]
},
{
"query": "الوضوء والطهارة",
"description": "Ablution and purification",
"expected_topics": ["ablution", "purification", "wudu"]
},
{
"query": "بر الوالدين",
"description": "Honoring parents",
"expected_topics": ["parents", "kindness", "family"]
},
{
"query": "الجنة والنار",
"description": "Paradise and Hell",
"expected_topics": ["afterlife", "paradise", "hell"]
},
{
"query": "الإيمان والإسلام",
"description": "Faith and Islam",
"expected_topics": ["faith", "belief", "islam"]
}
],
"english": [
{
"query": "five daily prayers",
"description": "The five obligatory prayers",
"expected_topics": ["prayer", "salah", "obligation"]
},
{
"query": "Prophet Muhammad in Mecca",
"description": "Prophet's life in Mecca",
"expected_topics": ["prophet", "mecca", "biography"]
},
{
"query": "treatment of neighbors",
"description": "Rights and treatment of neighbors",
"expected_topics": ["neighbors", "rights", "ethics"]
},
{
"query": "patience during hardship",
"description": "Patience in difficult times",
"expected_topics": ["patience", "sabr", "trials"]
},
{
"query": "marriage and family",
"description": "Islamic marriage guidance",
"expected_topics": ["marriage", "family", "nikah"]
},
{
"query": "honesty and truthfulness",
"description": "Importance of being truthful",
"expected_topics": ["honesty", "truth", "character"]
},
{
"query": "Day of Judgment signs",
"description": "Signs of the Last Day",
"expected_topics": ["judgment", "signs", "eschatology"]
},
{
"query": "charity and helping poor",
"description": "Giving charity to the needy",
"expected_topics": ["charity", "poor", "sadaqah"]
},
{
"query": "companions of the Prophet",
"description": "Sahaba and their virtues",
"expected_topics": ["companions", "sahaba", "virtue"]
},
{
"query": "seeking knowledge in Islam",
"description": "Importance of knowledge",
"expected_topics": ["knowledge", "learning", "education"]
}
],
"mixed": [
{
"query": "قال رسول الله about kindness",
"description": "Prophet's sayings about kindness (mixed)",
"expected_topics": ["prophet", "kindness", "ethics"]
},
{
"query": "women rights الإسلام",
"description": "Women's rights in Islam (mixed)",
"expected_topics": ["women", "rights", "islam"]
}
]
}
@dataclass
class SearchResult:
"""Individual search result."""
hadith_id: int
score: float
collection: str
hadith_number: str
arabic_text: str
english_text: str
grade: str
@dataclass
class QueryBenchmark:
"""Benchmark results for a single query."""
query: str
language: str
description: str
embedding_time_ms: float
search_time_ms: float
total_time_ms: float
results_count: int
top_score: float
meets_target: bool # <500ms
@dataclass
class BenchmarkReport:
"""Full benchmark report."""
total_queries: int
successful_queries: int
failed_queries: int
avg_embedding_time_ms: float
avg_search_time_ms: float
avg_total_time_ms: float
p50_time_ms: float
p95_time_ms: float
p99_time_ms: float
min_time_ms: float
max_time_ms: float
queries_meeting_target: int
target_ms: int
query_results: List[QueryBenchmark] = field(default_factory=list)
timestamp: str = ""
def get_db_connection():
"""Create PostgreSQL connection."""
return psycopg2.connect(
host=POSTGRES_HOST,
port=POSTGRES_PORT,
database=POSTGRES_DB,
user=POSTGRES_USER,
password=POSTGRES_PASSWORD,
sslmode='require'
)
async def get_embedding(client: httpx.AsyncClient, text: str) -> Tuple[List[float], float]:
"""Get embedding from TEI service."""
start = time.perf_counter()
response = await client.post(
f"{TEI_HOST}:{TEI_PORT}/embed",
json={"inputs": text}
)
response.raise_for_status()
elapsed_ms = (time.perf_counter() - start) * 1000
# TEI returns list of embeddings, we want the first one
embeddings = response.json()
if isinstance(embeddings, list) and len(embeddings) > 0:
if isinstance(embeddings[0], list):
return embeddings[0], elapsed_ms
return embeddings, elapsed_ms
raise ValueError(f"Unexpected embedding response format: {type(embeddings)}")
async def search_qdrant(
client: httpx.AsyncClient,
embedding: List[float],
limit: int = 10
) -> Tuple[List[Dict], float]:
"""Search Qdrant with embedding vector."""
start = time.perf_counter()
response = await client.post(
f"{QDRANT_HOST}:{QDRANT_PORT}/collections/{QDRANT_COLLECTION}/points/search",
json={
"vector": embedding,
"limit": limit,
"with_payload": True
}
)
response.raise_for_status()
elapsed_ms = (time.perf_counter() - start) * 1000
results = response.json().get("result", [])
return results, elapsed_ms
def enrich_results_from_db(hadith_ids: List[int]) -> Dict[int, Dict]:
"""Fetch full hadith data from PostgreSQL."""
if not hadith_ids:
return {}
conn = get_db_connection()
try:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT
h.id,
h.hadith_number,
h.arabic_text,
h.english_text,
h.grade,
c.name_english as collection_name
FROM hadiths h
JOIN collections c ON h.collection_id = c.id
WHERE h.id = ANY(%s)
""", (hadith_ids,))
return {row['id']: dict(row) for row in cur.fetchall()}
finally:
conn.close()
async def semantic_search(
client: httpx.AsyncClient,
query: str,
limit: int = 10
) -> Tuple[List[SearchResult], float, float]:
"""Perform semantic search and return results with timing."""
# Step 1: Get embedding
embedding, embed_time = await get_embedding(client, query)
# Step 2: Search Qdrant
qdrant_results, search_time = await search_qdrant(client, embedding, limit)
# Step 3: Get hadith IDs and enrich from DB
hadith_ids = []
for r in qdrant_results:
payload = r.get("payload", {})
hid = payload.get("hadith_id") or payload.get("id")
if hid:
hadith_ids.append(int(hid))
db_data = enrich_results_from_db(hadith_ids)
# Step 4: Build results
results = []
for r in qdrant_results:
payload = r.get("payload", {})
hid = payload.get("hadith_id") or payload.get("id")
if hid and int(hid) in db_data:
data = db_data[int(hid)]
results.append(SearchResult(
hadith_id=int(hid),
score=r.get("score", 0),
collection=data.get("collection_name", "Unknown"),
hadith_number=data.get("hadith_number", ""),
arabic_text=data.get("arabic_text", "")[:200] + "..." if data.get("arabic_text") else "",
english_text=data.get("english_text", "")[:200] + "..." if data.get("english_text") else "",
grade=data.get("grade", "")
))
else:
# Fallback to payload data
results.append(SearchResult(
hadith_id=int(hid) if hid else 0,
score=r.get("score", 0),
collection=payload.get("collection", "Unknown"),
hadith_number=str(payload.get("hadith_number", "")),
arabic_text=payload.get("arabic_text", "")[:200] + "..." if payload.get("arabic_text") else "",
english_text=payload.get("english_text", "")[:200] + "..." if payload.get("english_text") else "",
grade=payload.get("grade", "")
))
return results, embed_time, search_time
def display_search_results(query: str, results: List[SearchResult], embed_time: float, search_time: float):
"""Display search results in a nice format."""
total_time = embed_time + search_time
console.print(f"\n[bold cyan]Query:[/bold cyan] {query}")
console.print(f"[dim]Embedding: {embed_time:.1f}ms | Search: {search_time:.1f}ms | Total: {total_time:.1f}ms[/dim]")
if not results:
console.print("[yellow]No results found.[/yellow]")
return
table = Table(title=f"Top {len(results)} Results", show_lines=True)
table.add_column("#", style="dim", width=3)
table.add_column("Score", justify="right", width=8)
table.add_column("Collection", width=15)
table.add_column("Hadith #", width=10)
table.add_column("Text Preview", width=60)
table.add_column("Grade", width=10)
for i, r in enumerate(results, 1):
text_preview = r.english_text if r.english_text else r.arabic_text
table.add_row(
str(i),
f"{r.score:.4f}",
r.collection,
r.hadith_number,
text_preview[:80] + "..." if len(text_preview) > 80 else text_preview,
r.grade or "-"
)
console.print(table)
async def run_benchmarks(warmup_count: int = 3) -> BenchmarkReport:
"""Run full benchmark suite."""
console.print(Panel.fit(
"[bold blue]Semantic Search Benchmark[/bold blue]\n"
f"Target: <500ms per query\n"
f"TEI: {TEI_HOST}:{TEI_PORT}\n"
f"Qdrant: {QDRANT_HOST}:{QDRANT_PORT}",
title="Step 6.2"
))
all_queries = (
[(q, "arabic") for q in SAMPLE_QUERIES["arabic"]] +
[(q, "english") for q in SAMPLE_QUERIES["english"]] +
[(q, "mixed") for q in SAMPLE_QUERIES["mixed"]]
)
query_results = []
total_times = []
successful = 0
failed = 0
async with httpx.AsyncClient(timeout=30.0) as client:
# Warmup queries
console.print(f"\n[yellow]Running {warmup_count} warmup queries...[/yellow]")
for i in range(warmup_count):
try:
await semantic_search(client, "test warmup query", limit=5)
except Exception as e:
console.print(f"[dim]Warmup {i+1} error: {e}[/dim]")
console.print("[green]Warmup complete.[/green]\n")
# Run benchmarks
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
console=console
) as progress:
task = progress.add_task("Running benchmarks...", total=len(all_queries))
for query_data, lang in all_queries:
query = query_data["query"]
description = query_data["description"]
try:
results, embed_time, search_time = await semantic_search(
client, query, limit=10
)
total_time = embed_time + search_time
total_times.append(total_time)
benchmark = QueryBenchmark(
query=query,
language=lang,
description=description,
embedding_time_ms=embed_time,
search_time_ms=search_time,
total_time_ms=total_time,
results_count=len(results),
top_score=results[0].score if results else 0,
meets_target=total_time < 500
)
query_results.append(benchmark)
successful += 1
except Exception as e:
console.print(f"[red]Error for query '{query[:30]}...': {e}[/red]")
failed += 1
progress.advance(task)
# Calculate statistics
if total_times:
sorted_times = sorted(total_times)
p50_idx = int(len(sorted_times) * 0.50)
p95_idx = int(len(sorted_times) * 0.95)
p99_idx = int(len(sorted_times) * 0.99)
report = BenchmarkReport(
total_queries=len(all_queries),
successful_queries=successful,
failed_queries=failed,
avg_embedding_time_ms=statistics.mean(q.embedding_time_ms for q in query_results),
avg_search_time_ms=statistics.mean(q.search_time_ms for q in query_results),
avg_total_time_ms=statistics.mean(total_times),
p50_time_ms=sorted_times[p50_idx],
p95_time_ms=sorted_times[p95_idx] if p95_idx < len(sorted_times) else sorted_times[-1],
p99_time_ms=sorted_times[p99_idx] if p99_idx < len(sorted_times) else sorted_times[-1],
min_time_ms=min(total_times),
max_time_ms=max(total_times),
queries_meeting_target=sum(1 for t in total_times if t < 500),
target_ms=500,
query_results=query_results,
timestamp=datetime.now().isoformat()
)
else:
report = BenchmarkReport(
total_queries=len(all_queries),
successful_queries=0,
failed_queries=failed,
avg_embedding_time_ms=0,
avg_search_time_ms=0,
avg_total_time_ms=0,
p50_time_ms=0,
p95_time_ms=0,
p99_time_ms=0,
min_time_ms=0,
max_time_ms=0,
queries_meeting_target=0,
target_ms=500,
query_results=[],
timestamp=datetime.now().isoformat()
)
return report
def display_benchmark_report(report: BenchmarkReport):
"""Display benchmark report."""
console.print("\n" + "="*60)
console.print("[bold]BENCHMARK RESULTS[/bold]")
console.print("="*60)
# Summary stats
console.print(f"\n[cyan]Query Statistics:[/cyan]")
console.print(f" Total queries: {report.total_queries}")
console.print(f" Successful: [green]{report.successful_queries}[/green]")
console.print(f" Failed: [red]{report.failed_queries}[/red]")
console.print(f"\n[cyan]Timing Statistics:[/cyan]")
console.print(f" Average embedding time: {report.avg_embedding_time_ms:.1f}ms")
console.print(f" Average search time: {report.avg_search_time_ms:.1f}ms")
console.print(f" Average total time: {report.avg_total_time_ms:.1f}ms")
console.print(f"\n[cyan]Percentiles:[/cyan]")
console.print(f" P50: {report.p50_time_ms:.1f}ms")
console.print(f" P95: {report.p95_time_ms:.1f}ms")
console.print(f" P99: {report.p99_time_ms:.1f}ms")
console.print(f" Min: {report.min_time_ms:.1f}ms")
console.print(f" Max: {report.max_time_ms:.1f}ms")
# Target check
target_pct = (report.queries_meeting_target / report.successful_queries * 100) if report.successful_queries else 0
target_met = target_pct >= 95 # 95% of queries should meet target
console.print(f"\n[cyan]Performance Target (<{report.target_ms}ms):[/cyan]")
status = "[bold green]✓ TARGET MET[/bold green]" if target_met else "[bold red]✗ TARGET NOT MET[/bold red]"
console.print(f" Queries meeting target: {report.queries_meeting_target}/{report.successful_queries} ({target_pct:.1f}%)")
console.print(f" Status: {status}")
# Detailed results table
if report.query_results:
console.print("\n[cyan]Detailed Results:[/cyan]")
table = Table(show_lines=False)
table.add_column("Language", width=8)
table.add_column("Query", width=35)
table.add_column("Embed", justify="right", width=8)
table.add_column("Search", justify="right", width=8)
table.add_column("Total", justify="right", width=8)
table.add_column("Results", justify="right", width=7)
table.add_column("Status", width=6)
for r in report.query_results:
status_icon = "" if r.meets_target else ""
status_style = "green" if r.meets_target else "red"
table.add_row(
r.language,
r.query[:35] + "..." if len(r.query) > 35 else r.query,
f"{r.embedding_time_ms:.0f}ms",
f"{r.search_time_ms:.0f}ms",
f"{r.total_time_ms:.0f}ms",
str(r.results_count),
f"[{status_style}]{status_icon}[/{status_style}]"
)
console.print(table)
async def interactive_search():
"""Interactive search mode."""
console.print(Panel.fit(
"[bold blue]Interactive Semantic Search[/bold blue]\n"
"Type your query and press Enter. Type 'quit' to exit.",
title="Interactive Mode"
))
async with httpx.AsyncClient(timeout=30.0) as client:
while True:
try:
query = input("\n🔍 Query: ").strip()
if query.lower() in ('quit', 'exit', 'q'):
console.print("[dim]Goodbye![/dim]")
break
if not query:
continue
results, embed_time, search_time = await semantic_search(
client, query, limit=10
)
display_search_results(query, results, embed_time, search_time)
except KeyboardInterrupt:
console.print("\n[dim]Interrupted. Goodbye![/dim]")
break
except Exception as e:
console.print(f"[red]Error: {e}[/red]")
async def main():
"""Main entry point."""
import argparse
parser = argparse.ArgumentParser(description="Hadith Semantic Search Testing")
parser.add_argument("--mode", choices=["benchmark", "interactive", "demo"],
default="benchmark", help="Run mode")
parser.add_argument("--query", type=str, help="Single query to run")
parser.add_argument("--output", type=str, default="benchmark_results.json",
help="Output file for benchmark results")
args = parser.parse_args()
if args.query:
# Single query mode
async with httpx.AsyncClient(timeout=30.0) as client:
results, embed_time, search_time = await semantic_search(
client, args.query, limit=10
)
display_search_results(args.query, results, embed_time, search_time)
elif args.mode == "benchmark":
# Full benchmark
report = await run_benchmarks()
display_benchmark_report(report)
# Save results
with open(args.output, 'w') as f:
json.dump(asdict(report), f, indent=2, default=str)
console.print(f"\n[dim]Results saved to {args.output}[/dim]")
elif args.mode == "interactive":
await interactive_search()
elif args.mode == "demo":
# Demo with a few sample queries
console.print(Panel.fit(
"[bold blue]Semantic Search Demo[/bold blue]",
title="Demo Mode"
))
demo_queries = [
"الصلاة في المسجد",
"five daily prayers",
"patience during hardship",
"بر الوالدين"
]
async with httpx.AsyncClient(timeout=30.0) as client:
for query in demo_queries:
try:
results, embed_time, search_time = await semantic_search(
client, query, limit=5
)
display_search_results(query, results, embed_time, search_time)
except Exception as e:
console.print(f"[red]Error for '{query}': {e}[/red]")
if __name__ == "__main__":
asyncio.run(main())

View File

@ -0,0 +1,476 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Step 6: Verify Embeddings & Test Semantic Search\n",
"\n",
"This notebook provides interactive verification and testing of the hadith embedding system.\n",
"\n",
"**Prerequisites:**\n",
"- PostgreSQL accessible at pg.betelgeusebytes.io\n",
"- Qdrant accessible at qdrant.vector.svc.cluster.local\n",
"- TEI accessible at tei.ml.svc.cluster.local"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Setup & Configuration"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Install dependencies\n",
"!pip install -q psycopg2-binary httpx rich"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import json\n",
"import time\n",
"import httpx\n",
"import psycopg2\n",
"from psycopg2.extras import RealDictCursor\n",
"import pandas as pd\n",
"from IPython.display import display, HTML, Markdown\n",
"\n",
"# Configuration\n",
"POSTGRES_CONFIG = {\n",
" 'host': os.getenv('POSTGRES_HOST', 'pg.betelgeusebytes.io'),\n",
" 'port': int(os.getenv('POSTGRES_PORT', '5432')),\n",
" 'database': os.getenv('POSTGRES_DB', 'hadith_db'),\n",
" 'user': os.getenv('POSTGRES_USER', 'hadith_ingest'),\n",
" 'password': os.getenv('POSTGRES_PASSWORD', ''), # SET THIS!\n",
" 'sslmode': 'require'\n",
"}\n",
"\n",
"QDRANT_URL = f\"http://{os.getenv('QDRANT_HOST', 'qdrant.vector.svc.cluster.local')}:{os.getenv('QDRANT_PORT', '6333')}\"\n",
"QDRANT_COLLECTION = os.getenv('QDRANT_COLLECTION', 'hadith_embeddings')\n",
"\n",
"TEI_URL = f\"http://{os.getenv('TEI_HOST', 'tei.ml.svc.cluster.local')}:{os.getenv('TEI_PORT', '80')}\"\n",
"\n",
"print(f\"PostgreSQL: {POSTGRES_CONFIG['host']}:{POSTGRES_CONFIG['port']}/{POSTGRES_CONFIG['database']}\")\n",
"print(f\"Qdrant: {QDRANT_URL}\")\n",
"print(f\"TEI: {TEI_URL}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# ⚠️ SET YOUR PASSWORD HERE\n",
"POSTGRES_CONFIG['password'] = 'YOUR_PASSWORD_HERE' # CHANGE THIS!"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Database Verification"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_db_connection():\n",
" return psycopg2.connect(**POSTGRES_CONFIG)\n",
"\n",
"def run_query(query):\n",
" conn = get_db_connection()\n",
" try:\n",
" df = pd.read_sql(query, conn)\n",
" return df\n",
" finally:\n",
" conn.close()\n",
"\n",
"# Test connection\n",
"try:\n",
" conn = get_db_connection()\n",
" conn.close()\n",
" print(\"✅ Database connection successful!\")\n",
"except Exception as e:\n",
" print(f\"❌ Database connection failed: {e}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Get total hadith count and embedding status\n",
"query = \"\"\"\n",
"SELECT \n",
" COUNT(*) as total_hadiths,\n",
" SUM(CASE WHEN embedding_generated THEN 1 ELSE 0 END) as embedded,\n",
" SUM(CASE WHEN NOT embedding_generated THEN 1 ELSE 0 END) as not_embedded,\n",
" ROUND(100.0 * SUM(CASE WHEN embedding_generated THEN 1 ELSE 0 END) / COUNT(*), 2) as pct_complete\n",
"FROM hadiths\n",
"\"\"\"\n",
"\n",
"df = run_query(query)\n",
"display(df)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Get breakdown by collection\n",
"query = \"\"\"\n",
"SELECT \n",
" c.name_english as collection,\n",
" COUNT(h.id) as total,\n",
" SUM(CASE WHEN h.embedding_generated THEN 1 ELSE 0 END) as embedded,\n",
" ROUND(100.0 * SUM(CASE WHEN h.embedding_generated THEN 1 ELSE 0 END) / COUNT(h.id), 2) as pct_embedded\n",
"FROM hadiths h\n",
"JOIN collections c ON h.collection_id = c.id\n",
"GROUP BY c.id, c.name_english\n",
"ORDER BY total DESC\n",
"\"\"\"\n",
"\n",
"df_collections = run_query(query)\n",
"display(df_collections)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. Qdrant Verification"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Check Qdrant collection\n",
"with httpx.Client(timeout=30.0) as client:\n",
" try:\n",
" response = client.get(f\"{QDRANT_URL}/collections/{QDRANT_COLLECTION}\")\n",
" response.raise_for_status()\n",
" collection_info = response.json()\n",
" print(\"✅ Qdrant collection found!\")\n",
" print(f\"\\nCollection status: {collection_info['result']['status']}\")\n",
" print(f\"Vector dimension: {collection_info['result']['config']['params']['vectors']['size']}\")\n",
" except Exception as e:\n",
" print(f\"❌ Qdrant error: {e}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Count points in Qdrant\n",
"with httpx.Client(timeout=30.0) as client:\n",
" response = client.post(\n",
" f\"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/points/count\",\n",
" json={\"exact\": True}\n",
" )\n",
" response.raise_for_status()\n",
" count = response.json()['result']['count']\n",
" print(f\"\\n📊 Total embeddings in Qdrant: {count:,}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4. TEI Service Test"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Test TEI embedding service\n",
"test_text = \"الصلاة في المسجد الحرام\"\n",
"\n",
"with httpx.Client(timeout=30.0) as client:\n",
" start = time.perf_counter()\n",
" response = client.post(\n",
" f\"{TEI_URL}/embed\",\n",
" json={\"inputs\": test_text}\n",
" )\n",
" elapsed = (time.perf_counter() - start) * 1000\n",
" \n",
" response.raise_for_status()\n",
" embedding = response.json()[0]\n",
" \n",
" print(f\"✅ TEI service working!\")\n",
" print(f\"\\nTest text: {test_text}\")\n",
" print(f\"Embedding dimension: {len(embedding)}\")\n",
" print(f\"Time: {elapsed:.1f}ms\")\n",
" print(f\"First 5 values: {embedding[:5]}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5. Semantic Search Testing"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def semantic_search(query_text, limit=10):\n",
" \"\"\"Perform semantic search and return results with timing.\"\"\"\n",
" with httpx.Client(timeout=30.0) as client:\n",
" # Get embedding\n",
" start = time.perf_counter()\n",
" embed_response = client.post(f\"{TEI_URL}/embed\", json={\"inputs\": query_text})\n",
" embed_response.raise_for_status()\n",
" embedding = embed_response.json()[0]\n",
" embed_time = (time.perf_counter() - start) * 1000\n",
" \n",
" # Search Qdrant\n",
" start = time.perf_counter()\n",
" search_response = client.post(\n",
" f\"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/points/search\",\n",
" json={\"vector\": embedding, \"limit\": limit, \"with_payload\": True}\n",
" )\n",
" search_response.raise_for_status()\n",
" results = search_response.json()['result']\n",
" search_time = (time.perf_counter() - start) * 1000\n",
" \n",
" return results, embed_time, search_time\n",
"\n",
"def display_results(query, results, embed_time, search_time):\n",
" \"\"\"Display search results nicely.\"\"\"\n",
" total_time = embed_time + search_time\n",
" status = \"✅\" if total_time < 500 else \"⚠️\"\n",
" \n",
" print(f\"\\n{'='*60}\")\n",
" print(f\"Query: {query}\")\n",
" print(f\"Timing: {embed_time:.0f}ms (embed) + {search_time:.0f}ms (search) = {total_time:.0f}ms {status}\")\n",
" print(f\"{'='*60}\\n\")\n",
" \n",
" for i, r in enumerate(results, 1):\n",
" score = r['score']\n",
" payload = r.get('payload', {})\n",
" \n",
" text = payload.get('english_text') or payload.get('arabic_text', '')\n",
" text = text[:150] + '...' if len(text) > 150 else text\n",
" \n",
" print(f\"{i}. [Score: {score:.4f}] {payload.get('collection', 'Unknown')} #{payload.get('hadith_number', 'N/A')}\")\n",
" print(f\" {text}\")\n",
" print()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Test Arabic query\n",
"query = \"الصلاة في المسجد الحرام\"\n",
"results, embed_time, search_time = semantic_search(query, limit=5)\n",
"display_results(query, results, embed_time, search_time)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Test English query\n",
"query = \"five daily prayers\"\n",
"results, embed_time, search_time = semantic_search(query, limit=5)\n",
"display_results(query, results, embed_time, search_time)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Test more queries\n",
"test_queries = [\n",
" \"الصيام في شهر رمضان\",\n",
" \"patience during hardship\",\n",
" \"بر الوالدين\",\n",
" \"charity and helping poor\",\n",
" \"الجنة والنار\"\n",
"]\n",
"\n",
"for q in test_queries:\n",
" results, embed_time, search_time = semantic_search(q, limit=3)\n",
" display_results(q, results, embed_time, search_time)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 6. Performance Benchmarking"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import statistics\n",
"\n",
"# Benchmark queries\n",
"benchmark_queries = [\n",
" \"الصلاة في المسجد الحرام\",\n",
" \"أبو هريرة رضي الله عنه\",\n",
" \"الصيام في شهر رمضان\",\n",
" \"five daily prayers\",\n",
" \"treatment of neighbors\",\n",
" \"patience during hardship\",\n",
" \"marriage and family\",\n",
" \"honesty and truthfulness\",\n",
" \"الزكاة والصدقة\",\n",
" \"الحج والعمرة\"\n",
"]\n",
"\n",
"# Warmup\n",
"print(\"Warming up...\")\n",
"for _ in range(3):\n",
" semantic_search(\"warmup query\", limit=5)\n",
"\n",
"# Run benchmark\n",
"print(\"\\nRunning benchmark...\")\n",
"times = []\n",
"\n",
"for q in benchmark_queries:\n",
" results, embed_time, search_time = semantic_search(q, limit=10)\n",
" total = embed_time + search_time\n",
" times.append(total)\n",
" status = \"✅\" if total < 500 else \"⚠️\"\n",
" print(f\" {q[:40]:40s} → {total:6.1f}ms {status}\")\n",
"\n",
"# Statistics\n",
"print(f\"\\n{'='*60}\")\n",
"print(\"BENCHMARK RESULTS\")\n",
"print(f\"{'='*60}\")\n",
"print(f\"Queries: {len(times)}\")\n",
"print(f\"Average: {statistics.mean(times):.1f}ms\")\n",
"print(f\"Median: {statistics.median(times):.1f}ms\")\n",
"print(f\"Min: {min(times):.1f}ms\")\n",
"print(f\"Max: {max(times):.1f}ms\")\n",
"print(f\"StdDev: {statistics.stdev(times):.1f}ms\")\n",
"\n",
"meeting_target = sum(1 for t in times if t < 500)\n",
"print(f\"\\nMeeting <500ms target: {meeting_target}/{len(times)} ({100*meeting_target/len(times):.1f}%)\")\n",
"\n",
"if meeting_target == len(times):\n",
" print(\"\\n✅ TARGET MET!\")\n",
"else:\n",
" print(\"\\n⚠ Some queries exceeded target\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 7. Interactive Search"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Interactive search cell - run this and enter your query\n",
"query = input(\"Enter your search query: \")\n",
"if query:\n",
" results, embed_time, search_time = semantic_search(query, limit=10)\n",
" display_results(query, results, embed_time, search_time)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 8. Verification Summary"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Final verification summary\n",
"print(\"=\"*60)\n",
"print(\"STEP 6 VERIFICATION SUMMARY\")\n",
"print(\"=\"*60)\n",
"\n",
"# Database check\n",
"df = run_query(\"SELECT COUNT(*) as total, SUM(CASE WHEN embedding_generated THEN 1 ELSE 0 END) as embedded FROM hadiths\")\n",
"total = df['total'][0]\n",
"embedded = df['embedded'][0]\n",
"print(f\"\\n✅ Database: {total:,} hadiths, {embedded:,} embedded ({100*embedded/total:.1f}%)\")\n",
"\n",
"# Qdrant check\n",
"with httpx.Client(timeout=30.0) as client:\n",
" response = client.post(f\"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/points/count\", json={\"exact\": True})\n",
" qdrant_count = response.json()['result']['count']\n",
" print(f\"✅ Qdrant: {qdrant_count:,} embeddings stored\")\n",
"\n",
"# Benchmark summary\n",
"if 'times' in dir() and times:\n",
" print(f\"✅ Performance: Avg {statistics.mean(times):.0f}ms, P95 {sorted(times)[int(len(times)*0.95)]:.0f}ms\")\n",
"\n",
"# Missing check\n",
"missing = total - qdrant_count\n",
"if missing == 0:\n",
" print(f\"\\n🎉 ALL {total:,} HADITHS VERIFIED!\")\n",
"else:\n",
" print(f\"\\n⚠ {missing:,} embeddings potentially missing\")\n",
"\n",
"print(\"\\n\" + \"=\"*60)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.11.0"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@ -0,0 +1,192 @@
-- ============================================================================
-- Step 6.1: PostgreSQL Verification Queries
-- Run these against hadith_db to verify data integrity
-- ============================================================================
-- Connect: psql -h pg.betelgeusebytes.io -U hadith_ingest -d hadith_db
-- ============================================================================
-- 1. Basic Statistics
-- ============================================================================
-- Total hadith count
SELECT COUNT(*) AS total_hadiths FROM hadiths;
-- Hadiths by collection with embedding status
SELECT
c.name_english AS collection,
COUNT(h.id) AS total,
SUM(CASE WHEN h.embedding_generated THEN 1 ELSE 0 END) AS embedded,
ROUND(100.0 * SUM(CASE WHEN h.embedding_generated THEN 1 ELSE 0 END) / COUNT(h.id), 2) AS pct_embedded
FROM hadiths h
JOIN collections c ON h.collection_id = c.id
GROUP BY c.id, c.name_english
ORDER BY total DESC;
-- ============================================================================
-- 2. Embedding Status Summary
-- ============================================================================
-- Overall embedding status
SELECT
SUM(CASE WHEN embedding_generated THEN 1 ELSE 0 END) AS embedded,
SUM(CASE WHEN NOT embedding_generated THEN 1 ELSE 0 END) AS not_embedded,
COUNT(*) AS total,
ROUND(100.0 * SUM(CASE WHEN embedding_generated THEN 1 ELSE 0 END) / COUNT(*), 2) AS pct_complete
FROM hadiths;
-- List hadiths without embeddings (if any)
SELECT
h.id,
c.name_english AS collection,
h.hadith_number,
LEFT(h.arabic_text, 100) AS arabic_preview
FROM hadiths h
JOIN collections c ON h.collection_id = c.id
WHERE NOT h.embedding_generated
ORDER BY h.id
LIMIT 20;
-- ============================================================================
-- 3. Text Quality Checks
-- ============================================================================
-- Hadiths with empty or null texts
SELECT
'Empty Arabic' AS issue,
COUNT(*) AS count
FROM hadiths
WHERE arabic_text IS NULL OR LENGTH(TRIM(arabic_text)) = 0
UNION ALL
SELECT
'Empty English' AS issue,
COUNT(*) AS count
FROM hadiths
WHERE english_text IS NULL OR LENGTH(TRIM(english_text)) = 0
UNION ALL
SELECT
'Empty Both' AS issue,
COUNT(*) AS count
FROM hadiths
WHERE (arabic_text IS NULL OR LENGTH(TRIM(arabic_text)) = 0)
AND (english_text IS NULL OR LENGTH(TRIM(english_text)) = 0);
-- ============================================================================
-- 4. Grade Distribution
-- ============================================================================
SELECT
COALESCE(grade, 'Unknown') AS grade,
COUNT(*) AS count,
ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER(), 2) AS percentage
FROM hadiths
GROUP BY grade
ORDER BY count DESC;
-- ============================================================================
-- 5. Arabic Normalization Verification
-- ============================================================================
-- Check that normalized column is populated
SELECT
COUNT(*) AS total,
SUM(CASE WHEN arabic_normalized IS NOT NULL AND LENGTH(arabic_normalized) > 0 THEN 1 ELSE 0 END) AS normalized,
SUM(CASE WHEN arabic_normalized IS NULL OR LENGTH(arabic_normalized) = 0 THEN 1 ELSE 0 END) AS not_normalized
FROM hadiths
WHERE arabic_text IS NOT NULL AND LENGTH(arabic_text) > 0;
-- Sample comparison of original vs normalized
SELECT
id,
LEFT(arabic_text, 100) AS original,
LEFT(arabic_normalized, 100) AS normalized
FROM hadiths
WHERE arabic_text IS NOT NULL
LIMIT 5;
-- ============================================================================
-- 6. Metadata Completeness
-- ============================================================================
-- Check source_metadata JSON completeness
SELECT
COUNT(*) AS total,
SUM(CASE WHEN source_metadata IS NOT NULL THEN 1 ELSE 0 END) AS has_metadata,
SUM(CASE WHEN source_metadata ? 'api_source' THEN 1 ELSE 0 END) AS has_api_source,
SUM(CASE WHEN source_metadata ? 'ingested_at' THEN 1 ELSE 0 END) AS has_ingested_at
FROM hadiths;
-- ============================================================================
-- 7. ID Range Check (for Qdrant comparison)
-- ============================================================================
-- Get ID range
SELECT
MIN(id) AS min_id,
MAX(id) AS max_id,
COUNT(*) AS total_ids,
MAX(id) - MIN(id) + 1 AS expected_if_sequential,
COUNT(*) = (MAX(id) - MIN(id) + 1) AS is_sequential
FROM hadiths;
-- Find gaps in IDs (if any)
WITH id_series AS (
SELECT generate_series(
(SELECT MIN(id) FROM hadiths),
(SELECT MAX(id) FROM hadiths)
) AS expected_id
)
SELECT expected_id AS missing_id
FROM id_series
WHERE expected_id NOT IN (SELECT id FROM hadiths)
ORDER BY expected_id
LIMIT 50;
-- ============================================================================
-- 8. Sample Data for Manual Verification
-- ============================================================================
-- Sample 10 hadiths with all fields
SELECT
h.id,
c.name_english AS collection,
b.name_english AS book,
h.hadith_number,
h.grade,
LENGTH(h.arabic_text) AS arabic_len,
LENGTH(h.english_text) AS english_len,
LENGTH(h.urdu_text) AS urdu_len,
h.embedding_generated,
h.created_at
FROM hadiths h
JOIN collections c ON h.collection_id = c.id
LEFT JOIN books b ON h.book_id = b.id
ORDER BY RANDOM()
LIMIT 10;
-- ============================================================================
-- 9. NER/RE Preparation Status
-- ============================================================================
SELECT
SUM(CASE WHEN entities_extracted THEN 1 ELSE 0 END) AS entities_extracted,
SUM(CASE WHEN relations_extracted THEN 1 ELSE 0 END) AS relations_extracted,
COUNT(*) AS total
FROM hadiths;
-- ============================================================================
-- 10. Quick Health Check Query (run this first)
-- ============================================================================
SELECT
'Database Health Check' AS check_type,
(SELECT COUNT(*) FROM hadiths) AS total_hadiths,
(SELECT COUNT(*) FROM collections) AS total_collections,
(SELECT COUNT(*) FROM books) AS total_books,
(SELECT SUM(CASE WHEN embedding_generated THEN 1 ELSE 0 END) FROM hadiths) AS embedded_count,
(SELECT COUNT(*) FROM hadiths WHERE arabic_text IS NOT NULL AND LENGTH(arabic_text) > 0) AS has_arabic,
(SELECT COUNT(*) FROM hadiths WHERE english_text IS NOT NULL AND LENGTH(english_text) > 0) AS has_english;

View File

@ -0,0 +1,33 @@
{
"total_hadiths_db": 41349,
"total_embeddings_qdrant": 41268,
"embeddings_with_payloads": 0,
"missing_embeddings": 81,
"embedding_dimension": 1024,
"collection_exists": true,
"collection_status": "green",
"sample_ids_missing": [
"09895d61-26cf-444a-bc7b-163a765dd37a",
"0990e5a0-7956-440c-9ee7-26dd0d5ecf8f",
"09926a1f-925a-49c8-9ff1-6870dc78bb2b",
"09951573-3a3e-4e4f-8fd9-268b9ba024eb",
"09fe7958-5af2-47f6-8d74-4bc10816a9df",
"09feef6b-c0ef-44a2-b9e8-bb40b13d7682",
"0a0b2754-6be4-4831-9c4b-1d2c420523f5",
"0a110e84-0ac3-4c50-8f04-8f8e8de876ce",
"0a711aef-8ae6-4dcd-9800-4a140356827f",
"0a71da11-bc62-4dca-b3a6-3d56b59dbb6b",
"0a7baa50-58f3-4c96-8103-c4e6c1fdbac6",
"0a7d0d2c-98c2-48b5-be84-a6a46f6151c2",
"0a80949a-0bce-40ea-8b26-1a65ae72c39b",
"0a82ae08-1d1f-49b8-97d2-2985c47fd79a",
"0a8374e0-0a33-41cd-a9d6-8d54a7eef227",
"0a843db5-23bc-4919-95cd-c88475d43ad9",
"0ad5aa90-6bb7-438f-967a-41786a4e7b9b",
"0ad626f3-0865-45c0-9ab8-14323351b278",
"0ad671b9-cf0f-4f46-8a2d-e13ec8fd19f7",
"0ad6de6b-763b-4874-ab3f-e0f6263749eb"
],
"verification_time_seconds": 9.182509422302246,
"timestamp": "2025-11-28T10:03:44.365382"
}

View File

@ -0,0 +1,387 @@
#!/usr/bin/env python3
"""
Step 6.1: Verify Embeddings in Qdrant
=====================================
Validates that all hadiths have embeddings stored in Qdrant vector database.
Author: Hadith Scholar AI Project
Date: 2025
"""
import os
import sys
import json
import time
import asyncio
from datetime import datetime
from typing import Dict, List, Optional, Tuple
from dataclasses import dataclass, asdict
if sys.platform == 'win32':
os.environ['PYTHONIOENCODING'] = 'utf-8'
if hasattr(sys.stdout, 'reconfigure'):
sys.stdout.reconfigure(encoding='utf-8')
if hasattr(sys.stderr, 'reconfigure'):
sys.stderr.reconfigure(encoding='utf-8')
import psycopg2
from psycopg2.extras import RealDictCursor
import httpx
from rich.console import Console
from rich.table import Table
from rich.progress import Progress, SpinnerColumn, TextColumn
from rich.panel import Panel
# Configuration
POSTGRES_HOST = os.getenv("POSTGRES_HOST", "pg.betelgeusebytes.io")
POSTGRES_PORT = int(os.getenv("POSTGRES_PORT", "5432"))
POSTGRES_DB = os.getenv("POSTGRES_DB", "hadith_db")
POSTGRES_USER = os.getenv("POSTGRES_USER", "hadith_ingest")
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD", "hadith_ingest")
# TEI_URL = "https://embeddings.betelgeusebytes.io"
# QDRANT_URL = "https://vector.betelgeusebytes.io"
QDRANT_HOST = os.getenv("QDRANT_HOST", "https://vector.betelgeusebytes.io")
QDRANT_PORT = int(os.getenv("QDRANT_PORT", "443"))
QDRANT_COLLECTION = os.getenv("QDRANT_COLLECTION", "hadith_embeddings")
# For external access
QDRANT_EXTERNAL = os.getenv("QDRANT_EXTERNAL", "qdrant.betelgeusebytes.io")
console = Console()
@dataclass
class VerificationResult:
"""Results from embedding verification."""
total_hadiths_db: int
total_embeddings_qdrant: int
embeddings_with_payloads: int
missing_embeddings: int
embedding_dimension: int
collection_exists: bool
collection_status: str
sample_ids_missing: List[int]
verification_time_seconds: float
timestamp: str
def get_db_connection():
"""Create PostgreSQL connection."""
return psycopg2.connect(
host=POSTGRES_HOST,
port=POSTGRES_PORT,
database=POSTGRES_DB,
user=POSTGRES_USER,
password=POSTGRES_PASSWORD,
sslmode='require'
)
async def get_qdrant_collection_info(client: httpx.AsyncClient) -> Dict:
"""Get Qdrant collection information."""
try:
response = await client.get(
f"{QDRANT_HOST}:{QDRANT_PORT}/collections/{QDRANT_COLLECTION}"
)
response.raise_for_status()
return response.json()
except httpx.HTTPError as e:
console.print(f"[red]Error connecting to Qdrant: {e}[/red]")
return {}
async def count_qdrant_points(client: httpx.AsyncClient) -> int:
"""Count total points in Qdrant collection."""
try:
response = await client.post(
f"{QDRANT_HOST}:{QDRANT_PORT}/collections/{QDRANT_COLLECTION}/points/count",
json={"exact": True}
)
response.raise_for_status()
return response.json().get("result", {}).get("count", 0)
except httpx.HTTPError as e:
console.print(f"[red]Error counting Qdrant points: {e}[/red]")
return 0
async def get_qdrant_points_sample(
client: httpx.AsyncClient,
offset: int = 0,
limit: int = 100
) -> List[Dict]:
"""Get a sample of points from Qdrant."""
try:
response = await client.post(
f"{QDRANT_HOST}:{QDRANT_PORT}/collections/{QDRANT_COLLECTION}/points/scroll",
json={
"limit": limit,
"offset": offset,
"with_payload": True,
"with_vector": False
}
)
response.raise_for_status()
return response.json().get("result", {}).get("points", [])
except httpx.HTTPError as e:
console.print(f"[red]Error fetching Qdrant points: {e}[/red]")
return []
async def get_all_qdrant_ids(client: httpx.AsyncClient) -> set:
"""Get all point IDs from Qdrant (paginated)."""
all_ids = set()
offset = None
batch_size = 1000
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
console=console
) as progress:
task = progress.add_task("Fetching Qdrant IDs...", total=None)
while True:
try:
payload = {
"limit": batch_size,
"with_payload": False,
"with_vector": False
}
if offset is not None:
payload["offset"] = offset
response = await client.post(
f"{QDRANT_HOST}:{QDRANT_PORT}/collections/{QDRANT_COLLECTION}/points/scroll",
json=payload,
timeout=60.0
)
response.raise_for_status()
result = response.json().get("result", {})
points = result.get("points", [])
if not points:
break
for point in points:
all_ids.add(point["id"])
offset = result.get("next_page_offset")
progress.update(task, description=f"Fetched {len(all_ids)} IDs...")
# console.print(f" Fetched IDs: [green]{offset}[/green]")
if offset is None:
break
except httpx.HTTPError as e:
console.print(f"[red]Error during ID fetch: {e}[/red]")
break
return all_ids
def get_all_hadith_ids_from_db() -> set:
"""Get all hadith IDs from PostgreSQL."""
conn = get_db_connection()
try:
with conn.cursor() as cur:
cur.execute("SELECT id FROM hadiths ORDER BY id")
return {row[0] for row in cur.fetchall()}
finally:
conn.close()
def get_hadith_count_from_db() -> int:
"""Get total hadith count from PostgreSQL."""
conn = get_db_connection()
try:
with conn.cursor() as cur:
cur.execute("SELECT COUNT(*) FROM hadiths")
return cur.fetchone()[0]
finally:
conn.close()
def get_embedding_stats_from_db() -> Dict:
"""Get embedding generation stats from PostgreSQL."""
conn = get_db_connection()
try:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT
COUNT(*) as total,
SUM(CASE WHEN embedding_generated THEN 1 ELSE 0 END) as embedded,
SUM(CASE WHEN NOT embedding_generated THEN 1 ELSE 0 END) as not_embedded
FROM hadiths
""")
return dict(cur.fetchone())
finally:
conn.close()
def get_collection_stats_by_source() -> List[Dict]:
"""Get hadith counts by collection/source."""
conn = get_db_connection()
try:
with conn.cursor(cursor_factory=RealDictCursor) as cur:
cur.execute("""
SELECT
c.name_english as collection,
COUNT(h.id) as count,
SUM(CASE WHEN h.embedding_generated THEN 1 ELSE 0 END) as embedded
FROM hadiths h
JOIN collections c ON h.collection_id = c.id
GROUP BY c.id, c.name_english
ORDER BY count DESC
""")
return [dict(row) for row in cur.fetchall()]
finally:
conn.close()
async def verify_embeddings() -> VerificationResult:
"""Main verification function."""
start_time = time.time()
console.print(Panel.fit(
"[bold blue]Hadith Embeddings Verification[/bold blue]\n"
f"Database: {POSTGRES_HOST}:{POSTGRES_PORT}/{POSTGRES_DB}\n"
f"Qdrant: {QDRANT_HOST}:{QDRANT_PORT}/{QDRANT_COLLECTION}",
title="Step 6.1"
))
# Step 1: Get PostgreSQL stats
console.print("\n[yellow]1. Checking PostgreSQL database...[/yellow]")
db_stats = get_embedding_stats_from_db()
total_hadiths = db_stats['total']
console.print(f" Total hadiths: [green]{total_hadiths:,}[/green]")
console.print(f" Marked as embedded: [green]{db_stats['embedded']:,}[/green]")
# Step 2: Get collection breakdown
console.print("\n[yellow]2. Collection breakdown:[/yellow]")
collection_stats = get_collection_stats_by_source()
table = Table(title="Hadiths by Collection")
table.add_column("Collection", style="cyan")
table.add_column("Total", justify="right")
table.add_column("Embedded", justify="right", style="green")
for stat in collection_stats:
table.add_row(
stat['collection'],
f"{stat['count']:,}",
f"{stat['embedded']:,}"
)
console.print(table)
# Step 3: Check Qdrant collection
console.print("\n[yellow]3. Checking Qdrant collection...[/yellow]")
async with httpx.AsyncClient(timeout=30.0) as client:
collection_info = await get_qdrant_collection_info(client)
if not collection_info:
return VerificationResult(
total_hadiths_db=total_hadiths,
total_embeddings_qdrant=0,
embeddings_with_payloads=0,
missing_embeddings=total_hadiths,
embedding_dimension=0,
collection_exists=False,
collection_status="NOT_FOUND",
sample_ids_missing=[],
verification_time_seconds=time.time() - start_time,
timestamp=datetime.now().isoformat()
)
result = collection_info.get("result", {})
status = result.get("status", "unknown")
vectors_config = result.get("config", {}).get("params", {}).get("vectors", {})
embedding_dim = vectors_config.get("size", 0)
console.print(f" Collection status: [green]{status}[/green]")
console.print(f" Embedding dimension: [green]{embedding_dim}[/green]")
# Step 4: Count Qdrant points
console.print("\n[yellow]4. Counting Qdrant embeddings...[/yellow]")
qdrant_count = await count_qdrant_points(client)
console.print(f" Total embeddings: [green]{qdrant_count:,}[/green]")
# Step 5: Find missing embeddings
console.print("\n[yellow]5. Identifying missing embeddings...[/yellow]")
db_ids = get_all_hadith_ids_from_db()
console.print(f" DB Ids: [green]{len(db_ids)}[/green]")
qdrant_ids = await get_all_qdrant_ids(client)
console.print(f" DB Ids: [green]{len(qdrant_ids)}[/green]")
missing_ids = db_ids - qdrant_ids
extra_ids = qdrant_ids - db_ids
console.print(f" IDs in DB: [blue]{len(db_ids):,}[/blue]")
console.print(f" IDs in Qdrant: [blue]{len(qdrant_ids):,}[/blue]")
console.print(f" Missing embeddings: [{'red' if missing_ids else 'green'}]{len(missing_ids):,}[/{'red' if missing_ids else 'green'}]")
if extra_ids:
console.print(f" Extra IDs in Qdrant (orphaned): [yellow]{len(extra_ids):,}[/yellow]")
# Get sample of missing IDs
sample_missing = sorted(list(missing_ids))[:20] if missing_ids else []
# Step 6: Verify sample payload integrity
console.print("\n[yellow]6. Verifying payload integrity...[/yellow]")
sample_points = await get_qdrant_points_sample(client, limit=100)
payloads_with_data = sum(
1 for p in sample_points
if p.get("payload") and p["payload"].get("hadith_id")
)
console.print(f" Sample size: {len(sample_points)}")
console.print(f" With valid payloads: [green]{payloads_with_data}[/green]")
verification_time = time.time() - start_time
# Summary
console.print("\n" + "="*50)
console.print("[bold]VERIFICATION SUMMARY[/bold]")
console.print("="*50)
if len(missing_ids) == 0:
console.print("[bold green]✓ ALL EMBEDDINGS VERIFIED![/bold green]")
else:
console.print(f"[bold red]✗ {len(missing_ids):,} EMBEDDINGS MISSING[/bold red]")
if sample_missing:
console.print(f" Sample missing IDs: {sample_missing[:10]}")
console.print(f"\nVerification completed in {verification_time:.2f} seconds")
return VerificationResult(
total_hadiths_db=total_hadiths,
total_embeddings_qdrant=qdrant_count,
embeddings_with_payloads=payloads_with_data,
missing_embeddings=len(missing_ids),
embedding_dimension=embedding_dim,
collection_exists=True,
collection_status=status,
sample_ids_missing=sample_missing,
verification_time_seconds=verification_time,
timestamp=datetime.now().isoformat()
)
async def main():
"""Main entry point."""
result = await verify_embeddings()
# Save results to JSON
output_file = "verification_results.json"
with open(output_file, 'w') as f:
json.dump(asdict(result), f, indent=2)
console.print(f"\n[dim]Results saved to {output_file}[/dim]")
# Exit with error code if missing embeddings
if result.missing_embeddings > 0:
sys.exit(1)
if __name__ == "__main__":
asyncio.run(main())

View File