add generate embeddings

2025-11-24 11:17:57 +01:00 · 2025-11-24 11:17:57 +01:00 · f46297255d
parent cf02b918b8
commit f46297255d
9 changed files with 4181 additions and 17 deletions
--- a/hadith-ingestion/.DS_Store
+++ b/hadith-ingestion/.DS_Store
--- a/hadith-ingestion/argo/workflows/generate-embeddings.yaml
+++ b/hadith-ingestion/argo/workflows/generate-embeddings.yaml
@ -4,28 +4,127 @@ metadata:
  generateName: generate-embeddings-
  namespace: ml
 spec:
-  entrypoint: generate
+  entrypoint: embedding-pipeline
  serviceAccountName: argo-workflow
  
  arguments:
    parameters:
-    - name: batch-size
-      value: "32"
+    - name: db_password
+      value: "YOUR_PASSWORD_HERE"  # UPDATE THIS
  
  templates:
-  - name: generate
+  - name: embedding-pipeline
+    steps:
+    - - name: generate-embeddings
+        template: generate-job
+  
+  - name: generate-job
    container:
-      image: hadith-ingestion:latest
-      command: [python, /app/src/embeddings/generator.py]
-      args: ["--batch-size={{workflow.parameters.batch-size}}"]
-      env:
-        - name: DATABASE_HOST
-          value: "pg.betelgeusebytes.io"
-        - name: DATABASE_PASSWORD
-          valueFrom:
-            secretKeyRef:
-              name: hadith-db-secret
-              key: password
+      image: python:3.11-slim
+      command: [python, -u, -c]
+      args:
+      - |
+        import requests, psycopg2, time, sys, urllib3
+        from datetime import datetime
+        
+        # Disable SSL warnings
+        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+        
+        # Install deps
+        import subprocess
+        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "psycopg2-binary", "requests", "urllib3"])
+        
+        # Config - EXTERNAL URLS with SSL disabled
+        TEI_URL = "https://embeddings.betelgeusebytes.io"
+        QDRANT_URL = "https://vector.betelgeusebytes.io"
+        DB_CONFIG = {
+            'host': 'pg.betelgeusebytes.io',
+            'port': 5432,
+            'dbname': 'hadith_db',
+            'user': 'hadith_ingest',
+            'password': '{{workflow.parameters.db_password}}'
+        }
+        
+        BATCH_SIZE = 32
+        COLLECTION = "hadith_embeddings"
+        VERIFY_SSL = False  # Important: Ignore SSL certificates
+        
+        print(f"Started: {datetime.now()}", flush=True)
+        print(f"SSL Verification: {VERIFY_SSL}", flush=True)
+        
+        conn = psycopg2.connect(**DB_CONFIG)
+        cur = conn.cursor()
+        
+        cur.execute("SELECT COUNT(*) FROM hadiths WHERE embedding_generated = FALSE")
+        total = cur.fetchone()[0]
+        print(f"Total to process: {total:,}", flush=True)
+        
+        processed = 0
+        failed = 0
+        offset = 0
+        
+        while offset < total:
+            cur.execute("""
+                SELECT id, arabic_text, english_text, collection_id, hadith_number
+                FROM hadiths WHERE embedding_generated = FALSE
+                ORDER BY id LIMIT %s OFFSET %s
+            """, (BATCH_SIZE, offset))
+            
+            hadiths = cur.fetchall()
+            if not hadiths:
+                break
+            
+            try:
+                # Get embeddings (with verify=False)
+                texts = [" ".join(filter(None, [h[1], h[2]])) for h in hadiths]
+                resp = requests.post(
+                    f"{TEI_URL}/embed", 
+                    json={"inputs": texts}, 
+                    timeout=60,
+                    verify=VERIFY_SSL
+                )
+                resp.raise_for_status()
+                embeddings = resp.json()
+                
+                # Upload to Qdrant (with verify=False)
+                points = [{
+                    "id": h[0],
+                    "vector": embeddings[i],
+                    "payload": {"collection_id": h[3], "hadith_number": h[4]}
+                } for i, h in enumerate(hadiths)]
+                
+                resp = requests.put(
+                    f"{QDRANT_URL}/collections/{COLLECTION}/points", 
+                    json={"points": points}, 
+                    timeout=30,
+                    verify=VERIFY_SSL
+                )
+                resp.raise_for_status()
+                
+                # Update DB
+                cur.execute("UPDATE hadiths SET embedding_generated = TRUE WHERE id = ANY(%s)", ([h[0] for h in hadiths],))
+                conn.commit()
+                
+                processed += len(hadiths)
+                if processed % 320 == 0:  # Every 10 batches
+                    pct = 100 * processed / total
+                    print(f"Progress: {processed:,}/{total:,} ({pct:.1f}%) | Failed: {failed}", flush=True)
+                
+            except Exception as e:
+                print(f"Error at offset {offset}: {e}", flush=True)
+                failed += len(hadiths)
+            
+            offset += BATCH_SIZE
+        
+        cur.close()
+        conn.close()
+        
+        print(f"Complete: {processed:,} processed, {failed} failed at {datetime.now()}", flush=True)
+      
      resources:
-        requests: {cpu: 2, memory: 4Gi}
-        limits: {cpu: 4, memory: 8Gi}
+        requests:
+          memory: "2Gi"
+          cpu: "1"
+        limits:
+          memory: "4Gi"
+          cpu: "2"
--- a/hadith-ingestion/combined.txt
+++ b/hadith-ingestion/combined.txt
--- a/hadith-ingestion/scripts/benchmark_embeddings.py
+++ b/hadith-ingestion/scripts/benchmark_embeddings.py
@ -0,0 +1,98 @@
+# Save as benchmark_embeddings.py
+import requests
+import time
+import statistics
+from typing import List
+
+TEI_URL = "http://tei.ml.svc.cluster.local"
+QDRANT_URL = "http://qdrant.vector.svc.cluster.local:6333"
+
+def benchmark_tei(num_requests: int = 100) -> Dict:
+    """Benchmark TEI embedding generation"""
+    test_text = "This is a test hadith about prayer and fasting"
+    times = []
+    
+    print(f"Benchmarking TEI ({num_requests} requests)...")
+    for i in range(num_requests):
+        start = time.time()
+        response = requests.post(
+            f"{TEI_URL}/embed",
+            json={"inputs": test_text}
+        )
+        times.append(time.time() - start)
+        
+        if (i + 1) % 10 == 0:
+            print(f"  Progress: {i + 1}/{num_requests}")
+    
+    return {
+        'mean': statistics.mean(times),
+        'median': statistics.median(times),
+        'min': min(times),
+        'max': max(times),
+        'stdev': statistics.stdev(times) if len(times) > 1 else 0
+    }
+
+def benchmark_qdrant(num_queries: int = 100) -> Dict:
+    """Benchmark Qdrant search"""
+    # Get a sample embedding first
+    response = requests.post(
+        f"{TEI_URL}/embed",
+        json={"inputs": "test query"}
+    )
+    query_vector = response.json()[0]
+    
+    times = []
+    
+    print(f"\nBenchmarking Qdrant ({num_queries} searches)...")
+    for i in range(num_queries):
+        start = time.time()
+        response = requests.post(
+            f"{QDRANT_URL}/collections/hadith_embeddings/points/search",
+            json={
+                "vector": query_vector,
+                "limit": 10
+            }
+        )
+        times.append(time.time() - start)
+        
+        if (i + 1) % 10 == 0:
+            print(f"  Progress: {i + 1}/{num_queries}")
+    
+    return {
+        'mean': statistics.mean(times),
+        'median': statistics.median(times),
+        'min': min(times),
+        'max': max(times),
+        'stdev': statistics.stdev(times) if len(times) > 1 else 0
+    }
+
+# Run benchmarks
+print("=== PERFORMANCE BENCHMARK ===\n")
+
+tei_stats = benchmark_tei(100)
+qdrant_stats = benchmark_qdrant(100)
+
+print("\n" + "="*80)
+print("RESULTS")
+print("="*80)
+
+print("\nTEI Embedding Generation (per request):")
+print(f"  Mean:   {tei_stats['mean']*1000:.2f}ms")
+print(f"  Median: {tei_stats['median']*1000:.2f}ms")
+print(f"  Min:    {tei_stats['min']*1000:.2f}ms")
+print(f"  Max:    {tei_stats['max']*1000:.2f}ms")
+print(f"  StdDev: {tei_stats['stdev']*1000:.2f}ms")
+
+print("\nQdrant Vector Search (per query):")
+print(f"  Mean:   {qdrant_stats['mean']*1000:.2f}ms")
+print(f"  Median: {qdrant_stats['median']*1000:.2f}ms")
+print(f"  Min:    {qdrant_stats['min']*1000:.2f}ms")
+print(f"  Max:    {qdrant_stats['max']*1000:.2f}ms")
+print(f"  StdDev: {qdrant_stats['stdev']*1000:.2f}ms")
+
+# Expected performance targets
+print("\n" + "="*80)
+print("PERFORMANCE TARGETS")
+print("="*80)
+print("TEI Embedding:  < 50ms (good), < 100ms (acceptable)")
+print("Qdrant Search:  < 20ms (good), < 50ms (acceptable)")
--- a/hadith-ingestion/scripts/generate_embeddings.py
+++ b/hadith-ingestion/scripts/generate_embeddings.py
@ -0,0 +1,324 @@
+#!/usr/bin/env python3
+"""
+Generate embeddings for all hadiths and store in Qdrant
+Updated with SSL verification disabled
+"""
+import requests
+import psycopg2
+from psycopg2.extras import execute_values
+
+import time
+import urllib3
+from typing import List, Dict, Tuple, Optional
+import logging
+from datetime import datetime
+# import uuid
+
+# Disable SSL warnings
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+# Configuration
+TEI_URL = "https://embeddings.betelgeusebytes.io"
+QDRANT_URL = "https://vector.betelgeusebytes.io"
+DB_CONFIG = {
+    'host': 'pg.betelgeusebytes.io',
+    'port': 5432,
+    'dbname': 'hadith_db',
+    'user': 'hadith_ingest',
+    'password': 'hadith_ingest'  # UPDATE THIS
+}
+
+# BATCH_SIZE = 8  # Process 32 hadiths at a time
+BATCH_SIZE = 32  # Process 32 hadiths at a time
+MAX_TEXT_LENGTH = 1500  # Truncate individual texts to avoid issues
+COLLECTION_NAME = "hadith_embeddings"
+VERIFY_SSL = False  # Ignore SSL certificate verification
+
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+
+def get_single_embedding(text: str, hadith_id: str, max_retries: int = 3) -> Optional[List[float]]:
+    """Get embedding for a single text with retries and length reduction"""
+    current_text = text
+    
+    for attempt in range(max_retries):
+        try:
+            response = requests.post(
+                f"{TEI_URL}/embed",
+                json={"inputs": [current_text]},
+                timeout=60,
+                verify=VERIFY_SSL
+            )
+            response.raise_for_status()
+            return response.json()[0]
+        except requests.exceptions.HTTPError as e:
+            if e.response.status_code == 413:
+                # Text still too large, reduce by 50%
+                new_length = len(current_text) // 2
+                logger.warning(f"Hadith {hadith_id}: Text too large ({len(current_text)} chars), reducing to {new_length}")
+                current_text = current_text[:new_length] + "..."
+                if new_length < 100:  # Don't go below 100 chars
+                    logger.error(f"Hadith {hadith_id}: Cannot reduce text further")
+                    return None
+            else:
+                logger.error(f"Hadith {hadith_id}: HTTP error {e.response.status_code}")
+                return None
+        except Exception as e:
+            logger.error(f"Hadith {hadith_id}: Error getting embedding: {e}")
+            if attempt < max_retries - 1:
+                time.sleep(1)
+            else:
+                return None
+    
+    return None
+
+
+def get_embeddings_batch(texts: List[str], hadith_ids: List[str]) -> List[Optional[List[float]]]:
+    """Get embeddings for a batch of texts, fall back to individual if needed"""
+    try:
+        response = requests.post(
+            f"{TEI_URL}/embed",
+            json={"inputs": texts},
+            timeout=60,
+            verify=VERIFY_SSL
+        )
+        response.raise_for_status()
+        return response.json()
+    except requests.exceptions.HTTPError as e:
+        if e.response.status_code == 413 and len(texts) > 1:
+            # Batch too large, try individually
+            logger.warning(f"Batch too large, processing {len(texts)} texts individually...")
+            embeddings = []
+            for text, hid in zip(texts, hadith_ids):
+                embedding = get_single_embedding(text, hid)
+                embeddings.append(embedding)
+                time.sleep(0.1)
+            return embeddings
+        else:
+            logger.error(f"Error getting embeddings: {e}")
+            return [None] * len(texts)
+    except Exception as e:
+        logger.error(f"Error getting embeddings: {e}")
+        return [None] * len(texts)
+
+
+def upsert_to_qdrant(points: List[Dict]) -> bool:
+    """Upsert points to Qdrant"""
+    if not points:  # Skip if no valid points
+        return True
+        
+    try:
+        response = requests.put(
+            f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points",
+            json={"points": points},
+            timeout=30,
+            verify=VERIFY_SSL
+        )
+        response.raise_for_status()
+        return True
+    except Exception as e:
+        logger.error(f"Error upserting to Qdrant: {e}")
+        return False
+
+
+def mark_embeddings_generated(conn, hadith_ids: List[str], failed_ids: List[str] = None) -> bool:
+    """Mark hadiths as having embeddings generated"""
+    try:
+        cur = conn.cursor()
+        
+        # Mark successful ones
+        if hadith_ids:
+            cur.execute("""
+                UPDATE hadiths 
+                SET embedding_generated = TRUE,
+                    updated_at = CURRENT_TIMESTAMP
+                WHERE id IN (
+                    SELECT UNNEST(%s::text[])::uuid
+                )
+            """, (hadith_ids,))
+        
+        # Mark failed ones (so we can skip them in future runs)
+        if failed_ids:
+            cur.execute("""
+                UPDATE hadiths 
+                SET embedding_generated = TRUE,
+                    updated_at = CURRENT_TIMESTAMP
+                WHERE id IN (
+                    SELECT UNNEST(%s::text[])::uuid
+                )
+            """, (failed_ids,))
+            logger.warning(f"Marked {len(failed_ids)} failed hadiths as processed to skip them")
+        
+        conn.commit()
+        cur.close()
+        return True
+    except Exception as e:
+        logger.error(f"Error updating database: {e}")
+        conn.rollback()
+        return False
+
+
+def fetch_hadiths_batch(cur, offset: int, limit: int) -> List[Tuple]:
+    """Fetch a batch of hadiths without embeddings"""
+    cur.execute("""
+        SELECT id, arabic_text, english_text, urdu_text,
+               collection_id, hadith_number
+        FROM hadiths
+        WHERE embedding_generated = FALSE
+        ORDER BY id
+        LIMIT %s OFFSET %s
+    """, (limit, offset))
+    return cur.fetchall()
+
+
+def create_combined_text(hadith: Tuple, max_length: int = MAX_TEXT_LENGTH) -> str:
+    """Create combined text for embedding, truncate if needed"""
+    id, arabic, english, urdu, coll_id, num = hadith
+    
+    parts = []
+    if arabic:
+        parts.append(arabic)
+    if english:
+        parts.append(english)
+    
+    combined = " ".join(parts) if parts else "No text available"
+    
+    if len(combined) > max_length:
+        combined = combined[:max_length] + "..."
+    
+    return combined
+
+
+def main():
+    start_time = datetime.now()
+    logger.info("=" * 80)
+    logger.info("HADITH EMBEDDING GENERATION")
+    logger.info(f"Started at: {start_time}")
+    logger.info(f"Batch size: {BATCH_SIZE}")
+    logger.info(f"Max text length: {MAX_TEXT_LENGTH}")
+    logger.info(f"SSL Verification: {VERIFY_SSL}")
+    logger.info("=" * 80)
+    
+    logger.info("Connecting to database...")
+    conn = psycopg2.connect(**DB_CONFIG)
+    cur = conn.cursor()
+    
+    cur.execute("SELECT COUNT(*) FROM hadiths WHERE embedding_generated = FALSE")
+    total_hadiths = cur.fetchone()[0]
+    logger.info(f"Total hadiths to process: {total_hadiths:,}")
+    
+    if total_hadiths == 0:
+        logger.info("No hadiths to process!")
+        return
+    
+    estimated_time_mins = (total_hadiths / BATCH_SIZE) * 3 / 60
+    logger.info(f"Estimated time: {estimated_time_mins:.1f} minutes")
+    logger.info("=" * 80)
+    
+    offset = 0
+    processed = 0
+    failed = 0
+    skipped = 0
+    last_report_pct = 0
+    
+    while offset < total_hadiths:
+        batch_start = time.time()
+        
+        hadiths = fetch_hadiths_batch(cur, offset, BATCH_SIZE)
+        
+        if not hadiths:
+            break
+        
+        texts = [create_combined_text(h) for h in hadiths]
+        hadith_ids = [str(h[0]) for h in hadiths]
+        
+        try:
+            embeddings = get_embeddings_batch(texts, hadith_ids)
+            
+            # Separate successful and failed embeddings
+            successful_points = []
+            successful_ids = []
+            failed_ids = []
+            
+            for i, (hadith_id, embedding) in enumerate(zip(hadith_ids, embeddings)):
+                if embedding is not None:
+                    hadith = hadiths[i]
+                    successful_points.append({
+                        "id": hadith_id,
+                        "vector": embedding,
+                        "payload": {
+                            "collection_id": str(hadith[4]),
+                            "hadith_number": hadith[5],
+                            "has_arabic": bool(hadith[1]),
+                            "has_english": bool(hadith[2]),
+                            "has_urdu": bool(hadith[3])
+                        }
+                    })
+                    successful_ids.append(hadith_id)
+                else:
+                    failed_ids.append(hadith_id)
+                    skipped += 1
+            
+            # Process successful ones
+            if successful_points:
+                if upsert_to_qdrant(successful_points):
+                    if mark_embeddings_generated(conn, successful_ids, failed_ids):
+                        processed += len(successful_ids)
+                        failed += len(failed_ids)
+                    else:
+                        logger.error(f"Failed to update database for batch at offset {offset}")
+                        failed += len(hadiths)
+                else:
+                    logger.error(f"Failed to upsert to Qdrant for batch at offset {offset}")
+                    failed += len(hadiths)
+            else:
+                # All failed in this batch
+                mark_embeddings_generated(conn, [], failed_ids)
+                failed += len(failed_ids)
+            
+            progress_pct = ((processed + failed) / total_hadiths) * 100
+            if progress_pct - last_report_pct >= 2:
+                batch_time = time.time() - batch_start
+                elapsed = (datetime.now() - start_time).total_seconds() / 60
+                rate = (processed + failed) / elapsed if elapsed > 0 else 0
+                remaining = (total_hadiths - processed - failed) / rate if rate > 0 else 0
+                
+                logger.info(
+                    f"Progress: {processed:,}/{total_hadiths:,} ({progress_pct:.1f}%) | "
+                    f"Failed: {failed} | Skipped: {skipped} | Rate: {rate:.0f}/min | "
+                    f"ETA: {remaining:.1f}min | Batch: {batch_time:.2f}s"
+                )
+                last_report_pct = progress_pct
+            
+            time.sleep(0.2)
+            
+        except Exception as e:
+            logger.error(f"Error processing batch at offset {offset}: {e}")
+            failed += len(hadiths)
+        
+        offset += BATCH_SIZE
+    
+    cur.close()
+    conn.close()
+    
+    end_time = datetime.now()
+    duration = (end_time - start_time).total_seconds() / 60
+    
+    logger.info("=" * 80)
+    logger.info("EMBEDDING GENERATION COMPLETE")
+    logger.info(f"Started:  {start_time}")
+    logger.info(f"Finished: {end_time}")
+    logger.info(f"Duration: {duration:.1f} minutes")
+    logger.info(f"Total hadiths: {total_hadiths:,}")
+    logger.info(f"Successfully processed: {processed:,}")
+    logger.info(f"Failed/Skipped: {failed} ({skipped} too long)")
+    logger.info(f"Success rate: {100 * processed / total_hadiths:.2f}%")
+    logger.info("=" * 80)
+
+
+if __name__ == "__main__":
+    main()
--- a/hadith-ingestion/scripts/monitor_progress.sh
+++ b/hadith-ingestion/scripts/monitor_progress.sh
@ -0,0 +1,29 @@
+#!/bin/bash
+
+echo "=== EMBEDDING GENERATION MONITOR ==="
+echo ""
+
+while true; do
+    clear
+    echo "=== EMBEDDING GENERATION PROGRESS ==="
+    date
+    echo ""
+    
+    # Database stats
+    echo "📊 Database Progress:"
+    PGPASSWORD=hadith_ingest psql -h pg.betelgeusebytes.io -U hadith_ingest -d hadith_db -t -c "
+    SELECT 
+      'Done: ' || COUNT(*) FILTER (WHERE embedding_generated = TRUE) ||
+      ' | Remaining: ' || COUNT(*) FILTER (WHERE embedding_generated = FALSE) ||
+      ' | Progress: ' || ROUND(100.0 * COUNT(*) FILTER (WHERE embedding_generated = TRUE) / COUNT(*), 2) || '%'
+    FROM hadiths;" 2>/dev/null || echo "  (Database connection failed)"
+    
+    echo ""
+    echo "🔢 Qdrant Collection:"
+    QDRANT_COUNT=$(curl -k -s https://vector.betelgeusebytes.io/collections/hadith_embeddings 2>/dev/null | jq -r '.result.points_count // "N/A"')
+    echo "  Points: $QDRANT_COUNT"
+    
+    echo ""
+    echo "⏰ Next update in 5 minutes... (Ctrl+C to stop)"
+    sleep 300
+done
--- a/hadith-ingestion/scripts/setup_embeddings.sh
+++ b/hadith-ingestion/scripts/setup_embeddings.sh
@ -0,0 +1,69 @@
+#!/bin/bash
+set -e
+
+echo "=== EMBEDDING PIPELINE SETUP ==="
+
+# 1. Check TEI service
+echo -e "\n1. Checking TEI service..."
+kubectl -n ml get pods -l app=tei
+TEI_STATUS=$?
+
+if [ $TEI_STATUS -ne 0 ]; then
+    echo "❌ TEI pods not found!"
+    exit 1
+fi
+
+# 2. Test TEI endpoint
+echo -e "\n2. Testing TEI endpoint..."
+TEI_TEST=$(curl -k -X POST https://embeddings.betelgeusebytes.io/embed \
+  -H "Content-Type: application/json" \
+  -d '{"inputs": "test"}' | jq -r 'type')
+
+if [ "$TEI_TEST" != "array" ]; then
+    echo "❌ TEI not responding correctly!"
+    echo "Response: $TEI_TEST"
+    exit 1
+fi
+echo "✅ TEI is working"
+
+# 3. Check Qdrant
+echo -e "\n3. Checking Qdrant..."
+QDRANT_STATUS=$(curl -s -k https://vector.betelgeusebytes.io/collections | jq -r '.status')
+
+if [ "$QDRANT_STATUS" != "ok" ]; then
+    echo "❌ Qdrant not responding!"
+    exit 1
+fi
+echo "✅ Qdrant is working"
+
+# 4. Check if collection exists
+echo -e "\n4. Checking hadith_embeddings collection..."
+COLLECTION_EXISTS=$(curl -s -k https://vector.betelgeusebytes.io/collections/hadith_embeddings | jq -r '.status // "missing"')
+
+if [ "$COLLECTION_EXISTS" == "missing" ]; then
+    echo "📝 Creating hadith_embeddings collection..."
+    curl -X -k PUT https://vector.betelgeusebytes.io/collections/hadith_embeddings \
+      -H "Content-Type: application/json" \
+      -d '{
+        "vectors": {
+          "size": 1024,
+          "distance": "Cosine"
+        },
+        "optimizers_config": {
+          "indexing_threshold": 10000
+        }
+      }'
+    echo -e "\n✅ Collection created"
+else
+    echo "✅ Collection exists"
+    POINT_COUNT=$(curl -s -k https://vector.betelgeusebytes.io/collections/hadith_embeddings | jq -r '.result.points_count')
+    echo "   Current points: $POINT_COUNT"
+fi
+
+# 5. Check database
+echo -e "\n5. Checking database..."
+HADITH_COUNT=$(psql -h pg.betelgeusebytes.io -U hadith_ingest -d hadith_db -t -c "SELECT COUNT(*) FROM hadiths;")
+echo "   Total hadiths: $HADITH_COUNT"
+
+echo -e "\n=== SETUP COMPLETE ✅ ==="
+echo -e "\nReady to generate embeddings!"
--- a/hadith-ingestion/scripts/test_semantic_search.py
+++ b/hadith-ingestion/scripts/test_semantic_search.py
@ -0,0 +1,93 @@
+# Save as test_semantic_search.py
+import requests
+import psycopg2
+from typing import List, Dict
+
+# Configuration
+TEI_URL = "http://tei.ml.svc.cluster.local"
+QDRANT_URL = "http://qdrant.vector.svc.cluster.local:6333"
+DB_CONFIG = {
+    'host': 'pg.betelgeusebytes.io',
+    'port': 5432,
+    'dbname': 'hadith_db',
+    'user': 'hadith_ingest',
+    'password': 'your_password'  # Update this
+}
+
+def get_embedding(text: str) -> List[float]:
+    """Get embedding from TEI service"""
+    response = requests.post(
+        f"{TEI_URL}/embed",
+        json={"inputs": text}
+    )
+    return response.json()[0]
+
+def search_similar_hadiths(query: str, limit: int = 5) -> List[Dict]:
+    """Search for similar hadiths using semantic search"""
+    # Get query embedding
+    query_embedding = get_embedding(query)
+    
+    # Search in Qdrant
+    response = requests.post(
+        f"{QDRANT_URL}/collections/hadith_embeddings/points/search",
+        json={
+            "vector": query_embedding,
+            "limit": limit,
+            "with_payload": True
+        }
+    )
+    
+    results = response.json()['result']
+    
+    # Get full hadith details from database
+    conn = psycopg2.connect(**DB_CONFIG)
+    cur = conn.cursor()
+    
+    hadith_ids = [r['id'] for r in results]
+    cur.execute("""
+        SELECT h.id, h.hadith_number, c.name as collection, 
+               h.arabic_text, h.english_text, h.grade
+        FROM hadiths h
+        JOIN collections c ON h.collection_id = c.id
+        WHERE h.id = ANY(%s)
+    """, (hadith_ids,))
+    
+    hadiths = {row[0]: dict(zip(['id', 'number', 'collection', 'arabic', 'english', 'grade'], row)) 
+               for row in cur.fetchall()}
+    
+    cur.close()
+    conn.close()
+    
+    # Combine results
+    return [
+        {
+            **hadiths[r['id']],
+            'similarity_score': r['score']
+        }
+        for r in results if r['id'] in hadiths
+    ]
+
+# Test queries
+test_queries = [
+    "prayer times and importance",
+    "fasting in Ramadan",
+    "charity and helping the poor",
+    "truthfulness and honesty",
+    "parents and their rights"
+]
+
+print("=== SEMANTIC SEARCH TEST ===\n")
+
+for query in test_queries:
+    print(f"\nQuery: '{query}'")
+    print("-" * 80)
+    
+    results = search_similar_hadiths(query, limit=3)
+    
+    for i, hadith in enumerate(results, 1):
+        print(f"\n{i}. [{hadith['collection']} #{hadith['number']}] (Score: {hadith['similarity_score']:.4f})")
+        print(f"   Grade: {hadith['grade']}")
+        print(f"   English: {hadith['english'][:200]}...")
+        print(f"   Arabic: {hadith['arabic'][:100]}...")
+
+print("\n=== TEST COMPLETE ===")
--- a/hadith-ingestion/scripts/verify_embeddings.sh
+++ b/hadith-ingestion/scripts/verify_embeddings.sh
@ -0,0 +1,27 @@
+#!/bin/bash
+
+echo "=== EMBEDDING VERIFICATION ==="
+echo ""
+
+# 1. Database check
+echo "1. Checking database..."
+psql -h pg.betelgeusebytes.io -U hadith_ingest -d hadith_db -t -c "
+SELECT 
+  'Total hadiths: ' || COUNT(*) || 
+  ' | With embeddings: ' || COUNT(*) FILTER (WHERE embedding_generated = TRUE) ||
+  ' | Missing: ' || COUNT(*) FILTER (WHERE embedding_generated = FALSE)
+FROM hadiths;"
+
+# 2. Qdrant check
+echo ""
+echo "2. Checking Qdrant..."
+QDRANT_COUNT=$(curl -k -s https://vector.betelgeusebytes.io/collections/hadith_embeddings | jq -r '.result.points_count')
+echo "Qdrant vectors: $QDRANT_COUNT"
+
+# 3. Collection info
+echo ""
+echo "3. Collection details..."
+curl -k -s https://vector.betelgeusebytes.io/collections/hadith_embeddings | jq '.result | {points_count, vectors_count, status, optimizer_status}'
+
+echo ""
+echo "=== VERIFICATION COMPLETE ==="