add generate embeddings

2025-11-24 11:17:57 +01:00 · 2025-11-24 11:17:57 +01:00 · f46297255d
parent cf02b918b8
commit f46297255d
9 changed files with 4181 additions and 17 deletions
--- a/hadith-ingestion/.DS_Store
+++ b/hadith-ingestion/.DS_Store
--- a/hadith-ingestion/argo/workflows/generate-embeddings.yaml
+++ b/hadith-ingestion/argo/workflows/generate-embeddings.yaml
@ -4,28 +4,127 @@ metadata:
  generateName: generate-embeddings-
  namespace: ml
 spec:
-  entrypoint: generate
+  entrypoint: embedding-pipeline
  serviceAccountName: argo-workflow
  arguments:
    parameters:
-    - name: batch-size
+    - name: db_password
-      value: "32"
+      value: "YOUR_PASSWORD_HERE"  # UPDATE THIS
  templates:
-  - name: generate
+  - name: embedding-pipeline
    steps:
    - - name: generate-embeddings
        template: generate-job
  - name: generate-job
    container:
-      image: hadith-ingestion:latest
+      image: python:3.11-slim
-      command: [python, /app/src/embeddings/generator.py]
+      command: [python, -u, -c]
-      args: ["--batch-size={{workflow.parameters.batch-size}}"]
+      args:
-      env:
+      - |
-        - name: DATABASE_HOST
+        import requests, psycopg2, time, sys, urllib3
-          value: "pg.betelgeusebytes.io"
+        from datetime import datetime
-        - name: DATABASE_PASSWORD
+        
-          valueFrom:
+        # Disable SSL warnings
-            secretKeyRef:
+        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
-              name: hadith-db-secret
+        
-              key: password
+        # Install deps
        import subprocess
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "psycopg2-binary", "requests", "urllib3"])
        # Config - EXTERNAL URLS with SSL disabled
        TEI_URL = "https://embeddings.betelgeusebytes.io"
        QDRANT_URL = "https://vector.betelgeusebytes.io"
        DB_CONFIG = {
            'host': 'pg.betelgeusebytes.io',
            'port': 5432,
            'dbname': 'hadith_db',
            'user': 'hadith_ingest',
            'password': '{{workflow.parameters.db_password}}'
        }
        BATCH_SIZE = 32
        COLLECTION = "hadith_embeddings"
        VERIFY_SSL = False  # Important: Ignore SSL certificates
        print(f"Started: {datetime.now()}", flush=True)
        print(f"SSL Verification: {VERIFY_SSL}", flush=True)
        conn = psycopg2.connect(**DB_CONFIG)
        cur = conn.cursor()
        cur.execute("SELECT COUNT(*) FROM hadiths WHERE embedding_generated = FALSE")
        total = cur.fetchone()[0]
        print(f"Total to process: {total:,}", flush=True)
        processed = 0
        failed = 0
        offset = 0
        while offset < total:
            cur.execute("""
                SELECT id, arabic_text, english_text, collection_id, hadith_number
                FROM hadiths WHERE embedding_generated = FALSE
                ORDER BY id LIMIT %s OFFSET %s
            """, (BATCH_SIZE, offset))
            hadiths = cur.fetchall()
            if not hadiths:
                break
            try:
                # Get embeddings (with verify=False)
                texts = [" ".join(filter(None, [h[1], h[2]])) for h in hadiths]
                resp = requests.post(
                    f"{TEI_URL}/embed", 
                    json={"inputs": texts}, 
                    timeout=60,
                    verify=VERIFY_SSL
                )
                resp.raise_for_status()
                embeddings = resp.json()
                # Upload to Qdrant (with verify=False)
                points = [{
                    "id": h[0],
                    "vector": embeddings[i],
                    "payload": {"collection_id": h[3], "hadith_number": h[4]}
                } for i, h in enumerate(hadiths)]
                resp = requests.put(
                    f"{QDRANT_URL}/collections/{COLLECTION}/points", 
                    json={"points": points}, 
                    timeout=30,
                    verify=VERIFY_SSL
                )
                resp.raise_for_status()
                # Update DB
                cur.execute("UPDATE hadiths SET embedding_generated = TRUE WHERE id = ANY(%s)", ([h[0] for h in hadiths],))
                conn.commit()
                processed += len(hadiths)
                if processed % 320 == 0:  # Every 10 batches
                    pct = 100 * processed / total
                    print(f"Progress: {processed:,}/{total:,} ({pct:.1f}%) | Failed: {failed}", flush=True)
            except Exception as e:
                print(f"Error at offset {offset}: {e}", flush=True)
                failed += len(hadiths)
            offset += BATCH_SIZE
        cur.close()
        conn.close()
        print(f"Complete: {processed:,} processed, {failed} failed at {datetime.now()}", flush=True)
      resources:
-        requests: {cpu: 2, memory: 4Gi}
+        requests:
-        limits: {cpu: 4, memory: 8Gi}
+          memory: "2Gi"
          cpu: "1"
        limits:
          memory: "4Gi"
          cpu: "2"
--- a/hadith-ingestion/combined.txt
+++ b/hadith-ingestion/combined.txt
--- a/hadith-ingestion/scripts/benchmark_embeddings.py
+++ b/hadith-ingestion/scripts/benchmark_embeddings.py
@ -0,0 +1,98 @@
 # Save as benchmark_embeddings.py
 import requests
 import time
 import statistics
 from typing import List
 TEI_URL = "http://tei.ml.svc.cluster.local"
 QDRANT_URL = "http://qdrant.vector.svc.cluster.local:6333"
 def benchmark_tei(num_requests: int = 100) -> Dict:
    """Benchmark TEI embedding generation"""
    test_text = "This is a test hadith about prayer and fasting"
    times = []
    print(f"Benchmarking TEI ({num_requests} requests)...")
    for i in range(num_requests):
        start = time.time()
        response = requests.post(
            f"{TEI_URL}/embed",
            json={"inputs": test_text}
        )
        times.append(time.time() - start)
        if (i + 1) % 10 == 0:
            print(f"  Progress: {i + 1}/{num_requests}")
    return {
        'mean': statistics.mean(times),
        'median': statistics.median(times),
        'min': min(times),
        'max': max(times),
        'stdev': statistics.stdev(times) if len(times) > 1 else 0
    }
 def benchmark_qdrant(num_queries: int = 100) -> Dict:
    """Benchmark Qdrant search"""
    # Get a sample embedding first
    response = requests.post(
        f"{TEI_URL}/embed",
        json={"inputs": "test query"}
    )
    query_vector = response.json()[0]
    times = []
    print(f"\nBenchmarking Qdrant ({num_queries} searches)...")
    for i in range(num_queries):
        start = time.time()
        response = requests.post(
            f"{QDRANT_URL}/collections/hadith_embeddings/points/search",
            json={
                "vector": query_vector,
                "limit": 10
            }
        )
        times.append(time.time() - start)
        if (i + 1) % 10 == 0:
            print(f"  Progress: {i + 1}/{num_queries}")
    return {
        'mean': statistics.mean(times),
        'median': statistics.median(times),
        'min': min(times),
        'max': max(times),
        'stdev': statistics.stdev(times) if len(times) > 1 else 0
    }
 # Run benchmarks
 print("=== PERFORMANCE BENCHMARK ===\n")
 tei_stats = benchmark_tei(100)
 qdrant_stats = benchmark_qdrant(100)
 print("\n" + "="*80)
 print("RESULTS")
 print("="*80)
 print("\nTEI Embedding Generation (per request):")
 print(f"  Mean:   {tei_stats['mean']*1000:.2f}ms")
 print(f"  Median: {tei_stats['median']*1000:.2f}ms")
 print(f"  Min:    {tei_stats['min']*1000:.2f}ms")
 print(f"  Max:    {tei_stats['max']*1000:.2f}ms")
 print(f"  StdDev: {tei_stats['stdev']*1000:.2f}ms")
 print("\nQdrant Vector Search (per query):")
 print(f"  Mean:   {qdrant_stats['mean']*1000:.2f}ms")
 print(f"  Median: {qdrant_stats['median']*1000:.2f}ms")
 print(f"  Min:    {qdrant_stats['min']*1000:.2f}ms")
 print(f"  Max:    {qdrant_stats['max']*1000:.2f}ms")
 print(f"  StdDev: {qdrant_stats['stdev']*1000:.2f}ms")
 # Expected performance targets
 print("\n" + "="*80)
 print("PERFORMANCE TARGETS")
 print("="*80)
 print("TEI Embedding:  < 50ms (good), < 100ms (acceptable)")
 print("Qdrant Search:  < 20ms (good), < 50ms (acceptable)")
--- a/hadith-ingestion/scripts/generate_embeddings.py
+++ b/hadith-ingestion/scripts/generate_embeddings.py
@ -0,0 +1,324 @@
 #!/usr/bin/env python3
 """
 Generate embeddings for all hadiths and store in Qdrant
 Updated with SSL verification disabled
 """
 import requests
 import psycopg2
 from psycopg2.extras import execute_values
 import time
 import urllib3
 from typing import List, Dict, Tuple, Optional
 import logging
 from datetime import datetime
 # import uuid
 # Disable SSL warnings
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 # Configuration
 TEI_URL = "https://embeddings.betelgeusebytes.io"
 QDRANT_URL = "https://vector.betelgeusebytes.io"
 DB_CONFIG = {
    'host': 'pg.betelgeusebytes.io',
    'port': 5432,
    'dbname': 'hadith_db',
    'user': 'hadith_ingest',
    'password': 'hadith_ingest'  # UPDATE THIS
 }
 # BATCH_SIZE = 8  # Process 32 hadiths at a time
 BATCH_SIZE = 32  # Process 32 hadiths at a time
 MAX_TEXT_LENGTH = 1500  # Truncate individual texts to avoid issues
 COLLECTION_NAME = "hadith_embeddings"
 VERIFY_SSL = False  # Ignore SSL certificate verification
 logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
 )
 logger = logging.getLogger(__name__)
 def get_single_embedding(text: str, hadith_id: str, max_retries: int = 3) -> Optional[List[float]]:
    """Get embedding for a single text with retries and length reduction"""
    current_text = text
    for attempt in range(max_retries):
        try:
            response = requests.post(
                f"{TEI_URL}/embed",
                json={"inputs": [current_text]},
                timeout=60,
                verify=VERIFY_SSL
            )
            response.raise_for_status()
            return response.json()[0]
        except requests.exceptions.HTTPError as e:
            if e.response.status_code == 413:
                # Text still too large, reduce by 50%
                new_length = len(current_text) // 2
                logger.warning(f"Hadith {hadith_id}: Text too large ({len(current_text)} chars), reducing to {new_length}")
                current_text = current_text[:new_length] + "..."
                if new_length < 100:  # Don't go below 100 chars
                    logger.error(f"Hadith {hadith_id}: Cannot reduce text further")
                    return None
            else:
                logger.error(f"Hadith {hadith_id}: HTTP error {e.response.status_code}")
                return None
        except Exception as e:
            logger.error(f"Hadith {hadith_id}: Error getting embedding: {e}")
            if attempt < max_retries - 1:
                time.sleep(1)
            else:
                return None
    return None
 def get_embeddings_batch(texts: List[str], hadith_ids: List[str]) -> List[Optional[List[float]]]:
    """Get embeddings for a batch of texts, fall back to individual if needed"""
    try:
        response = requests.post(
            f"{TEI_URL}/embed",
            json={"inputs": texts},
            timeout=60,
            verify=VERIFY_SSL
        )
        response.raise_for_status()
        return response.json()
    except requests.exceptions.HTTPError as e:
        if e.response.status_code == 413 and len(texts) > 1:
            # Batch too large, try individually
            logger.warning(f"Batch too large, processing {len(texts)} texts individually...")
            embeddings = []
            for text, hid in zip(texts, hadith_ids):
                embedding = get_single_embedding(text, hid)
                embeddings.append(embedding)
                time.sleep(0.1)
            return embeddings
        else:
            logger.error(f"Error getting embeddings: {e}")
            return [None] * len(texts)
    except Exception as e:
        logger.error(f"Error getting embeddings: {e}")
        return [None] * len(texts)
 def upsert_to_qdrant(points: List[Dict]) -> bool:
    """Upsert points to Qdrant"""
    if not points:  # Skip if no valid points
        return True
    try:
        response = requests.put(
            f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points",
            json={"points": points},
            timeout=30,
            verify=VERIFY_SSL
        )
        response.raise_for_status()
        return True
    except Exception as e:
        logger.error(f"Error upserting to Qdrant: {e}")
        return False
 def mark_embeddings_generated(conn, hadith_ids: List[str], failed_ids: List[str] = None) -> bool:
    """Mark hadiths as having embeddings generated"""
    try:
        cur = conn.cursor()
        # Mark successful ones
        if hadith_ids:
            cur.execute("""
                UPDATE hadiths 
                SET embedding_generated = TRUE,
                    updated_at = CURRENT_TIMESTAMP
                WHERE id IN (
                    SELECT UNNEST(%s::text[])::uuid
                )
            """, (hadith_ids,))
        # Mark failed ones (so we can skip them in future runs)
        if failed_ids:
            cur.execute("""
                UPDATE hadiths 
                SET embedding_generated = TRUE,
                    updated_at = CURRENT_TIMESTAMP
                WHERE id IN (
                    SELECT UNNEST(%s::text[])::uuid
                )
            """, (failed_ids,))
            logger.warning(f"Marked {len(failed_ids)} failed hadiths as processed to skip them")
        conn.commit()
        cur.close()
        return True
    except Exception as e:
        logger.error(f"Error updating database: {e}")
        conn.rollback()
        return False
 def fetch_hadiths_batch(cur, offset: int, limit: int) -> List[Tuple]:
    """Fetch a batch of hadiths without embeddings"""
    cur.execute("""
        SELECT id, arabic_text, english_text, urdu_text,
               collection_id, hadith_number
        FROM hadiths
        WHERE embedding_generated = FALSE
        ORDER BY id
        LIMIT %s OFFSET %s
    """, (limit, offset))
    return cur.fetchall()
 def create_combined_text(hadith: Tuple, max_length: int = MAX_TEXT_LENGTH) -> str:
    """Create combined text for embedding, truncate if needed"""
    id, arabic, english, urdu, coll_id, num = hadith
    parts = []
    if arabic:
        parts.append(arabic)
    if english:
        parts.append(english)
    combined = " ".join(parts) if parts else "No text available"
    if len(combined) > max_length:
        combined = combined[:max_length] + "..."
    return combined
 def main():
    start_time = datetime.now()
    logger.info("=" * 80)
    logger.info("HADITH EMBEDDING GENERATION")
    logger.info(f"Started at: {start_time}")
    logger.info(f"Batch size: {BATCH_SIZE}")
    logger.info(f"Max text length: {MAX_TEXT_LENGTH}")
    logger.info(f"SSL Verification: {VERIFY_SSL}")
    logger.info("=" * 80)
    logger.info("Connecting to database...")
    conn = psycopg2.connect(**DB_CONFIG)
    cur = conn.cursor()
    cur.execute("SELECT COUNT(*) FROM hadiths WHERE embedding_generated = FALSE")
    total_hadiths = cur.fetchone()[0]
    logger.info(f"Total hadiths to process: {total_hadiths:,}")
    if total_hadiths == 0:
        logger.info("No hadiths to process!")
        return
    estimated_time_mins = (total_hadiths / BATCH_SIZE) * 3 / 60
    logger.info(f"Estimated time: {estimated_time_mins:.1f} minutes")
    logger.info("=" * 80)
    offset = 0
    processed = 0
    failed = 0
    skipped = 0
    last_report_pct = 0
    while offset < total_hadiths:
        batch_start = time.time()
        hadiths = fetch_hadiths_batch(cur, offset, BATCH_SIZE)
        if not hadiths:
            break
        texts = [create_combined_text(h) for h in hadiths]
        hadith_ids = [str(h[0]) for h in hadiths]
        try:
            embeddings = get_embeddings_batch(texts, hadith_ids)
            # Separate successful and failed embeddings
            successful_points = []
            successful_ids = []
            failed_ids = []
            for i, (hadith_id, embedding) in enumerate(zip(hadith_ids, embeddings)):
                if embedding is not None:
                    hadith = hadiths[i]
                    successful_points.append({
                        "id": hadith_id,
                        "vector": embedding,
                        "payload": {
                            "collection_id": str(hadith[4]),
                            "hadith_number": hadith[5],
                            "has_arabic": bool(hadith[1]),
                            "has_english": bool(hadith[2]),
                            "has_urdu": bool(hadith[3])
                        }
                    })
                    successful_ids.append(hadith_id)
                else:
                    failed_ids.append(hadith_id)
                    skipped += 1
            # Process successful ones
            if successful_points:
                if upsert_to_qdrant(successful_points):
                    if mark_embeddings_generated(conn, successful_ids, failed_ids):
                        processed += len(successful_ids)
                        failed += len(failed_ids)
                    else:
                        logger.error(f"Failed to update database for batch at offset {offset}")
                        failed += len(hadiths)
                else:
                    logger.error(f"Failed to upsert to Qdrant for batch at offset {offset}")
                    failed += len(hadiths)
            else:
                # All failed in this batch
                mark_embeddings_generated(conn, [], failed_ids)
                failed += len(failed_ids)
            progress_pct = ((processed + failed) / total_hadiths) * 100
            if progress_pct - last_report_pct >= 2:
                batch_time = time.time() - batch_start
                elapsed = (datetime.now() - start_time).total_seconds() / 60
                rate = (processed + failed) / elapsed if elapsed > 0 else 0
                remaining = (total_hadiths - processed - failed) / rate if rate > 0 else 0
                logger.info(
                    f"Progress: {processed:,}/{total_hadiths:,} ({progress_pct:.1f}%) | "
                    f"Failed: {failed} | Skipped: {skipped} | Rate: {rate:.0f}/min | "
                    f"ETA: {remaining:.1f}min | Batch: {batch_time:.2f}s"
                )
                last_report_pct = progress_pct
            time.sleep(0.2)
        except Exception as e:
            logger.error(f"Error processing batch at offset {offset}: {e}")
            failed += len(hadiths)
        offset += BATCH_SIZE
    cur.close()
    conn.close()
    end_time = datetime.now()
    duration = (end_time - start_time).total_seconds() / 60
    logger.info("=" * 80)
    logger.info("EMBEDDING GENERATION COMPLETE")
    logger.info(f"Started:  {start_time}")
    logger.info(f"Finished: {end_time}")
    logger.info(f"Duration: {duration:.1f} minutes")
    logger.info(f"Total hadiths: {total_hadiths:,}")
    logger.info(f"Successfully processed: {processed:,}")
    logger.info(f"Failed/Skipped: {failed} ({skipped} too long)")
    logger.info(f"Success rate: {100 * processed / total_hadiths:.2f}%")
    logger.info("=" * 80)
 if __name__ == "__main__":
    main()
--- a/hadith-ingestion/scripts/monitor_progress.sh
+++ b/hadith-ingestion/scripts/monitor_progress.sh
@ -0,0 +1,29 @@
 #!/bin/bash
 echo "=== EMBEDDING GENERATION MONITOR ==="
 echo ""
 while true; do
    clear
    echo "=== EMBEDDING GENERATION PROGRESS ==="
    date
    echo ""
    # Database stats
    echo "📊 Database Progress:"
    PGPASSWORD=hadith_ingest psql -h pg.betelgeusebytes.io -U hadith_ingest -d hadith_db -t -c "
    SELECT 
      'Done: ' || COUNT(*) FILTER (WHERE embedding_generated = TRUE) ||
      ' | Remaining: ' || COUNT(*) FILTER (WHERE embedding_generated = FALSE) ||
      ' | Progress: ' || ROUND(100.0 * COUNT(*) FILTER (WHERE embedding_generated = TRUE) / COUNT(*), 2) || '%'
    FROM hadiths;" 2>/dev/null || echo "  (Database connection failed)"
    echo ""
    echo "🔢 Qdrant Collection:"
    QDRANT_COUNT=$(curl -k -s https://vector.betelgeusebytes.io/collections/hadith_embeddings 2>/dev/null | jq -r '.result.points_count // "N/A"')
    echo "  Points: $QDRANT_COUNT"
    echo ""
    echo "⏰ Next update in 5 minutes... (Ctrl+C to stop)"
    sleep 300
 done
--- a/hadith-ingestion/scripts/setup_embeddings.sh
+++ b/hadith-ingestion/scripts/setup_embeddings.sh
@ -0,0 +1,69 @@
 #!/bin/bash
 set -e
 echo "=== EMBEDDING PIPELINE SETUP ==="
 # 1. Check TEI service
 echo -e "\n1. Checking TEI service..."
 kubectl -n ml get pods -l app=tei
 TEI_STATUS=$?
 if [ $TEI_STATUS -ne 0 ]; then
    echo "❌ TEI pods not found!"
    exit 1
 fi
 # 2. Test TEI endpoint
 echo -e "\n2. Testing TEI endpoint..."
 TEI_TEST=$(curl -k -X POST https://embeddings.betelgeusebytes.io/embed \
  -H "Content-Type: application/json" \
  -d '{"inputs": "test"}' | jq -r 'type')
 if [ "$TEI_TEST" != "array" ]; then
    echo "❌ TEI not responding correctly!"
    echo "Response: $TEI_TEST"
    exit 1
 fi
 echo "✅ TEI is working"
 # 3. Check Qdrant
 echo -e "\n3. Checking Qdrant..."
 QDRANT_STATUS=$(curl -s -k https://vector.betelgeusebytes.io/collections | jq -r '.status')
 if [ "$QDRANT_STATUS" != "ok" ]; then
    echo "❌ Qdrant not responding!"
    exit 1
 fi
 echo "✅ Qdrant is working"
 # 4. Check if collection exists
 echo -e "\n4. Checking hadith_embeddings collection..."
 COLLECTION_EXISTS=$(curl -s -k https://vector.betelgeusebytes.io/collections/hadith_embeddings | jq -r '.status // "missing"')
 if [ "$COLLECTION_EXISTS" == "missing" ]; then
    echo "📝 Creating hadith_embeddings collection..."
    curl -X -k PUT https://vector.betelgeusebytes.io/collections/hadith_embeddings \
      -H "Content-Type: application/json" \
      -d '{
        "vectors": {
          "size": 1024,
          "distance": "Cosine"
        },
        "optimizers_config": {
          "indexing_threshold": 10000
        }
      }'
    echo -e "\n✅ Collection created"
 else
    echo "✅ Collection exists"
    POINT_COUNT=$(curl -s -k https://vector.betelgeusebytes.io/collections/hadith_embeddings | jq -r '.result.points_count')
    echo "   Current points: $POINT_COUNT"
 fi
 # 5. Check database
 echo -e "\n5. Checking database..."
 HADITH_COUNT=$(psql -h pg.betelgeusebytes.io -U hadith_ingest -d hadith_db -t -c "SELECT COUNT(*) FROM hadiths;")
 echo "   Total hadiths: $HADITH_COUNT"
 echo -e "\n=== SETUP COMPLETE ✅ ==="
 echo -e "\nReady to generate embeddings!"
--- a/hadith-ingestion/scripts/test_semantic_search.py
+++ b/hadith-ingestion/scripts/test_semantic_search.py
@ -0,0 +1,93 @@
 # Save as test_semantic_search.py
 import requests
 import psycopg2
 from typing import List, Dict
 # Configuration
 TEI_URL = "http://tei.ml.svc.cluster.local"
 QDRANT_URL = "http://qdrant.vector.svc.cluster.local:6333"
 DB_CONFIG = {
    'host': 'pg.betelgeusebytes.io',
    'port': 5432,
    'dbname': 'hadith_db',
    'user': 'hadith_ingest',
    'password': 'your_password'  # Update this
 }
 def get_embedding(text: str) -> List[float]:
    """Get embedding from TEI service"""
    response = requests.post(
        f"{TEI_URL}/embed",
        json={"inputs": text}
    )
    return response.json()[0]
 def search_similar_hadiths(query: str, limit: int = 5) -> List[Dict]:
    """Search for similar hadiths using semantic search"""
    # Get query embedding
    query_embedding = get_embedding(query)
    # Search in Qdrant
    response = requests.post(
        f"{QDRANT_URL}/collections/hadith_embeddings/points/search",
        json={
            "vector": query_embedding,
            "limit": limit,
            "with_payload": True
        }
    )
    results = response.json()['result']
    # Get full hadith details from database
    conn = psycopg2.connect(**DB_CONFIG)
    cur = conn.cursor()
    hadith_ids = [r['id'] for r in results]
    cur.execute("""
        SELECT h.id, h.hadith_number, c.name as collection, 
               h.arabic_text, h.english_text, h.grade
        FROM hadiths h
        JOIN collections c ON h.collection_id = c.id
        WHERE h.id = ANY(%s)
    """, (hadith_ids,))
    hadiths = {row[0]: dict(zip(['id', 'number', 'collection', 'arabic', 'english', 'grade'], row)) 
               for row in cur.fetchall()}
    cur.close()
    conn.close()
    # Combine results
    return [
        {
            **hadiths[r['id']],
            'similarity_score': r['score']
        }
        for r in results if r['id'] in hadiths
    ]
 # Test queries
 test_queries = [
    "prayer times and importance",
    "fasting in Ramadan",
    "charity and helping the poor",
    "truthfulness and honesty",
    "parents and their rights"
 ]
 print("=== SEMANTIC SEARCH TEST ===\n")
 for query in test_queries:
    print(f"\nQuery: '{query}'")
    print("-" * 80)
    results = search_similar_hadiths(query, limit=3)
    for i, hadith in enumerate(results, 1):
        print(f"\n{i}. [{hadith['collection']} #{hadith['number']}] (Score: {hadith['similarity_score']:.4f})")
        print(f"   Grade: {hadith['grade']}")
        print(f"   English: {hadith['english'][:200]}...")
        print(f"   Arabic: {hadith['arabic'][:100]}...")
 print("\n=== TEST COMPLETE ===")
--- a/hadith-ingestion/scripts/verify_embeddings.sh
+++ b/hadith-ingestion/scripts/verify_embeddings.sh
@ -0,0 +1,27 @@
 #!/bin/bash
 echo "=== EMBEDDING VERIFICATION ==="
 echo ""
 # 1. Database check
 echo "1. Checking database..."
 psql -h pg.betelgeusebytes.io -U hadith_ingest -d hadith_db -t -c "
 SELECT 
  'Total hadiths: ' || COUNT(*) || 
  ' | With embeddings: ' || COUNT(*) FILTER (WHERE embedding_generated = TRUE) ||
  ' | Missing: ' || COUNT(*) FILTER (WHERE embedding_generated = FALSE)
 FROM hadiths;"
 # 2. Qdrant check
 echo ""
 echo "2. Checking Qdrant..."
 QDRANT_COUNT=$(curl -k -s https://vector.betelgeusebytes.io/collections/hadith_embeddings | jq -r '.result.points_count')
 echo "Qdrant vectors: $QDRANT_COUNT"
 # 3. Collection info
 echo ""
 echo "3. Collection details..."
 curl -k -s https://vector.betelgeusebytes.io/collections/hadith_embeddings | jq '.result | {points_count, vectors_count, status, optimizer_status}'
 echo ""
 echo "=== VERIFICATION COMPLETE ==="