add generate embeddings

This commit is contained in:
salahangal 2025-11-24 11:17:57 +01:00
parent cf02b918b8
commit f46297255d
9 changed files with 4181 additions and 17 deletions

Binary file not shown.

View File

@ -4,28 +4,127 @@ metadata:
generateName: generate-embeddings- generateName: generate-embeddings-
namespace: ml namespace: ml
spec: spec:
entrypoint: generate entrypoint: embedding-pipeline
serviceAccountName: argo-workflow serviceAccountName: argo-workflow
arguments: arguments:
parameters: parameters:
- name: batch-size - name: db_password
value: "32" value: "YOUR_PASSWORD_HERE" # UPDATE THIS
templates: templates:
- name: generate - name: embedding-pipeline
steps:
- - name: generate-embeddings
template: generate-job
- name: generate-job
container: container:
image: hadith-ingestion:latest image: python:3.11-slim
command: [python, /app/src/embeddings/generator.py] command: [python, -u, -c]
args: ["--batch-size={{workflow.parameters.batch-size}}"] args:
env: - |
- name: DATABASE_HOST import requests, psycopg2, time, sys, urllib3
value: "pg.betelgeusebytes.io" from datetime import datetime
- name: DATABASE_PASSWORD
valueFrom: # Disable SSL warnings
secretKeyRef: urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
name: hadith-db-secret
key: password # Install deps
import subprocess
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "psycopg2-binary", "requests", "urllib3"])
# Config - EXTERNAL URLS with SSL disabled
TEI_URL = "https://embeddings.betelgeusebytes.io"
QDRANT_URL = "https://vector.betelgeusebytes.io"
DB_CONFIG = {
'host': 'pg.betelgeusebytes.io',
'port': 5432,
'dbname': 'hadith_db',
'user': 'hadith_ingest',
'password': '{{workflow.parameters.db_password}}'
}
BATCH_SIZE = 32
COLLECTION = "hadith_embeddings"
VERIFY_SSL = False # Important: Ignore SSL certificates
print(f"Started: {datetime.now()}", flush=True)
print(f"SSL Verification: {VERIFY_SSL}", flush=True)
conn = psycopg2.connect(**DB_CONFIG)
cur = conn.cursor()
cur.execute("SELECT COUNT(*) FROM hadiths WHERE embedding_generated = FALSE")
total = cur.fetchone()[0]
print(f"Total to process: {total:,}", flush=True)
processed = 0
failed = 0
offset = 0
while offset < total:
cur.execute("""
SELECT id, arabic_text, english_text, collection_id, hadith_number
FROM hadiths WHERE embedding_generated = FALSE
ORDER BY id LIMIT %s OFFSET %s
""", (BATCH_SIZE, offset))
hadiths = cur.fetchall()
if not hadiths:
break
try:
# Get embeddings (with verify=False)
texts = [" ".join(filter(None, [h[1], h[2]])) for h in hadiths]
resp = requests.post(
f"{TEI_URL}/embed",
json={"inputs": texts},
timeout=60,
verify=VERIFY_SSL
)
resp.raise_for_status()
embeddings = resp.json()
# Upload to Qdrant (with verify=False)
points = [{
"id": h[0],
"vector": embeddings[i],
"payload": {"collection_id": h[3], "hadith_number": h[4]}
} for i, h in enumerate(hadiths)]
resp = requests.put(
f"{QDRANT_URL}/collections/{COLLECTION}/points",
json={"points": points},
timeout=30,
verify=VERIFY_SSL
)
resp.raise_for_status()
# Update DB
cur.execute("UPDATE hadiths SET embedding_generated = TRUE WHERE id = ANY(%s)", ([h[0] for h in hadiths],))
conn.commit()
processed += len(hadiths)
if processed % 320 == 0: # Every 10 batches
pct = 100 * processed / total
print(f"Progress: {processed:,}/{total:,} ({pct:.1f}%) | Failed: {failed}", flush=True)
except Exception as e:
print(f"Error at offset {offset}: {e}", flush=True)
failed += len(hadiths)
offset += BATCH_SIZE
cur.close()
conn.close()
print(f"Complete: {processed:,} processed, {failed} failed at {datetime.now()}", flush=True)
resources: resources:
requests: {cpu: 2, memory: 4Gi} requests:
limits: {cpu: 4, memory: 8Gi} memory: "2Gi"
cpu: "1"
limits:
memory: "4Gi"
cpu: "2"

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,98 @@
# Save as benchmark_embeddings.py
import requests
import time
import statistics
from typing import List
TEI_URL = "http://tei.ml.svc.cluster.local"
QDRANT_URL = "http://qdrant.vector.svc.cluster.local:6333"
def benchmark_tei(num_requests: int = 100) -> Dict:
"""Benchmark TEI embedding generation"""
test_text = "This is a test hadith about prayer and fasting"
times = []
print(f"Benchmarking TEI ({num_requests} requests)...")
for i in range(num_requests):
start = time.time()
response = requests.post(
f"{TEI_URL}/embed",
json={"inputs": test_text}
)
times.append(time.time() - start)
if (i + 1) % 10 == 0:
print(f" Progress: {i + 1}/{num_requests}")
return {
'mean': statistics.mean(times),
'median': statistics.median(times),
'min': min(times),
'max': max(times),
'stdev': statistics.stdev(times) if len(times) > 1 else 0
}
def benchmark_qdrant(num_queries: int = 100) -> Dict:
"""Benchmark Qdrant search"""
# Get a sample embedding first
response = requests.post(
f"{TEI_URL}/embed",
json={"inputs": "test query"}
)
query_vector = response.json()[0]
times = []
print(f"\nBenchmarking Qdrant ({num_queries} searches)...")
for i in range(num_queries):
start = time.time()
response = requests.post(
f"{QDRANT_URL}/collections/hadith_embeddings/points/search",
json={
"vector": query_vector,
"limit": 10
}
)
times.append(time.time() - start)
if (i + 1) % 10 == 0:
print(f" Progress: {i + 1}/{num_queries}")
return {
'mean': statistics.mean(times),
'median': statistics.median(times),
'min': min(times),
'max': max(times),
'stdev': statistics.stdev(times) if len(times) > 1 else 0
}
# Run benchmarks
print("=== PERFORMANCE BENCHMARK ===\n")
tei_stats = benchmark_tei(100)
qdrant_stats = benchmark_qdrant(100)
print("\n" + "="*80)
print("RESULTS")
print("="*80)
print("\nTEI Embedding Generation (per request):")
print(f" Mean: {tei_stats['mean']*1000:.2f}ms")
print(f" Median: {tei_stats['median']*1000:.2f}ms")
print(f" Min: {tei_stats['min']*1000:.2f}ms")
print(f" Max: {tei_stats['max']*1000:.2f}ms")
print(f" StdDev: {tei_stats['stdev']*1000:.2f}ms")
print("\nQdrant Vector Search (per query):")
print(f" Mean: {qdrant_stats['mean']*1000:.2f}ms")
print(f" Median: {qdrant_stats['median']*1000:.2f}ms")
print(f" Min: {qdrant_stats['min']*1000:.2f}ms")
print(f" Max: {qdrant_stats['max']*1000:.2f}ms")
print(f" StdDev: {qdrant_stats['stdev']*1000:.2f}ms")
# Expected performance targets
print("\n" + "="*80)
print("PERFORMANCE TARGETS")
print("="*80)
print("TEI Embedding: < 50ms (good), < 100ms (acceptable)")
print("Qdrant Search: < 20ms (good), < 50ms (acceptable)")

View File

@ -0,0 +1,324 @@
#!/usr/bin/env python3
"""
Generate embeddings for all hadiths and store in Qdrant
Updated with SSL verification disabled
"""
import requests
import psycopg2
from psycopg2.extras import execute_values
import time
import urllib3
from typing import List, Dict, Tuple, Optional
import logging
from datetime import datetime
# import uuid
# Disable SSL warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# Configuration
TEI_URL = "https://embeddings.betelgeusebytes.io"
QDRANT_URL = "https://vector.betelgeusebytes.io"
DB_CONFIG = {
'host': 'pg.betelgeusebytes.io',
'port': 5432,
'dbname': 'hadith_db',
'user': 'hadith_ingest',
'password': 'hadith_ingest' # UPDATE THIS
}
# BATCH_SIZE = 8 # Process 32 hadiths at a time
BATCH_SIZE = 32 # Process 32 hadiths at a time
MAX_TEXT_LENGTH = 1500 # Truncate individual texts to avoid issues
COLLECTION_NAME = "hadith_embeddings"
VERIFY_SSL = False # Ignore SSL certificate verification
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
def get_single_embedding(text: str, hadith_id: str, max_retries: int = 3) -> Optional[List[float]]:
"""Get embedding for a single text with retries and length reduction"""
current_text = text
for attempt in range(max_retries):
try:
response = requests.post(
f"{TEI_URL}/embed",
json={"inputs": [current_text]},
timeout=60,
verify=VERIFY_SSL
)
response.raise_for_status()
return response.json()[0]
except requests.exceptions.HTTPError as e:
if e.response.status_code == 413:
# Text still too large, reduce by 50%
new_length = len(current_text) // 2
logger.warning(f"Hadith {hadith_id}: Text too large ({len(current_text)} chars), reducing to {new_length}")
current_text = current_text[:new_length] + "..."
if new_length < 100: # Don't go below 100 chars
logger.error(f"Hadith {hadith_id}: Cannot reduce text further")
return None
else:
logger.error(f"Hadith {hadith_id}: HTTP error {e.response.status_code}")
return None
except Exception as e:
logger.error(f"Hadith {hadith_id}: Error getting embedding: {e}")
if attempt < max_retries - 1:
time.sleep(1)
else:
return None
return None
def get_embeddings_batch(texts: List[str], hadith_ids: List[str]) -> List[Optional[List[float]]]:
"""Get embeddings for a batch of texts, fall back to individual if needed"""
try:
response = requests.post(
f"{TEI_URL}/embed",
json={"inputs": texts},
timeout=60,
verify=VERIFY_SSL
)
response.raise_for_status()
return response.json()
except requests.exceptions.HTTPError as e:
if e.response.status_code == 413 and len(texts) > 1:
# Batch too large, try individually
logger.warning(f"Batch too large, processing {len(texts)} texts individually...")
embeddings = []
for text, hid in zip(texts, hadith_ids):
embedding = get_single_embedding(text, hid)
embeddings.append(embedding)
time.sleep(0.1)
return embeddings
else:
logger.error(f"Error getting embeddings: {e}")
return [None] * len(texts)
except Exception as e:
logger.error(f"Error getting embeddings: {e}")
return [None] * len(texts)
def upsert_to_qdrant(points: List[Dict]) -> bool:
"""Upsert points to Qdrant"""
if not points: # Skip if no valid points
return True
try:
response = requests.put(
f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points",
json={"points": points},
timeout=30,
verify=VERIFY_SSL
)
response.raise_for_status()
return True
except Exception as e:
logger.error(f"Error upserting to Qdrant: {e}")
return False
def mark_embeddings_generated(conn, hadith_ids: List[str], failed_ids: List[str] = None) -> bool:
"""Mark hadiths as having embeddings generated"""
try:
cur = conn.cursor()
# Mark successful ones
if hadith_ids:
cur.execute("""
UPDATE hadiths
SET embedding_generated = TRUE,
updated_at = CURRENT_TIMESTAMP
WHERE id IN (
SELECT UNNEST(%s::text[])::uuid
)
""", (hadith_ids,))
# Mark failed ones (so we can skip them in future runs)
if failed_ids:
cur.execute("""
UPDATE hadiths
SET embedding_generated = TRUE,
updated_at = CURRENT_TIMESTAMP
WHERE id IN (
SELECT UNNEST(%s::text[])::uuid
)
""", (failed_ids,))
logger.warning(f"Marked {len(failed_ids)} failed hadiths as processed to skip them")
conn.commit()
cur.close()
return True
except Exception as e:
logger.error(f"Error updating database: {e}")
conn.rollback()
return False
def fetch_hadiths_batch(cur, offset: int, limit: int) -> List[Tuple]:
"""Fetch a batch of hadiths without embeddings"""
cur.execute("""
SELECT id, arabic_text, english_text, urdu_text,
collection_id, hadith_number
FROM hadiths
WHERE embedding_generated = FALSE
ORDER BY id
LIMIT %s OFFSET %s
""", (limit, offset))
return cur.fetchall()
def create_combined_text(hadith: Tuple, max_length: int = MAX_TEXT_LENGTH) -> str:
"""Create combined text for embedding, truncate if needed"""
id, arabic, english, urdu, coll_id, num = hadith
parts = []
if arabic:
parts.append(arabic)
if english:
parts.append(english)
combined = " ".join(parts) if parts else "No text available"
if len(combined) > max_length:
combined = combined[:max_length] + "..."
return combined
def main():
start_time = datetime.now()
logger.info("=" * 80)
logger.info("HADITH EMBEDDING GENERATION")
logger.info(f"Started at: {start_time}")
logger.info(f"Batch size: {BATCH_SIZE}")
logger.info(f"Max text length: {MAX_TEXT_LENGTH}")
logger.info(f"SSL Verification: {VERIFY_SSL}")
logger.info("=" * 80)
logger.info("Connecting to database...")
conn = psycopg2.connect(**DB_CONFIG)
cur = conn.cursor()
cur.execute("SELECT COUNT(*) FROM hadiths WHERE embedding_generated = FALSE")
total_hadiths = cur.fetchone()[0]
logger.info(f"Total hadiths to process: {total_hadiths:,}")
if total_hadiths == 0:
logger.info("No hadiths to process!")
return
estimated_time_mins = (total_hadiths / BATCH_SIZE) * 3 / 60
logger.info(f"Estimated time: {estimated_time_mins:.1f} minutes")
logger.info("=" * 80)
offset = 0
processed = 0
failed = 0
skipped = 0
last_report_pct = 0
while offset < total_hadiths:
batch_start = time.time()
hadiths = fetch_hadiths_batch(cur, offset, BATCH_SIZE)
if not hadiths:
break
texts = [create_combined_text(h) for h in hadiths]
hadith_ids = [str(h[0]) for h in hadiths]
try:
embeddings = get_embeddings_batch(texts, hadith_ids)
# Separate successful and failed embeddings
successful_points = []
successful_ids = []
failed_ids = []
for i, (hadith_id, embedding) in enumerate(zip(hadith_ids, embeddings)):
if embedding is not None:
hadith = hadiths[i]
successful_points.append({
"id": hadith_id,
"vector": embedding,
"payload": {
"collection_id": str(hadith[4]),
"hadith_number": hadith[5],
"has_arabic": bool(hadith[1]),
"has_english": bool(hadith[2]),
"has_urdu": bool(hadith[3])
}
})
successful_ids.append(hadith_id)
else:
failed_ids.append(hadith_id)
skipped += 1
# Process successful ones
if successful_points:
if upsert_to_qdrant(successful_points):
if mark_embeddings_generated(conn, successful_ids, failed_ids):
processed += len(successful_ids)
failed += len(failed_ids)
else:
logger.error(f"Failed to update database for batch at offset {offset}")
failed += len(hadiths)
else:
logger.error(f"Failed to upsert to Qdrant for batch at offset {offset}")
failed += len(hadiths)
else:
# All failed in this batch
mark_embeddings_generated(conn, [], failed_ids)
failed += len(failed_ids)
progress_pct = ((processed + failed) / total_hadiths) * 100
if progress_pct - last_report_pct >= 2:
batch_time = time.time() - batch_start
elapsed = (datetime.now() - start_time).total_seconds() / 60
rate = (processed + failed) / elapsed if elapsed > 0 else 0
remaining = (total_hadiths - processed - failed) / rate if rate > 0 else 0
logger.info(
f"Progress: {processed:,}/{total_hadiths:,} ({progress_pct:.1f}%) | "
f"Failed: {failed} | Skipped: {skipped} | Rate: {rate:.0f}/min | "
f"ETA: {remaining:.1f}min | Batch: {batch_time:.2f}s"
)
last_report_pct = progress_pct
time.sleep(0.2)
except Exception as e:
logger.error(f"Error processing batch at offset {offset}: {e}")
failed += len(hadiths)
offset += BATCH_SIZE
cur.close()
conn.close()
end_time = datetime.now()
duration = (end_time - start_time).total_seconds() / 60
logger.info("=" * 80)
logger.info("EMBEDDING GENERATION COMPLETE")
logger.info(f"Started: {start_time}")
logger.info(f"Finished: {end_time}")
logger.info(f"Duration: {duration:.1f} minutes")
logger.info(f"Total hadiths: {total_hadiths:,}")
logger.info(f"Successfully processed: {processed:,}")
logger.info(f"Failed/Skipped: {failed} ({skipped} too long)")
logger.info(f"Success rate: {100 * processed / total_hadiths:.2f}%")
logger.info("=" * 80)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,29 @@
#!/bin/bash
echo "=== EMBEDDING GENERATION MONITOR ==="
echo ""
while true; do
clear
echo "=== EMBEDDING GENERATION PROGRESS ==="
date
echo ""
# Database stats
echo "📊 Database Progress:"
PGPASSWORD=hadith_ingest psql -h pg.betelgeusebytes.io -U hadith_ingest -d hadith_db -t -c "
SELECT
'Done: ' || COUNT(*) FILTER (WHERE embedding_generated = TRUE) ||
' | Remaining: ' || COUNT(*) FILTER (WHERE embedding_generated = FALSE) ||
' | Progress: ' || ROUND(100.0 * COUNT(*) FILTER (WHERE embedding_generated = TRUE) / COUNT(*), 2) || '%'
FROM hadiths;" 2>/dev/null || echo " (Database connection failed)"
echo ""
echo "🔢 Qdrant Collection:"
QDRANT_COUNT=$(curl -k -s https://vector.betelgeusebytes.io/collections/hadith_embeddings 2>/dev/null | jq -r '.result.points_count // "N/A"')
echo " Points: $QDRANT_COUNT"
echo ""
echo "⏰ Next update in 5 minutes... (Ctrl+C to stop)"
sleep 300
done

View File

@ -0,0 +1,69 @@
#!/bin/bash
set -e
echo "=== EMBEDDING PIPELINE SETUP ==="
# 1. Check TEI service
echo -e "\n1. Checking TEI service..."
kubectl -n ml get pods -l app=tei
TEI_STATUS=$?
if [ $TEI_STATUS -ne 0 ]; then
echo "❌ TEI pods not found!"
exit 1
fi
# 2. Test TEI endpoint
echo -e "\n2. Testing TEI endpoint..."
TEI_TEST=$(curl -k -X POST https://embeddings.betelgeusebytes.io/embed \
-H "Content-Type: application/json" \
-d '{"inputs": "test"}' | jq -r 'type')
if [ "$TEI_TEST" != "array" ]; then
echo "❌ TEI not responding correctly!"
echo "Response: $TEI_TEST"
exit 1
fi
echo "✅ TEI is working"
# 3. Check Qdrant
echo -e "\n3. Checking Qdrant..."
QDRANT_STATUS=$(curl -s -k https://vector.betelgeusebytes.io/collections | jq -r '.status')
if [ "$QDRANT_STATUS" != "ok" ]; then
echo "❌ Qdrant not responding!"
exit 1
fi
echo "✅ Qdrant is working"
# 4. Check if collection exists
echo -e "\n4. Checking hadith_embeddings collection..."
COLLECTION_EXISTS=$(curl -s -k https://vector.betelgeusebytes.io/collections/hadith_embeddings | jq -r '.status // "missing"')
if [ "$COLLECTION_EXISTS" == "missing" ]; then
echo "📝 Creating hadith_embeddings collection..."
curl -X -k PUT https://vector.betelgeusebytes.io/collections/hadith_embeddings \
-H "Content-Type: application/json" \
-d '{
"vectors": {
"size": 1024,
"distance": "Cosine"
},
"optimizers_config": {
"indexing_threshold": 10000
}
}'
echo -e "\n✅ Collection created"
else
echo "✅ Collection exists"
POINT_COUNT=$(curl -s -k https://vector.betelgeusebytes.io/collections/hadith_embeddings | jq -r '.result.points_count')
echo " Current points: $POINT_COUNT"
fi
# 5. Check database
echo -e "\n5. Checking database..."
HADITH_COUNT=$(psql -h pg.betelgeusebytes.io -U hadith_ingest -d hadith_db -t -c "SELECT COUNT(*) FROM hadiths;")
echo " Total hadiths: $HADITH_COUNT"
echo -e "\n=== SETUP COMPLETE ✅ ==="
echo -e "\nReady to generate embeddings!"

View File

@ -0,0 +1,93 @@
# Save as test_semantic_search.py
import requests
import psycopg2
from typing import List, Dict
# Configuration
TEI_URL = "http://tei.ml.svc.cluster.local"
QDRANT_URL = "http://qdrant.vector.svc.cluster.local:6333"
DB_CONFIG = {
'host': 'pg.betelgeusebytes.io',
'port': 5432,
'dbname': 'hadith_db',
'user': 'hadith_ingest',
'password': 'your_password' # Update this
}
def get_embedding(text: str) -> List[float]:
"""Get embedding from TEI service"""
response = requests.post(
f"{TEI_URL}/embed",
json={"inputs": text}
)
return response.json()[0]
def search_similar_hadiths(query: str, limit: int = 5) -> List[Dict]:
"""Search for similar hadiths using semantic search"""
# Get query embedding
query_embedding = get_embedding(query)
# Search in Qdrant
response = requests.post(
f"{QDRANT_URL}/collections/hadith_embeddings/points/search",
json={
"vector": query_embedding,
"limit": limit,
"with_payload": True
}
)
results = response.json()['result']
# Get full hadith details from database
conn = psycopg2.connect(**DB_CONFIG)
cur = conn.cursor()
hadith_ids = [r['id'] for r in results]
cur.execute("""
SELECT h.id, h.hadith_number, c.name as collection,
h.arabic_text, h.english_text, h.grade
FROM hadiths h
JOIN collections c ON h.collection_id = c.id
WHERE h.id = ANY(%s)
""", (hadith_ids,))
hadiths = {row[0]: dict(zip(['id', 'number', 'collection', 'arabic', 'english', 'grade'], row))
for row in cur.fetchall()}
cur.close()
conn.close()
# Combine results
return [
{
**hadiths[r['id']],
'similarity_score': r['score']
}
for r in results if r['id'] in hadiths
]
# Test queries
test_queries = [
"prayer times and importance",
"fasting in Ramadan",
"charity and helping the poor",
"truthfulness and honesty",
"parents and their rights"
]
print("=== SEMANTIC SEARCH TEST ===\n")
for query in test_queries:
print(f"\nQuery: '{query}'")
print("-" * 80)
results = search_similar_hadiths(query, limit=3)
for i, hadith in enumerate(results, 1):
print(f"\n{i}. [{hadith['collection']} #{hadith['number']}] (Score: {hadith['similarity_score']:.4f})")
print(f" Grade: {hadith['grade']}")
print(f" English: {hadith['english'][:200]}...")
print(f" Arabic: {hadith['arabic'][:100]}...")
print("\n=== TEST COMPLETE ===")

View File

@ -0,0 +1,27 @@
#!/bin/bash
echo "=== EMBEDDING VERIFICATION ==="
echo ""
# 1. Database check
echo "1. Checking database..."
psql -h pg.betelgeusebytes.io -U hadith_ingest -d hadith_db -t -c "
SELECT
'Total hadiths: ' || COUNT(*) ||
' | With embeddings: ' || COUNT(*) FILTER (WHERE embedding_generated = TRUE) ||
' | Missing: ' || COUNT(*) FILTER (WHERE embedding_generated = FALSE)
FROM hadiths;"
# 2. Qdrant check
echo ""
echo "2. Checking Qdrant..."
QDRANT_COUNT=$(curl -k -s https://vector.betelgeusebytes.io/collections/hadith_embeddings | jq -r '.result.points_count')
echo "Qdrant vectors: $QDRANT_COUNT"
# 3. Collection info
echo ""
echo "3. Collection details..."
curl -k -s https://vector.betelgeusebytes.io/collections/hadith_embeddings | jq '.result | {points_count, vectors_count, status, optimizer_status}'
echo ""
echo "=== VERIFICATION COMPLETE ==="