add generate embeddings
This commit is contained in:
parent
cf02b918b8
commit
f46297255d
Binary file not shown.
|
|
@ -4,28 +4,127 @@ metadata:
|
|||
generateName: generate-embeddings-
|
||||
namespace: ml
|
||||
spec:
|
||||
entrypoint: generate
|
||||
entrypoint: embedding-pipeline
|
||||
serviceAccountName: argo-workflow
|
||||
|
||||
arguments:
|
||||
parameters:
|
||||
- name: batch-size
|
||||
value: "32"
|
||||
- name: db_password
|
||||
value: "YOUR_PASSWORD_HERE" # UPDATE THIS
|
||||
|
||||
templates:
|
||||
- name: generate
|
||||
- name: embedding-pipeline
|
||||
steps:
|
||||
- - name: generate-embeddings
|
||||
template: generate-job
|
||||
|
||||
- name: generate-job
|
||||
container:
|
||||
image: hadith-ingestion:latest
|
||||
command: [python, /app/src/embeddings/generator.py]
|
||||
args: ["--batch-size={{workflow.parameters.batch-size}}"]
|
||||
env:
|
||||
- name: DATABASE_HOST
|
||||
value: "pg.betelgeusebytes.io"
|
||||
- name: DATABASE_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hadith-db-secret
|
||||
key: password
|
||||
image: python:3.11-slim
|
||||
command: [python, -u, -c]
|
||||
args:
|
||||
- |
|
||||
import requests, psycopg2, time, sys, urllib3
|
||||
from datetime import datetime
|
||||
|
||||
# Disable SSL warnings
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
# Install deps
|
||||
import subprocess
|
||||
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "psycopg2-binary", "requests", "urllib3"])
|
||||
|
||||
# Config - EXTERNAL URLS with SSL disabled
|
||||
TEI_URL = "https://embeddings.betelgeusebytes.io"
|
||||
QDRANT_URL = "https://vector.betelgeusebytes.io"
|
||||
DB_CONFIG = {
|
||||
'host': 'pg.betelgeusebytes.io',
|
||||
'port': 5432,
|
||||
'dbname': 'hadith_db',
|
||||
'user': 'hadith_ingest',
|
||||
'password': '{{workflow.parameters.db_password}}'
|
||||
}
|
||||
|
||||
BATCH_SIZE = 32
|
||||
COLLECTION = "hadith_embeddings"
|
||||
VERIFY_SSL = False # Important: Ignore SSL certificates
|
||||
|
||||
print(f"Started: {datetime.now()}", flush=True)
|
||||
print(f"SSL Verification: {VERIFY_SSL}", flush=True)
|
||||
|
||||
conn = psycopg2.connect(**DB_CONFIG)
|
||||
cur = conn.cursor()
|
||||
|
||||
cur.execute("SELECT COUNT(*) FROM hadiths WHERE embedding_generated = FALSE")
|
||||
total = cur.fetchone()[0]
|
||||
print(f"Total to process: {total:,}", flush=True)
|
||||
|
||||
processed = 0
|
||||
failed = 0
|
||||
offset = 0
|
||||
|
||||
while offset < total:
|
||||
cur.execute("""
|
||||
SELECT id, arabic_text, english_text, collection_id, hadith_number
|
||||
FROM hadiths WHERE embedding_generated = FALSE
|
||||
ORDER BY id LIMIT %s OFFSET %s
|
||||
""", (BATCH_SIZE, offset))
|
||||
|
||||
hadiths = cur.fetchall()
|
||||
if not hadiths:
|
||||
break
|
||||
|
||||
try:
|
||||
# Get embeddings (with verify=False)
|
||||
texts = [" ".join(filter(None, [h[1], h[2]])) for h in hadiths]
|
||||
resp = requests.post(
|
||||
f"{TEI_URL}/embed",
|
||||
json={"inputs": texts},
|
||||
timeout=60,
|
||||
verify=VERIFY_SSL
|
||||
)
|
||||
resp.raise_for_status()
|
||||
embeddings = resp.json()
|
||||
|
||||
# Upload to Qdrant (with verify=False)
|
||||
points = [{
|
||||
"id": h[0],
|
||||
"vector": embeddings[i],
|
||||
"payload": {"collection_id": h[3], "hadith_number": h[4]}
|
||||
} for i, h in enumerate(hadiths)]
|
||||
|
||||
resp = requests.put(
|
||||
f"{QDRANT_URL}/collections/{COLLECTION}/points",
|
||||
json={"points": points},
|
||||
timeout=30,
|
||||
verify=VERIFY_SSL
|
||||
)
|
||||
resp.raise_for_status()
|
||||
|
||||
# Update DB
|
||||
cur.execute("UPDATE hadiths SET embedding_generated = TRUE WHERE id = ANY(%s)", ([h[0] for h in hadiths],))
|
||||
conn.commit()
|
||||
|
||||
processed += len(hadiths)
|
||||
if processed % 320 == 0: # Every 10 batches
|
||||
pct = 100 * processed / total
|
||||
print(f"Progress: {processed:,}/{total:,} ({pct:.1f}%) | Failed: {failed}", flush=True)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error at offset {offset}: {e}", flush=True)
|
||||
failed += len(hadiths)
|
||||
|
||||
offset += BATCH_SIZE
|
||||
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
print(f"Complete: {processed:,} processed, {failed} failed at {datetime.now()}", flush=True)
|
||||
|
||||
resources:
|
||||
requests: {cpu: 2, memory: 4Gi}
|
||||
limits: {cpu: 4, memory: 8Gi}
|
||||
requests:
|
||||
memory: "2Gi"
|
||||
cpu: "1"
|
||||
limits:
|
||||
memory: "4Gi"
|
||||
cpu: "2"
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,98 @@
|
|||
# Save as benchmark_embeddings.py
|
||||
import requests
|
||||
import time
|
||||
import statistics
|
||||
from typing import List
|
||||
|
||||
TEI_URL = "http://tei.ml.svc.cluster.local"
|
||||
QDRANT_URL = "http://qdrant.vector.svc.cluster.local:6333"
|
||||
|
||||
def benchmark_tei(num_requests: int = 100) -> Dict:
|
||||
"""Benchmark TEI embedding generation"""
|
||||
test_text = "This is a test hadith about prayer and fasting"
|
||||
times = []
|
||||
|
||||
print(f"Benchmarking TEI ({num_requests} requests)...")
|
||||
for i in range(num_requests):
|
||||
start = time.time()
|
||||
response = requests.post(
|
||||
f"{TEI_URL}/embed",
|
||||
json={"inputs": test_text}
|
||||
)
|
||||
times.append(time.time() - start)
|
||||
|
||||
if (i + 1) % 10 == 0:
|
||||
print(f" Progress: {i + 1}/{num_requests}")
|
||||
|
||||
return {
|
||||
'mean': statistics.mean(times),
|
||||
'median': statistics.median(times),
|
||||
'min': min(times),
|
||||
'max': max(times),
|
||||
'stdev': statistics.stdev(times) if len(times) > 1 else 0
|
||||
}
|
||||
|
||||
def benchmark_qdrant(num_queries: int = 100) -> Dict:
|
||||
"""Benchmark Qdrant search"""
|
||||
# Get a sample embedding first
|
||||
response = requests.post(
|
||||
f"{TEI_URL}/embed",
|
||||
json={"inputs": "test query"}
|
||||
)
|
||||
query_vector = response.json()[0]
|
||||
|
||||
times = []
|
||||
|
||||
print(f"\nBenchmarking Qdrant ({num_queries} searches)...")
|
||||
for i in range(num_queries):
|
||||
start = time.time()
|
||||
response = requests.post(
|
||||
f"{QDRANT_URL}/collections/hadith_embeddings/points/search",
|
||||
json={
|
||||
"vector": query_vector,
|
||||
"limit": 10
|
||||
}
|
||||
)
|
||||
times.append(time.time() - start)
|
||||
|
||||
if (i + 1) % 10 == 0:
|
||||
print(f" Progress: {i + 1}/{num_queries}")
|
||||
|
||||
return {
|
||||
'mean': statistics.mean(times),
|
||||
'median': statistics.median(times),
|
||||
'min': min(times),
|
||||
'max': max(times),
|
||||
'stdev': statistics.stdev(times) if len(times) > 1 else 0
|
||||
}
|
||||
|
||||
# Run benchmarks
|
||||
print("=== PERFORMANCE BENCHMARK ===\n")
|
||||
|
||||
tei_stats = benchmark_tei(100)
|
||||
qdrant_stats = benchmark_qdrant(100)
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("RESULTS")
|
||||
print("="*80)
|
||||
|
||||
print("\nTEI Embedding Generation (per request):")
|
||||
print(f" Mean: {tei_stats['mean']*1000:.2f}ms")
|
||||
print(f" Median: {tei_stats['median']*1000:.2f}ms")
|
||||
print(f" Min: {tei_stats['min']*1000:.2f}ms")
|
||||
print(f" Max: {tei_stats['max']*1000:.2f}ms")
|
||||
print(f" StdDev: {tei_stats['stdev']*1000:.2f}ms")
|
||||
|
||||
print("\nQdrant Vector Search (per query):")
|
||||
print(f" Mean: {qdrant_stats['mean']*1000:.2f}ms")
|
||||
print(f" Median: {qdrant_stats['median']*1000:.2f}ms")
|
||||
print(f" Min: {qdrant_stats['min']*1000:.2f}ms")
|
||||
print(f" Max: {qdrant_stats['max']*1000:.2f}ms")
|
||||
print(f" StdDev: {qdrant_stats['stdev']*1000:.2f}ms")
|
||||
|
||||
# Expected performance targets
|
||||
print("\n" + "="*80)
|
||||
print("PERFORMANCE TARGETS")
|
||||
print("="*80)
|
||||
print("TEI Embedding: < 50ms (good), < 100ms (acceptable)")
|
||||
print("Qdrant Search: < 20ms (good), < 50ms (acceptable)")
|
||||
|
|
@ -0,0 +1,324 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Generate embeddings for all hadiths and store in Qdrant
|
||||
Updated with SSL verification disabled
|
||||
"""
|
||||
import requests
|
||||
import psycopg2
|
||||
from psycopg2.extras import execute_values
|
||||
|
||||
import time
|
||||
import urllib3
|
||||
from typing import List, Dict, Tuple, Optional
|
||||
import logging
|
||||
from datetime import datetime
|
||||
# import uuid
|
||||
|
||||
# Disable SSL warnings
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
# Configuration
|
||||
TEI_URL = "https://embeddings.betelgeusebytes.io"
|
||||
QDRANT_URL = "https://vector.betelgeusebytes.io"
|
||||
DB_CONFIG = {
|
||||
'host': 'pg.betelgeusebytes.io',
|
||||
'port': 5432,
|
||||
'dbname': 'hadith_db',
|
||||
'user': 'hadith_ingest',
|
||||
'password': 'hadith_ingest' # UPDATE THIS
|
||||
}
|
||||
|
||||
# BATCH_SIZE = 8 # Process 32 hadiths at a time
|
||||
BATCH_SIZE = 32 # Process 32 hadiths at a time
|
||||
MAX_TEXT_LENGTH = 1500 # Truncate individual texts to avoid issues
|
||||
COLLECTION_NAME = "hadith_embeddings"
|
||||
VERIFY_SSL = False # Ignore SSL certificate verification
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_single_embedding(text: str, hadith_id: str, max_retries: int = 3) -> Optional[List[float]]:
|
||||
"""Get embedding for a single text with retries and length reduction"""
|
||||
current_text = text
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
response = requests.post(
|
||||
f"{TEI_URL}/embed",
|
||||
json={"inputs": [current_text]},
|
||||
timeout=60,
|
||||
verify=VERIFY_SSL
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response.json()[0]
|
||||
except requests.exceptions.HTTPError as e:
|
||||
if e.response.status_code == 413:
|
||||
# Text still too large, reduce by 50%
|
||||
new_length = len(current_text) // 2
|
||||
logger.warning(f"Hadith {hadith_id}: Text too large ({len(current_text)} chars), reducing to {new_length}")
|
||||
current_text = current_text[:new_length] + "..."
|
||||
if new_length < 100: # Don't go below 100 chars
|
||||
logger.error(f"Hadith {hadith_id}: Cannot reduce text further")
|
||||
return None
|
||||
else:
|
||||
logger.error(f"Hadith {hadith_id}: HTTP error {e.response.status_code}")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"Hadith {hadith_id}: Error getting embedding: {e}")
|
||||
if attempt < max_retries - 1:
|
||||
time.sleep(1)
|
||||
else:
|
||||
return None
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_embeddings_batch(texts: List[str], hadith_ids: List[str]) -> List[Optional[List[float]]]:
|
||||
"""Get embeddings for a batch of texts, fall back to individual if needed"""
|
||||
try:
|
||||
response = requests.post(
|
||||
f"{TEI_URL}/embed",
|
||||
json={"inputs": texts},
|
||||
timeout=60,
|
||||
verify=VERIFY_SSL
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
except requests.exceptions.HTTPError as e:
|
||||
if e.response.status_code == 413 and len(texts) > 1:
|
||||
# Batch too large, try individually
|
||||
logger.warning(f"Batch too large, processing {len(texts)} texts individually...")
|
||||
embeddings = []
|
||||
for text, hid in zip(texts, hadith_ids):
|
||||
embedding = get_single_embedding(text, hid)
|
||||
embeddings.append(embedding)
|
||||
time.sleep(0.1)
|
||||
return embeddings
|
||||
else:
|
||||
logger.error(f"Error getting embeddings: {e}")
|
||||
return [None] * len(texts)
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting embeddings: {e}")
|
||||
return [None] * len(texts)
|
||||
|
||||
|
||||
def upsert_to_qdrant(points: List[Dict]) -> bool:
|
||||
"""Upsert points to Qdrant"""
|
||||
if not points: # Skip if no valid points
|
||||
return True
|
||||
|
||||
try:
|
||||
response = requests.put(
|
||||
f"{QDRANT_URL}/collections/{COLLECTION_NAME}/points",
|
||||
json={"points": points},
|
||||
timeout=30,
|
||||
verify=VERIFY_SSL
|
||||
)
|
||||
response.raise_for_status()
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Error upserting to Qdrant: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def mark_embeddings_generated(conn, hadith_ids: List[str], failed_ids: List[str] = None) -> bool:
|
||||
"""Mark hadiths as having embeddings generated"""
|
||||
try:
|
||||
cur = conn.cursor()
|
||||
|
||||
# Mark successful ones
|
||||
if hadith_ids:
|
||||
cur.execute("""
|
||||
UPDATE hadiths
|
||||
SET embedding_generated = TRUE,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
WHERE id IN (
|
||||
SELECT UNNEST(%s::text[])::uuid
|
||||
)
|
||||
""", (hadith_ids,))
|
||||
|
||||
# Mark failed ones (so we can skip them in future runs)
|
||||
if failed_ids:
|
||||
cur.execute("""
|
||||
UPDATE hadiths
|
||||
SET embedding_generated = TRUE,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
WHERE id IN (
|
||||
SELECT UNNEST(%s::text[])::uuid
|
||||
)
|
||||
""", (failed_ids,))
|
||||
logger.warning(f"Marked {len(failed_ids)} failed hadiths as processed to skip them")
|
||||
|
||||
conn.commit()
|
||||
cur.close()
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Error updating database: {e}")
|
||||
conn.rollback()
|
||||
return False
|
||||
|
||||
|
||||
def fetch_hadiths_batch(cur, offset: int, limit: int) -> List[Tuple]:
|
||||
"""Fetch a batch of hadiths without embeddings"""
|
||||
cur.execute("""
|
||||
SELECT id, arabic_text, english_text, urdu_text,
|
||||
collection_id, hadith_number
|
||||
FROM hadiths
|
||||
WHERE embedding_generated = FALSE
|
||||
ORDER BY id
|
||||
LIMIT %s OFFSET %s
|
||||
""", (limit, offset))
|
||||
return cur.fetchall()
|
||||
|
||||
|
||||
def create_combined_text(hadith: Tuple, max_length: int = MAX_TEXT_LENGTH) -> str:
|
||||
"""Create combined text for embedding, truncate if needed"""
|
||||
id, arabic, english, urdu, coll_id, num = hadith
|
||||
|
||||
parts = []
|
||||
if arabic:
|
||||
parts.append(arabic)
|
||||
if english:
|
||||
parts.append(english)
|
||||
|
||||
combined = " ".join(parts) if parts else "No text available"
|
||||
|
||||
if len(combined) > max_length:
|
||||
combined = combined[:max_length] + "..."
|
||||
|
||||
return combined
|
||||
|
||||
|
||||
def main():
|
||||
start_time = datetime.now()
|
||||
logger.info("=" * 80)
|
||||
logger.info("HADITH EMBEDDING GENERATION")
|
||||
logger.info(f"Started at: {start_time}")
|
||||
logger.info(f"Batch size: {BATCH_SIZE}")
|
||||
logger.info(f"Max text length: {MAX_TEXT_LENGTH}")
|
||||
logger.info(f"SSL Verification: {VERIFY_SSL}")
|
||||
logger.info("=" * 80)
|
||||
|
||||
logger.info("Connecting to database...")
|
||||
conn = psycopg2.connect(**DB_CONFIG)
|
||||
cur = conn.cursor()
|
||||
|
||||
cur.execute("SELECT COUNT(*) FROM hadiths WHERE embedding_generated = FALSE")
|
||||
total_hadiths = cur.fetchone()[0]
|
||||
logger.info(f"Total hadiths to process: {total_hadiths:,}")
|
||||
|
||||
if total_hadiths == 0:
|
||||
logger.info("No hadiths to process!")
|
||||
return
|
||||
|
||||
estimated_time_mins = (total_hadiths / BATCH_SIZE) * 3 / 60
|
||||
logger.info(f"Estimated time: {estimated_time_mins:.1f} minutes")
|
||||
logger.info("=" * 80)
|
||||
|
||||
offset = 0
|
||||
processed = 0
|
||||
failed = 0
|
||||
skipped = 0
|
||||
last_report_pct = 0
|
||||
|
||||
while offset < total_hadiths:
|
||||
batch_start = time.time()
|
||||
|
||||
hadiths = fetch_hadiths_batch(cur, offset, BATCH_SIZE)
|
||||
|
||||
if not hadiths:
|
||||
break
|
||||
|
||||
texts = [create_combined_text(h) for h in hadiths]
|
||||
hadith_ids = [str(h[0]) for h in hadiths]
|
||||
|
||||
try:
|
||||
embeddings = get_embeddings_batch(texts, hadith_ids)
|
||||
|
||||
# Separate successful and failed embeddings
|
||||
successful_points = []
|
||||
successful_ids = []
|
||||
failed_ids = []
|
||||
|
||||
for i, (hadith_id, embedding) in enumerate(zip(hadith_ids, embeddings)):
|
||||
if embedding is not None:
|
||||
hadith = hadiths[i]
|
||||
successful_points.append({
|
||||
"id": hadith_id,
|
||||
"vector": embedding,
|
||||
"payload": {
|
||||
"collection_id": str(hadith[4]),
|
||||
"hadith_number": hadith[5],
|
||||
"has_arabic": bool(hadith[1]),
|
||||
"has_english": bool(hadith[2]),
|
||||
"has_urdu": bool(hadith[3])
|
||||
}
|
||||
})
|
||||
successful_ids.append(hadith_id)
|
||||
else:
|
||||
failed_ids.append(hadith_id)
|
||||
skipped += 1
|
||||
|
||||
# Process successful ones
|
||||
if successful_points:
|
||||
if upsert_to_qdrant(successful_points):
|
||||
if mark_embeddings_generated(conn, successful_ids, failed_ids):
|
||||
processed += len(successful_ids)
|
||||
failed += len(failed_ids)
|
||||
else:
|
||||
logger.error(f"Failed to update database for batch at offset {offset}")
|
||||
failed += len(hadiths)
|
||||
else:
|
||||
logger.error(f"Failed to upsert to Qdrant for batch at offset {offset}")
|
||||
failed += len(hadiths)
|
||||
else:
|
||||
# All failed in this batch
|
||||
mark_embeddings_generated(conn, [], failed_ids)
|
||||
failed += len(failed_ids)
|
||||
|
||||
progress_pct = ((processed + failed) / total_hadiths) * 100
|
||||
if progress_pct - last_report_pct >= 2:
|
||||
batch_time = time.time() - batch_start
|
||||
elapsed = (datetime.now() - start_time).total_seconds() / 60
|
||||
rate = (processed + failed) / elapsed if elapsed > 0 else 0
|
||||
remaining = (total_hadiths - processed - failed) / rate if rate > 0 else 0
|
||||
|
||||
logger.info(
|
||||
f"Progress: {processed:,}/{total_hadiths:,} ({progress_pct:.1f}%) | "
|
||||
f"Failed: {failed} | Skipped: {skipped} | Rate: {rate:.0f}/min | "
|
||||
f"ETA: {remaining:.1f}min | Batch: {batch_time:.2f}s"
|
||||
)
|
||||
last_report_pct = progress_pct
|
||||
|
||||
time.sleep(0.2)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing batch at offset {offset}: {e}")
|
||||
failed += len(hadiths)
|
||||
|
||||
offset += BATCH_SIZE
|
||||
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
end_time = datetime.now()
|
||||
duration = (end_time - start_time).total_seconds() / 60
|
||||
|
||||
logger.info("=" * 80)
|
||||
logger.info("EMBEDDING GENERATION COMPLETE")
|
||||
logger.info(f"Started: {start_time}")
|
||||
logger.info(f"Finished: {end_time}")
|
||||
logger.info(f"Duration: {duration:.1f} minutes")
|
||||
logger.info(f"Total hadiths: {total_hadiths:,}")
|
||||
logger.info(f"Successfully processed: {processed:,}")
|
||||
logger.info(f"Failed/Skipped: {failed} ({skipped} too long)")
|
||||
logger.info(f"Success rate: {100 * processed / total_hadiths:.2f}%")
|
||||
logger.info("=" * 80)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -0,0 +1,29 @@
|
|||
#!/bin/bash
|
||||
|
||||
echo "=== EMBEDDING GENERATION MONITOR ==="
|
||||
echo ""
|
||||
|
||||
while true; do
|
||||
clear
|
||||
echo "=== EMBEDDING GENERATION PROGRESS ==="
|
||||
date
|
||||
echo ""
|
||||
|
||||
# Database stats
|
||||
echo "📊 Database Progress:"
|
||||
PGPASSWORD=hadith_ingest psql -h pg.betelgeusebytes.io -U hadith_ingest -d hadith_db -t -c "
|
||||
SELECT
|
||||
'Done: ' || COUNT(*) FILTER (WHERE embedding_generated = TRUE) ||
|
||||
' | Remaining: ' || COUNT(*) FILTER (WHERE embedding_generated = FALSE) ||
|
||||
' | Progress: ' || ROUND(100.0 * COUNT(*) FILTER (WHERE embedding_generated = TRUE) / COUNT(*), 2) || '%'
|
||||
FROM hadiths;" 2>/dev/null || echo " (Database connection failed)"
|
||||
|
||||
echo ""
|
||||
echo "🔢 Qdrant Collection:"
|
||||
QDRANT_COUNT=$(curl -k -s https://vector.betelgeusebytes.io/collections/hadith_embeddings 2>/dev/null | jq -r '.result.points_count // "N/A"')
|
||||
echo " Points: $QDRANT_COUNT"
|
||||
|
||||
echo ""
|
||||
echo "⏰ Next update in 5 minutes... (Ctrl+C to stop)"
|
||||
sleep 300
|
||||
done
|
||||
|
|
@ -0,0 +1,69 @@
|
|||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
echo "=== EMBEDDING PIPELINE SETUP ==="
|
||||
|
||||
# 1. Check TEI service
|
||||
echo -e "\n1. Checking TEI service..."
|
||||
kubectl -n ml get pods -l app=tei
|
||||
TEI_STATUS=$?
|
||||
|
||||
if [ $TEI_STATUS -ne 0 ]; then
|
||||
echo "❌ TEI pods not found!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 2. Test TEI endpoint
|
||||
echo -e "\n2. Testing TEI endpoint..."
|
||||
TEI_TEST=$(curl -k -X POST https://embeddings.betelgeusebytes.io/embed \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"inputs": "test"}' | jq -r 'type')
|
||||
|
||||
if [ "$TEI_TEST" != "array" ]; then
|
||||
echo "❌ TEI not responding correctly!"
|
||||
echo "Response: $TEI_TEST"
|
||||
exit 1
|
||||
fi
|
||||
echo "✅ TEI is working"
|
||||
|
||||
# 3. Check Qdrant
|
||||
echo -e "\n3. Checking Qdrant..."
|
||||
QDRANT_STATUS=$(curl -s -k https://vector.betelgeusebytes.io/collections | jq -r '.status')
|
||||
|
||||
if [ "$QDRANT_STATUS" != "ok" ]; then
|
||||
echo "❌ Qdrant not responding!"
|
||||
exit 1
|
||||
fi
|
||||
echo "✅ Qdrant is working"
|
||||
|
||||
# 4. Check if collection exists
|
||||
echo -e "\n4. Checking hadith_embeddings collection..."
|
||||
COLLECTION_EXISTS=$(curl -s -k https://vector.betelgeusebytes.io/collections/hadith_embeddings | jq -r '.status // "missing"')
|
||||
|
||||
if [ "$COLLECTION_EXISTS" == "missing" ]; then
|
||||
echo "📝 Creating hadith_embeddings collection..."
|
||||
curl -X -k PUT https://vector.betelgeusebytes.io/collections/hadith_embeddings \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"vectors": {
|
||||
"size": 1024,
|
||||
"distance": "Cosine"
|
||||
},
|
||||
"optimizers_config": {
|
||||
"indexing_threshold": 10000
|
||||
}
|
||||
}'
|
||||
echo -e "\n✅ Collection created"
|
||||
else
|
||||
echo "✅ Collection exists"
|
||||
POINT_COUNT=$(curl -s -k https://vector.betelgeusebytes.io/collections/hadith_embeddings | jq -r '.result.points_count')
|
||||
echo " Current points: $POINT_COUNT"
|
||||
fi
|
||||
|
||||
# 5. Check database
|
||||
echo -e "\n5. Checking database..."
|
||||
HADITH_COUNT=$(psql -h pg.betelgeusebytes.io -U hadith_ingest -d hadith_db -t -c "SELECT COUNT(*) FROM hadiths;")
|
||||
echo " Total hadiths: $HADITH_COUNT"
|
||||
|
||||
echo -e "\n=== SETUP COMPLETE ✅ ==="
|
||||
echo -e "\nReady to generate embeddings!"
|
||||
|
|
@ -0,0 +1,93 @@
|
|||
# Save as test_semantic_search.py
|
||||
import requests
|
||||
import psycopg2
|
||||
from typing import List, Dict
|
||||
|
||||
# Configuration
|
||||
TEI_URL = "http://tei.ml.svc.cluster.local"
|
||||
QDRANT_URL = "http://qdrant.vector.svc.cluster.local:6333"
|
||||
DB_CONFIG = {
|
||||
'host': 'pg.betelgeusebytes.io',
|
||||
'port': 5432,
|
||||
'dbname': 'hadith_db',
|
||||
'user': 'hadith_ingest',
|
||||
'password': 'your_password' # Update this
|
||||
}
|
||||
|
||||
def get_embedding(text: str) -> List[float]:
|
||||
"""Get embedding from TEI service"""
|
||||
response = requests.post(
|
||||
f"{TEI_URL}/embed",
|
||||
json={"inputs": text}
|
||||
)
|
||||
return response.json()[0]
|
||||
|
||||
def search_similar_hadiths(query: str, limit: int = 5) -> List[Dict]:
|
||||
"""Search for similar hadiths using semantic search"""
|
||||
# Get query embedding
|
||||
query_embedding = get_embedding(query)
|
||||
|
||||
# Search in Qdrant
|
||||
response = requests.post(
|
||||
f"{QDRANT_URL}/collections/hadith_embeddings/points/search",
|
||||
json={
|
||||
"vector": query_embedding,
|
||||
"limit": limit,
|
||||
"with_payload": True
|
||||
}
|
||||
)
|
||||
|
||||
results = response.json()['result']
|
||||
|
||||
# Get full hadith details from database
|
||||
conn = psycopg2.connect(**DB_CONFIG)
|
||||
cur = conn.cursor()
|
||||
|
||||
hadith_ids = [r['id'] for r in results]
|
||||
cur.execute("""
|
||||
SELECT h.id, h.hadith_number, c.name as collection,
|
||||
h.arabic_text, h.english_text, h.grade
|
||||
FROM hadiths h
|
||||
JOIN collections c ON h.collection_id = c.id
|
||||
WHERE h.id = ANY(%s)
|
||||
""", (hadith_ids,))
|
||||
|
||||
hadiths = {row[0]: dict(zip(['id', 'number', 'collection', 'arabic', 'english', 'grade'], row))
|
||||
for row in cur.fetchall()}
|
||||
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
# Combine results
|
||||
return [
|
||||
{
|
||||
**hadiths[r['id']],
|
||||
'similarity_score': r['score']
|
||||
}
|
||||
for r in results if r['id'] in hadiths
|
||||
]
|
||||
|
||||
# Test queries
|
||||
test_queries = [
|
||||
"prayer times and importance",
|
||||
"fasting in Ramadan",
|
||||
"charity and helping the poor",
|
||||
"truthfulness and honesty",
|
||||
"parents and their rights"
|
||||
]
|
||||
|
||||
print("=== SEMANTIC SEARCH TEST ===\n")
|
||||
|
||||
for query in test_queries:
|
||||
print(f"\nQuery: '{query}'")
|
||||
print("-" * 80)
|
||||
|
||||
results = search_similar_hadiths(query, limit=3)
|
||||
|
||||
for i, hadith in enumerate(results, 1):
|
||||
print(f"\n{i}. [{hadith['collection']} #{hadith['number']}] (Score: {hadith['similarity_score']:.4f})")
|
||||
print(f" Grade: {hadith['grade']}")
|
||||
print(f" English: {hadith['english'][:200]}...")
|
||||
print(f" Arabic: {hadith['arabic'][:100]}...")
|
||||
|
||||
print("\n=== TEST COMPLETE ===")
|
||||
|
|
@ -0,0 +1,27 @@
|
|||
#!/bin/bash
|
||||
|
||||
echo "=== EMBEDDING VERIFICATION ==="
|
||||
echo ""
|
||||
|
||||
# 1. Database check
|
||||
echo "1. Checking database..."
|
||||
psql -h pg.betelgeusebytes.io -U hadith_ingest -d hadith_db -t -c "
|
||||
SELECT
|
||||
'Total hadiths: ' || COUNT(*) ||
|
||||
' | With embeddings: ' || COUNT(*) FILTER (WHERE embedding_generated = TRUE) ||
|
||||
' | Missing: ' || COUNT(*) FILTER (WHERE embedding_generated = FALSE)
|
||||
FROM hadiths;"
|
||||
|
||||
# 2. Qdrant check
|
||||
echo ""
|
||||
echo "2. Checking Qdrant..."
|
||||
QDRANT_COUNT=$(curl -k -s https://vector.betelgeusebytes.io/collections/hadith_embeddings | jq -r '.result.points_count')
|
||||
echo "Qdrant vectors: $QDRANT_COUNT"
|
||||
|
||||
# 3. Collection info
|
||||
echo ""
|
||||
echo "3. Collection details..."
|
||||
curl -k -s https://vector.betelgeusebytes.io/collections/hadith_embeddings | jq '.result | {points_count, vectors_count, status, optimizer_status}'
|
||||
|
||||
echo ""
|
||||
echo "=== VERIFICATION COMPLETE ==="
|
||||
Loading…
Reference in New Issue