update ingestion muslim
This commit is contained in:
parent
7fdcb1417d
commit
4a14036b01
Binary file not shown.
|
|
@ -0,0 +1,31 @@
|
|||
apiVersion: argoproj.io/v1alpha1
|
||||
kind: Workflow
|
||||
metadata:
|
||||
generateName: generate-embeddings-
|
||||
namespace: ml
|
||||
spec:
|
||||
entrypoint: generate
|
||||
serviceAccountName: argo-workflow
|
||||
|
||||
arguments:
|
||||
parameters:
|
||||
- name: batch-size
|
||||
value: "32"
|
||||
|
||||
templates:
|
||||
- name: generate
|
||||
container:
|
||||
image: hadith-ingestion:latest
|
||||
command: [python, /app/src/embeddings/generator.py]
|
||||
args: ["--batch-size={{workflow.parameters.batch-size}}"]
|
||||
env:
|
||||
- name: DATABASE_HOST
|
||||
value: "pg.betelgeusebytes.io"
|
||||
- name: DATABASE_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: hadith-db-secret
|
||||
key: password
|
||||
resources:
|
||||
requests: {cpu: 2, memory: 4Gi}
|
||||
limits: {cpu: 4, memory: 8Gi}
|
||||
|
|
@ -59,8 +59,8 @@ spec:
|
|||
|
||||
container:
|
||||
image: hadith-ingestion:latest
|
||||
imagePullPolicy: IfNotPresent
|
||||
command: [python, /app/src/main.py]
|
||||
imagePullPolicy: Always
|
||||
command: [python, /app/src/main_hadithapi.py]
|
||||
args:
|
||||
- "{{inputs.parameters.collection}}"
|
||||
- "--limit={{inputs.parameters.limit}}"
|
||||
|
|
@ -122,8 +122,8 @@ spec:
|
|||
|
||||
container:
|
||||
image: hadith-embeddings:latest
|
||||
imagePullPolicy: IfNotPresent
|
||||
command: [python, /app/generate_embeddings.py]
|
||||
imagePullPolicy: Always
|
||||
command: [python, /app/src/embeddings/generator.py]
|
||||
args:
|
||||
- "--collection={{inputs.parameters.collection}}"
|
||||
- "--batch-size=32"
|
||||
|
|
@ -161,7 +161,7 @@ spec:
|
|||
|
||||
container:
|
||||
image: hadith-qdrant-indexer:latest
|
||||
imagePullPolicy: IfNotPresent
|
||||
imagePullPolicy: Always
|
||||
command: [python, /app/index_qdrant.py]
|
||||
args:
|
||||
- "--collection={{inputs.parameters.collection}}"
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
find . -type f -name "*.txt" -o -name "production" -o -name "*.py" -o -name "*.yaml" -o -name "Dockerfile" -o -name "*.sh" -o -name "*.env" -o -name "*.md" ! -name "*.xls" ! -name "*.xlsx"| while read file; do
|
||||
find . -type f -name "*.txt" -o -name "production" -o -name "*.py" -o -name "*.yaml" -o -name "Dockerfile" -o -name "*.sh" -o -name "*.env" ! -name "*.md" ! -name "*.xls" ! -name "*.xlsx"| while read file; do
|
||||
echo "=== $file ===" >> combined.txt
|
||||
cat "$file" >> combined.txt
|
||||
echo "" >> combined.txt
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -40,3 +40,9 @@ pytest==7.4.3
|
|||
pytest-asyncio==0.21.1
|
||||
pytest-cov==4.1.0
|
||||
faker==21.0.0
|
||||
|
||||
|
||||
httpx==0.25.2
|
||||
qdrant-client==1.7.0
|
||||
tqdm==4.66.1
|
||||
asyncpg==0.29.0
|
||||
|
|
@ -8,14 +8,14 @@ echo "=== Starting Full HadithAPI Ingestion ==="
|
|||
# Book slug to collection abbreviation mapping
|
||||
# Books to ingest (in order)
|
||||
BOOKS=(
|
||||
"sahih-bukhari"
|
||||
# "sahih-bukhari"
|
||||
"sahih-muslim"
|
||||
"abu-dawood"
|
||||
"al-tirmidhi"
|
||||
"ibn-e-majah"
|
||||
"sunan-nasai"
|
||||
"musnad-ahmad"
|
||||
"al-silsila-sahiha"
|
||||
# "abu-dawood"
|
||||
# "al-tirmidhi"
|
||||
# "ibn-e-majah"
|
||||
# "sunan-nasai"
|
||||
# "musnad-ahmad"
|
||||
# "al-silsila-sahiha"
|
||||
)
|
||||
|
||||
for BOOK in "${BOOKS[@]}"; do
|
||||
|
|
@ -23,7 +23,7 @@ for BOOK in "${BOOKS[@]}"; do
|
|||
echo "Ingesting: $BOOK"
|
||||
echo "========================================="
|
||||
|
||||
argo submit -n argo argo/workflows/ingest-hadithapi.yaml \
|
||||
argo submit -n ml argo/workflows/ingest-hadithapi.yaml \
|
||||
--parameter book-slug=$BOOK \
|
||||
--parameter limit=0 \
|
||||
--wait \
|
||||
|
|
|
|||
Binary file not shown.
|
|
@ -264,7 +264,11 @@ class HadithAPIClient(BaseAPIClient):
|
|||
|
||||
# Process each chapter
|
||||
for chapter in chapters:
|
||||
chapter_id = chapter.get('id')
|
||||
# logger.warning("Processing chapter", chapter=chapter)
|
||||
if book_slug == 'sahih-muslim':
|
||||
chapter_id = chapter.get('chapterNumber')
|
||||
else:
|
||||
chapter_id = chapter.get('id')
|
||||
chapter_number = chapter.get('chapterNumber')
|
||||
|
||||
logger.info(
|
||||
|
|
|
|||
|
|
@ -0,0 +1,148 @@
|
|||
# Update: src/embeddings/generator.py
|
||||
"""
|
||||
Embedding generation service for hadith texts
|
||||
"""
|
||||
import asyncio
|
||||
import httpx
|
||||
from typing import List, Tuple, Optional
|
||||
import psycopg2
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.models import Distance, VectorParams, PointStruct
|
||||
import structlog
|
||||
from tqdm import tqdm
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
from config.settings import settings
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
class EmbeddingGenerator:
|
||||
def __init__(self, database_url: str, tei_url: str, qdrant_url: str, batch_size: int = 32):
|
||||
self.database_url = database_url
|
||||
self.tei_url = tei_url
|
||||
self.qdrant_url = qdrant_url
|
||||
self.batch_size = batch_size
|
||||
self.http_client = httpx.AsyncClient(timeout=60.0)
|
||||
self.qdrant = QdrantClient(url=qdrant_url)
|
||||
|
||||
async def generate_embeddings_batch(self, texts: List[str]) -> List[List[float]]:
|
||||
"""Generate embeddings using TEI"""
|
||||
response = await self.http_client.post(
|
||||
f"{self.tei_url}/embed",
|
||||
json={"inputs": texts}
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
def create_collection(self, name: str = "hadith_embeddings"):
|
||||
"""Create Qdrant collection"""
|
||||
try:
|
||||
self.qdrant.get_collection(name)
|
||||
except:
|
||||
self.qdrant.create_collection(
|
||||
collection_name=name,
|
||||
vectors_config=VectorParams(size=1024, distance=Distance.COSINE)
|
||||
)
|
||||
|
||||
async def process_batch(self, conn, hadiths: List[Tuple], collection: str):
|
||||
"""Process batch: generate embeddings & store"""
|
||||
texts = [f"{h[1]} {h[2] or ''}" for h in hadiths] # arabic + english
|
||||
embeddings = await self.generate_embeddings_batch(texts)
|
||||
|
||||
points = [
|
||||
PointStruct(
|
||||
id=str(h[0]),
|
||||
vector=emb,
|
||||
payload={"hadith_id": str(h[0]), "collection_id": str(h[4])}
|
||||
)
|
||||
for h, emb in zip(hadiths, embeddings)
|
||||
]
|
||||
|
||||
self.qdrant.upsert(collection_name=collection, points=points)
|
||||
|
||||
# Mark completed
|
||||
cursor = conn.cursor()
|
||||
ids = [str(h[0]) for h in hadiths]
|
||||
cursor.execute(
|
||||
"UPDATE hadiths SET embedding_generated = TRUE, embedding_version = 'v1' WHERE id = ANY(%s)",
|
||||
(ids,)
|
||||
)
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
|
||||
return len(points)
|
||||
|
||||
async def generate_all(self, collection: str = "hadith_embeddings"):
|
||||
"""Generate embeddings for all hadiths"""
|
||||
self.create_collection(collection)
|
||||
conn = psycopg2.connect(self.database_url)
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("SELECT COUNT(*) FROM hadiths WHERE embedding_generated = FALSE")
|
||||
total = cursor.fetchone()[0]
|
||||
cursor.close()
|
||||
|
||||
if total == 0:
|
||||
print("All hadiths already have embeddings!")
|
||||
return
|
||||
|
||||
print(f"Generating embeddings for {total} hadiths...")
|
||||
processed = 0
|
||||
|
||||
with tqdm(total=total) as pbar:
|
||||
while True:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("""
|
||||
SELECT id, arabic_text, english_text, urdu_text, collection_id
|
||||
FROM hadiths
|
||||
WHERE embedding_generated = FALSE
|
||||
LIMIT 1000
|
||||
""")
|
||||
hadiths = cursor.fetchall()
|
||||
cursor.close()
|
||||
|
||||
if not hadiths:
|
||||
break
|
||||
|
||||
for i in range(0, len(hadiths), self.batch_size):
|
||||
batch = hadiths[i:i+self.batch_size]
|
||||
try:
|
||||
count = await self.process_batch(conn, batch, collection)
|
||||
processed += count
|
||||
pbar.update(count)
|
||||
except Exception as e:
|
||||
logger.error("batch_failed", error=str(e))
|
||||
|
||||
conn.close()
|
||||
print(f"\nCompleted! Generated {processed} embeddings.")
|
||||
|
||||
async def close(self):
|
||||
await self.http_client.aclose()
|
||||
|
||||
|
||||
async def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--batch-size", type=int, default=32)
|
||||
args = parser.parse_args()
|
||||
|
||||
gen = EmbeddingGenerator(
|
||||
database_url=settings.DATABASE_URL,
|
||||
tei_url="http://tei.ml.svc.cluster.local",
|
||||
qdrant_url="http://qdrant.vector.svc.cluster.local:6333",
|
||||
batch_size=args.batch_size
|
||||
)
|
||||
|
||||
try:
|
||||
await gen.generate_all()
|
||||
return 0
|
||||
except Exception as e:
|
||||
logger.error("generation_failed", error=str(e))
|
||||
return 1
|
||||
finally:
|
||||
await gen.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(asyncio.run(main()))
|
||||
|
|
@ -2,10 +2,16 @@
|
|||
Main ingestion script for fetching hadiths from HadithAPI.com
|
||||
"""
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import argparse
|
||||
from typing import Optional, Dict, Any
|
||||
from uuid import UUID
|
||||
import structlog
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
||||
if str(PROJECT_ROOT) not in sys.path:
|
||||
sys.path.insert(0, str(PROJECT_ROOT))
|
||||
|
||||
from config.settings import settings
|
||||
from src.api_clients.hadithapi_client import HadithAPIClient
|
||||
from src.database.repository import HadithRepository
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ echo "Building Docker image..."
|
|||
# 3. Submit test workflow (10 hadiths)
|
||||
echo "Submitting test workflow..."
|
||||
argo submit -n ml argo/workflows/ingest-hadithapi.yaml \
|
||||
--parameter book-slug=sahih-bukhari \
|
||||
--parameter book-slug=sahih-muslim \
|
||||
--parameter limit=10 \
|
||||
--wait \
|
||||
--log
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ python src/main_hadithapi.py --list-books
|
|||
|
||||
# 4. Test ingestion (limited to 10 hadiths)
|
||||
echo -e "\nRunning test ingestion (10 hadiths from Sahih Bukhari)..."
|
||||
python src/main_hadithapi.py --book-slug sahih-bukhari --limit 10
|
||||
python src/main_hadithapi.py --book-slug sahih-muslim --limit 10
|
||||
|
||||
# 5. Verify data
|
||||
echo -e "\nVerifying ingested data..."
|
||||
|
|
|
|||
Loading…
Reference in New Issue