diff --git a/.DS_Store b/.DS_Store index e3d3557..266b761 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/hadith-ingestion/.DS_Store b/hadith-ingestion/.DS_Store index c499c23..e2dd572 100644 Binary files a/hadith-ingestion/.DS_Store and b/hadith-ingestion/.DS_Store differ diff --git a/hadith-ingestion/argo/workflows/generate-embeddings.yaml b/hadith-ingestion/argo/workflows/generate-embeddings.yaml new file mode 100644 index 0000000..a5a4836 --- /dev/null +++ b/hadith-ingestion/argo/workflows/generate-embeddings.yaml @@ -0,0 +1,31 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Workflow +metadata: + generateName: generate-embeddings- + namespace: ml +spec: + entrypoint: generate + serviceAccountName: argo-workflow + + arguments: + parameters: + - name: batch-size + value: "32" + + templates: + - name: generate + container: + image: hadith-ingestion:latest + command: [python, /app/src/embeddings/generator.py] + args: ["--batch-size={{workflow.parameters.batch-size}}"] + env: + - name: DATABASE_HOST + value: "pg.betelgeusebytes.io" + - name: DATABASE_PASSWORD + valueFrom: + secretKeyRef: + name: hadith-db-secret + key: password + resources: + requests: {cpu: 2, memory: 4Gi} + limits: {cpu: 4, memory: 8Gi} \ No newline at end of file diff --git a/hadith-ingestion/argo/workflows/ingest-collection.yaml b/hadith-ingestion/argo/workflows/ingest-collection.yaml index 90927f9..088b20a 100644 --- a/hadith-ingestion/argo/workflows/ingest-collection.yaml +++ b/hadith-ingestion/argo/workflows/ingest-collection.yaml @@ -59,8 +59,8 @@ spec: container: image: hadith-ingestion:latest - imagePullPolicy: IfNotPresent - command: [python, /app/src/main.py] + imagePullPolicy: Always + command: [python, /app/src/main_hadithapi.py] args: - "{{inputs.parameters.collection}}" - "--limit={{inputs.parameters.limit}}" @@ -122,8 +122,8 @@ spec: container: image: hadith-embeddings:latest - imagePullPolicy: IfNotPresent - command: [python, /app/generate_embeddings.py] + imagePullPolicy: Always + command: [python, /app/src/embeddings/generator.py] args: - "--collection={{inputs.parameters.collection}}" - "--batch-size=32" @@ -161,7 +161,7 @@ spec: container: image: hadith-qdrant-indexer:latest - imagePullPolicy: IfNotPresent + imagePullPolicy: Always command: [python, /app/index_qdrant.py] args: - "--collection={{inputs.parameters.collection}}" diff --git a/hadith-ingestion/combine.sh b/hadith-ingestion/combine.sh index fd9fa32..a414562 100755 --- a/hadith-ingestion/combine.sh +++ b/hadith-ingestion/combine.sh @@ -1,4 +1,4 @@ -find . -type f -name "*.txt" -o -name "production" -o -name "*.py" -o -name "*.yaml" -o -name "Dockerfile" -o -name "*.sh" -o -name "*.env" -o -name "*.md" ! -name "*.xls" ! -name "*.xlsx"| while read file; do +find . -type f -name "*.txt" -o -name "production" -o -name "*.py" -o -name "*.yaml" -o -name "Dockerfile" -o -name "*.sh" -o -name "*.env" ! -name "*.md" ! -name "*.xls" ! -name "*.xlsx"| while read file; do echo "=== $file ===" >> combined.txt cat "$file" >> combined.txt echo "" >> combined.txt diff --git a/hadith-ingestion/combined.txt b/hadith-ingestion/combined.txt index f9b621c..7018741 100644 --- a/hadith-ingestion/combined.txt +++ b/hadith-ingestion/combined.txt @@ -6,6 +6,80 @@ set -e echo "=== Starting Full HadithAPI Ingestion ===" +# Book slug to collection abbreviation mapping +# Books to ingest (in order) +BOOKS=( + # "sahih-bukhari" + "sahih-muslim" + "abu-dawood" + "al-tirmidhi" + "ibn-e-majah" + "sunan-nasai" + "musnad-ahmad" + "al-silsila-sahiha" +) + +for BOOK in "${BOOKS[@]}"; do + echo -e "\n=========================================" + echo "Ingesting: $BOOK" + echo "=========================================" + + argo submit -n ml argo/workflows/ingest-hadithapi.yaml \ + --parameter book-slug=$BOOK \ + --parameter limit=0 \ + --wait \ + --log + + echo "$BOOK completed!" + + # Optional: add delay between books + sleep 10 +done + +echo -e "\n=== All Books Ingestion Complete ===" + +# Print summary +kubectl -n db exec -it postgres-0 -- psql -U hadith_ingest -d hadith_db -c " +SELECT + c.name_english, + c.abbreviation, + COUNT(h.id) as hadith_count, + COUNT(DISTINCT b.id) as chapter_count +FROM collections c +LEFT JOIN hadiths h ON c.id = h.collection_id +LEFT JOIN books b ON h.book_id = b.id +GROUP BY c.name_english, c.abbreviation +ORDER BY hadith_count DESC; +" +=== ./create-secrets.sh === +#!/bin/bash +# create-secrets.sh + +# Database secret +kubectl -n ml create secret generic hadith-db-secret \ + --from-literal=password='hadith_ingest' \ + --dry-run=client -o yaml | kubectl apply -f - + +# HadithAPI secret (already public, but for consistency) +kubectl -n ml create secret generic hadithapi-secret \ + --from-literal=api-key='$2y$10$nTJnyX3WUDoGmjKrKqSmbecANVsQWKyffmtp9fxmsQwR15DEv4mK' \ + --dry-run=client -o yaml | kubectl apply -f - + +# MinIO secret +kubectl -n ml create secret generic minio-secret \ + --from-literal=access-key='minioadmin' \ + --from-literal=secret-key='minioadmin' \ + --dry-run=client -o yaml | kubectl apply -f - + +echo "Secrets created successfully" +=== ./full-ingestion.sh === +#!/bin/bash +# run-full-ingestion.sh + +set -e + +echo "=== Starting Full HadithAPI Ingestion ===" + # Books to ingest (in order) BOOKS=( "sahih-bukhari" @@ -48,27 +122,6 @@ LEFT JOIN books b ON h.book_id = b.id GROUP BY c.name_english, c.abbreviation ORDER BY hadith_count DESC; " -=== ./create-secrets.sh === -#!/bin/bash -# create-secrets.sh - -# Database secret -kubectl -n ml create secret generic hadith-db-secret \ - --from-literal=password='hadith_ingest' \ - --dry-run=client -o yaml | kubectl apply -f - - -# HadithAPI secret (already public, but for consistency) -kubectl -n ml create secret generic hadithapi-secret \ - --from-literal=api-key='$2y$10$nTJnyX3WUDoGmjKrKqSmbecANVsQWKyffmtp9fxmsQwR15DEv4mK' \ - --dry-run=client -o yaml | kubectl apply -f - - -# MinIO secret -kubectl -n ml create secret generic minio-secret \ - --from-literal=access-key='minioadmin' \ - --from-literal=secret-key='minioadmin' \ - --dry-run=client -o yaml | kubectl apply -f - - -echo "Secrets created successfully" === ./requirements.txt === # Core dependencies python-dotenv==1.0.0 @@ -127,7 +180,8 @@ class Settings(BaseSettings): """Application settings loaded from environment variables""" # Database - DATABASE_HOST: str = "postgres.db.svc.cluster.local" + # DATABASE_HOST: str = "postgres.db.svc.cluster.local" + DATABASE_HOST: str = "pg.betelgeusebytes.io" DATABASE_PORT: int = 5432 DATABASE_NAME: str = "hadith_db" DATABASE_USER: str = "hadith_ingest" @@ -146,7 +200,7 @@ class Settings(BaseSettings): f"postgresql+asyncpg://{self.DATABASE_USER}:{self.DATABASE_PASSWORD}" f"@{self.DATABASE_HOST}:{self.DATABASE_PORT}/{self.DATABASE_NAME}" ) - + # MinIO / S3 MINIO_ENDPOINT: str = "minio.storage.svc.cluster.local:9000" MINIO_ACCESS_KEY: str = "minioadmin" @@ -154,9 +208,6 @@ class Settings(BaseSettings): MINIO_BUCKET_RAW: str = "hadith-raw-data" MINIO_BUCKET_PROCESSED: str = "hadith-processed" MINIO_SECURE: bool = False - - # APIs - SUNNAH_API_KEY: Optional[str] = None SUNNAH_BASE_URL: str = "https://api.sunnah.com/v1" HADITH_ONE_API_KEY: Optional[str] = "$2y$10$nTJnyX3WUDoGmjKrKqSmbecANVsQWKyffmtp9fxmsQwR15DEv4mK" # HadithAPI.com @@ -275,282 +326,116 @@ GROUP BY c.name_english; " echo -e "\n=== Test Complete ===" -=== ./README.md === -# šŸš€ HadithAPI.com Deployment - Quick Start +=== ./test_mainhadithapi.py === +#!/usr/bin/env python3 +""" +Test script for main_hadithapi.py +""" +import sys +import os +sys.path.insert(0, '.') -## What You Got +from src.main_hadithapi import HadithAPIIngestionService -Three comprehensive guides: -1. **PHASE_2_IMPLEMENTATION_GUIDE.md** - Original guide with PostgreSQL schema -2. **HADITHAPI_INTEGRATION_GUIDE.md** - Complete HadithAPI.com implementation -3. **This summary** - Quick deployment steps +def test_main_hadithapi(): + """Test the main HadithAPI ingestion service""" + print("=== Testing HadithAPI Ingestion Service ===\n") + + try: + # Initialize the service + print("1. Initializing HadithAPIIngestionService...") + service = HadithAPIIngestionService() + print("āœ“ Service initialized successfully\n") + + # Test 1: List available books + print("2. Testing book synchronization...") + book_mapping = service.sync_books_from_api() + print(f"āœ“ Found {len(book_mapping)} mapped books") + for book_slug, info in list(book_mapping.items())[:3]: # Show first 3 + print(f" - {book_slug}: {info['book_name']} ({info['hadiths_count']} hadiths)") + print() + + # Test 2: Test ingestion with limit + print("3. Testing limited ingestion (10 hadiths from Sahih Bukhari)...") + stats = service.ingest_collection( + book_slug='sahih-bukhari', + limit=10 + ) + print(f"āœ“ Ingestion completed with stats:") + print(f" Processed: {stats['processed']}") + print(f" Failed: {stats['failed']}") + print(f" Skipped: {stats['skipped']}\n") + + # Test 3: List books functionality + print("4. Testing book listing...") + print("\n=== Available Books ===\n") + for book_slug, info in book_mapping.items(): + print(f"Book Slug: {book_slug}") + print(f" Name: {info['book_name']}") + print(f" Hadiths: {info['hadiths_count']}") + print(f" Chapters: {info['chapters_count']}") + print() + + # Clean up + service.close() + print("=== All Tests Passed! ===") + return True + + except Exception as e: + print(f"āœ— Test failed with error: {e}") + import traceback + traceback.print_exc() + return False -## šŸ“¦ Complete Package Structure +def test_command_line_args(): + """Test command line argument parsing""" + print("=== Testing Command Line Arguments ===\n") + + # We'll simulate command line arguments + import argparse + from src.main_hadithapi import main + + # Test --list-books argument + print("1. Testing --list-books argument...") + original_argv = sys.argv.copy() + + try: + sys.argv = ['main_hadithapi.py', '--list-books'] + # We won't actually run main() as it would exit, but we can check the parsing + parser = argparse.ArgumentParser(description="Ingest hadiths from HadithAPI.com") + parser.add_argument("--book-slug", help="Book slug (e.g., sahih-bukhari)") + parser.add_argument("--limit", type=int, help="Limit number of hadiths to ingest") + parser.add_argument("--list-books", action="store_true", help="List available books and exit") + + args = parser.parse_args(['--list-books']) + print(f"āœ“ Argument parsing successful: list_books={args.list_books}") + + # Test book-slug argument + args = parser.parse_args(['--book-slug', 'sahih-bukhari', '--limit', '5']) + print(f"āœ“ Argument parsing successful: book_slug={args.book_slug}, limit={args.limit}") + + print("āœ“ Command line argument parsing works correctly\n") + return True + + except Exception as e: + print(f"āœ— Argument parsing failed: {e}") + return False + finally: + sys.argv = original_argv -The HadithAPI guide includes everything you need: - -### Production-Ready Code -āœ… **hadithapi_client.py** - Full API client with pagination and rate limiting -āœ… **main_hadithapi.py** - Complete ingestion service -āœ… **settings.py** - Configuration with your API key -āœ… **Dockerfile** - Container image -āœ… **Argo Workflows** - Kubernetes automation -āœ… **Test scripts** - Validation and troubleshooting - -### Key Features -- āœ… Automatic pagination handling -- āœ… Rate limiting (30 req/min) -- āœ… Error handling and retries -- āœ… Progress tracking -- āœ… Structured logging -- āœ… Multi-language support (Arabic, English, Urdu) - -## šŸŽÆ 5-Minute Quick Start - -### 1. Database Setup (2 min) -```bash -# Use schema from PHASE_2_IMPLEMENTATION_GUIDE.md Section 1 -kubectl -n db exec -it postgres-0 -- psql -U app -d gitea - -# Copy all SQL from Section 1.2 through 1.6 -# This creates hadith_db with complete schema -``` - -### 2. Create Project Structure (1 min) -```bash -mkdir -p hadith-ingestion/{config,src/{api_clients,processors,database,utils},argo/workflows} -cd hadith-ingestion/ - -# Copy code from HADITHAPI_INTEGRATION_GUIDE.md: -# - Section 2.1 → src/api_clients/hadithapi_client.py -# - Section 4.1 → src/main_hadithapi.py -# - Section 5.1 → config/settings.py -# - Section 6.1 → Dockerfile -# - Section 6.4 → argo/workflows/ingest-hadithapi.yaml - -# Also copy from PHASE_2_IMPLEMENTATION_GUIDE.md: -# - Section 3.4 → src/api_clients/base_client.py -# - Section 3.6 → src/processors/text_cleaner.py -# - Section 3.7 → src/database/repository.py -``` - -### 3. Build & Deploy (2 min) -```bash -# Build image -docker build -t hadith-ingestion:latest . - -# Create secrets -kubectl -n argo create secret generic hadith-db-secret \ - --from-literal=password='YOUR_PASSWORD' - -kubectl -n argo create secret generic hadithapi-secret \ - --from-literal=api-key='$2y$10$nTJnyX3WUDoGmjKrKqSmbecANVsQWKyffmtp9fxmsQwR15DEv4mK' - -# Test with 10 hadiths -argo submit -n argo argo/workflows/ingest-hadithapi.yaml \ - --parameter book-slug=sahih-bukhari \ - --parameter limit=10 \ - --watch -``` - -## šŸ“Š Expected Results - -### Available Collections -| Book | Hadiths | Time | -|------|---------|------| -| Sahih Bukhari | ~7,500 | 2-3h | -| Sahih Muslim | ~7,000 | 2-3h | -| Sunan Abu Dawood | ~5,000 | 1-2h | -| Jami` at-Tirmidhi | ~4,000 | 1-2h | -| Sunan an-Nasa'i | ~5,700 | 2h | -| Sunan Ibn Majah | ~4,300 | 1-2h | -| **TOTAL** | **~33,500** | **10-15h** | - -## šŸ”§ Key Differences from Sunnah.com - -| Feature | HadithAPI.com | Sunnah.com | -|---------|---------------|------------| -| **API Key** | āœ… Public (provided) | āŒ Requires PR | -| **Rate Limit** | Unknown (using 30/min) | 100/min | -| **Coverage** | 6 major books | 10+ books | -| **Languages** | Arabic, English, Urdu | Arabic, English | -| **Cost** | āœ… Free | Free | -| **Stability** | Good | Excellent | - -## šŸ“ Complete File Checklist - -Create these files from the guides: - -``` -hadith-ingestion/ -ā”œā”€ā”€ Dockerfile āœ“ Section 6.1 -ā”œā”€ā”€ requirements.txt āœ“ Phase 2 Section 3.2 -ā”œā”€ā”€ .env āœ“ Section 5.2 -ā”œā”€ā”€ build-hadithapi-ingestion.sh āœ“ Section 6.2 -ā”œā”€ā”€ create-secrets.sh āœ“ Section 6.3 -ā”œā”€ā”€ test-hadithapi-local.sh āœ“ Section 7.1 -ā”œā”€ā”€ test-hadithapi-k8s.sh āœ“ Section 7.2 -ā”œā”€ā”€ run-full-ingestion.sh āœ“ Section 7.3 -ā”œā”€ā”€ config/ -│ ā”œā”€ā”€ __init__.py (empty file) -│ └── settings.py āœ“ Section 5.1 -ā”œā”€ā”€ src/ -│ ā”œā”€ā”€ __init__.py (empty file) -│ ā”œā”€ā”€ main_hadithapi.py āœ“ Section 4.1 -│ ā”œā”€ā”€ api_clients/ -│ │ ā”œā”€ā”€ __init__.py (empty file) -│ │ ā”œā”€ā”€ base_client.py āœ“ Phase 2 Sec 3.4 -│ │ └── hadithapi_client.py āœ“ Section 2.1 -│ ā”œā”€ā”€ processors/ -│ │ ā”œā”€ā”€ __init__.py (empty file) -│ │ └── text_cleaner.py āœ“ Phase 2 Sec 3.6 -│ ā”œā”€ā”€ database/ -│ │ ā”œā”€ā”€ __init__.py (empty file) -│ │ ā”œā”€ā”€ connection.py (optional) -│ │ └── repository.py āœ“ Phase 2 Sec 3.7 -│ └── utils/ -│ ā”œā”€ā”€ __init__.py (empty file) -│ └── logger.py (optional) -└── argo/ - └── workflows/ - └── ingest-hadithapi.yaml āœ“ Section 6.4 -``` - -## šŸŽ¬ Step-by-Step Execution - -### Day 1: Setup & Test (2-3 hours) -```bash -# 1. Create database schema -# 2. Set up project structure -# 3. Build Docker image -# 4. Create secrets -# 5. Run test with 10 hadiths -# 6. Verify data -``` - -### Day 2: Ingest Major Collections (10-15 hours) -```bash -# Ingest all 6 major collections sequentially -./run-full-ingestion.sh - -# Or manually one by one: -argo submit ... --parameter book-slug=sahih-bukhari -argo submit ... --parameter book-slug=sahih-muslim -# etc... -``` - -### Day 3: Validation & Next Steps -```bash -# 1. Verify data quality -# 2. Check statistics -# 3. Proceed to Phase 3 (ML model development) -``` - -## āœ… Verification Checklist - -After ingestion completes: - -```bash -# 1. Check total hadiths -kubectl -n db exec -it postgres-0 -- psql -U hadith_ingest -d hadith_db -c " -SELECT COUNT(*) FROM hadiths; -" -# Expected: ~33,500 - -# 2. Check per collection -kubectl -n db exec -it postgres-0 -- psql -U hadith_ingest -d hadith_db -c " -SELECT - c.name_english, - COUNT(h.id) as count -FROM collections c -LEFT JOIN hadiths h ON c.id = h.collection_id -WHERE c.abbreviation IN ('bukhari', 'muslim', 'abudawud', 'tirmidhi', 'nasai', 'ibnmajah') -GROUP BY c.name_english; -" - -# 3. Check for errors -kubectl -n db exec -it postgres-0 -- psql -U hadith_ingest -d hadith_db -c " -SELECT * FROM ingestion_jobs -WHERE status = 'failed' -ORDER BY created_at DESC; -" -``` - -## šŸ› Common Issues & Solutions - -### Issue: Rate Limiting -``` -Error: 429 Too Many Requests -Solution: Already set to conservative 30/min -If still hitting limits, edit settings.py: - API_RATE_LIMIT = 20 -``` - -### Issue: Connection Timeout -``` -Error: Connection timeout to database -Solution: -1. Check PostgreSQL is running -2. Verify credentials in secrets -3. Test connection manually -``` - -### Issue: Missing Chapters -``` -Warning: chapters_fetch_failed -Solution: Script automatically falls back to fetching all hadiths -This is expected and not critical -``` - -## šŸ“š Documentation References - -All details in the comprehensive guides: - -1. **PHASE_2_IMPLEMENTATION_GUIDE.md** - - PostgreSQL schema (Section 1) - - Base utilities (Section 3) - - Database repository (Section 3.7) - -2. **HADITHAPI_INTEGRATION_GUIDE.md** - - API client (Section 2) - - Main ingestion service (Section 4) - - Deployment (Section 6) - - Testing (Section 7) - -## šŸŽÆ Next Phase - -After Phase 2 completion: -→ **Phase 3: ML Model Development** - - Annotate sample hadiths (Label Studio) - - Train NER model - - Train relation extraction model - - Fine-tune LLM with LoRA - -## šŸ’” Pro Tips - -1. **Start Small**: Test with `--limit 10` first -2. **Monitor Progress**: Use `argo logs -n argo -f` -3. **Check Logs**: Structured JSON logs for easy debugging -4. **Backup Data**: Before major operations -5. **Rate Limiting**: Be conservative to avoid blocks - -## šŸŽ‰ Success Criteria - -Phase 2 is complete when: -- āœ… Database schema created -- āœ… 33,500+ hadiths ingested -- āœ… All 6 collections present -- āœ… No critical errors -- āœ… Data validated -- āœ… Ready for embedding generation - ---- - -**Estimated Total Time: 1-2 days** -**Difficulty: Intermediate** -**Prerequisites: Phase 1 completed (all core services running)** - -Ready to start? Begin with Section 1 of PHASE_2_IMPLEMENTATION_GUIDE.md! +if __name__ == "__main__": + print("Starting tests for main_hadithapi.py...\n") + + # Test command line arguments + if not test_command_line_args(): + sys.exit(1) + + # Test main functionality + if not test_main_hadithapi(): + sys.exit(1) + + print("\nšŸŽ‰ All tests passed successfully!") + sys.exit(0) === ./setup.py === === ./build-and-push.sh === @@ -580,7 +465,8 @@ docker push ${REGISTRY}/${IMAGE_NAME}:${TAG} echo "Done!" === ./.env === # Database -DATABASE_HOST=postgres.db.svc.cluster.local +# DATABASE_HOST=postgres.db.svc.cluster.local +DATABASE_HOST = pg.betelgeusebytes.io DATABASE_PORT=5432 DATABASE_NAME=hadith_db DATABASE_USER=hadith_ingest @@ -624,12 +510,101 @@ docker tag ${IMAGE_NAME}:${TAG} ${IMAGE_NAME}:latest echo "Build complete: ${IMAGE_NAME}:${TAG}" === ./combine.sh === -find . -type f -name "*.txt" -o -name "production" -o -name "*.py" -o -name "*.yaml" -o -name "Dockerfile" -o -name "*.sh" -o -name "*.env" -o -name "*.md" ! -name "*.xls" ! -name "*.xlsx"| while read file; do +find . -type f -name "*.txt" -o -name "production" -o -name "*.py" -o -name "*.yaml" -o -name "Dockerfile" -o -name "*.sh" -o -name "*.env" ! -name "*.md" ! -name "*.xls" ! -name "*.xlsx"| while read file; do echo "=== $file ===" >> combined.txt cat "$file" >> combined.txt echo "" >> combined.txt done +=== ./test_hadithapi.py === +#!/usr/bin/env python3 +""" +Quick test script for hadithapi_client.py +""" +import sys +from venv import logger +sys.path.insert(0, '/app') + +from src.api_clients.hadithapi_client import HadithAPIClient +from config.settings import settings + +def test_api_connection(): + """Test basic API connectivity""" + print("=== Testing HadithAPI Client ===\n") + + client = HadithAPIClient() + + # Test 1: Get books + print("Test 1: Fetching available books...") + try: + books = client.get_books() + print(f"āœ“ Success! Found {len(books)} books") + for book in books[:3]: # Show first 3 + print(f" - {book.get('bookName')} ({book.get('bookSlug')})") + print(f" Hadiths: {book.get('hadiths_count')}, Chapters: {book.get('chapters_count')}") + logger.info(f"Fetched {len(books)} books successfully") + + except Exception as e: + print(f"āœ— Failed: {e}") + return False + + # Test 2: Get chapters for Sahih Bukhari + print("\nTest 2: Fetching chapters for Sahih Bukhari...") + try: + chapters = client.get_chapters('sahih-bukhari') + print(f"āœ“ Success! Found {len(chapters)} chapters") + if chapters: + print(f" First chapter: {chapters[0].get('chapterEnglish')}") + except Exception as e: + print(f"āœ— Failed: {e}") + return False + + # Test 3: Fetch first page of hadiths + print("\nTest 3: Fetching first page of hadiths...") + book_id = None + try: + book = client.get_book_by_slug('sahih-bukhari') + if not book: + print("āœ— Failed: Book 'sahih-bukhari' not found") + return False + book_id = book.get('id') + page_data = client.get_hadiths_page('sahih-bukhari', page=1, limit=5) + hadiths = page_data.get('hadiths', []) + print(f"āœ“ Success! Fetched {len(hadiths)} hadiths") + if hadiths: + first = hadiths[0] + print(f" First hadith number: {first.get('hadithNumber')}") + print(f" Arabic text (first 100 chars): {first.get('hadithArabic', '')[:100]}...") + except Exception as e: + print(f"āœ— Failed: {e}") + return False + + if book_id is None: + print("āœ— Failed: Book ID unavailable for iterator test") + return False + + # # Test 4: Test iterator (fetch 3 hadiths) + print("\nTest 4: Testing hadith iterator (3 hadiths)...") + try: + count = 0 + + for hadith in client.iter_all_hadiths_in_book(book_id='sahih-bukhari', book_slug='sahih-bukhari', batch_size=10): + count += 1 + print(f" Hadith #{hadith.get('hadithNumber')} is {hadith.get('englishNarrator')} and is {hadith.get('status')} ") + if count >= 3: + break + print(f"āœ“ Success! Iterator working correctly") + except Exception as e: + print(f"āœ— Failed: {e}") + return False + + client.close() + print("\n=== All Tests Passed! ===") + return True + +if __name__ == "__main__": + success = test_api_connection() + sys.exit(0 if success else 1) === ./test-hadithapi-local.sh === #!/bin/bash # test-hadithapi-local.sh @@ -779,8 +754,8 @@ spec: container: image: hadith-ingestion:latest - imagePullPolicy: IfNotPresent - command: [python, /app/src/main.py] + imagePullPolicy: Always + command: [python, /app/src/main_hadithapi.py] args: - "{{inputs.parameters.collection}}" - "--limit={{inputs.parameters.limit}}" @@ -842,8 +817,8 @@ spec: container: image: hadith-embeddings:latest - imagePullPolicy: IfNotPresent - command: [python, /app/generate_embeddings.py] + imagePullPolicy: Always + command: [python, /app/src/embeddings/generator.py] args: - "--collection={{inputs.parameters.collection}}" - "--batch-size=32" @@ -881,7 +856,7 @@ spec: container: image: hadith-qdrant-indexer:latest - imagePullPolicy: IfNotPresent + imagePullPolicy: Always command: [python, /app/index_qdrant.py] args: - "--collection={{inputs.parameters.collection}}" @@ -956,7 +931,7 @@ spec: container: image: axxs/hadith-ingestion:latest - imagePullPolicy: IfNotPresent + imagePullPolicy: Always command: [python, /app/src/main_hadithapi.py] args: - "--book-slug={{inputs.parameters.book-slug}}" @@ -1054,28 +1029,43 @@ spec: arguments: parameters: - name: book-slug - value: "sunan-abu-dawood" + value: "abu-dawood" - - name: jami-at-tirmidhi template: ingest-book arguments: parameters: - name: book-slug - value: "jami-at-tirmidhi" + value: "al-tirmidhi" - - name: sunan-an-nasai template: ingest-book arguments: parameters: - name: book-slug - value: "sunan-an-nasai" + value: "sunan-nasai" - - name: sunan-ibn-e-majah template: ingest-book arguments: parameters: - name: book-slug - value: "sunan-ibn-e-majah" + value: "ibn-e-majah" + + - - name: musnad-ahmad + template: ingest-book + arguments: + parameters: + - name: book-slug + value: "musnad-ahmad" + + + - - name: al-silsila-sahiha + template: ingest-book + arguments: + parameters: + - name: book-slug + value: "al-silsila-sahiha" # ======================================== # Book ingestion template @@ -1087,7 +1077,7 @@ spec: container: image: axxs/hadith-ingestion:latest - imagePullPolicy: IfNotPresent + imagePullPolicy: Always command: [python, /app/src/main_hadithapi.py] args: - "--book-slug={{inputs.parameters.book-slug}}" @@ -1125,6 +1115,7 @@ spec: Database repository for hadith data operations """ from typing import List, Dict, Any, Optional +import json from uuid import UUID import structlog from sqlalchemy import create_engine, text, select, insert, update @@ -1137,31 +1128,37 @@ logger = structlog.get_logger() class HadithRepository: """Repository for hadith database operations""" - + def __init__(self, database_url: Optional[str] = None): self.database_url = database_url or settings.DATABASE_URL self.engine = create_engine(self.database_url, pool_pre_ping=True) self.SessionLocal = sessionmaker(bind=self.engine) - + + @staticmethod + def _coerce_uuid(value: Any) -> UUID: + if isinstance(value, UUID): + return value + return UUID(str(value)) + def get_session(self) -> Session: """Get database session""" return self.SessionLocal() - + # ===== Collections ===== - + def get_collection_by_abbreviation(self, abbr: str) -> Optional[Dict[str, Any]]: """Get collection by abbreviation""" with self.get_session() as session: query = text(""" - SELECT * FROM collections + SELECT * FROM collections WHERE abbreviation = :abbr """) result = session.execute(query, {"abbr": abbr}).fetchone() - + if result: return dict(result._mapping) return None - + def get_all_collections(self) -> List[Dict[str, Any]]: """Get all collections""" with self.get_session() as session: @@ -1191,34 +1188,35 @@ class HadithRepository: metadata: Optional[Dict] = None ) -> UUID: """Insert or update a book""" + metadata_json = json.dumps(metadata or {}) with self.get_session() as session: query = text(""" INSERT INTO books (collection_id, book_number, name_english, name_arabic, metadata) VALUES (:collection_id, :book_number, :name_english, :name_arabic, :metadata) - ON CONFLICT (collection_id, book_number) - DO UPDATE SET + ON CONFLICT (collection_id, book_number) + DO UPDATE SET name_english = EXCLUDED.name_english, name_arabic = EXCLUDED.name_arabic, metadata = EXCLUDED.metadata RETURNING id """) - + result = session.execute(query, { "collection_id": str(collection_id), "book_number": book_number, "name_english": name_english, "name_arabic": name_arabic, - "metadata": metadata or {} + "metadata": metadata_json }) session.commit() - - return UUID(result.fetchone()[0]) - + + return self._coerce_uuid(result.fetchone()[0]) + def get_book(self, collection_id: UUID, book_number: int) -> Optional[Dict[str, Any]]: """Get book by collection and book number""" with self.get_session() as session: query = text(""" - SELECT * FROM books + SELECT * FROM books WHERE collection_id = :collection_id AND book_number = :book_number """) result = session.execute(query, { @@ -1248,6 +1246,7 @@ class HadithRepository: source_metadata: Optional[Dict] = None ) -> UUID: """Insert or update a hadith""" + with self.get_session() as session: query = text(""" INSERT INTO hadiths ( @@ -1275,7 +1274,8 @@ class HadithRepository: updated_at = NOW() RETURNING id """) - + metadata_json = json.dumps(source_metadata or {}) + result = session.execute(query, { "collection_id": str(collection_id), "book_id": str(book_id) if book_id else None, @@ -1288,12 +1288,12 @@ class HadithRepository: "chapter_name": chapter_name, "source_id": source_id, "source_url": source_url, - "source_metadata": source_metadata or {} + "source_metadata": metadata_json }) session.commit() - - return UUID(result.fetchone()[0]) - + + return self._coerce_uuid(result.fetchone()[0]) + def get_hadiths_without_embeddings( self, limit: int = 100, @@ -1303,8 +1303,8 @@ class HadithRepository: with self.get_session() as session: if collection_id: query = text(""" - SELECT * FROM hadiths - WHERE embedding_generated = FALSE + SELECT * FROM hadiths + WHERE embedding_generated = FALSE AND collection_id = :collection_id ORDER BY created_at ASC LIMIT :limit @@ -1323,22 +1323,26 @@ class HadithRepository: result = session.execute(query, {"limit": limit}).fetchall() return [dict(row._mapping) for row in result] - + def mark_embedding_generated(self, hadith_id: UUID, version: str = "v1"): """Mark hadith as having embedding generated""" with self.get_session() as session: + # Prepare the update query query = text(""" - UPDATE hadiths - SET embedding_generated = TRUE, + UPDATE hadiths + SET embedding_generated = TRUE, embedding_version = :version, updated_at = NOW() WHERE id = :id """) + # Pre-serialize parameters (keeping consistent with other methods that + # serialize payloads/configs before execution) + params = {"id": str(hadith_id), "version": version} session.execute(query, {"id": str(hadith_id), "version": version}) session.commit() - + # ===== Ingestion Jobs ===== - + def create_ingestion_job( self, job_name: str, @@ -1353,15 +1357,16 @@ class HadithRepository: VALUES (:job_name, :job_type, :source_name, :config, 'running', NOW()) RETURNING id """) + # serialize config as JSON for storage result = session.execute(query, { "job_name": job_name, "job_type": job_type, "source_name": source_name, - "config": config or {} + "config": json.dumps(config or {}) }) session.commit() - - return UUID(result.fetchone()[0]) + job_id = result.fetchone()[0] + return job_id if isinstance(job_id, UUID) else UUID(str(job_id)) def update_job_progress( self, @@ -1420,7 +1425,7 @@ class HadithRepository: "error_message": error_message }) session.commit() - + def add_processing_log( self, job_id: UUID, @@ -1434,16 +1439,17 @@ class HadithRepository: INSERT INTO processing_logs (job_id, log_level, message, details) VALUES (:job_id, :level, :message, :details) """) + details_json = json.dumps(details or {}) session.execute(query, { "job_id": str(job_id), "level": level, "message": message, - "details": details or {} + "details": details_json }) session.commit() - + # ===== Statistics ===== - + def get_collection_stats(self, collection_id: UUID) -> Dict[str, Any]: """Get statistics for a collection""" with self.get_session() as session: @@ -1451,7 +1457,7 @@ class HadithRepository: SELECT * FROM get_collection_statistics(:collection_id) """) result = session.execute(query, {"collection_id": str(collection_id)}).fetchone() - + if result: return dict(result._mapping) return {} @@ -1469,9 +1475,9 @@ from typing import Optional, Dict, Any from uuid import UUID import structlog from config.settings import settings -from api_clients.hadithapi_client import HadithAPIClient -from database.repository import HadithRepository -from processors.text_cleaner import ArabicTextProcessor, TextCleaner +from src.api_clients.hadithapi_client import HadithAPIClient +from src.database.repository import HadithRepository +from src.processors.text_cleaner import ArabicTextProcessor, TextCleaner # Configure structured logging structlog.configure( @@ -1497,14 +1503,15 @@ logger = structlog.get_logger() BOOK_SLUG_MAPPING = { 'sahih-bukhari': 'bukhari', 'sahih-muslim': 'muslim', - 'sunan-abu-dawood': 'abudawud', - 'jami-at-tirmidhi': 'tirmidhi', - 'sunan-an-nasai': 'nasai', - 'sunan-ibn-e-majah': 'ibnmajah', + 'abu-dawood': 'abudawud', + 'al-tirmidhi': 'tirmidhi', + 'sunan-nasai': 'nasai', + 'ibn-e-majah': 'ibnmajah', 'muwatta-imam-malik': 'malik', 'musnad-ahmad': 'ahmad', 'sunan-ad-darimi': 'darimi', - 'mishkat-al-masabih': 'mishkat' + 'mishkat': 'mishkat', + 'al-silsila-sahiha': 'al-silsila-sahiha' } @@ -1528,12 +1535,12 @@ class HadithAPIIngestionService: # Get books from API api_books = self.api_client.get_books() - + book_mapping = {} - + # print(BOOK_SLUG_MAPPING) for api_book in api_books: book_slug = api_book.get('bookSlug') - + # print(book_slug) # Map to our collection abbreviation collection_abbr = BOOK_SLUG_MAPPING.get(book_slug) @@ -1544,7 +1551,6 @@ class HadithAPIIngestionService: book_name=api_book.get('bookName') ) continue - # Get or verify collection exists in database collection = self.repo.get_collection_by_abbreviation(collection_abbr) @@ -1555,8 +1561,10 @@ class HadithAPIIngestionService: book_slug=book_slug ) continue - - collection_id = UUID(collection['id']) + + collection_id = collection['id'] + if not isinstance(collection_id, UUID): + collection_id = UUID(str(collection_id)) book_mapping[book_slug] = { 'collection_id': collection_id, 'book_id': api_book.get('id'), @@ -1602,7 +1610,8 @@ class HadithAPIIngestionService: # Get book mapping book_mapping = self.sync_books_from_api() - + logger.info("containing books", book_mapping=book_mapping) + logger.info("Book slugs", book_slug=book_slug) if book_slug not in book_mapping: logger.error( "book_not_mapped", @@ -1613,7 +1622,8 @@ class HadithAPIIngestionService: book_info = book_mapping[book_slug] collection_id = book_info['collection_id'] - book_id = book_info['book_id'] + # book_id = book_info['book_id'] + book_id = book_slug # Create ingestion job job_id = self.repo.create_ingestion_job( @@ -1766,7 +1776,7 @@ class HadithAPIIngestionService: if not arabic_text: raise ValueError("Missing Arabic text") - + # passed logger.warning("Arabic text extracted and validated", hadith_number=hadith_number) # Clean texts arabic_text = self.text_cleaner.clean_text(arabic_text) if english_text: @@ -1780,14 +1790,15 @@ class HadithAPIIngestionService: # Get or create chapter (book in our schema) book_id = None chapter_name = None - + # logger.warning("Processing chapter data####", chapter_data.get('positional_args', {}).get('id')) + # logger.info("Processing chapter data####2####", chapter_data.get('id')) if chapter_data: chapter_id = chapter_data.get('id') chapter_number = chapter_data.get('chapterNumber') chapter_name_en = chapter_data.get('chapterEnglish') chapter_name_ar = chapter_data.get('chapterArabic') chapter_name = chapter_name_en - + # print(chapter_number, chapter_name) if chapter_number: try: chapter_number = int(chapter_number) @@ -1796,7 +1807,8 @@ class HadithAPIIngestionService: # Get or create book (chapter in HadithAPI = book in our schema) existing_book = self.repo.get_book(collection_id, chapter_number) - + # logger.warning("EXISTING BOOK : ", existing_book) + # logger.warning("Fetched or created book", collection_id=collection_id, chapter_number=chapter_number, chapter_name=chapter_name) if not existing_book: book_id = self.repo.upsert_book( collection_id=collection_id, @@ -1806,8 +1818,12 @@ class HadithAPIIngestionService: metadata=chapter_data ) else: - book_id = UUID(existing_book['id']) - + # print(existing_book['id']) + existing_id = existing_book['id'] + if isinstance(existing_id, str): + existing_id = UUID(existing_id) + book_id = existing_id + # Build source metadata source_metadata = { 'api_id': hadith_data.get('id'), @@ -1817,7 +1833,7 @@ class HadithAPIIngestionService: 'chapterId': hadith_data.get('chapterId'), 'chapter': chapter_data } - + # logger.warning("Hadith metadata built", source_metadata) # Store hadith hadith_id = self.repo.upsert_hadith( collection_id=collection_id, @@ -2304,7 +2320,7 @@ class BaseAPIClient: """ Client for HadithAPI.com API """ -from typing import List, Dict, Any, Optional, Generator +from typing import List, Dict, Any, Optional, Generator, Tuple import structlog from .base_client import BaseAPIClient from config.settings import settings @@ -2348,7 +2364,8 @@ class HadithAPIClient(BaseAPIClient): ) raise Exception(f"API Error: {response.get('message')}") - books = response.get('data', []) + books = response.get('books', []) + logger.info( "books_fetched", @@ -2383,7 +2400,8 @@ class HadithAPIClient(BaseAPIClient): ) raise Exception(f"API Error: {response.get('message')}") - chapters = response.get('data', []) + chapters = response.get('chapters', []) + logger.info( "chapters_fetched", @@ -2430,7 +2448,10 @@ class HadithAPIClient(BaseAPIClient): ) response = self.get("hadiths", params=params) - + # logger.debug( + # "fetching_hadiths_page####", + # response=response + # ) if response.get('status') != 200: logger.error( "api_error", @@ -2439,7 +2460,7 @@ class HadithAPIClient(BaseAPIClient): ) raise Exception(f"API Error: {response.get('message')}") - return response.get('data', {}) + return response.get('hadiths', {}) def iter_all_hadiths_in_book( self, @@ -2465,15 +2486,21 @@ class HadithAPIClient(BaseAPIClient): while True: response_data = self.get_hadiths_page( - book_id=book_id, + book_id=book_slug, chapter_id=chapter_id, page=page, limit=batch_size ) - hadiths = response_data.get('hadiths', []) + hadiths = response_data.get('data', []) pagination = response_data.get('pagination', {}) - + # logger.info( + # "book_complete", + # book_slug=book_slug, + # hadiths=hadiths, + # pagination=pagination, + # response = response_data + # ) if not hadiths: logger.info( "book_complete", @@ -2493,12 +2520,12 @@ class HadithAPIClient(BaseAPIClient): "progress", book_slug=book_slug, fetched=total_fetched, - total=pagination.get('total', '?') + total=response_data.get('total', '?') ) # Check if there are more pages - current_page = pagination.get('current_page', page) - last_page = pagination.get('last_page', 1) + current_page = response_data.get('current_page', page) + last_page = response_data.get('last_page', 1) if current_page >= last_page: logger.info( @@ -2516,15 +2543,15 @@ class HadithAPIClient(BaseAPIClient): book_id: int, book_slug: str, batch_size: int = 100 - ) -> Generator[tuple[Dict[str, Any], Optional[Dict[str, Any]]], None, None]: + ) -> Generator[Tuple[Dict[str, Any], Optional[Dict[str, Any]]], None, None]: """ Iterator that yields all hadiths in a book, organized by chapter - + Args: book_id: Book ID book_slug: Book slug batch_size: Number of hadiths to fetch per request - + Yields: Tuple of (hadith_dict, chapter_dict or None) """ diff --git a/hadith-ingestion/requirements.txt b/hadith-ingestion/requirements.txt index 211a2c9..ba02b56 100644 --- a/hadith-ingestion/requirements.txt +++ b/hadith-ingestion/requirements.txt @@ -39,4 +39,10 @@ redis==5.0.1 pytest==7.4.3 pytest-asyncio==0.21.1 pytest-cov==4.1.0 -faker==21.0.0 \ No newline at end of file +faker==21.0.0 + + +httpx==0.25.2 +qdrant-client==1.7.0 +tqdm==4.66.1 +asyncpg==0.29.0 \ No newline at end of file diff --git a/hadith-ingestion/run-full-ingestion.sh b/hadith-ingestion/run-full-ingestion.sh index 9f41a10..3aeac43 100755 --- a/hadith-ingestion/run-full-ingestion.sh +++ b/hadith-ingestion/run-full-ingestion.sh @@ -8,14 +8,14 @@ echo "=== Starting Full HadithAPI Ingestion ===" # Book slug to collection abbreviation mapping # Books to ingest (in order) BOOKS=( - "sahih-bukhari" + # "sahih-bukhari" "sahih-muslim" - "abu-dawood" - "al-tirmidhi" - "ibn-e-majah" - "sunan-nasai" - "musnad-ahmad" - "al-silsila-sahiha" + # "abu-dawood" + # "al-tirmidhi" + # "ibn-e-majah" + # "sunan-nasai" + # "musnad-ahmad" + # "al-silsila-sahiha" ) for BOOK in "${BOOKS[@]}"; do @@ -23,7 +23,7 @@ for BOOK in "${BOOKS[@]}"; do echo "Ingesting: $BOOK" echo "=========================================" - argo submit -n argo argo/workflows/ingest-hadithapi.yaml \ + argo submit -n ml argo/workflows/ingest-hadithapi.yaml \ --parameter book-slug=$BOOK \ --parameter limit=0 \ --wait \ diff --git a/hadith-ingestion/src/api_clients/__pycache__/hadithapi_client.cpython-312.pyc b/hadith-ingestion/src/api_clients/__pycache__/hadithapi_client.cpython-312.pyc index a0ad7e7..1bbba2c 100644 Binary files a/hadith-ingestion/src/api_clients/__pycache__/hadithapi_client.cpython-312.pyc and b/hadith-ingestion/src/api_clients/__pycache__/hadithapi_client.cpython-312.pyc differ diff --git a/hadith-ingestion/src/api_clients/hadithapi_client.py b/hadith-ingestion/src/api_clients/hadithapi_client.py index d9e2e5a..bea11ea 100644 --- a/hadith-ingestion/src/api_clients/hadithapi_client.py +++ b/hadith-ingestion/src/api_clients/hadithapi_client.py @@ -264,7 +264,11 @@ class HadithAPIClient(BaseAPIClient): # Process each chapter for chapter in chapters: - chapter_id = chapter.get('id') + # logger.warning("Processing chapter", chapter=chapter) + if book_slug == 'sahih-muslim': + chapter_id = chapter.get('chapterNumber') + else: + chapter_id = chapter.get('id') chapter_number = chapter.get('chapterNumber') logger.info( diff --git a/hadith-ingestion/src/embeddings/generator.py b/hadith-ingestion/src/embeddings/generator.py index e69de29..4a147ae 100644 --- a/hadith-ingestion/src/embeddings/generator.py +++ b/hadith-ingestion/src/embeddings/generator.py @@ -0,0 +1,148 @@ +# Update: src/embeddings/generator.py +""" +Embedding generation service for hadith texts +""" +import asyncio +import httpx +from typing import List, Tuple, Optional +import psycopg2 +from qdrant_client import QdrantClient +from qdrant_client.models import Distance, VectorParams, PointStruct +import structlog +from tqdm import tqdm +import sys +import argparse + +from config.settings import settings + +logger = structlog.get_logger() + + +class EmbeddingGenerator: + def __init__(self, database_url: str, tei_url: str, qdrant_url: str, batch_size: int = 32): + self.database_url = database_url + self.tei_url = tei_url + self.qdrant_url = qdrant_url + self.batch_size = batch_size + self.http_client = httpx.AsyncClient(timeout=60.0) + self.qdrant = QdrantClient(url=qdrant_url) + + async def generate_embeddings_batch(self, texts: List[str]) -> List[List[float]]: + """Generate embeddings using TEI""" + response = await self.http_client.post( + f"{self.tei_url}/embed", + json={"inputs": texts} + ) + response.raise_for_status() + return response.json() + + def create_collection(self, name: str = "hadith_embeddings"): + """Create Qdrant collection""" + try: + self.qdrant.get_collection(name) + except: + self.qdrant.create_collection( + collection_name=name, + vectors_config=VectorParams(size=1024, distance=Distance.COSINE) + ) + + async def process_batch(self, conn, hadiths: List[Tuple], collection: str): + """Process batch: generate embeddings & store""" + texts = [f"{h[1]} {h[2] or ''}" for h in hadiths] # arabic + english + embeddings = await self.generate_embeddings_batch(texts) + + points = [ + PointStruct( + id=str(h[0]), + vector=emb, + payload={"hadith_id": str(h[0]), "collection_id": str(h[4])} + ) + for h, emb in zip(hadiths, embeddings) + ] + + self.qdrant.upsert(collection_name=collection, points=points) + + # Mark completed + cursor = conn.cursor() + ids = [str(h[0]) for h in hadiths] + cursor.execute( + "UPDATE hadiths SET embedding_generated = TRUE, embedding_version = 'v1' WHERE id = ANY(%s)", + (ids,) + ) + conn.commit() + cursor.close() + + return len(points) + + async def generate_all(self, collection: str = "hadith_embeddings"): + """Generate embeddings for all hadiths""" + self.create_collection(collection) + conn = psycopg2.connect(self.database_url) + cursor = conn.cursor() + + cursor.execute("SELECT COUNT(*) FROM hadiths WHERE embedding_generated = FALSE") + total = cursor.fetchone()[0] + cursor.close() + + if total == 0: + print("All hadiths already have embeddings!") + return + + print(f"Generating embeddings for {total} hadiths...") + processed = 0 + + with tqdm(total=total) as pbar: + while True: + cursor = conn.cursor() + cursor.execute(""" + SELECT id, arabic_text, english_text, urdu_text, collection_id + FROM hadiths + WHERE embedding_generated = FALSE + LIMIT 1000 + """) + hadiths = cursor.fetchall() + cursor.close() + + if not hadiths: + break + + for i in range(0, len(hadiths), self.batch_size): + batch = hadiths[i:i+self.batch_size] + try: + count = await self.process_batch(conn, batch, collection) + processed += count + pbar.update(count) + except Exception as e: + logger.error("batch_failed", error=str(e)) + + conn.close() + print(f"\nCompleted! Generated {processed} embeddings.") + + async def close(self): + await self.http_client.aclose() + + +async def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--batch-size", type=int, default=32) + args = parser.parse_args() + + gen = EmbeddingGenerator( + database_url=settings.DATABASE_URL, + tei_url="http://tei.ml.svc.cluster.local", + qdrant_url="http://qdrant.vector.svc.cluster.local:6333", + batch_size=args.batch_size + ) + + try: + await gen.generate_all() + return 0 + except Exception as e: + logger.error("generation_failed", error=str(e)) + return 1 + finally: + await gen.close() + + +if __name__ == "__main__": + sys.exit(asyncio.run(main())) \ No newline at end of file diff --git a/hadith-ingestion/src/main_hadithapi.py b/hadith-ingestion/src/main_hadithapi.py index 174dbdd..8e58943 100644 --- a/hadith-ingestion/src/main_hadithapi.py +++ b/hadith-ingestion/src/main_hadithapi.py @@ -2,10 +2,16 @@ Main ingestion script for fetching hadiths from HadithAPI.com """ import sys +from pathlib import Path import argparse from typing import Optional, Dict, Any from uuid import UUID import structlog + +PROJECT_ROOT = Path(__file__).resolve().parents[1] +if str(PROJECT_ROOT) not in sys.path: + sys.path.insert(0, str(PROJECT_ROOT)) + from config.settings import settings from src.api_clients.hadithapi_client import HadithAPIClient from src.database.repository import HadithRepository diff --git a/hadith-ingestion/test-hadithapi-k8s.sh b/hadith-ingestion/test-hadithapi-k8s.sh index 9fa5c30..22c3344 100755 --- a/hadith-ingestion/test-hadithapi-k8s.sh +++ b/hadith-ingestion/test-hadithapi-k8s.sh @@ -19,7 +19,7 @@ echo "Building Docker image..." # 3. Submit test workflow (10 hadiths) echo "Submitting test workflow..." argo submit -n ml argo/workflows/ingest-hadithapi.yaml \ - --parameter book-slug=sahih-bukhari \ + --parameter book-slug=sahih-muslim \ --parameter limit=10 \ --wait \ --log diff --git a/hadith-ingestion/test-hadithapi-local.sh b/hadith-ingestion/test-hadithapi-local.sh index aa01df5..bbf98c3 100755 --- a/hadith-ingestion/test-hadithapi-local.sh +++ b/hadith-ingestion/test-hadithapi-local.sh @@ -19,7 +19,7 @@ python src/main_hadithapi.py --list-books # 4. Test ingestion (limited to 10 hadiths) echo -e "\nRunning test ingestion (10 hadiths from Sahih Bukhari)..." -python src/main_hadithapi.py --book-slug sahih-bukhari --limit 10 +python src/main_hadithapi.py --book-slug sahih-muslim --limit 10 # 5. Verify data echo -e "\nVerifying ingested data..."