From 53cd6e24154c716dbc649f65276577b18c248ef3 Mon Sep 17 00:00:00 2001 From: salahangal Date: Sun, 16 Nov 2025 13:43:08 +0100 Subject: [PATCH] correct hadithapi iterators --- .DS_Store | Bin 0 -> 6148 bytes hadith-ingestion/.DS_Store | Bin 0 -> 8196 bytes hadith-ingestion/combine.sh | 5 + hadith-ingestion/combined.txt | 3202 +++++++++++++++++ .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 197 bytes .../__pycache__/__init__.cpython-38.pyc | Bin 0 -> 191 bytes .../__pycache__/settings.cpython-312.pyc | Bin 0 -> 3556 bytes .../__pycache__/settings.cpython-38.pyc | Bin 0 -> 2757 bytes hadith-ingestion/simple-pod.yaml | 46 + hadith-ingestion/src/.DS_Store | Bin 0 -> 6148 bytes .../src/__pycache__/__init__.cpython-312.pyc | Bin 0 -> 194 bytes .../src/__pycache__/__init__.cpython-38.pyc | Bin 0 -> 188 bytes .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 206 bytes .../__pycache__/__init__.cpython-38.pyc | Bin 0 -> 200 bytes .../__pycache__/base_client.cpython-312.pyc | Bin 0 -> 5130 bytes .../__pycache__/base_client.cpython-38.pyc | Bin 0 -> 3509 bytes .../hadithapi_client.cpython-312.pyc | Bin 0 -> 9795 bytes .../hadithapi_client.cpython-38.pyc | Bin 0 -> 6573 bytes .../src/api_clients/hadithapi_client.py | 39 +- hadith-ingestion/src/main_hadithapi.py | 3 +- hadith-ingestion/test_hadithapi.py | 88 + 21 files changed, 3368 insertions(+), 15 deletions(-) create mode 100644 .DS_Store create mode 100644 hadith-ingestion/.DS_Store create mode 100755 hadith-ingestion/combine.sh create mode 100644 hadith-ingestion/combined.txt create mode 100644 hadith-ingestion/config/__pycache__/__init__.cpython-312.pyc create mode 100644 hadith-ingestion/config/__pycache__/__init__.cpython-38.pyc create mode 100644 hadith-ingestion/config/__pycache__/settings.cpython-312.pyc create mode 100644 hadith-ingestion/config/__pycache__/settings.cpython-38.pyc create mode 100644 hadith-ingestion/simple-pod.yaml create mode 100644 hadith-ingestion/src/.DS_Store create mode 100644 hadith-ingestion/src/__pycache__/__init__.cpython-312.pyc create mode 100644 hadith-ingestion/src/__pycache__/__init__.cpython-38.pyc create mode 100644 hadith-ingestion/src/api_clients/__pycache__/__init__.cpython-312.pyc create mode 100644 hadith-ingestion/src/api_clients/__pycache__/__init__.cpython-38.pyc create mode 100644 hadith-ingestion/src/api_clients/__pycache__/base_client.cpython-312.pyc create mode 100644 hadith-ingestion/src/api_clients/__pycache__/base_client.cpython-38.pyc create mode 100644 hadith-ingestion/src/api_clients/__pycache__/hadithapi_client.cpython-312.pyc create mode 100644 hadith-ingestion/src/api_clients/__pycache__/hadithapi_client.cpython-38.pyc create mode 100644 hadith-ingestion/test_hadithapi.py diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..e3d35577860f825b365f6eb686dbd7683e748ce2 GIT binary patch literal 6148 zcmeHK!A{#i5SncY-WX_Z?QRWs7;ThGq3y>G2u4*;maBl`eYPlf%S7B=NvfJ4Ohk8C`pmHc#C zUVHw1dR^pY)#*H_#zynS%gtaj*b2VIS9Tg#Ni``4$@nW>ompF?i++-R&PUT>`1;72 zD#^`g%!E7}VaoZJ++=n-uoIJ&%#GbZ&4iN~g}hCcqqeHMH{Q9u+B1=dIbzeNRG zYxGX?Dx!cWupSD~{-DtbBagL1-a4@15&*G@VK(ejEkPXZG4fbD#1S;*sECfLe2JkP z9pl>PMILL1jtfgc5S?w()}aDYiHahDWQl82+7h8wT++}UIB;nY8~_zN4yjegHnu~ks!F-T zZ{RO*=8y1SIKi7)Q^igy#3e#vSK591F>hx5yqnD?M5Nk7Z<}bHh%98r`U;8}iN`rl zrG>m_6{x_cr$N+pgUCg@K4on>i~>dhqkvJsC}0%$7Zkvm&B?mr+}ER~Gzu66{!0b; z^T9@DY~?uCC?6dt9!CwZpLosR%WQgw_nd>R!q35_QG) z9XS2s9Mq?h?J8^&L~szHhlnXk-IJj{E835ehii0zXk9u5Ga-%{P_GzupndN&?oaZ}04l!q*V@4k-EAd(DN&Xu6Logbc;Se(n{z~0R zPnKwu!Ap$TF`X47i^7xkH(~E%zA@{9s4>*?z&wNI0}4uPR(FzJ0Xv~?(Mt53Vdaj} zIRC}zn;Qpynr&@;R|}Qu;?lCUY^_*tIw#SnlR4QiZ9B;;Zat5p!2Ml!+!uatbX>pn zBnmUf4||Cae6I(S*Dw9hi$?8e7<#F2J9@yXS+(Q(o%8eEoxKfvySd%G*s#y{cD6R` z=EFx97d30`{)5Ly$EU#{jCcVesr=-k>$-heenu^S=ntJF45E-14_mLv66LMXkS$W) zET^<9%VB!sVKJ#26N~BgA3O!JpdGxh85@AGv!`o$vpLSO5M$_j#FsMggP1{1i~tgU&%4@$}I=9hi@58~H6V rC$<}F6bcGC&4F?q2VVYRh`J3^=CpDgYeWx{`63`Pn8GM9R|S3n_IEKS literal 0 HcmV?d00001 diff --git a/hadith-ingestion/combine.sh b/hadith-ingestion/combine.sh new file mode 100755 index 0000000..fd9fa32 --- /dev/null +++ b/hadith-ingestion/combine.sh @@ -0,0 +1,5 @@ +find . -type f -name "*.txt" -o -name "production" -o -name "*.py" -o -name "*.yaml" -o -name "Dockerfile" -o -name "*.sh" -o -name "*.env" -o -name "*.md" ! -name "*.xls" ! -name "*.xlsx"| while read file; do + echo "=== $file ===" >> combined.txt + cat "$file" >> combined.txt + echo "" >> combined.txt +done diff --git a/hadith-ingestion/combined.txt b/hadith-ingestion/combined.txt new file mode 100644 index 0000000..f9b621c --- /dev/null +++ b/hadith-ingestion/combined.txt @@ -0,0 +1,3202 @@ +=== ./run-full-ingestion.sh === +#!/bin/bash +# run-full-ingestion.sh + +set -e + +echo "=== Starting Full HadithAPI Ingestion ===" + +# Books to ingest (in order) +BOOKS=( + "sahih-bukhari" + "sahih-muslim" + "sunan-abu-dawood" + "jami-at-tirmidhi" + "sunan-an-nasai" + "sunan-ibn-e-majah" +) + +for BOOK in "${BOOKS[@]}"; do + echo -e "\n=========================================" + echo "Ingesting: $BOOK" + echo "=========================================" + + argo submit -n argo argo/workflows/ingest-hadithapi.yaml \ + --parameter book-slug=$BOOK \ + --parameter limit=0 \ + --wait \ + --log + + echo "$BOOK completed!" + + # Optional: add delay between books + sleep 10 +done + +echo -e "\n=== All Books Ingestion Complete ===" + +# Print summary +kubectl -n db exec -it postgres-0 -- psql -U hadith_ingest -d hadith_db -c " +SELECT + c.name_english, + c.abbreviation, + COUNT(h.id) as hadith_count, + COUNT(DISTINCT b.id) as chapter_count +FROM collections c +LEFT JOIN hadiths h ON c.id = h.collection_id +LEFT JOIN books b ON h.book_id = b.id +GROUP BY c.name_english, c.abbreviation +ORDER BY hadith_count DESC; +" +=== ./create-secrets.sh === +#!/bin/bash +# create-secrets.sh + +# Database secret +kubectl -n ml create secret generic hadith-db-secret \ + --from-literal=password='hadith_ingest' \ + --dry-run=client -o yaml | kubectl apply -f - + +# HadithAPI secret (already public, but for consistency) +kubectl -n ml create secret generic hadithapi-secret \ + --from-literal=api-key='$2y$10$nTJnyX3WUDoGmjKrKqSmbecANVsQWKyffmtp9fxmsQwR15DEv4mK' \ + --dry-run=client -o yaml | kubectl apply -f - + +# MinIO secret +kubectl -n ml create secret generic minio-secret \ + --from-literal=access-key='minioadmin' \ + --from-literal=secret-key='minioadmin' \ + --dry-run=client -o yaml | kubectl apply -f - + +echo "Secrets created successfully" +=== ./requirements.txt === +# Core dependencies +python-dotenv==1.0.0 +pydantic==2.5.0 +pydantic-settings==2.1.0 + +# HTTP clients +httpx==0.25.2 +requests==2.31.0 +tenacity==8.2.3 + +# Database +psycopg2-binary==2.9.9 +sqlalchemy==2.0.23 +asyncpg==0.29.0 + +# Data processing +pandas==2.1.4 +numpy==1.26.2 +pyarabic==0.6.15 +arabic-reshaper==3.0.0 + +# Validation +jsonschema==4.20.0 +validators==0.22.0 + +# Logging & Monitoring +structlog==23.2.0 +prometheus-client==0.19.0 + +# Cloud storage +minio==7.2.0 +boto3==1.34.0 + +# Task queue (optional) +celery==5.3.4 +redis==5.0.1 + +# Testing +pytest==7.4.3 +pytest-asyncio==0.21.1 +pytest-cov==4.1.0 +faker==21.0.0 +=== ./config/__init__.py === + +=== ./config/settings.py === +""" +Configuration settings for hadith ingestion service +""" +from pydantic_settings import BaseSettings +from typing import Optional +import os + + +class Settings(BaseSettings): + """Application settings loaded from environment variables""" + + # Database + DATABASE_HOST: str = "postgres.db.svc.cluster.local" + DATABASE_PORT: int = 5432 + DATABASE_NAME: str = "hadith_db" + DATABASE_USER: str = "hadith_ingest" + DATABASE_PASSWORD: str = "hadith_ingest" + + @property + def DATABASE_URL(self) -> str: + return ( + f"postgresql://{self.DATABASE_USER}:{self.DATABASE_PASSWORD}" + f"@{self.DATABASE_HOST}:{self.DATABASE_PORT}/{self.DATABASE_NAME}" + ) + + @property + def ASYNC_DATABASE_URL(self) -> str: + return ( + f"postgresql+asyncpg://{self.DATABASE_USER}:{self.DATABASE_PASSWORD}" + f"@{self.DATABASE_HOST}:{self.DATABASE_PORT}/{self.DATABASE_NAME}" + ) + + # MinIO / S3 + MINIO_ENDPOINT: str = "minio.storage.svc.cluster.local:9000" + MINIO_ACCESS_KEY: str = "minioadmin" + MINIO_SECRET_KEY: str = "minioadmin" + MINIO_BUCKET_RAW: str = "hadith-raw-data" + MINIO_BUCKET_PROCESSED: str = "hadith-processed" + MINIO_SECURE: bool = False + + # APIs + SUNNAH_API_KEY: Optional[str] = None + SUNNAH_BASE_URL: str = "https://api.sunnah.com/v1" + HADITH_ONE_API_KEY: Optional[str] = "$2y$10$nTJnyX3WUDoGmjKrKqSmbecANVsQWKyffmtp9fxmsQwR15DEv4mK" + # HadithAPI.com + HADITHAPI_KEY: str = "$2y$10$nTJnyX3WUDoGmjKrKqSmbecANVsQWKyffmtp9fxmsQwR15DEv4mK" + HADITHAPI_BASE_URL: str = "https://hadithapi.com/api" + # Rate limiting + API_RATE_LIMIT: int = 30 # requests per minute + API_MAX_RETRIES: int = 3 + API_RETRY_DELAY: int = 5 # seconds + + # Processing + BATCH_SIZE: int = 100 + MAX_WORKERS: int = 4 + + # TEI Service (for embeddings) + TEI_URL: str = "http://tei.ml.svc.cluster.local" + TEI_TIMEOUT: int = 30 + + # Qdrant + QDRANT_URL: str = "http://qdrant.db.svc.cluster.local:6333" + QDRANT_COLLECTION: str = "hadith_embeddings" + + # Logging + LOG_LEVEL: str = "INFO" + LOG_FORMAT: str = "json" + + # Job tracking + JOB_NAME: Optional[str] = None + JOB_TYPE: str = "api_fetch" + + class Config: + env_file = ".env" + env_file_encoding = "utf-8" + case_sensitive = True + + +# Global settings instance +settings = Settings() +=== ./Dockerfile === +FROM python:3.11-slim + +WORKDIR /app + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + gcc \ + postgresql-client \ + curl \ + && rm -rf /var/lib/apt/lists/* + +# Copy requirements +COPY requirements.txt . + +# Install Python dependencies +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application code +COPY config/ /app/config/ +COPY src/ /app/src/ + +# Create non-root user +RUN useradd -m -u 1000 hadith && chown -R hadith:hadith /app +USER hadith + +# Set Python path +ENV PYTHONPATH=/app + +# Default command +CMD ["python", "/app/src/main_hadithapi.py"] +=== ./tests/__init__.py === + +=== ./tests/test_clients.py === + +=== ./test-hadithapi-k8s.sh === +#!/bin/bash +# test-hadithapi-k8s.sh + +set -e + +echo "=== Kubernetes HadithAPI Integration Test ===" + +# 1. Create secrets +echo "Creating secrets..." +#./create-secrets.sh + +# 2. Build and load image (if using local cluster) +echo "Building Docker image..." +#docker build -t hadith-ingestion:latest . + +# If using kind/minikube, load image +# kind load docker-image hadith-ingestion:latest + +# 3. Submit test workflow (10 hadiths) +echo "Submitting test workflow..." +argo submit -n ml argo/workflows/ingest-hadithapi.yaml \ + --parameter book-slug=sahih-bukhari \ + --parameter limit=10 \ + --wait \ + --log + +# 4. Check workflow status +echo -e "\nChecking workflow status..." +argo list -n argo + +# 5. Verify data in database +echo -e "\nVerifying data..." +kubectl -n db exec -it postgres-0 -- psql -U hadith_ingest -d hadith_db -c " +SELECT + c.name_english, + COUNT(h.id) as hadith_count, + MAX(h.created_at) as last_ingestion +FROM collections c +LEFT JOIN hadiths h ON c.id = h.collection_id +WHERE c.abbreviation = 'bukhari' +GROUP BY c.name_english; +" + +echo -e "\n=== Test Complete ===" +=== ./README.md === +# 🚀 HadithAPI.com Deployment - Quick Start + +## What You Got + +Three comprehensive guides: +1. **PHASE_2_IMPLEMENTATION_GUIDE.md** - Original guide with PostgreSQL schema +2. **HADITHAPI_INTEGRATION_GUIDE.md** - Complete HadithAPI.com implementation +3. **This summary** - Quick deployment steps + +## 📦 Complete Package Structure + +The HadithAPI guide includes everything you need: + +### Production-Ready Code +✅ **hadithapi_client.py** - Full API client with pagination and rate limiting +✅ **main_hadithapi.py** - Complete ingestion service +✅ **settings.py** - Configuration with your API key +✅ **Dockerfile** - Container image +✅ **Argo Workflows** - Kubernetes automation +✅ **Test scripts** - Validation and troubleshooting + +### Key Features +- ✅ Automatic pagination handling +- ✅ Rate limiting (30 req/min) +- ✅ Error handling and retries +- ✅ Progress tracking +- ✅ Structured logging +- ✅ Multi-language support (Arabic, English, Urdu) + +## 🎯 5-Minute Quick Start + +### 1. Database Setup (2 min) +```bash +# Use schema from PHASE_2_IMPLEMENTATION_GUIDE.md Section 1 +kubectl -n db exec -it postgres-0 -- psql -U app -d gitea + +# Copy all SQL from Section 1.2 through 1.6 +# This creates hadith_db with complete schema +``` + +### 2. Create Project Structure (1 min) +```bash +mkdir -p hadith-ingestion/{config,src/{api_clients,processors,database,utils},argo/workflows} +cd hadith-ingestion/ + +# Copy code from HADITHAPI_INTEGRATION_GUIDE.md: +# - Section 2.1 → src/api_clients/hadithapi_client.py +# - Section 4.1 → src/main_hadithapi.py +# - Section 5.1 → config/settings.py +# - Section 6.1 → Dockerfile +# - Section 6.4 → argo/workflows/ingest-hadithapi.yaml + +# Also copy from PHASE_2_IMPLEMENTATION_GUIDE.md: +# - Section 3.4 → src/api_clients/base_client.py +# - Section 3.6 → src/processors/text_cleaner.py +# - Section 3.7 → src/database/repository.py +``` + +### 3. Build & Deploy (2 min) +```bash +# Build image +docker build -t hadith-ingestion:latest . + +# Create secrets +kubectl -n argo create secret generic hadith-db-secret \ + --from-literal=password='YOUR_PASSWORD' + +kubectl -n argo create secret generic hadithapi-secret \ + --from-literal=api-key='$2y$10$nTJnyX3WUDoGmjKrKqSmbecANVsQWKyffmtp9fxmsQwR15DEv4mK' + +# Test with 10 hadiths +argo submit -n argo argo/workflows/ingest-hadithapi.yaml \ + --parameter book-slug=sahih-bukhari \ + --parameter limit=10 \ + --watch +``` + +## 📊 Expected Results + +### Available Collections +| Book | Hadiths | Time | +|------|---------|------| +| Sahih Bukhari | ~7,500 | 2-3h | +| Sahih Muslim | ~7,000 | 2-3h | +| Sunan Abu Dawood | ~5,000 | 1-2h | +| Jami` at-Tirmidhi | ~4,000 | 1-2h | +| Sunan an-Nasa'i | ~5,700 | 2h | +| Sunan Ibn Majah | ~4,300 | 1-2h | +| **TOTAL** | **~33,500** | **10-15h** | + +## 🔧 Key Differences from Sunnah.com + +| Feature | HadithAPI.com | Sunnah.com | +|---------|---------------|------------| +| **API Key** | ✅ Public (provided) | ❌ Requires PR | +| **Rate Limit** | Unknown (using 30/min) | 100/min | +| **Coverage** | 6 major books | 10+ books | +| **Languages** | Arabic, English, Urdu | Arabic, English | +| **Cost** | ✅ Free | Free | +| **Stability** | Good | Excellent | + +## 📝 Complete File Checklist + +Create these files from the guides: + +``` +hadith-ingestion/ +├── Dockerfile ✓ Section 6.1 +├── requirements.txt ✓ Phase 2 Section 3.2 +├── .env ✓ Section 5.2 +├── build-hadithapi-ingestion.sh ✓ Section 6.2 +├── create-secrets.sh ✓ Section 6.3 +├── test-hadithapi-local.sh ✓ Section 7.1 +├── test-hadithapi-k8s.sh ✓ Section 7.2 +├── run-full-ingestion.sh ✓ Section 7.3 +├── config/ +│ ├── __init__.py (empty file) +│ └── settings.py ✓ Section 5.1 +├── src/ +│ ├── __init__.py (empty file) +│ ├── main_hadithapi.py ✓ Section 4.1 +│ ├── api_clients/ +│ │ ├── __init__.py (empty file) +│ │ ├── base_client.py ✓ Phase 2 Sec 3.4 +│ │ └── hadithapi_client.py ✓ Section 2.1 +│ ├── processors/ +│ │ ├── __init__.py (empty file) +│ │ └── text_cleaner.py ✓ Phase 2 Sec 3.6 +│ ├── database/ +│ │ ├── __init__.py (empty file) +│ │ ├── connection.py (optional) +│ │ └── repository.py ✓ Phase 2 Sec 3.7 +│ └── utils/ +│ ├── __init__.py (empty file) +│ └── logger.py (optional) +└── argo/ + └── workflows/ + └── ingest-hadithapi.yaml ✓ Section 6.4 +``` + +## 🎬 Step-by-Step Execution + +### Day 1: Setup & Test (2-3 hours) +```bash +# 1. Create database schema +# 2. Set up project structure +# 3. Build Docker image +# 4. Create secrets +# 5. Run test with 10 hadiths +# 6. Verify data +``` + +### Day 2: Ingest Major Collections (10-15 hours) +```bash +# Ingest all 6 major collections sequentially +./run-full-ingestion.sh + +# Or manually one by one: +argo submit ... --parameter book-slug=sahih-bukhari +argo submit ... --parameter book-slug=sahih-muslim +# etc... +``` + +### Day 3: Validation & Next Steps +```bash +# 1. Verify data quality +# 2. Check statistics +# 3. Proceed to Phase 3 (ML model development) +``` + +## ✅ Verification Checklist + +After ingestion completes: + +```bash +# 1. Check total hadiths +kubectl -n db exec -it postgres-0 -- psql -U hadith_ingest -d hadith_db -c " +SELECT COUNT(*) FROM hadiths; +" +# Expected: ~33,500 + +# 2. Check per collection +kubectl -n db exec -it postgres-0 -- psql -U hadith_ingest -d hadith_db -c " +SELECT + c.name_english, + COUNT(h.id) as count +FROM collections c +LEFT JOIN hadiths h ON c.id = h.collection_id +WHERE c.abbreviation IN ('bukhari', 'muslim', 'abudawud', 'tirmidhi', 'nasai', 'ibnmajah') +GROUP BY c.name_english; +" + +# 3. Check for errors +kubectl -n db exec -it postgres-0 -- psql -U hadith_ingest -d hadith_db -c " +SELECT * FROM ingestion_jobs +WHERE status = 'failed' +ORDER BY created_at DESC; +" +``` + +## 🐛 Common Issues & Solutions + +### Issue: Rate Limiting +``` +Error: 429 Too Many Requests +Solution: Already set to conservative 30/min +If still hitting limits, edit settings.py: + API_RATE_LIMIT = 20 +``` + +### Issue: Connection Timeout +``` +Error: Connection timeout to database +Solution: +1. Check PostgreSQL is running +2. Verify credentials in secrets +3. Test connection manually +``` + +### Issue: Missing Chapters +``` +Warning: chapters_fetch_failed +Solution: Script automatically falls back to fetching all hadiths +This is expected and not critical +``` + +## 📚 Documentation References + +All details in the comprehensive guides: + +1. **PHASE_2_IMPLEMENTATION_GUIDE.md** + - PostgreSQL schema (Section 1) + - Base utilities (Section 3) + - Database repository (Section 3.7) + +2. **HADITHAPI_INTEGRATION_GUIDE.md** + - API client (Section 2) + - Main ingestion service (Section 4) + - Deployment (Section 6) + - Testing (Section 7) + +## 🎯 Next Phase + +After Phase 2 completion: +→ **Phase 3: ML Model Development** + - Annotate sample hadiths (Label Studio) + - Train NER model + - Train relation extraction model + - Fine-tune LLM with LoRA + +## 💡 Pro Tips + +1. **Start Small**: Test with `--limit 10` first +2. **Monitor Progress**: Use `argo logs -n argo -f` +3. **Check Logs**: Structured JSON logs for easy debugging +4. **Backup Data**: Before major operations +5. **Rate Limiting**: Be conservative to avoid blocks + +## 🎉 Success Criteria + +Phase 2 is complete when: +- ✅ Database schema created +- ✅ 33,500+ hadiths ingested +- ✅ All 6 collections present +- ✅ No critical errors +- ✅ Data validated +- ✅ Ready for embedding generation + +--- + +**Estimated Total Time: 1-2 days** +**Difficulty: Intermediate** +**Prerequisites: Phase 1 completed (all core services running)** + +Ready to start? Begin with Section 1 of PHASE_2_IMPLEMENTATION_GUIDE.md! +=== ./setup.py === + +=== ./build-and-push.sh === +#!/bin/bash +# build-and-push.sh + +set -e + +# Configuration +IMAGE_NAME="hadith-ingestion" +TAG="${1:-latest}" +DOCKER_REGISTRY="axxs" +REGISTRY="${DOCKER_REGISTRY:-}" + +echo "Building Docker image: ${IMAGE_NAME}:${TAG}" + +# Build image +docker build -t ${IMAGE_NAME}:${TAG} -f Dockerfile . + +# Tag for registry +docker tag ${IMAGE_NAME}:${TAG} ${REGISTRY}/${IMAGE_NAME}:${TAG} + +# Push to registry +echo "Pushing to registry: ${REGISTRY}" +docker push ${REGISTRY}/${IMAGE_NAME}:${TAG} + +echo "Done!" +=== ./.env === +# Database +DATABASE_HOST=postgres.db.svc.cluster.local +DATABASE_PORT=5432 +DATABASE_NAME=hadith_db +DATABASE_USER=hadith_ingest +DATABASE_PASSWORD=hadith_ingest + +# HadithAPI.com +HADITHAPI_KEY=$2y$10$nTJnyX3WUDoGmjKrKqSmbecANVsQWKyffmtp9fxmsQwR15DEv4mK + +# MinIO +MINIO_ENDPOINT=minio.storage.svc.cluster.local:9000 +MINIO_ACCESS_KEY=minioadmin +MINIO_SECRET_KEY=minioadmin + +# Services +TEI_URL=http://tei.ml.svc.cluster.local +QDRANT_URL=http://qdrant.vector.svc.cluster.local:6333 + +# Settings +LOG_LEVEL=INFO +API_RATE_LIMIT=30 +BATCH_SIZE=100 +=== ./build-hadithapi-ingestion.sh === +#!/bin/bash +# build-hadithapi-ingestion.sh + +set -e + +IMAGE_NAME="hadith-ingestion" +TAG="v1.0-hadithapi" + +echo "Building Docker image for HadithAPI.com ingestion..." + +# Build image +docker build -t ${IMAGE_NAME}:${TAG} -f Dockerfile . + +# Tag as latest +docker tag ${IMAGE_NAME}:${TAG} ${IMAGE_NAME}:latest + +# If you have a registry, push +# docker push your-registry/${IMAGE_NAME}:${TAG} + +echo "Build complete: ${IMAGE_NAME}:${TAG}" +=== ./combine.sh === +find . -type f -name "*.txt" -o -name "production" -o -name "*.py" -o -name "*.yaml" -o -name "Dockerfile" -o -name "*.sh" -o -name "*.env" -o -name "*.md" ! -name "*.xls" ! -name "*.xlsx"| while read file; do + echo "=== $file ===" >> combined.txt + cat "$file" >> combined.txt + echo "" >> combined.txt +done + +=== ./test-hadithapi-local.sh === +#!/bin/bash +# test-hadithapi-local.sh + +set -e + +echo "=== HadithAPI.com Integration Test ===" + +# 1. Test API connection +echo "Testing API connection..." +curl -s "https://hadithapi.com/api/books?apiKey=\$2y\$10\$nTJnyX3WUDoGmjKrKqSmbecANVsQWKyffmtp9fxmsQwR15DEv4mK" | jq . + +# 2. Test database connection +echo -e "\nTesting database connection..." +kubectl -n db exec -it postgres-0 -- psql -U hadith_ingest -d hadith_db -c "SELECT COUNT(*) FROM collections;" + +# 3. List available books +echo -e "\nListing available books..." +python src/main_hadithapi.py --list-books + +# 4. Test ingestion (limited to 10 hadiths) +echo -e "\nRunning test ingestion (10 hadiths from Sahih Bukhari)..." +python src/main_hadithapi.py --book-slug sahih-bukhari --limit 10 + +# 5. Verify data +echo -e "\nVerifying ingested data..." +kubectl -n db exec -it postgres-0 -- psql -U hadith_ingest -d hadith_db -c " +SELECT + c.name_english, + c.abbreviation, + COUNT(h.id) as hadith_count, + COUNT(DISTINCT b.id) as book_count +FROM collections c +LEFT JOIN hadiths h ON c.id = h.collection_id +LEFT JOIN books b ON h.book_id = b.id +WHERE c.abbreviation = 'bukhari' +GROUP BY c.name_english, c.abbreviation; +" + +echo -e "\n=== Test Complete ===" +=== ./simple-pod.yaml === +apiVersion: v1 +kind: Pod +metadata: + name: hadith-ingestion-list-books + namespace: ml +spec: + restartPolicy: Never + containers: + - name: hadith-ingestion + image: axxs/hadith-ingestion:latest + # command: ["python"] + # args: ["/app/src/main_hadithapi.py", "--list-books"] + command: ["sh","-c","sleep infinity"] + env: + - name: DATABASE_HOST + value: "postgres.db.svc.cluster.local" + - name: DATABASE_PORT + value: "5432" + - name: DATABASE_NAME + value: "hadith_db" + - name: DATABASE_USER + value: "hadith_ingest" + - name: DATABASE_PASSWORD + valueFrom: + secretKeyRef: + name: hadith-db-secret + key: password + - name: HADITHAPI_KEY + valueFrom: + secretKeyRef: + name: hadithapi-secret + key: api-key + - name: MINIO_ENDPOINT + value: "minio.storage.svc.cluster.local:9000" + - name: MINIO_ACCESS_KEY + valueFrom: + secretKeyRef: + name: minio-secret + key: access-key + - name: MINIO_SECRET_KEY + valueFrom: + secretKeyRef: + name: minio-secret + key: secret-key + - name: LOG_LEVEL + value: "INFO" +=== ./argo/workflows/ingest-collection.yaml === +apiVersion: argoproj.io/v1alpha1 +kind: Workflow +metadata: + generateName: ingest-hadith-collection- + namespace: argo +spec: + entrypoint: ingest-pipeline + + # Arguments + arguments: + parameters: + - name: collection + value: "bukhari" + - name: limit + value: "0" # 0 means no limit + + # Service account with database access + serviceAccountName: argo-workflow + + # Templates + templates: + + # ======================================== + # Main pipeline + # ======================================== + - name: ingest-pipeline + steps: + - - name: ingest-hadiths + template: ingest + arguments: + parameters: + - name: collection + value: "{{workflow.parameters.collection}}" + - name: limit + value: "{{workflow.parameters.limit}}" + + - - name: generate-embeddings + template: generate-embeddings + arguments: + parameters: + - name: collection + value: "{{workflow.parameters.collection}}" + + - - name: index-qdrant + template: index-qdrant + arguments: + parameters: + - name: collection + value: "{{workflow.parameters.collection}}" + + # ======================================== + # Ingestion step + # ======================================== + - name: ingest + inputs: + parameters: + - name: collection + - name: limit + + container: + image: hadith-ingestion:latest + imagePullPolicy: IfNotPresent + command: [python, /app/src/main.py] + args: + - "{{inputs.parameters.collection}}" + - "--limit={{inputs.parameters.limit}}" + + env: + - name: DATABASE_HOST + value: "postgres.db.svc.cluster.local" + - name: DATABASE_PORT + value: "5432" + - name: DATABASE_NAME + value: "hadith_db" + - name: DATABASE_USER + value: "hadith_ingest" + - name: DATABASE_PASSWORD + valueFrom: + secretKeyRef: + name: hadith-db-secret + key: password + + - name: MINIO_ENDPOINT + value: "minio.storage.svc.cluster.local:9000" + - name: MINIO_ACCESS_KEY + valueFrom: + secretKeyRef: + name: minio-secret + key: access-key + - name: MINIO_SECRET_KEY + valueFrom: + secretKeyRef: + name: minio-secret + key: secret-key + + - name: SUNNAH_API_KEY + valueFrom: + secretKeyRef: + name: sunnah-api-secret + key: api-key + + - name: LOG_LEVEL + value: "INFO" + - name: JOB_NAME + value: "{{workflow.name}}" + + resources: + requests: + cpu: 500m + memory: 1Gi + limits: + cpu: 2 + memory: 4Gi + + # ======================================== + # Embedding generation step + # ======================================== + - name: generate-embeddings + inputs: + parameters: + - name: collection + + container: + image: hadith-embeddings:latest + imagePullPolicy: IfNotPresent + command: [python, /app/generate_embeddings.py] + args: + - "--collection={{inputs.parameters.collection}}" + - "--batch-size=32" + + env: + - name: DATABASE_URL + value: "postgresql://hadith_ingest:$(DATABASE_PASSWORD)@postgres.db.svc.cluster.local:5432/hadith_db" + - name: DATABASE_PASSWORD + valueFrom: + secretKeyRef: + name: hadith-db-secret + key: password + + - name: TEI_URL + value: "http://tei.ml.svc.cluster.local" + + - name: LOG_LEVEL + value: "INFO" + + resources: + requests: + cpu: 1 + memory: 2Gi + limits: + cpu: 4 + memory: 8Gi + + # ======================================== + # Qdrant indexing step + # ======================================== + - name: index-qdrant + inputs: + parameters: + - name: collection + + container: + image: hadith-qdrant-indexer:latest + imagePullPolicy: IfNotPresent + command: [python, /app/index_qdrant.py] + args: + - "--collection={{inputs.parameters.collection}}" + - "--batch-size=100" + + env: + - name: DATABASE_URL + value: "postgresql://hadith_ingest:$(DATABASE_PASSWORD)@postgres.db.svc.cluster.local:5432/hadith_db" + - name: DATABASE_PASSWORD + valueFrom: + secretKeyRef: + name: hadith-db-secret + key: password + + - name: QDRANT_URL + value: "http://qdrant.vector.svc.cluster.local:6333" + - name: QDRANT_COLLECTION + value: "hadith_embeddings" + + - name: LOG_LEVEL + value: "INFO" + + resources: + requests: + cpu: 500m + memory: 1Gi + limits: + cpu: 2 + memory: 4Gi +=== ./argo/workflows/ingest-hadithapi.yaml === +apiVersion: argoproj.io/v1alpha1 +kind: Workflow +metadata: + generateName: ingest-hadithapi- + namespace: ml +spec: + entrypoint: ingest-pipeline + + arguments: + parameters: + - name: book-slug + value: "sahih-bukhari" + - name: limit + value: "0" # 0 means no limit + + serviceAccountName: argo-workflow + + templates: + + # ======================================== + # Main pipeline + # ======================================== + - name: ingest-pipeline + steps: + - - name: ingest-hadiths + template: ingest + arguments: + parameters: + - name: book-slug + value: "{{workflow.parameters.book-slug}}" + - name: limit + value: "{{workflow.parameters.limit}}" + + # ======================================== + # Ingestion step + # ======================================== + - name: ingest + inputs: + parameters: + - name: book-slug + - name: limit + + container: + image: axxs/hadith-ingestion:latest + imagePullPolicy: IfNotPresent + command: [python, /app/src/main_hadithapi.py] + args: + - "--book-slug={{inputs.parameters.book-slug}}" + - "--limit={{inputs.parameters.limit}}" + + env: + - name: DATABASE_HOST + value: "postgres.db.svc.cluster.local" + - name: DATABASE_PORT + value: "5432" + - name: DATABASE_NAME + value: "hadith_db" + - name: DATABASE_USER + value: "hadith_ingest" + - name: DATABASE_PASSWORD + valueFrom: + secretKeyRef: + name: hadith-db-secret + key: password + + - name: HADITHAPI_KEY + valueFrom: + secretKeyRef: + name: hadithapi-secret + key: api-key + + - name: MINIO_ENDPOINT + value: "minio.storage.svc.cluster.local:9000" + - name: MINIO_ACCESS_KEY + valueFrom: + secretKeyRef: + name: minio-secret + key: access-key + - name: MINIO_SECRET_KEY + valueFrom: + secretKeyRef: + name: minio-secret + key: secret-key + + - name: LOG_LEVEL + value: "INFO" + - name: JOB_NAME + value: "{{workflow.name}}" + - name: API_RATE_LIMIT + value: "30" + + resources: + requests: + cpu: 500m + memory: 1Gi + limits: + cpu: 2 + memory: 4Gi +--- +# Workflow to ingest ALL books +apiVersion: argoproj.io/v1alpha1 +kind: Workflow +metadata: + generateName: ingest-all-hadithapi- + namespace: ml +spec: + entrypoint: ingest-all-books + + serviceAccountName: argo-workflow + + arguments: + parameters: + - name: limit-per-book + value: "0" # 0 means no limit + + templates: + + # ======================================== + # Main pipeline - sequential processing + # ======================================== + - name: ingest-all-books + steps: + # Process each book sequentially to avoid rate limiting + - - name: sahih-bukhari + template: ingest-book + arguments: + parameters: + - name: book-slug + value: "sahih-bukhari" + + - - name: sahih-muslim + template: ingest-book + arguments: + parameters: + - name: book-slug + value: "sahih-muslim" + + - - name: sunan-abu-dawood + template: ingest-book + arguments: + parameters: + - name: book-slug + value: "sunan-abu-dawood" + + - - name: jami-at-tirmidhi + template: ingest-book + arguments: + parameters: + - name: book-slug + value: "jami-at-tirmidhi" + + - - name: sunan-an-nasai + template: ingest-book + arguments: + parameters: + - name: book-slug + value: "sunan-an-nasai" + + - - name: sunan-ibn-e-majah + template: ingest-book + arguments: + parameters: + - name: book-slug + value: "sunan-ibn-e-majah" + + # ======================================== + # Book ingestion template + # ======================================== + - name: ingest-book + inputs: + parameters: + - name: book-slug + + container: + image: axxs/hadith-ingestion:latest + imagePullPolicy: IfNotPresent + command: [python, /app/src/main_hadithapi.py] + args: + - "--book-slug={{inputs.parameters.book-slug}}" + - "--limit={{workflow.parameters.limit-per-book}}" + + env: + - name: DATABASE_HOST + value: "postgres.db.svc.cluster.local" + - name: DATABASE_PASSWORD + valueFrom: + secretKeyRef: + name: hadith-db-secret + key: password + - name: HADITHAPI_KEY + valueFrom: + secretKeyRef: + name: hadithapi-secret + key: api-key + - name: LOG_LEVEL + value: "INFO" + + resources: + requests: + cpu: 500m + memory: 1Gi + limits: + cpu: 2 + memory: 4Gi +=== ./src/database/__init__.py === + +=== ./src/database/connection.py === + +=== ./src/database/repository.py === +""" +Database repository for hadith data operations +""" +from typing import List, Dict, Any, Optional +from uuid import UUID +import structlog +from sqlalchemy import create_engine, text, select, insert, update +from sqlalchemy.orm import sessionmaker, Session +from sqlalchemy.exc import IntegrityError +from config.settings import settings + +logger = structlog.get_logger() + + +class HadithRepository: + """Repository for hadith database operations""" + + def __init__(self, database_url: Optional[str] = None): + self.database_url = database_url or settings.DATABASE_URL + self.engine = create_engine(self.database_url, pool_pre_ping=True) + self.SessionLocal = sessionmaker(bind=self.engine) + + def get_session(self) -> Session: + """Get database session""" + return self.SessionLocal() + + # ===== Collections ===== + + def get_collection_by_abbreviation(self, abbr: str) -> Optional[Dict[str, Any]]: + """Get collection by abbreviation""" + with self.get_session() as session: + query = text(""" + SELECT * FROM collections + WHERE abbreviation = :abbr + """) + result = session.execute(query, {"abbr": abbr}).fetchone() + + if result: + return dict(result._mapping) + return None + + def get_all_collections(self) -> List[Dict[str, Any]]: + """Get all collections""" + with self.get_session() as session: + query = text("SELECT * FROM collections ORDER BY name_english") + result = session.execute(query).fetchall() + return [dict(row._mapping) for row in result] + + def update_collection_count(self, collection_id: UUID, count: int): + """Update total hadith count for a collection""" + with self.get_session() as session: + query = text(""" + UPDATE collections + SET total_hadiths = :count, updated_at = NOW() + WHERE id = :id + """) + session.execute(query, {"id": str(collection_id), "count": count}) + session.commit() + + # ===== Books ===== + + def upsert_book( + self, + collection_id: UUID, + book_number: int, + name_english: Optional[str] = None, + name_arabic: Optional[str] = None, + metadata: Optional[Dict] = None + ) -> UUID: + """Insert or update a book""" + with self.get_session() as session: + query = text(""" + INSERT INTO books (collection_id, book_number, name_english, name_arabic, metadata) + VALUES (:collection_id, :book_number, :name_english, :name_arabic, :metadata) + ON CONFLICT (collection_id, book_number) + DO UPDATE SET + name_english = EXCLUDED.name_english, + name_arabic = EXCLUDED.name_arabic, + metadata = EXCLUDED.metadata + RETURNING id + """) + + result = session.execute(query, { + "collection_id": str(collection_id), + "book_number": book_number, + "name_english": name_english, + "name_arabic": name_arabic, + "metadata": metadata or {} + }) + session.commit() + + return UUID(result.fetchone()[0]) + + def get_book(self, collection_id: UUID, book_number: int) -> Optional[Dict[str, Any]]: + """Get book by collection and book number""" + with self.get_session() as session: + query = text(""" + SELECT * FROM books + WHERE collection_id = :collection_id AND book_number = :book_number + """) + result = session.execute(query, { + "collection_id": str(collection_id), + "book_number": book_number + }).fetchone() + + if result: + return dict(result._mapping) + return None + + # ===== Hadiths ===== + + def upsert_hadith( + self, + collection_id: UUID, + hadith_number: int, + arabic_text: str, + book_id: Optional[UUID] = None, + english_text: Optional[str] = None, + urdu_text: Optional[str] = None, + grade: Optional[str] = None, + grade_source: Optional[str] = None, + chapter_name: Optional[str] = None, + source_id: Optional[str] = None, + source_url: Optional[str] = None, + source_metadata: Optional[Dict] = None + ) -> UUID: + """Insert or update a hadith""" + with self.get_session() as session: + query = text(""" + INSERT INTO hadiths ( + collection_id, book_id, hadith_number, + arabic_text, english_text, urdu_text, + grade, grade_source, chapter_name, + source_id, source_url, source_metadata + ) + VALUES ( + :collection_id, :book_id, :hadith_number, + :arabic_text, :english_text, :urdu_text, + :grade, :grade_source, :chapter_name, + :source_id, :source_url, :source_metadata + ) + ON CONFLICT (collection_id, book_id, hadith_number) + DO UPDATE SET + arabic_text = EXCLUDED.arabic_text, + english_text = EXCLUDED.english_text, + urdu_text = EXCLUDED.urdu_text, + grade = EXCLUDED.grade, + grade_source = EXCLUDED.grade_source, + chapter_name = EXCLUDED.chapter_name, + source_url = EXCLUDED.source_url, + source_metadata = EXCLUDED.source_metadata, + updated_at = NOW() + RETURNING id + """) + + result = session.execute(query, { + "collection_id": str(collection_id), + "book_id": str(book_id) if book_id else None, + "hadith_number": hadith_number, + "arabic_text": arabic_text, + "english_text": english_text, + "urdu_text": urdu_text, + "grade": grade, + "grade_source": grade_source, + "chapter_name": chapter_name, + "source_id": source_id, + "source_url": source_url, + "source_metadata": source_metadata or {} + }) + session.commit() + + return UUID(result.fetchone()[0]) + + def get_hadiths_without_embeddings( + self, + limit: int = 100, + collection_id: Optional[UUID] = None + ) -> List[Dict[str, Any]]: + """Get hadiths that need embedding generation""" + with self.get_session() as session: + if collection_id: + query = text(""" + SELECT * FROM hadiths + WHERE embedding_generated = FALSE + AND collection_id = :collection_id + ORDER BY created_at ASC + LIMIT :limit + """) + result = session.execute(query, { + "collection_id": str(collection_id), + "limit": limit + }).fetchall() + else: + query = text(""" + SELECT * FROM hadiths + WHERE embedding_generated = FALSE + ORDER BY created_at ASC + LIMIT :limit + """) + result = session.execute(query, {"limit": limit}).fetchall() + + return [dict(row._mapping) for row in result] + + def mark_embedding_generated(self, hadith_id: UUID, version: str = "v1"): + """Mark hadith as having embedding generated""" + with self.get_session() as session: + query = text(""" + UPDATE hadiths + SET embedding_generated = TRUE, + embedding_version = :version, + updated_at = NOW() + WHERE id = :id + """) + session.execute(query, {"id": str(hadith_id), "version": version}) + session.commit() + + # ===== Ingestion Jobs ===== + + def create_ingestion_job( + self, + job_name: str, + job_type: str, + source_name: str, + config: Optional[Dict] = None + ) -> UUID: + """Create a new ingestion job""" + with self.get_session() as session: + query = text(""" + INSERT INTO ingestion_jobs (job_name, job_type, source_name, config, status, started_at) + VALUES (:job_name, :job_type, :source_name, :config, 'running', NOW()) + RETURNING id + """) + result = session.execute(query, { + "job_name": job_name, + "job_type": job_type, + "source_name": source_name, + "config": config or {} + }) + session.commit() + + return UUID(result.fetchone()[0]) + + def update_job_progress( + self, + job_id: UUID, + total: Optional[int] = None, + processed: Optional[int] = None, + failed: Optional[int] = None, + skipped: Optional[int] = None + ): + """Update job progress counters""" + with self.get_session() as session: + updates = [] + params = {"job_id": str(job_id)} + + if total is not None: + updates.append("total_records = :total") + params["total"] = total + if processed is not None: + updates.append("processed_records = :processed") + params["processed"] = processed + if failed is not None: + updates.append("failed_records = :failed") + params["failed"] = failed + if skipped is not None: + updates.append("skipped_records = :skipped") + params["skipped"] = skipped + + if updates: + query_str = f""" + UPDATE ingestion_jobs + SET {', '.join(updates)} + WHERE id = :job_id + """ + session.execute(text(query_str), params) + session.commit() + + def complete_job( + self, + job_id: UUID, + status: str = "success", + error_message: Optional[str] = None + ): + """Mark job as completed""" + with self.get_session() as session: + query = text(""" + UPDATE ingestion_jobs + SET status = :status, + completed_at = NOW(), + duration_seconds = EXTRACT(EPOCH FROM (NOW() - started_at)), + error_message = :error_message + WHERE id = :job_id + """) + session.execute(query, { + "job_id": str(job_id), + "status": status, + "error_message": error_message + }) + session.commit() + + def add_processing_log( + self, + job_id: UUID, + level: str, + message: str, + details: Optional[Dict] = None + ): + """Add a processing log entry""" + with self.get_session() as session: + query = text(""" + INSERT INTO processing_logs (job_id, log_level, message, details) + VALUES (:job_id, :level, :message, :details) + """) + session.execute(query, { + "job_id": str(job_id), + "level": level, + "message": message, + "details": details or {} + }) + session.commit() + + # ===== Statistics ===== + + def get_collection_stats(self, collection_id: UUID) -> Dict[str, Any]: + """Get statistics for a collection""" + with self.get_session() as session: + query = text(""" + SELECT * FROM get_collection_statistics(:collection_id) + """) + result = session.execute(query, {"collection_id": str(collection_id)}).fetchone() + + if result: + return dict(result._mapping) + return {} +=== ./src/embeddings/__init__.py === + +=== ./src/embeddings/generator.py === + +=== ./src/main_hadithapi.py === +""" +Main ingestion script for fetching hadiths from HadithAPI.com +""" +import sys +import argparse +from typing import Optional, Dict, Any +from uuid import UUID +import structlog +from config.settings import settings +from api_clients.hadithapi_client import HadithAPIClient +from database.repository import HadithRepository +from processors.text_cleaner import ArabicTextProcessor, TextCleaner + +# Configure structured logging +structlog.configure( + processors=[ + structlog.stdlib.filter_by_level, + structlog.stdlib.add_logger_name, + structlog.stdlib.add_log_level, + structlog.processors.TimeStamper(fmt="iso"), + structlog.processors.StackInfoRenderer(), + structlog.processors.format_exc_info, + structlog.processors.JSONRenderer() + ], + wrapper_class=structlog.stdlib.BoundLogger, + context_class=dict, + logger_factory=structlog.stdlib.LoggerFactory(), + cache_logger_on_first_use=True, +) + +logger = structlog.get_logger() + + +# Book slug to collection abbreviation mapping +BOOK_SLUG_MAPPING = { + 'sahih-bukhari': 'bukhari', + 'sahih-muslim': 'muslim', + 'sunan-abu-dawood': 'abudawud', + 'jami-at-tirmidhi': 'tirmidhi', + 'sunan-an-nasai': 'nasai', + 'sunan-ibn-e-majah': 'ibnmajah', + 'muwatta-imam-malik': 'malik', + 'musnad-ahmad': 'ahmad', + 'sunan-ad-darimi': 'darimi', + 'mishkat-al-masabih': 'mishkat' +} + + +class HadithAPIIngestionService: + """Service for ingesting hadiths from HadithAPI.com""" + + def __init__(self): + self.api_client = HadithAPIClient() + self.repo = HadithRepository() + self.text_processor = ArabicTextProcessor() + self.text_cleaner = TextCleaner() + + def sync_books_from_api(self) -> Dict[str, Any]: + """ + Sync book metadata from API to database + + Returns: + Dictionary mapping book_slug -> collection_id + """ + logger.info("syncing_books_from_api") + + # Get books from API + api_books = self.api_client.get_books() + + book_mapping = {} + + for api_book in api_books: + book_slug = api_book.get('bookSlug') + + # Map to our collection abbreviation + collection_abbr = BOOK_SLUG_MAPPING.get(book_slug) + + if not collection_abbr: + logger.warning( + "unmapped_book", + book_slug=book_slug, + book_name=api_book.get('bookName') + ) + continue + + # Get or verify collection exists in database + collection = self.repo.get_collection_by_abbreviation(collection_abbr) + + if not collection: + logger.warning( + "collection_not_in_db", + abbreviation=collection_abbr, + book_slug=book_slug + ) + continue + + collection_id = UUID(collection['id']) + book_mapping[book_slug] = { + 'collection_id': collection_id, + 'book_id': api_book.get('id'), + 'book_name': api_book.get('bookName'), + 'hadiths_count': api_book.get('hadiths_count'), + 'chapters_count': api_book.get('chapters_count') + } + + logger.info( + "book_mapped", + book_slug=book_slug, + collection_abbr=collection_abbr, + hadiths_count=api_book.get('hadiths_count') + ) + + logger.info( + "books_synced", + total_books=len(book_mapping) + ) + + return book_mapping + + def ingest_collection( + self, + book_slug: str, + limit: Optional[int] = None + ) -> dict: + """ + Ingest entire collection from HadithAPI.com + + Args: + book_slug: Book slug identifier (e.g., 'sahih-bukhari') + limit: Optional limit on number of hadiths to ingest + + Returns: + Statistics dictionary + """ + logger.info( + "ingestion_started", + book_slug=book_slug, + limit=limit + ) + + # Get book mapping + book_mapping = self.sync_books_from_api() + + if book_slug not in book_mapping: + logger.error( + "book_not_mapped", + book_slug=book_slug, + available_books=list(book_mapping.keys()) + ) + raise ValueError(f"Book '{book_slug}' not found or not mapped") + + book_info = book_mapping[book_slug] + collection_id = book_info['collection_id'] + book_id = book_info['book_id'] + + # Create ingestion job + job_id = self.repo.create_ingestion_job( + job_name=f"ingest_{book_slug}", + job_type="api_fetch", + source_name="hadithapi.com", + config={ + "book_slug": book_slug, + "book_id": book_id, + "limit": limit + } + ) + + logger.info( + "job_created", + job_id=str(job_id), + book_slug=book_slug, + expected_count=book_info.get('hadiths_count') + ) + + stats = { + "processed": 0, + "failed": 0, + "skipped": 0 + } + + try: + # Iterate through all hadiths in book + for hadith_data, chapter_data in self.api_client.iter_all_hadiths_in_book_with_chapters( + book_id=book_id, + book_slug=book_slug, + batch_size=100 + ): + # Check limit + if limit and stats["processed"] >= limit: + logger.info("limit_reached", limit=limit) + break + + try: + # Process and store hadith + self._process_and_store_hadith( + collection_id=collection_id, + hadith_data=hadith_data, + chapter_data=chapter_data + ) + + stats["processed"] += 1 + + # Update job progress every 100 hadiths + if stats["processed"] % 100 == 0: + self.repo.update_job_progress( + job_id=job_id, + processed=stats["processed"], + failed=stats["failed"], + skipped=stats["skipped"] + ) + + logger.info( + "progress_update", + book_slug=book_slug, + processed=stats["processed"], + failed=stats["failed"], + percentage=round( + (stats["processed"] / int(book_info.get('hadiths_count', 1))) * 100, + 2 + ) if book_info.get('hadiths_count') else 0 + ) + + except Exception as e: + stats["failed"] += 1 + logger.error( + "hadith_processing_failed", + error=str(e), + hadith_number=hadith_data.get("hadithNumber"), + hadith_id=hadith_data.get("id") + ) + + self.repo.add_processing_log( + job_id=job_id, + level="ERROR", + message=f"Failed to process hadith: {str(e)}", + details={"hadith_data": hadith_data} + ) + + # Update final job progress + self.repo.update_job_progress( + job_id=job_id, + total=stats["processed"] + stats["failed"] + stats["skipped"], + processed=stats["processed"], + failed=stats["failed"], + skipped=stats["skipped"] + ) + + # Mark job as complete + self.repo.complete_job(job_id=job_id, status="success") + + # Update collection count + self.repo.update_collection_count( + collection_id=collection_id, + count=stats["processed"] + ) + + logger.info( + "ingestion_completed", + book_slug=book_slug, + stats=stats + ) + + return stats + + except Exception as e: + logger.error( + "ingestion_failed", + book_slug=book_slug, + error=str(e), + exc_info=True + ) + + self.repo.complete_job( + job_id=job_id, + status="failed", + error_message=str(e) + ) + + raise + + def _process_and_store_hadith( + self, + collection_id: UUID, + hadith_data: dict, + chapter_data: Optional[dict] + ): + """Process and store a single hadith""" + + # Extract hadith number + hadith_number = hadith_data.get("hadithNumber") + if not hadith_number: + raise ValueError("Missing hadith number") + + # Convert to integer + try: + hadith_number = int(hadith_number) + except (ValueError, TypeError): + raise ValueError(f"Invalid hadith number: {hadith_number}") + + # Extract text in multiple languages + arabic_text = hadith_data.get("hadithArabic") + english_text = hadith_data.get("hadithEnglish") + urdu_text = hadith_data.get("hadithUrdu") + + if not arabic_text: + raise ValueError("Missing Arabic text") + + # Clean texts + arabic_text = self.text_cleaner.clean_text(arabic_text) + if english_text: + english_text = self.text_cleaner.clean_text(english_text) + if urdu_text: + urdu_text = self.text_cleaner.clean_text(urdu_text) + + # Extract grade/status + grade = hadith_data.get("status") + + # Get or create chapter (book in our schema) + book_id = None + chapter_name = None + + if chapter_data: + chapter_id = chapter_data.get('id') + chapter_number = chapter_data.get('chapterNumber') + chapter_name_en = chapter_data.get('chapterEnglish') + chapter_name_ar = chapter_data.get('chapterArabic') + chapter_name = chapter_name_en + + if chapter_number: + try: + chapter_number = int(chapter_number) + except (ValueError, TypeError): + chapter_number = chapter_id # Fallback to ID + + # Get or create book (chapter in HadithAPI = book in our schema) + existing_book = self.repo.get_book(collection_id, chapter_number) + + if not existing_book: + book_id = self.repo.upsert_book( + collection_id=collection_id, + book_number=chapter_number, + name_english=chapter_name_en, + name_arabic=chapter_name_ar, + metadata=chapter_data + ) + else: + book_id = UUID(existing_book['id']) + + # Build source metadata + source_metadata = { + 'api_id': hadith_data.get('id'), + 'englishNarrator': hadith_data.get('englishNarrator'), + 'urduNarrator': hadith_data.get('urduNarrator'), + 'bookSlug': hadith_data.get('bookSlug'), + 'chapterId': hadith_data.get('chapterId'), + 'chapter': chapter_data + } + + # Store hadith + hadith_id = self.repo.upsert_hadith( + collection_id=collection_id, + book_id=book_id, + hadith_number=hadith_number, + arabic_text=arabic_text, + english_text=english_text, + urdu_text=urdu_text, + grade=grade, + grade_source="hadithapi.com", + chapter_name=chapter_name, + source_id=str(hadith_data.get('id', '')), + source_url=f"https://hadithapi.com/hadith/{hadith_data.get('id')}", + source_metadata=source_metadata + ) + + logger.debug( + "hadith_stored", + hadith_id=str(hadith_id), + hadith_number=hadith_number, + chapter_id=chapter_data.get('id') if chapter_data else None + ) + + def ingest_all_books(self, limit_per_book: Optional[int] = None) -> Dict[str, dict]: + """ + Ingest all available books + + Args: + limit_per_book: Optional limit per book + + Returns: + Dictionary of book_slug -> stats + """ + logger.info("ingesting_all_books", limit_per_book=limit_per_book) + + book_mapping = self.sync_books_from_api() + results = {} + + for book_slug in book_mapping.keys(): + logger.info("starting_book", book_slug=book_slug) + + try: + stats = self.ingest_collection( + book_slug=book_slug, + limit=limit_per_book + ) + results[book_slug] = {"status": "success", "stats": stats} + + except Exception as e: + logger.error( + "book_ingestion_failed", + book_slug=book_slug, + error=str(e) + ) + results[book_slug] = {"status": "failed", "error": str(e)} + + logger.info("all_books_completed", results=results) + return results + + def close(self): + """Close connections""" + self.api_client.close() + + +def main(): + """Main entry point""" + parser = argparse.ArgumentParser( + description="Ingest hadiths from HadithAPI.com" + ) + parser.add_argument( + "--book-slug", + help="Book slug (e.g., sahih-bukhari). If not provided, ingests all books." + ) + parser.add_argument( + "--limit", + type=int, + help="Limit number of hadiths to ingest per book" + ) + parser.add_argument( + "--list-books", + action="store_true", + help="List available books and exit" + ) + + args = parser.parse_args() + + try: + service = HadithAPIIngestionService() + + # List books mode + if args.list_books: + logger.info("listing_available_books") + book_mapping = service.sync_books_from_api() + + print("\n=== Available Books ===\n") + for book_slug, info in book_mapping.items(): + print(f"Book Slug: {book_slug}") + print(f" Name: {info['book_name']}") + print(f" Hadiths: {info['hadiths_count']}") + print(f" Chapters: {info['chapters_count']}") + print() + + service.close() + return 0 + + # Ingest mode + if args.book_slug: + logger.info( + "script_started", + book_slug=args.book_slug, + limit=args.limit + ) + + stats = service.ingest_collection( + book_slug=args.book_slug, + limit=args.limit + ) + + logger.info("script_completed", stats=stats) + + print(f"\n=== Ingestion completed for {args.book_slug} ===") + print(f"Processed: {stats['processed']}") + print(f"Failed: {stats['failed']}") + print(f"Skipped: {stats['skipped']}") + + else: + # Ingest all books + logger.info("script_started_all_books", limit_per_book=args.limit) + + results = service.ingest_all_books(limit_per_book=args.limit) + + print("\n=== All Books Ingestion Summary ===\n") + for book_slug, result in results.items(): + print(f"{book_slug}: {result['status']}") + if result['status'] == 'success': + stats = result['stats'] + print(f" Processed: {stats['processed']}") + print(f" Failed: {stats['failed']}") + else: + print(f" Error: {result['error']}") + print() + + service.close() + return 0 + + except Exception as e: + logger.error( + "script_failed", + error=str(e), + exc_info=True + ) + + print(f"\nIngestion failed: {str(e)}", file=sys.stderr) + return 1 + + +if __name__ == "__main__": + sys.exit(main()) +=== ./src/__init__.py === + +=== ./src/utils/__init__.py === + +=== ./src/utils/logger.py === + +=== ./src/utils/retry.py === + +=== ./src/processors/validator.py === + +=== ./src/processors/arabic_normalizer.py === + +=== ./src/processors/__init__.py === + +=== ./src/processors/text_cleaner.py === +""" +Text cleaning and normalization utilities +""" +import re +from typing import Optional +import unicodedata +import structlog + +logger = structlog.get_logger() + + +class ArabicTextProcessor: + """Process and normalize Arabic text""" + + # Arabic diacritics to remove + DIACRITICS = re.compile( + r'[\u064B-\u065F\u0670\u06D6-\u06DC\u06DF-\u06E8\u06EA-\u06ED]' + ) + + # Tatweel (elongation character) + TATWEEL = '\u0640' + + # Normalize Arabic letters + ALEF_VARIANTS = re.compile(r'[إأآا]') + ALEF_MAKSURA = 'ى' + YAA = 'ي' + TAA_MARBUTA = 'ة' + HAA = 'ه' + + @classmethod + def remove_diacritics(cls, text: str) -> str: + """Remove Arabic diacritics (tashkeel)""" + if not text: + return text + return cls.DIACRITICS.sub('', text) + + @classmethod + def remove_tatweel(cls, text: str) -> str: + """Remove tatweel (elongation) character""" + if not text: + return text + return text.replace(cls.TATWEEL, '') + + @classmethod + def normalize_alef(cls, text: str) -> str: + """Normalize all Alef variants to bare Alef""" + if not text: + return text + return cls.ALEF_VARIANTS.sub('ا', text) + + @classmethod + def normalize_yaa(cls, text: str) -> str: + """Normalize Alef Maksura to Yaa""" + if not text: + return text + return text.replace(cls.ALEF_MAKSURA, cls.YAA) + + @classmethod + def normalize_taa_marbuta(cls, text: str) -> str: + """Normalize Taa Marbuta to Haa""" + if not text: + return text + return text.replace(cls.TAA_MARBUTA, cls.HAA) + + @classmethod + def normalize_whitespace(cls, text: str) -> str: + """Normalize whitespace""" + if not text: + return text + # Replace multiple spaces with single space + text = re.sub(r'\s+', ' ', text) + # Trim + return text.strip() + + @classmethod + def normalize_full(cls, text: str) -> str: + """ + Apply full normalization: + - Remove diacritics + - Remove tatweel + - Normalize Alef variants + - Normalize Yaa + - Normalize Taa Marbuta + - Normalize whitespace + """ + if not text: + return text + + text = cls.remove_diacritics(text) + text = cls.remove_tatweel(text) + text = cls.normalize_alef(text) + text = cls.normalize_yaa(text) + text = cls.normalize_taa_marbuta(text) + text = cls.normalize_whitespace(text) + + return text + + @classmethod + def extract_sanad_matn(cls, text: str) -> tuple[Optional[str], Optional[str]]: + """ + Attempt to extract sanad (chain) and matn (text) from hadith + + Common patterns: + - حدثنا ... قال ... (sanad ends before reported speech) + - Simple heuristic: Split on first occurrence of قال or أن + + Returns: + Tuple of (sanad, matn) or (None, None) if cannot split + """ + if not text: + return None, None + + # Look for common sanad-matn separators + separators = [ + r'قال\s*رسول\s*الله', # "The Messenger of Allah said" + r'قال\s*النبي', # "The Prophet said" + r'عن\s*النبي', # "From the Prophet" + r'أن\s*رسول\s*الله', # "That the Messenger of Allah" + ] + + for pattern in separators: + match = re.search(pattern, text, re.IGNORECASE) + if match: + split_pos = match.start() + sanad = text[:split_pos].strip() + matn = text[split_pos:].strip() + + logger.debug( + "sanad_matn_extracted", + sanad_length=len(sanad), + matn_length=len(matn) + ) + + return sanad, matn + + # Could not split + logger.debug("sanad_matn_extraction_failed") + return None, None + + +class TextCleaner: + """General text cleaning utilities""" + + @staticmethod + def clean_html(text: str) -> str: + """Remove HTML tags""" + if not text: + return text + return re.sub(r'<[^>]+>', '', text) + + @staticmethod + def normalize_unicode(text: str) -> str: + """Normalize Unicode (NFC normalization)""" + if not text: + return text + return unicodedata.normalize('NFC', text) + + @staticmethod + def clean_text(text: str) -> str: + """Apply general cleaning""" + if not text: + return text + + # Remove HTML + text = TextCleaner.clean_html(text) + + # Normalize Unicode + text = TextCleaner.normalize_unicode(text) + + # Normalize whitespace + text = ArabicTextProcessor.normalize_whitespace(text) + + return text +=== ./src/api_clients/base_client.py === +""" +Base API client with retry logic and rate limiting +""" +import httpx +import time +from typing import Optional, Dict, Any +from tenacity import ( + retry, + stop_after_attempt, + wait_exponential, + retry_if_exception_type +) +import structlog +from config.settings import settings + +logger = structlog.get_logger() + + +class BaseAPIClient: + """Base class for API clients with built-in retry and rate limiting""" + + def __init__( + self, + base_url: str, + api_key: Optional[str] = None, + rate_limit: int = 90, + timeout: int = 30 + ): + self.base_url = base_url.rstrip('/') + self.api_key = api_key + self.rate_limit = rate_limit + self.timeout = timeout + + # Rate limiting + self.request_times = [] + self.min_interval = 60.0 / rate_limit # seconds between requests + + # HTTP client + self.client = httpx.Client(timeout=timeout) + + logger.info( + "api_client_initialized", + base_url=base_url, + rate_limit=rate_limit + ) + + def _wait_for_rate_limit(self): + """Implement rate limiting""" + now = time.time() + + # Remove old timestamps (older than 1 minute) + self.request_times = [t for t in self.request_times if now - t < 60] + + # If we're at the limit, wait + if len(self.request_times) >= self.rate_limit: + sleep_time = 60 - (now - self.request_times[0]) + if sleep_time > 0: + logger.info( + "rate_limit_wait", + sleep_seconds=sleep_time, + requests_in_window=len(self.request_times) + ) + time.sleep(sleep_time) + self.request_times = [] + + # Add current timestamp + self.request_times.append(time.time()) + + @retry( + stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=1, min=2, max=10), + retry=retry_if_exception_type((httpx.HTTPError, httpx.TimeoutException)), + reraise=True + ) + def _make_request( + self, + method: str, + endpoint: str, + params: Optional[Dict[str, Any]] = None, + headers: Optional[Dict[str, str]] = None + ) -> Dict[str, Any]: + """Make HTTP request with retry logic""" + + # Rate limiting + self._wait_for_rate_limit() + + # Prepare headers + request_headers = headers or {} + if self.api_key: + request_headers['X-API-Key'] = self.api_key + + # Make request + url = f"{self.base_url}/{endpoint.lstrip('/')}" + + logger.debug( + "api_request", + method=method, + url=url, + params=params + ) + + response = self.client.request( + method=method, + url=url, + params=params, + headers=request_headers + ) + + response.raise_for_status() + + logger.debug( + "api_response", + status_code=response.status_code, + response_size=len(response.content) + ) + + return response.json() + + def get(self, endpoint: str, params: Optional[Dict] = None) -> Dict[str, Any]: + """Make GET request""" + return self._make_request("GET", endpoint, params=params) + + def close(self): + """Close the HTTP client""" + self.client.close() + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() +=== ./src/api_clients/hadith_one_client.py === + +=== ./src/api_clients/__init__.py === + +=== ./src/api_clients/hadithapi_client.py === +""" +Client for HadithAPI.com API +""" +from typing import List, Dict, Any, Optional, Generator +import structlog +from .base_client import BaseAPIClient +from config.settings import settings + +logger = structlog.get_logger() + + +class HadithAPIClient(BaseAPIClient): + """Client for interacting with hadithapi.com API""" + + def __init__(self, api_key: Optional[str] = None): + super().__init__( + base_url="https://hadithapi.com/api", + api_key=api_key or settings.HADITHAPI_KEY, + rate_limit=30 # Conservative: 30 req/min + ) + + def _add_api_key(self, params: Optional[Dict] = None) -> Dict: + """Add API key to request parameters""" + params = params or {} + params['apiKey'] = self.api_key + return params + + def get_books(self) -> List[Dict[str, Any]]: + """ + Get list of all available books/collections + + Returns: + List of book dictionaries + """ + logger.info("fetching_books") + + params = self._add_api_key() + response = self.get("books", params=params) + + if response.get('status') != 200: + logger.error( + "api_error", + status=response.get('status'), + message=response.get('message') + ) + raise Exception(f"API Error: {response.get('message')}") + + books = response.get('data', []) + + logger.info( + "books_fetched", + count=len(books) + ) + + return books + + def get_chapters(self, book_slug: str) -> List[Dict[str, Any]]: + """ + Get chapters for a specific book + + Args: + book_slug: Book slug identifier (e.g., 'sahih-bukhari') + + Returns: + List of chapter dictionaries + """ + logger.info( + "fetching_chapters", + book_slug=book_slug + ) + + params = self._add_api_key() + response = self.get(f"{book_slug}/chapters", params=params) + + if response.get('status') != 200: + logger.error( + "api_error", + status=response.get('status'), + message=response.get('message') + ) + raise Exception(f"API Error: {response.get('message')}") + + chapters = response.get('data', []) + + logger.info( + "chapters_fetched", + book_slug=book_slug, + count=len(chapters) + ) + + return chapters + + def get_hadiths_page( + self, + book_id: int, + chapter_id: Optional[int] = None, + page: int = 1, + limit: int = 100 + ) -> Dict[str, Any]: + """ + Get a page of hadiths + + Args: + book_id: Book ID + chapter_id: Optional chapter ID to filter by + page: Page number (1-indexed) + limit: Results per page (max 100) + + Returns: + Response dictionary with hadiths and pagination info + """ + params = self._add_api_key({ + 'book': book_id, + 'page': page, + 'limit': min(limit, 100) # Enforce max limit + }) + + if chapter_id: + params['chapter'] = chapter_id + + logger.debug( + "fetching_hadiths_page", + book_id=book_id, + chapter_id=chapter_id, + page=page, + limit=limit + ) + + response = self.get("hadiths", params=params) + + if response.get('status') != 200: + logger.error( + "api_error", + status=response.get('status'), + message=response.get('message') + ) + raise Exception(f"API Error: {response.get('message')}") + + return response.get('data', {}) + + def iter_all_hadiths_in_book( + self, + book_id: int, + book_slug: str, + chapter_id: Optional[int] = None, + batch_size: int = 100 + ) -> Generator[Dict[str, Any], None, None]: + """ + Iterator that yields all hadiths in a book, handling pagination automatically + + Args: + book_id: Book ID + book_slug: Book slug for logging + chapter_id: Optional chapter ID to filter by + batch_size: Number of hadiths to fetch per request (max 100) + + Yields: + Individual hadith dictionaries + """ + page = 1 + total_fetched = 0 + + while True: + response_data = self.get_hadiths_page( + book_id=book_id, + chapter_id=chapter_id, + page=page, + limit=batch_size + ) + + hadiths = response_data.get('hadiths', []) + pagination = response_data.get('pagination', {}) + + if not hadiths: + logger.info( + "book_complete", + book_slug=book_slug, + chapter_id=chapter_id, + total_hadiths=total_fetched + ) + break + + for hadith in hadiths: + yield hadith + total_fetched += 1 + + # Log progress + if total_fetched % 500 == 0: + logger.info( + "progress", + book_slug=book_slug, + fetched=total_fetched, + total=pagination.get('total', '?') + ) + + # Check if there are more pages + current_page = pagination.get('current_page', page) + last_page = pagination.get('last_page', 1) + + if current_page >= last_page: + logger.info( + "book_complete", + book_slug=book_slug, + total_hadiths=total_fetched, + total_pages=last_page + ) + break + + page += 1 + + def iter_all_hadiths_in_book_with_chapters( + self, + book_id: int, + book_slug: str, + batch_size: int = 100 + ) -> Generator[tuple[Dict[str, Any], Optional[Dict[str, Any]]], None, None]: + """ + Iterator that yields all hadiths in a book, organized by chapter + + Args: + book_id: Book ID + book_slug: Book slug + batch_size: Number of hadiths to fetch per request + + Yields: + Tuple of (hadith_dict, chapter_dict or None) + """ + # First, get all chapters + try: + chapters = self.get_chapters(book_slug) + except Exception as e: + logger.warning( + "chapters_fetch_failed", + book_slug=book_slug, + error=str(e), + fallback="fetching_all_hadiths_without_chapter_filter" + ) + # Fallback: fetch all hadiths without chapter filter + for hadith in self.iter_all_hadiths_in_book( + book_id=book_id, + book_slug=book_slug, + batch_size=batch_size + ): + chapter_info = hadith.get('chapter') + yield hadith, chapter_info + return + + logger.info( + "starting_chapter_by_chapter_fetch", + book_slug=book_slug, + total_chapters=len(chapters) + ) + + # Process each chapter + for chapter in chapters: + chapter_id = chapter.get('id') + chapter_number = chapter.get('chapterNumber') + + logger.info( + "fetching_chapter", + book_slug=book_slug, + chapter_id=chapter_id, + chapter_number=chapter_number + ) + + try: + for hadith in self.iter_all_hadiths_in_book( + book_id=book_id, + book_slug=book_slug, + chapter_id=chapter_id, + batch_size=batch_size + ): + yield hadith, chapter + except Exception as e: + logger.error( + "chapter_fetch_failed", + book_slug=book_slug, + chapter_id=chapter_id, + error=str(e) + ) + continue + + def get_book_by_slug(self, book_slug: str) -> Optional[Dict[str, Any]]: + """ + Get book details by slug + + Args: + book_slug: Book slug identifier + + Returns: + Book dictionary or None if not found + """ + books = self.get_books() + for book in books: + if book.get('bookSlug') == book_slug: + return book + return None +=== ./src/api_clients/sunnah_client.py === +""" +Client for Sunnah.com API +""" +from typing import List, Dict, Any, Optional, Generator +import structlog +from .base_client import BaseAPIClient +from config.settings import settings + +logger = structlog.get_logger() + + +class SunnahAPIClient(BaseAPIClient): + """Client for interacting with Sunnah.com API""" + + def __init__(self, api_key: Optional[str] = None): + super().__init__( + base_url=settings.SUNNAH_BASE_URL, + api_key=api_key or settings.SUNNAH_API_KEY, + rate_limit=settings.API_RATE_LIMIT + ) + + def get_collections(self) -> List[Dict[str, Any]]: + """ + Get list of all hadith collections + + Returns: + List of collection dictionaries + """ + logger.info("fetching_collections") + + response = self.get("collections") + collections = response.get("data", []) + + logger.info( + "collections_fetched", + count=len(collections) + ) + + return collections + + def get_collection_details(self, collection_name: str) -> Dict[str, Any]: + """ + Get details for a specific collection + + Args: + collection_name: Collection abbreviation (e.g., 'bukhari') + + Returns: + Collection details dictionary + """ + logger.info( + "fetching_collection_details", + collection=collection_name + ) + + response = self.get(f"collections/{collection_name}") + + return response + + def get_books(self, collection_name: str) -> List[Dict[str, Any]]: + """ + Get all books in a collection + + Args: + collection_name: Collection abbreviation + + Returns: + List of book dictionaries + """ + logger.info( + "fetching_books", + collection=collection_name + ) + + response = self.get(f"collections/{collection_name}/books") + books = response.get("data", []) + + logger.info( + "books_fetched", + collection=collection_name, + count=len(books) + ) + + return books + + def get_hadiths_in_book( + self, + collection_name: str, + book_number: int, + limit: int = 50, + page: int = 1 + ) -> Dict[str, Any]: + """ + Get hadiths in a specific book with pagination + + Args: + collection_name: Collection abbreviation + book_number: Book number + limit: Number of hadiths per page + page: Page number + + Returns: + Response with hadiths and pagination info + """ + logger.debug( + "fetching_hadiths", + collection=collection_name, + book=book_number, + page=page, + limit=limit + ) + + response = self.get( + f"collections/{collection_name}/books/{book_number}/hadiths", + params={"limit": limit, "page": page} + ) + + return response + + def iter_all_hadiths_in_book( + self, + collection_name: str, + book_number: int, + batch_size: int = 50 + ) -> Generator[Dict[str, Any], None, None]: + """ + Iterator that yields all hadiths in a book, handling pagination automatically + + Args: + collection_name: Collection abbreviation + book_number: Book number + batch_size: Number of hadiths to fetch per request + + Yields: + Individual hadith dictionaries + """ + page = 1 + total_fetched = 0 + + while True: + response = self.get_hadiths_in_book( + collection_name=collection_name, + book_number=book_number, + limit=batch_size, + page=page + ) + + hadiths = response.get("data", []) + + if not hadiths: + logger.info( + "book_complete", + collection=collection_name, + book=book_number, + total_hadiths=total_fetched + ) + break + + for hadith in hadiths: + yield hadith + total_fetched += 1 + + # Check if there are more pages + pagination = response.get("pagination", {}) + if page >= pagination.get("total_pages", 1): + break + + page += 1 + + def iter_all_hadiths_in_collection( + self, + collection_name: str, + batch_size: int = 50 + ) -> Generator[tuple[Dict[str, Any], int], None, None]: + """ + Iterator that yields all hadiths in a collection + + Args: + collection_name: Collection abbreviation + batch_size: Number of hadiths to fetch per request + + Yields: + Tuple of (hadith_dict, book_number) + """ + # First, get all books in the collection + books = self.get_books(collection_name) + + logger.info( + "starting_collection_fetch", + collection=collection_name, + total_books=len(books) + ) + + for book in books: + book_number = book.get("bookNumber") + + if not book_number: + logger.warning( + "book_missing_number", + book=book + ) + continue + + logger.info( + "fetching_book", + collection=collection_name, + book=book_number + ) + + try: + for hadith in self.iter_all_hadiths_in_book( + collection_name=collection_name, + book_number=int(book_number), + batch_size=batch_size + ): + yield hadith, int(book_number) + except Exception as e: + logger.error( + "book_fetch_failed", + collection=collection_name, + book=book_number, + error=str(e) + ) + continue + + def get_specific_hadith( + self, + collection_name: str, + book_number: int, + hadith_number: int + ) -> Dict[str, Any]: + """ + Get a specific hadith by its number + + Args: + collection_name: Collection abbreviation + book_number: Book number + hadith_number: Hadith number + + Returns: + Hadith dictionary + """ + response = self.get( + f"hadiths/collection/{collection_name}/{book_number}/{hadith_number}" + ) + + return response.get("data", {}) +=== ./src/main.py === +""" +Main ingestion script for fetching hadiths from Sunnah.com API +""" +import sys +import argparse +from typing import Optional +from uuid import UUID +import structlog +from config.settings import settings +from api_clients.sunnah_client import SunnahAPIClient +from database.repository import HadithRepository +from processors.text_cleaner import ArabicTextProcessor, TextCleaner + +# Configure structured logging +structlog.configure( + processors=[ + structlog.stdlib.filter_by_level, + structlog.stdlib.add_logger_name, + structlog.stdlib.add_log_level, + structlog.processors.TimeStamper(fmt="iso"), + structlog.processors.StackInfoRenderer(), + structlog.processors.format_exc_info, + structlog.processors.JSONRenderer() + ], + wrapper_class=structlog.stdlib.BoundLogger, + context_class=dict, + logger_factory=structlog.stdlib.LoggerFactory(), + cache_logger_on_first_use=True, +) + +logger = structlog.get_logger() + + +class HadithIngestionService: + """Service for ingesting hadiths from Sunnah.com API""" + + def __init__(self): + self.api_client = SunnahAPIClient() + self.repo = HadithRepository() + self.text_processor = ArabicTextProcessor() + self.text_cleaner = TextCleaner() + + def ingest_collection( + self, + collection_abbr: str, + limit: Optional[int] = None + ) -> dict: + """ + Ingest entire collection from Sunnah.com API + + Args: + collection_abbr: Collection abbreviation (e.g., 'bukhari') + limit: Optional limit on number of hadiths to ingest + + Returns: + Statistics dictionary + """ + logger.info( + "ingestion_started", + collection=collection_abbr, + limit=limit + ) + + # Get collection from database + collection = self.repo.get_collection_by_abbreviation(collection_abbr) + if not collection: + logger.error( + "collection_not_found", + collection=collection_abbr + ) + raise ValueError(f"Collection '{collection_abbr}' not found in database") + + collection_id = UUID(collection['id']) + + # Create ingestion job + job_id = self.repo.create_ingestion_job( + job_name=f"ingest_{collection_abbr}", + job_type="api_fetch", + source_name="sunnah.com", + config={"collection": collection_abbr, "limit": limit} + ) + + logger.info( + "job_created", + job_id=str(job_id), + collection=collection_abbr + ) + + stats = { + "processed": 0, + "failed": 0, + "skipped": 0 + } + + try: + # Iterate through all hadiths in collection + for hadith_data, book_number in self.api_client.iter_all_hadiths_in_collection( + collection_name=collection_abbr, + batch_size=50 + ): + # Check limit + if limit and stats["processed"] >= limit: + logger.info("limit_reached", limit=limit) + break + + try: + # Process and store hadith + self._process_and_store_hadith( + collection_id=collection_id, + hadith_data=hadith_data, + book_number=book_number + ) + + stats["processed"] += 1 + + # Update job progress every 100 hadiths + if stats["processed"] % 100 == 0: + self.repo.update_job_progress( + job_id=job_id, + processed=stats["processed"], + failed=stats["failed"], + skipped=stats["skipped"] + ) + + logger.info( + "progress_update", + processed=stats["processed"], + failed=stats["failed"] + ) + + except Exception as e: + stats["failed"] += 1 + logger.error( + "hadith_processing_failed", + error=str(e), + hadith_number=hadith_data.get("hadithNumber") + ) + + self.repo.add_processing_log( + job_id=job_id, + level="ERROR", + message=f"Failed to process hadith: {str(e)}", + details={"hadith_data": hadith_data} + ) + + # Update final job progress + self.repo.update_job_progress( + job_id=job_id, + total=stats["processed"] + stats["failed"] + stats["skipped"], + processed=stats["processed"], + failed=stats["failed"], + skipped=stats["skipped"] + ) + + # Mark job as complete + self.repo.complete_job(job_id=job_id, status="success") + + # Update collection count + self.repo.update_collection_count( + collection_id=collection_id, + count=stats["processed"] + ) + + logger.info( + "ingestion_completed", + collection=collection_abbr, + stats=stats + ) + + return stats + + except Exception as e: + logger.error( + "ingestion_failed", + collection=collection_abbr, + error=str(e) + ) + + self.repo.complete_job( + job_id=job_id, + status="failed", + error_message=str(e) + ) + + raise + + def _process_and_store_hadith( + self, + collection_id: UUID, + hadith_data: dict, + book_number: int + ): + """Process and store a single hadith""" + + # Extract hadith number + hadith_number = hadith_data.get("hadithNumber") + if not hadith_number: + raise ValueError("Missing hadith number") + + # Extract text in multiple languages + hadith_texts = hadith_data.get("hadith", []) + + arabic_text = None + english_text = None + urdu_text = None + grade = None + grade_source = None + chapter_name = None + + for text_entry in hadith_texts: + lang = text_entry.get("lang", "").lower() + body = text_entry.get("body") + + if not body: + continue + + # Clean text + body = self.text_cleaner.clean_text(body) + + if lang == "ar": + arabic_text = body + chapter_name = text_entry.get("chapterTitle") + + # Extract grade from Arabic entry + grades = text_entry.get("grades", []) + if grades: + grade = grades[0].get("grade") + grade_source = grades[0].get("name") + + elif lang == "en": + english_text = body + + # Extract grade from English entry if not found + if not grade: + grades = text_entry.get("grades", []) + if grades: + grade = grades[0].get("grade") + grade_source = grades[0].get("name") + + elif lang == "ur": + urdu_text = body + + if not arabic_text: + raise ValueError("Missing Arabic text") + + # Get or create book + book = self.repo.get_book(collection_id, book_number) + if not book: + # Extract book name from hadith data + book_name_en = None + book_name_ar = None + + for text_entry in hadith_texts: + lang = text_entry.get("lang", "").lower() + book_data = text_entry.get("book", [{}])[0] if text_entry.get("book") else {} + + if lang == "en" and book_data.get("name"): + book_name_en = book_data.get("name") + elif lang == "ar" and book_data.get("name"): + book_name_ar = book_data.get("name") + + book_id = self.repo.upsert_book( + collection_id=collection_id, + book_number=book_number, + name_english=book_name_en, + name_arabic=book_name_ar + ) + else: + book_id = UUID(book["id"]) + + # Store hadith + hadith_id = self.repo.upsert_hadith( + collection_id=collection_id, + book_id=book_id, + hadith_number=int(hadith_number), + arabic_text=arabic_text, + english_text=english_text, + urdu_text=urdu_text, + grade=grade, + grade_source=grade_source, + chapter_name=chapter_name, + source_id=str(hadith_data.get("id", "")), + source_url=hadith_data.get("reference", {}).get("link"), + source_metadata=hadith_data + ) + + logger.debug( + "hadith_stored", + hadith_id=str(hadith_id), + hadith_number=hadith_number, + book_number=book_number + ) + + def close(self): + """Close connections""" + self.api_client.close() + + +def main(): + """Main entry point""" + parser = argparse.ArgumentParser(description="Ingest hadiths from Sunnah.com API") + parser.add_argument( + "collection", + help="Collection abbreviation (e.g., bukhari, muslim)" + ) + parser.add_argument( + "--limit", + type=int, + help="Limit number of hadiths to ingest" + ) + + args = parser.parse_args() + + logger.info( + "script_started", + collection=args.collection, + limit=args.limit + ) + + try: + service = HadithIngestionService() + stats = service.ingest_collection( + collection_abbr=args.collection, + limit=args.limit + ) + + logger.info( + "script_completed", + stats=stats + ) + + print(f"\nIngestion completed successfully!") + print(f"Processed: {stats['processed']}") + print(f"Failed: {stats['failed']}") + print(f"Skipped: {stats['skipped']}") + + service.close() + + return 0 + + except Exception as e: + logger.error( + "script_failed", + error=str(e), + exc_info=True + ) + + print(f"\nIngestion failed: {str(e)}", file=sys.stderr) + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/hadith-ingestion/config/__pycache__/__init__.cpython-312.pyc b/hadith-ingestion/config/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1a351fb02e81e6adb15ccebe235674a6e2f1d0d1 GIT binary patch literal 197 zcmZ9Fu?@m75JhbQ5<*JL4is<;AV#30ww&ZhPH}AHb3hq^37COd7=w}urAvj~@}>L# z-RY$N%JY?ox_{P`@}2Bo<2+s-Vh|g7dV4~?!UG+VMYav*1+boNX&C`FcnQoGAAE%%+PdV>eQ Qh@~qn-Y)Jg7aa4LdTnvz|UH^_;Chc5UN8IY1ZB`G)O=`r69L5~i1<^7#4 EzV=KsYybcN literal 0 HcmV?d00001 diff --git a/hadith-ingestion/config/__pycache__/settings.cpython-312.pyc b/hadith-ingestion/config/__pycache__/settings.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..97a1e318465e7d38771e47b77f56664398277296 GIT binary patch literal 3556 zcmdT{%}*Oi7Voy(#yFUd@M$pMfJq<;HiU!#6V1@EJK$h!Vw+^>ORI60!I^fqscy*F zngc6kB}Q{$Pn*?hr5rcPWsdnD_F}{VZ>?5}v?p%NFsqd~ZB=y}oXMQ_wwCzS`@Q$- zb=9j^AN2&w;Ews4-n!XFuh5gCP594cFMhz^#>PSsiDM2^aw z>JnWJW{FW;uNlP+9Pgc0f6?t=fcw22Ol&H16HS!|-l(qS3tPKTHVe8IG=OOqv@Ii; z*I{s5Rtn~J5OZMIR=8Kl0q-dZGQpyj$Fc!3b{EV&sS@#%)nv~83~e}@oL{hH0)P=& zJPF4-9=hmM93lr?z^ynRVK;>nn}7#17iDg-ncA8t^N3!`nr(KCGB07EMZqlZP+DGd zms9jzYJCmbR;5*JqdnVboOZFDvJRVdP4QZ`1}O>u~_ zTa=B9!<0=>c3TWmc89XN;td*elClwTgxbQCMZ{5Ro1$!53{iHEvKeuVvip=h5N}d8 zOPOE1)i9HXR5vG%Q;&JVX!$MR^84bw15^AHC(8bbN61qzmdY$&DyfCsuc}VfWd$g~ zJk*OppzRf)t`&i12KQuGkT+FeR0d1BVQxWSgq6*(v6llt)jr>E#E?~aTh_K@HS$V_-$*9h)xL>r0u!iPVAlYfWfK??1?)u%hAI~eImyUv z>#7VRR(>Zh50prb9zK!!sS_@hQOjkT*VBngcfDZ3jp|N39wUumpE+|gZGktjlR)?= z5UvhH{<`yH*U9wK(ezSvYPr_hcF3Kyj~um+RNF@1PQT5bj6FOWdsq$4ePBO?P8JeJ z3yJFd>Y0=ErPwp4v)Ox&qcZ;=3)+3Ppm$`WtmR5ucwu2Xo&fA1aGe0j73{)bts?Bf zBD&6JHj+`PF{hP*%T#}E>?i%N5*~URI|G;kNLMvvt-~EkxPZJv8!qFb~vXOBYV@R&32^+6)E#C zLifs{>8X&GeX5neo_Vnz)0c`bSK!J{rnm`meDb;R?8QnspD&uF`TVy<Mmpkyb45sJ zD#Ii_I6EdNgp2CuD~_77LR`YhLDz_ojjsx+^=##q-EBvKvS$9~C9p6%Gc$v{&thpl znWYgrtZ7RCCsR?#`v!H`=4dLB5Te<5Dv7vw@<|FcCsIpNLU=AD5cks1HRL6VC#m!* zpGBUhsmBt%E1Gfcq&zTl+mKD!i_wVq{&4fNysZnMP1odiQFPdJG-)KTRgarlrN|Nb<`yZXgQrs z7PaQ^GfgspW)#fA9(c#lJ!hQ+ZtJV?OFYUr{xis-@tUc1`VTy{z-ZOuulWbRZ?$?n z!M2Ip@GIQFMFKnq)FerYo|7bcdmyJ_LewTnvZm=Kz1|IMB6kXM7BrJyELS%U^5BhD z0?;hOUJ{TTG;n~xAOTW{aF{@lzzqzD+thUxju1|&3XT#85f~$IlfXEE+XU_s2osnh zFiqedff)i12s|V(Phf!ndB8w2Qz#H1%STfM$-PSfKhPQD2fV<)Xa3GD;v1-cet-3F z=9rtqxw53enm=Sop_+f(lE!O;cdK0Yp>fRJ{pj+YIXIX1P3~s~H|I{q-AWZ_Ep_Q_ z^w*|VW`Jk+lZVk`Zm|{^uX0_7xnpkp6LHekPS!^5SGoQ-*!O|Yrc&uX+ zIKuLq%pX=s*MZsn)M4hBo2`w5D%bPI^S1L(zGDu2^gp)-{k%3bPYljuZvJC`!1{Wi zHW0A-HApyiZ~y+&L+t6V_4e(r9CjXaeOTM|V1MZVf8&0w*KcY4#CMUkoWH6KPT6Uh z!up=Te*6HZXrP8A8ZCge!=wAl2iu3n_r+swq}DZz8J_y^e>kUW=w}9lwN7M-COmKn zH>jgjR`3x)o*(vqD3BguZnIp%yMh)Ea_9zKQslXG`gGMVJddm`_a%IWqmo^1Ok%)K u2A2KF;b6IcyBYS*Kbe7lFwMWXqbxgg@ZB#AX@5#On3k>|7JlcUgZnRM2BF>n literal 0 HcmV?d00001 diff --git a/hadith-ingestion/config/__pycache__/settings.cpython-38.pyc b/hadith-ingestion/config/__pycache__/settings.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3ed354132f6c23253be7b5cb3030591c4cc0725d GIT binary patch literal 2757 zcmb_eOII7m5}wfuAtYWl*cdoi`@GIe9rwvy}c`TS$lJDRqFvKWSP;KufMLU?y0V>>Q*zC%V_YhKj*z)$2IN0 z*tz&5pz{h|@e+V)M58*5`FcA>Vmk7;A8#i}Lh*#3B*`}#P14lw8cng-r#MNm)F+LM z(L_UA9_u~Ll)|9p9d)@Ad0~(jEQ-9~NaR}~&mTL~i;nZqXJVk`C!WhPXYgrlS(n)@ zN3h0V2=vpn4%#_>)kTYo&&cin=&(4A=-_&+2@aDujgbUPvJ{PPYBWK{SQ>Z|c#4cG z`55pt$tXUq_=Mt_O^sz)l1}_fr&*f&J@H8=xsha&PASPGohDPteg?+Ql4-@SC_bb3 zRp4`ER`Ge@*T@ycuLEBoSJ@o+a|8GynP=BnazTUsP3Yeu*OlFE;CINvNS9ar8%lQ< z_*Z06@q56R$W67r`@p{@w-kQ>{2Ovx@nzr-$sNTX0sofd6<+~fAa~U~Mc_;1Db-LECJYbvZMddf#X?-qq@5$kP;OQI`iUq~hp7j+Iw}S9}XV_20Dj+8-K?L90U> zhc-6o$DvI?Kkh=HnsS}|MljE8(p|)^UsE9 z?mSG79B~@Bog)Yrp8ywl79oc)i7#+oZPEaD>d7!miKFwszowtltZDB(|w&$Dw&Ojy<;8>B@MGw zH!YO(rUu)B5D|;2|ERMn{6z&;6 z+Yr)v#cZ5m;q;{_Ik3y-u5oZiQJUE@tkSmKs1WlEGg(gHK&bQ`v)<_4!SFzIBIXs^ z{^cE)W0qO5AvkgZiB;J%YfY>7U@+_h+oi!agtP*eX=(-Dvkc7-%AwWxb7vpQETP7 zRb5WWv~34Yo7uL^*mgUlT_1VYwm)AK?MQGQvZIM+kU-@D+q2!WzP3gmna5>EA_$WBTGfAzlMy z^h7+X4}P(jKAnPhY$VaM@oCUb$3TKF^#A&TKBneePV?t5Bbn)(Qb=dKFb03xcnst+ p6`giq_tcVQEEH;xOplDt&*1Z2OmFt{<0WRFz?-x0ZaO~w=Km)N$!`Dv literal 0 HcmV?d00001 diff --git a/hadith-ingestion/simple-pod.yaml b/hadith-ingestion/simple-pod.yaml new file mode 100644 index 0000000..8e5c67c --- /dev/null +++ b/hadith-ingestion/simple-pod.yaml @@ -0,0 +1,46 @@ +apiVersion: v1 +kind: Pod +metadata: + name: hadith-ingestion-list-books + namespace: ml +spec: + restartPolicy: Never + containers: + - name: hadith-ingestion + image: axxs/hadith-ingestion:latest + # command: ["python"] + # args: ["/app/src/main_hadithapi.py", "--list-books"] + command: ["sh","-c","sleep infinity"] + env: + - name: DATABASE_HOST + value: "postgres.db.svc.cluster.local" + - name: DATABASE_PORT + value: "5432" + - name: DATABASE_NAME + value: "hadith_db" + - name: DATABASE_USER + value: "hadith_ingest" + - name: DATABASE_PASSWORD + valueFrom: + secretKeyRef: + name: hadith-db-secret + key: password + - name: HADITHAPI_KEY + valueFrom: + secretKeyRef: + name: hadithapi-secret + key: api-key + - name: MINIO_ENDPOINT + value: "minio.storage.svc.cluster.local:9000" + - name: MINIO_ACCESS_KEY + valueFrom: + secretKeyRef: + name: minio-secret + key: access-key + - name: MINIO_SECRET_KEY + valueFrom: + secretKeyRef: + name: minio-secret + key: secret-key + - name: LOG_LEVEL + value: "INFO" \ No newline at end of file diff --git a/hadith-ingestion/src/.DS_Store b/hadith-ingestion/src/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..dd91efd90af8a79af470d0bee0b1877f745285c9 GIT binary patch literal 6148 zcmeHKO>fgc5S>j^;v_-}5=gxuS>jriKq)HXVnR7^#Rv|7j9pu;h3k!MhX_THe1`wP zU*O6w;lFT#H@jQpIH|;;LV$Ln**9L#+l`<1!%IYBGK=pJjflvBGj<0kHW=S$U$c&> zVV+QJq*PIW-yTiZvTcP`z$);!DZq1gsOD@+IW_0+ho_l5jxre`gO`U#@Vy_(5HTyn zEKk8brVN;XUs2g&efd3BG%u&7udmTGOUr6J{w6wG-R&zqr{@fvH~y)d`&C%Y%1Kzf z;;ZLUX3=>ZMlaH29*nL(mU$JXc~U4s8Yf73{W8sCIiJW`9+!&i>l;qr=?9~o#p2*# zyzAaO++XgxiwAc(-oL+G_MIEIZa;h$yvxpV`B6PJ39NL=t{OatPY5p1DWO5c9_Zb6kU-!CE8kz=Wm( zHC31+hR}4>yC%;wSZmaD66WwBOwYocP=x9o?YkV=plZ(&) zt0a466|f5YR|<&kk$*J7BbmK*?csQ@b>VN}Y@AnXR1_3uJ5~X2#XE3ixaV>J^bFP- SQ3JCd0!jv3SOxy50zUz>6TXZ9 literal 0 HcmV?d00001 diff --git a/hadith-ingestion/src/__pycache__/__init__.cpython-312.pyc b/hadith-ingestion/src/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..77710c35260a50f08d778ef51bfe1e6eeb03eac6 GIT binary patch literal 194 zcmZ9Fu?@m75JhbQ5<*JL4is<;AV#30ww&Y?TQN4eb3hq^37COd7=w}urAvj~@}>L# z-RY$ND$A8fx_{O-=R4cK#(BCt#2_~E_!Klt+n9eQ4JSroVC}TgzSRzHfjSkb_Z?Ii z(Y4q!RzpOF2Crb*>9egB*CsekV02@n*;?NsM;kma)u5F1HY&Bd4AXLd4o&ayK#SBU N({P4wA*G}u(GToPHQxXL literal 0 HcmV?d00001 diff --git a/hadith-ingestion/src/__pycache__/__init__.cpython-38.pyc b/hadith-ingestion/src/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c0bdaed51283d01341c2b4e8ffdad3f777b45cae GIT binary patch literal 188 zcmYj~F$w}P6hyONA;Pxa!9xE9L_C3wY#sh?KAX?RMDnxB9>Jq{C0k2NJ1dh)2j&ek z%`|nrP+7Ovl1n}d{#E6&&z4z+QAf3^4tEvH{KE&K5sCtW4lVi~9b7{kbc&-Npru5v zr%|Y#5(yZ(0E@#DcUn3d97eEcIN3G(o+NSbAh83j*>lo*bDkda?J1bi(XL#-v(*<+ CT{AlX literal 0 HcmV?d00001 diff --git a/hadith-ingestion/src/api_clients/__pycache__/__init__.cpython-312.pyc b/hadith-ingestion/src/api_clients/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e256b74a484646317db2ca8e8c90e05c648ac542 GIT binary patch literal 206 zcmZ9Fu?@m75Jh7G5<*JL4is<;AV#30wjAe_oZ{G#&jDowCSV3;VGK$tlr9x^%a`u| zcc+v7t0*>E)cv!il<#E!nwIhM$i{3Z&dHI;l8{Hx04n9Yj}BhF$|o$ex*`G?N{4P#_r<&;&vQx2{n_AKQc~8R@@E67KsJ~PI&jtqr9!YS7~yPFs)->i(L$L%@4&|Q zMs1jivgBo=iWi9LyTQ7^RP!90kctNaFDj-MKX&Ssc-qjloE4wdt*JQGEIJoA@{@Wx zuH@7>RV*!T=vm#;bCZE}Br%rYZ0Tgd((^gRu=$ttv}JS0a&rmZ7M$&Ni)rNxvNB<5 zR8}lY%N8s|=6Z})VRZinoUK2FF3s6--6ID?YCnVK(K}h*j=>kbfAT&Sx0;>N8 z(7^_?pc+tvYG@^VgR8?rS~%0=&NlY|-9h9NtVd04T8Z4KDc|90ml_4Gn&H<1oWWRY zo?wwbhsV|$jNb%{ZN{1isBIU7x=LHro(3(4!d7fiZHK$IsvU6GnA!<_o7x3^JE*Bk+N7hiCUAZ;k78K210P3B;__HbrKH>~}7u4WjInZ%eaodJsS3^i<@QqbjT zZO#s$)yR(3*gi|oYWW!}Eo_n@j#17Ve}`_7l$cCS6Ps|r5j=^Oh+JPUu`sKdG9e`n zW;Kn$jE9-$c4U&ktbVG>mbhXNaEL5=r&tkhl!D86LV}@qoZXJ{a-C1sbKqf$p%*nZ!O}1+Q)Dg?vPEiI zR4=$NbOC3_pu}*~PJaT}Xc4)To&>6+@X)a+8XYoXyi|*`hPK>Es#B(u_%M#?;Qt zS(=$twRe)bX((AeEt~17yrIzKl%hg-0YjPub|G>kO`1+N?r$a?D{@&JEzH@H?1)5` zi|t!1cGQEn19PSU-FZ@NB3-+G75N}?ja%&;S>WFbS33IO6RLC%!Y5LjRYlSlU*PXb z-IbpB@}Y9iew=hY5=lqTqH)={cJOZd(|{lRQfj&sycn#61}km7m5$w2AMwZjO?ZD` z1I{Dqx-G)Q2S3M>{|V4VV%66&h`OzLK0MEqNTzPX4WyUIB{t1Yk~I4xuz-0MEQu{b zf>D`iz&4rpLM8Sx`(A_x3|QUd6;|EOcc$F1<4oQ98|O~{5Ufg388~l8zEme#At(5o z=SrN)PQL_uUbZBUN(Ovuds6}P|K)6jZ+OfmDTcPwM4$I=!?-?Pm-)`zXX;+wptb3Q z2CzQz^wcVy*@B^EF>7u~kn2Z)21U)#?M!LHYCOAZFQ7II(oSNV%W+ zi`(p5Nuo4@p z42*oSXYaKGYx|xr?|XiA&rxv3=AK7{Z*F-MAkA%yfy;rVLwBRYm7zWF^LL{Iz?eV0 zCUuvk?loz5S=#;U!K;ZYiI4YvCXN2*VL$16gE7H~$;aX2qx^4q=+@DH*U^a9QFH6K z3MEb;J7ZXS0rDiZIS8!KDQCx_$lSSKn+2Uzr zXLw0a8MIJXMdR4kJ|d7K*$i^Ct*Q0e<|Fb;ZeILGuV+dkY2CeZp?<@H-&D&UlX+t94z>f&~TOQwUp^rHY)-j!IYo7P7_!^N3FtOy?);vy5 z6DX_)-cY8s_;<(0PeIM(Waq~!>!Say{ZNDLe@&aC$e10(OjyHCaP%O;I4G&;w_yZ6 zjsSWTI|R@}(Agn(j|l~}sX=rJnwDbCm~uL=YBm%G?t*MWNs$QC5!~lg4_I435#SaT z7*}jT)y~XJQoI=2vct0-bxhic7cOD?0<6qiz&F1=O`JqhbA?Tfvad*6>Ny}jIdwf{>0&DPc3 z&)kVWd#CB(0*3`a=whf6>Z-JMRidrXSD}(`dqf0(0IH2x=i=eZhu2z%%dNw8j7sbY zK#J|zAVMs#AOV`c*^NNJQJrFUr8B+n;lczny;E`Qg#pygJ4Op9m$NIZXSraRTruic=F2C>J;Ss) zm_Rk^SQ$fK1%hCGm{4e4k0zc~*A6$SS;gk9Iswou;FxGj=gIv@s~hl_hpu(r8omRO zKXUAjbj-cT=D>!**0~6tU^O3qkiZ9spje%WM!J|4aFs zS0Bvl@qpei| zzHnx1d)0>{i8OaD#g@{`PcKb<&{vK=S@k0r4)0zXgo5k%(%BD2%gu=@?uRfFXj(|W z|MbGt#lDAOfNX^Ej1Y_-yv4A>i_FsTD#7oy&YOeR``qy-2X1q>Pv4gQcyz;AdnmC& z&qIk3VjChOw7IfFu_aoljM783&>`q-5&pRY?>|SO4PDq0WD+HjzHF#&?iT|{|!+r)a(F+AK87Z&bI3DYw3t{{r%^~ufzhwv^^x) HJ9qvsp~82- literal 0 HcmV?d00001 diff --git a/hadith-ingestion/src/api_clients/__pycache__/base_client.cpython-38.pyc b/hadith-ingestion/src/api_clients/__pycache__/base_client.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1af3fc1bf4e2b084e6c814020275d5e3637af625 GIT binary patch literal 3509 zcmai1TWcK26|U;LMl+Hu%aXO;><$hD8e-2T+3ZUc$Bx%*OvuC#FNB8Bw7N>89`$sO zt9w?`WK3W!Nb;bEJg)=NqyI~OLA~ZlkYC`x->kO8%erW69_v1P% zizL_1=UG~7`%!TiI@&eM^=eg@<2c!=rHYffmZNd4SN4;vj^)9)EU;OIl^bSqob8}D zl}2A&AC6_nbx_HgHmp+G@ba^OkB4}xe?drwU@;0*WQ!k2N4nD4WYUwKa9=ydX*cpk zAQp~T6vzeH60KKUE{a9b79Fv4Y|FOn443dO?^v(6SQabCj`#?gNY@JFjU8)~L1**j zoPWI=5jS4DVpZHca$oUiReap6!~BaGYp-o_OROKc(M_=-Zo`r_u_->l?=5jhe2Cw5 zlaj9PP)3lD$0i@shbBR3o>WzLr&O0xRW-TVo@9A_FDuTIb}e1~vj{;t`v?NL3AUj< zo~T^=$vBJmeg^kI5R%8g?nrRvJ61<`fqt}0L^&%4@0h2(1_N_ z!UmRcRsefRo=v3)E!DxWT80SQ+EZ1nvT@Ug3Tp<82HjEe#Y9$hOm01VDZT z8UqRFT`NEXbqVvbX zFXfl0sQMbBW+&{F{hj@l?HC$53TAA`KWDE@#%pAJ$f-TF_gqy7`)|ne5R|mZZsYGI zpP{|ysG@d;u5f33zCw7EacK8uKo4gIDWr#=_|2ZwD||yu8&5~$T#ks7uF%gl(;ZlR zlT7~*f>i!#?Injv{5HU~#}vT!59@^krTlCQQ%u%EOI!Cp(Y$`cwOy3^rc-@@Vcpuw zv#L(Z(fGlqq}GG*n8n&``q_KYG;D^dj4w*){#IV5NnSlbW3EP#Wr{)`Enm!xX?18t zohp}d99J?e3sFU@=a{XK<9I(SM7iG&1MLum75S;{TowlXit?{EsLwS@w3m#>vY-+n zR79;twOLA7RUeZuUapz#07?>x5!d`<8E{}`MRlFrEA zGs4yxoq(Z@Lu!)O*%-X0w2e}PNDiKZlpH!Kv_tVw1jNW!QYBd>QGVV?*1M(90o)pw zC|>Q26P1iA?eEG&pzy!|vYx2o3TFjSLy0q-b%e&uV4Aa;GhWKrOP5fmx+G_=;9G=X z>l+5vvQBJao$_zkKkb*EL9h+B^1xTlv4iz&=neg*2P@rU=cPaMPl8hhWE0-KnLh-} zK`!Df@d@!5Z=Vv+1=G8~OZH^8snQU?O3oOGfRG*i&GG1P|Y7b<+j~Nx!53{B>u9}R|D>O)@Ml+3Flt;4b zgcBwOhgH_}*5kayMX%mH2VZlT^g$GqSnZ}X{DwDXT{T8Mz>GiRof~uWsxfBiHFL-$ zV-C^t4B&sm`%OZBg^u@AyAa1f3@(cCU*2uRryEx{dN(H}o%4s?Bm>DqqZDx9DM>AIG9h(QTvb6#1fU5S_Y1>P%@8<`mJH z@<|Z6sWk|_Kuq#PsZvEN&zq~;lle7T5n2_6$6!wVl3El9^@s%3&n1W7VNVfToAZvhan&)7Uf37~;EW{A>O*q`yyvZ6Kxigujk}p%2EXiNgZuOU4ix=2pj)VkxVgGD h$0E7$eg5fStI5Lyf@nom?0`Q4zQ%8R0dM2v`yXpER$Tx9 literal 0 HcmV?d00001 diff --git a/hadith-ingestion/src/api_clients/__pycache__/hadithapi_client.cpython-312.pyc b/hadith-ingestion/src/api_clients/__pycache__/hadithapi_client.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8c6dce5fb4f5f4007f0bbedc6bf6f626b20cd420 GIT binary patch literal 9795 zcmcIqU2GdycD_S$NDlucQ4*=&v7}g{V@vU0oG4m*Wh?%P6D5e!?OMAjOL0aLEsE66 zkg=qW3L^-%6yQbV1iPl2U08XOVHfoReQ^6wq)8qY=!+CCpy||$y68SMZ*-(!)1*Lq z&Yd4p3Kb_scLd$JbMHOp-h1ZU@B9q^+3)udxc=uKqA9MMkpI93^J0z4)>Ww7AOaaD z0wdVc%&cwPHftZZ&pO5(v+OuK>l}B^y2f1$_S@6$akt=oaho+(KBhoKAlc*zvUSB3Y^H+ z>~{zr==X61|Aa=tEBXZ=aQ9R0es~JNKg?3$JLwUEFkgUDG*JrVUZDz6ieNR3U=wVU zcDf>uPzTf@n625OH7V+gB~S@qq}RaJPog?ivsZ|v^t;~hc+?5?3e=9%&E7>Px+ zp!K5lSF?DiN(oY#9K6mXz9;?)r~^MuwIVT>GPPnuE<4Ry8XVVGESo|Rnt|;RH8jz(V z?g6I4V<=a4XSDRsEl6RQelfD^rS+RT!Sf@>p!nh{dEoXhyEfbrwecx6-1CnIhJX?NvLeC1G#QZ_cjRP0n|HCOydA4qE14T=MY7bxzFjX)sZIc9Kt| z(|qEML@J$_NQ?YLHajB^B(v$X2$+`5$QF%Nd6nwFJZLo_Q#5$U4e5CS>>gS>DJ53u zwZNp9OHRX4j?+mrmfmhNREagGoJ-{L@~6Lnt!Nx7uqa7jBwe$jEGMSK+cv2ix}^P3 zeCfrr@;vqr@|r_Pdv!}&O%uzX6np+a(IS_&EK`-A_EMYuv2s+NkR))%C!^YiPdl%=KrA=hn`yJX2~oT5vz~HoQHw zJXCa-y!%$iKJoTF@YTQl;_{1Xs8wz5yrEoIR?nB352+FUmhYx-^-?J^0FdR6ZWD(; zuGOtP~?X1y07?y)mKZQz5@GucL3L= z`dS~v4Sc=y#j*+~sN_gXeK{yO5#w zZoE~<;CKu&_!OoJ8!Uvp>#wX&aR4?17T2xQjM$KzPRyazp>A%1m*>P}YBH6ir`sB6 zRfeUhU1(tG$sm6k0RdHpPYGbLfsH8fePaJq|3SW2PE4n!4^8A}rU5H@V-`oN@{L$y zta1;ysk7k5+zr^y<8TrexdF39>Vjt-S9-Aa1Qv)eQZE*LSfHz*g-i+yU~H6^^dt^B z2t|z5aRXJFby?AgxUFZLtWkbEKth7g#s`nY+#7$>r|UMcbwDu zrW$y%03Z-uZ##OQ!pfOHF03@P-Wa+*wCY^5l^UKbxGh-efTbd`Ouu^wy(&N!@T$r_ z5{eXpILMubB%a&7Nwt(B>c^scT`n`7y8G&P9NCm)g1=QNwjNQZDCSE1)Rj$--9$m+nf15bmm1y;w=E=&^SH?h&s84|x-ctE9Z2nAitVoEnZiXp9sa zSMBe{AGsVo5WYF;y#?nZAMrJ*{DH#Rm1j46?Fdf3WuNNnMw;e*YF(pR-(2<*u5p_< z0iHr69HB1Rg|$g^$&UYVu`}5Gwz8}g&#s;>1$)=sy*e&>CKBK?$K}+bxB~$ZZ@S@s z>k~L&7YT#8F=mMsY)eiK!>m2~{==S#l|^njxE`%EfT+F@~+2^d(Ml%Po&V zP6)REH=Tfp#_W)&2r2Fn@;ad0l1K6Ug1Nx}mO2Gy5m0H#dCl*@C-)*GJOtYe*kUR6 zm^mQyQk)7`b59<@Yhp1VptXiQM*9`k9KGb-V@#l8QI>p)SMkmK7}of0YS^ND}#6b`~%t2{aWDw4miXA}+Pn=+aetcbE{xKnrHJ_~HToKf`lH^9T7+9iJ;! zgQ7ZylN5~2bjdg1@g*v=oiXc$jF5UGCFB#P94ok79k@z(*d+8Fq!BEhheGpIq^pOX zamWEfHa926?7C-&fUEg)*<2!RLR)GZUaIDvld@9~Sjrkpd%t8(88n*92w$a_k=U2a zOA`3Z6vs6Vl52X^oZ%%Lpahy|EH|bTR9D_oJlS=R{TbGXKkrXIw9F)!6Ib;!OsJz&B-b|Vs zxW3oOpCS9hAg}+#L>)kUsIK5pBhiBYVW3`Z<2Tz5Z?qj&Bb{pN6KZ2rZQ{!uX+N?> z*!Do-rE-8Y_iQ%xZZ!3-arcgtnw~D4Q=|JgqX#yk2iAt}*O#J0g%|$4q2)oS>4x{Z zR}FWo9ldI2Ozj!~K6P!|gsrOw7qzbGVavYHh|S-<;#8a3-tnoS_SL$LP}g;TnT1x2 zkK2p#MzCu&@ky}PV%LG}+G1);&#nHO{cFwlx=SrjS8)!Jw*9wG-aNV1f4`^HdaCfE z8tJ+fxEWaei+jVR$kT-j4}-16g-?Qg7Du|kt^t*gRk5K9^o3T2moF4X%C9pd*zjL6 zW{BTB^Te56CTZXtG=Lul|I}v8gD&efV^3RQC8(AhRhVmjdZ6s|LU1XzT%FmVR6@+E zalpbhqNZj2g4+61Y2ty+TV;bA-nl;d#t@ykG-3#Z!&(beA=JY3j>!iXFmX zLr$0E=Sm zlBVk`7KqgH^Yu6bk{fniSg{c4cK?@!shl~#!BagS*(&`HR0yP_i?0(}(H`rRpTBgx%u7w)MfYBJ& z!!Iyil@(bQN(-B#C@(_*)qH655%WlcuqYaY=<$i`N(?>=({G=Yp_(xa&fsL6;cs4m8(+rRyLS+zL6B2%35;?v*O z8t$Yk;lz2%HultU4Qg}iW^><0bKhFD)O@sXe!Is?6b)eZrM06t|K9hivM&J=;XflS zToer>f2;Lo>*{Nz=z+ouYJ1PEAKm=XT4$;K*}~}WA?bW^`Jx)?Qd_#w)OH>M^W%?z zee?&)fuKLCc64oaJh##DoZ4pE&`vZ_okL)YJL^9q&bk9DXTg-V?py6G4wS;NM}BB} z6ot9J*gi}`(a%VbYryRG=<=u<>V=6OG)0U1-knsVXK}JtV=~g(x!H1JqvZrzeYzUh zPwO)f-?au{8vm0kPB6#({#%PT7uRB?&SNV+FxkcQdeebY=s?*IBOnLwZ}_U*2@`&` z{R4)C+dm^Ne-w=@B*fM50o@CD(9&K^ytf|>Z9DY@w!zRwzbXe|?0?IbK>TkqBZJIa zb*Ed%&xiLx^`B1gBR1QI&334N*utGTW&f~yxc-GM&5@XHvO3HT&PGvS8e(EIeV z@-0qbFrbo|YZ1ksgfEq21Pq)lW?#H-VIXE}(9`DyF$bU{Bj}>jZw;_2dm!)b2%(by zwJ5Dh&2-6WI=lHTX~9HCJ~hc_viK8>d`7T1z~^ijJ_ZXM1%NsR8>YwI{G3B!ZXdP| zL$T{i4oyGMs0OeOmt8ykP(7j*PM?g31HNgP)HU7+To05&e1TQNpwBk0tuUen8{VE@ zo-cYz!S2P7(^!2tUS*#_Xedi1B@37qg8@^v>wFM*IYyNz97z zxaNt+XR|^+jdfo<{u2mXj2>4!E@YGNl^$I`NyH+I1?msY+eoMl5@sxQ(csE8JN$M* zj~CE+(IFDtA0ka-F@r<=#z+062)=wA66np)D#PB%_bJjz=#0?9haK0U=Zm+o7N(aww*1&KO_{Z{Kk|++Y!hXL z72p+aF1u*eZL;O)qlYv`%5*fcYgjo}v=_&U?&~L4kFAB*8xC!7{ab!I@--W2?RdnU zW!NZXj1_}1tpejnnRdexfz$C~xR_i$TAaSwu@<@4b5Fj1=HC3T`!{04>(SwjhSML6 z{HABUVRVCgdCN-~ykZNm;kE8Ef!n>l`^P@8e=zoe`|nR~9C_gn*hck?zN1)3%ED{X zoVkTLu#CD=P#vL~L6wK90&0QAf!)j}a}d;PUiwoJ{Wlj9#t~XDnaxb5ruxmFDCxfs z@nDut|J+KAFrD=_(N}=KPx#V3q^s~rX&xR?kz};BWjn(#k8C!^@dqbiPHmEt|4Kr? lBl~|xdLDA^h3F>NzQMJZxX!E2ZRhI_CcHBHIl((+{l9y#*iQfe literal 0 HcmV?d00001 diff --git a/hadith-ingestion/src/api_clients/__pycache__/hadithapi_client.cpython-38.pyc b/hadith-ingestion/src/api_clients/__pycache__/hadithapi_client.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9f740ed5b86aabbc9fa196716e39b8552871e8e9 GIT binary patch literal 6573 zcmb_h&5snv74Prq>G@#6fLV-j<2ZQ7hFv>}ZDmCqgRucST3f~rc1LlicdB-_*`Ar* z>K^PJ^+r)Zu@6B)QEtW(n;ZW@4!K2&q8y@}`;bdc+;hSZ<@c&TW_AI~Rx-=fR9C-x z^;Wd2YU!Y)$r!bXik=oZht*?7}-|&pS z>6v}Yv-&l!rqX*ovOSv_%#7-N$8%(x6*c-zuc<2BzSY`Ry!JVT)tG%vVK#TJ>fRov zUsO8v@logXDCC30T^S1Ze89qF_4KlWG8{C{#n0{0JOj%LM&1)^sVo##IHmmR&vu4Q%gClkH=VT{FEldkSyNN@~xP zl*Y6b9i#1s^Z|Ad(lgAJ(k;G+@101J#S`nuMppKH{HDPUv03!pCr6o*V`S-J_5>__ z>@&qsy!}|XW7(c+vGVC1pPX9AaFAe0x-{4BCs=e9 z3aXp<>iEV-P$bG#<+94uR9&A*w97hd*01XTnF2eErSa@)lB~yz3ky5OSb*+3N722b zd-gJL;EzNkT4?YbfQ9=}*bkGAk(u#mor}!&{csQ_zAq>~yw>^CXD_^e9^moce&eH# zmKiaRR%Cw|UibZO6vVOb-&XF97Cwl%h!^4@3RZ(bFNhXC8HzPO5u?G{!ZJ^I)Z?R= zFK;G1USRynLKsIuKkWK(cXb#AqA>gV;Tz$$kt%p%G6!D_Z8)H^mIh+=Fi{ z<5{&-O|@0+{`k+zAcB6JS%N1cF?e9!Y0h6Kod;21zO7K%Qd8wJ>P$U_ z5;LxikDX?Wg44y)xyjHKd}9Q|U8x-IirBMY{o8y~G@+-XiM{v{q*Ls#iUX)+O+R4F zFJM3dv4uj>Y}HZ6M;`EGO1Ptn$06CJP%l5kUDRK?@GDdjK+bU`tF^FI&D)zSOR&zgFe5)>F2%EggLk z>WP)=S5?-yu3<#tx8``uaf{D6p12WM%pI<{K@_>cl^~3Q<%qk>!{J)I&>cn*N5~8Z zan+})yeAhdUaT5ODRClcqy8=fH4-6;kWc!1?G>JMSAi^F4w9Mj=e8Ji8l0l@3HhoqX`)Xw=K0G|fW9KoyjJiwC*D z;2@^S#m(WQAvW|>{Q|qgmjDwp04hDz0V)%~u>h)7394GzCVy?bs~4bh%9aXL)Vm=; zWymp`sR~#bfK`k7qy}Ks9{-~P0>P@g8mv>?$xSqH<8|H*SHi9YMkQEPPK(|yAXO8w z=$;`g(VVy;1NT4;7w#ON@6DfZpN)gnaP{Q!Xl)fRd$v<)tSVm%)56L}Aa!qr)Y2P- z`16?gaebj|5r^@dLy|Bco}_}nDW0OjMUmO1us2iIDwt|gp*TjuPowCVIr?aroS!;W zJ3+-R^fd`NrNTuB#B(SVhoGhbYPP!np)dpx?_1V`TsEyv*wA!`q3D=dO)f^rGN%~L zN8MOQG-jsUvvRN!K+8`J75YlvyMYSW4rHC`iJBTry^2l#roOFinW-k8Nww5u`YAJRU_!>>_ch{o(fC5Q$lUUfC&XZy{7%nf?irCgPtZM zbCv^al71YH`GdhlDI>*a7vK1AC>R2XQFu^ZM8RVseDv$aRxPa|-7q9~+p=yF{I}|< z9ls^R4ao)MErge$DjR=E?X$|y_iZ_;^HT-EXEF5^Wy{)FLd`yi+x=A)l3^{bIlZgsz$WZCfietXv#6n4V%p%EcXV#nGVYf zl(gd<idjF@`pDg{NyOan2gpwp>Py!_*z07mJCfGlcbG9=}yD(tkm5_~s zavmo^Kb@+{O+ZjOD_%m887xSG%$ZDYo^$y)8RLwb@Q$8uW)i`zWS9g|89(8_p~kb< z#juBrD$Y##_OANlqK{WA@;31*jnwRpguq@eSt6_BV3SwN5q<HL=rC{Yx^M? zD&wArK;SAt>9nTezKSkEo2BF%uduQh+-sHTo=>idmr3pvipH}q^$2hym)bMu@j+amnX&5pOltgjQsx{T7jW(hU`nzXY8gfTVb@XP{>fEzz$f13w z+<5MMc{5Rv%A*Xx%uL8ddM-~XbTTPF4OHkW3FSGg)VMA8YYjWJaY-4h8cL$BIpT_{ zY++9x=hT6HIx$icX&@bcDg2eoX1;qP?bJaehvbfJQR)dQ0$r?VL@OI_UQ#|%{(!ys zqQa~#^Oxp_cphjUQU*|CHR!cvdMVF2sR2#D-?CHtmc{DO1oA@7LCul!4Ny@%b#e+C zAFr&ar#cc;>fI(gQo9)a@0XN~8@Wd}RDs(S>C@BDRcCF|Rd^){N?K}d+_$)>orRY^h#PT(@00LL-KF{KBUKvj7}Fz?PKCI{TAXH|J<)_z8gUf#28mI3F6EAC)OH`t>A zt2E%&5#kqN;1V$Z96Xl@0#0ReEcZuw&GCJDwF?L`8o4&t+jHP%}^L;k# z;%?L=ThCK5LxoNSUGWIII}xw#p{H@oOlsF>8lv2d%?LkIVrls`8bl;e;)>bmwmKm9iyo+kdzDu5yng z=x^?;=6R{DGb7%CuFOg{*Ae2m?a~$Lfb>yXOvZY35WuLLAnEiP@*X+AlofSSv8Ov6 xtc1P!^8Xd`%VUR*9QievWD_~+lyh>i<^899DxdLJvAjx5ZW_9a^_5A8{$GcoQE31G literal 0 HcmV?d00001 diff --git a/hadith-ingestion/src/api_clients/hadithapi_client.py b/hadith-ingestion/src/api_clients/hadithapi_client.py index 02dee1f..d9e2e5a 100644 --- a/hadith-ingestion/src/api_clients/hadithapi_client.py +++ b/hadith-ingestion/src/api_clients/hadithapi_client.py @@ -1,7 +1,7 @@ """ Client for HadithAPI.com API """ -from typing import List, Dict, Any, Optional, Generator +from typing import List, Dict, Any, Optional, Generator, Tuple import structlog from .base_client import BaseAPIClient from config.settings import settings @@ -45,7 +45,8 @@ class HadithAPIClient(BaseAPIClient): ) raise Exception(f"API Error: {response.get('message')}") - books = response.get('data', []) + books = response.get('books', []) + logger.info( "books_fetched", @@ -80,7 +81,8 @@ class HadithAPIClient(BaseAPIClient): ) raise Exception(f"API Error: {response.get('message')}") - chapters = response.get('data', []) + chapters = response.get('chapters', []) + logger.info( "chapters_fetched", @@ -127,7 +129,10 @@ class HadithAPIClient(BaseAPIClient): ) response = self.get("hadiths", params=params) - + # logger.debug( + # "fetching_hadiths_page####", + # response=response + # ) if response.get('status') != 200: logger.error( "api_error", @@ -136,7 +141,7 @@ class HadithAPIClient(BaseAPIClient): ) raise Exception(f"API Error: {response.get('message')}") - return response.get('data', {}) + return response.get('hadiths', {}) def iter_all_hadiths_in_book( self, @@ -162,15 +167,21 @@ class HadithAPIClient(BaseAPIClient): while True: response_data = self.get_hadiths_page( - book_id=book_id, + book_id=book_slug, chapter_id=chapter_id, page=page, limit=batch_size ) - hadiths = response_data.get('hadiths', []) + hadiths = response_data.get('data', []) pagination = response_data.get('pagination', {}) - + # logger.info( + # "book_complete", + # book_slug=book_slug, + # hadiths=hadiths, + # pagination=pagination, + # response = response_data + # ) if not hadiths: logger.info( "book_complete", @@ -190,12 +201,12 @@ class HadithAPIClient(BaseAPIClient): "progress", book_slug=book_slug, fetched=total_fetched, - total=pagination.get('total', '?') + total=response_data.get('total', '?') ) # Check if there are more pages - current_page = pagination.get('current_page', page) - last_page = pagination.get('last_page', 1) + current_page = response_data.get('current_page', page) + last_page = response_data.get('last_page', 1) if current_page >= last_page: logger.info( @@ -213,15 +224,15 @@ class HadithAPIClient(BaseAPIClient): book_id: int, book_slug: str, batch_size: int = 100 - ) -> Generator[tuple[Dict[str, Any], Optional[Dict[str, Any]]], None, None]: + ) -> Generator[Tuple[Dict[str, Any], Optional[Dict[str, Any]]], None, None]: """ Iterator that yields all hadiths in a book, organized by chapter - + Args: book_id: Book ID book_slug: Book slug batch_size: Number of hadiths to fetch per request - + Yields: Tuple of (hadith_dict, chapter_dict or None) """ diff --git a/hadith-ingestion/src/main_hadithapi.py b/hadith-ingestion/src/main_hadithapi.py index 6cd61e4..19dc566 100644 --- a/hadith-ingestion/src/main_hadithapi.py +++ b/hadith-ingestion/src/main_hadithapi.py @@ -151,7 +151,8 @@ class HadithAPIIngestionService: book_info = book_mapping[book_slug] collection_id = book_info['collection_id'] - book_id = book_info['book_id'] + # book_id = book_info['book_id'] + book_id = book_slug # Create ingestion job job_id = self.repo.create_ingestion_job( diff --git a/hadith-ingestion/test_hadithapi.py b/hadith-ingestion/test_hadithapi.py new file mode 100644 index 0000000..0cd20c2 --- /dev/null +++ b/hadith-ingestion/test_hadithapi.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 +""" +Quick test script for hadithapi_client.py +""" +import sys +from venv import logger +sys.path.insert(0, '/app') + +from src.api_clients.hadithapi_client import HadithAPIClient +from config.settings import settings + +def test_api_connection(): + """Test basic API connectivity""" + print("=== Testing HadithAPI Client ===\n") + + client = HadithAPIClient() + + # Test 1: Get books + print("Test 1: Fetching available books...") + try: + books = client.get_books() + print(f"✓ Success! Found {len(books)} books") + for book in books[:3]: # Show first 3 + print(f" - {book.get('bookName')} ({book.get('bookSlug')})") + print(f" Hadiths: {book.get('hadiths_count')}, Chapters: {book.get('chapters_count')}") + logger.info(f"Fetched {len(books)} books successfully") + + except Exception as e: + print(f"✗ Failed: {e}") + return False + + # Test 2: Get chapters for Sahih Bukhari + print("\nTest 2: Fetching chapters for Sahih Bukhari...") + try: + chapters = client.get_chapters('sahih-bukhari') + print(f"✓ Success! Found {len(chapters)} chapters") + if chapters: + print(f" First chapter: {chapters[0].get('chapterEnglish')}") + except Exception as e: + print(f"✗ Failed: {e}") + return False + + # Test 3: Fetch first page of hadiths + print("\nTest 3: Fetching first page of hadiths...") + book_id = None + try: + book = client.get_book_by_slug('sahih-bukhari') + if not book: + print("✗ Failed: Book 'sahih-bukhari' not found") + return False + book_id = book.get('id') + page_data = client.get_hadiths_page('sahih-bukhari', page=1, limit=5) + hadiths = page_data.get('hadiths', []) + print(f"✓ Success! Fetched {len(hadiths)} hadiths") + if hadiths: + first = hadiths[0] + print(f" First hadith number: {first.get('hadithNumber')}") + print(f" Arabic text (first 100 chars): {first.get('hadithArabic', '')[:100]}...") + except Exception as e: + print(f"✗ Failed: {e}") + return False + + if book_id is None: + print("✗ Failed: Book ID unavailable for iterator test") + return False + + # # Test 4: Test iterator (fetch 3 hadiths) + print("\nTest 4: Testing hadith iterator (3 hadiths)...") + try: + count = 0 + + for hadith in client.iter_all_hadiths_in_book(book_id='sahih-bukhari', book_slug='sahih-bukhari', batch_size=10): + count += 1 + print(f" Hadith #{hadith.get('hadithNumber')} is {hadith.get('englishNarrator')} and is {hadith.get('status')} ") + if count >= 3: + break + print(f"✓ Success! Iterator working correctly") + except Exception as e: + print(f"✗ Failed: {e}") + return False + + client.close() + print("\n=== All Tests Passed! ===") + return True + +if __name__ == "__main__": + success = test_api_connection() + sys.exit(0 if success else 1) \ No newline at end of file