correct hadithapi iterators

This commit is contained in:
salahangal 2025-11-16 13:43:08 +01:00
parent e4546cd007
commit 53cd6e2415
21 changed files with 3368 additions and 15 deletions

BIN
.DS_Store vendored Normal file

Binary file not shown.

BIN
hadith-ingestion/.DS_Store vendored Normal file

Binary file not shown.

5
hadith-ingestion/combine.sh Executable file
View File

@ -0,0 +1,5 @@
find . -type f -name "*.txt" -o -name "production" -o -name "*.py" -o -name "*.yaml" -o -name "Dockerfile" -o -name "*.sh" -o -name "*.env" -o -name "*.md" ! -name "*.xls" ! -name "*.xlsx"| while read file; do
echo "=== $file ===" >> combined.txt
cat "$file" >> combined.txt
echo "" >> combined.txt
done

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,46 @@
apiVersion: v1
kind: Pod
metadata:
name: hadith-ingestion-list-books
namespace: ml
spec:
restartPolicy: Never
containers:
- name: hadith-ingestion
image: axxs/hadith-ingestion:latest
# command: ["python"]
# args: ["/app/src/main_hadithapi.py", "--list-books"]
command: ["sh","-c","sleep infinity"]
env:
- name: DATABASE_HOST
value: "postgres.db.svc.cluster.local"
- name: DATABASE_PORT
value: "5432"
- name: DATABASE_NAME
value: "hadith_db"
- name: DATABASE_USER
value: "hadith_ingest"
- name: DATABASE_PASSWORD
valueFrom:
secretKeyRef:
name: hadith-db-secret
key: password
- name: HADITHAPI_KEY
valueFrom:
secretKeyRef:
name: hadithapi-secret
key: api-key
- name: MINIO_ENDPOINT
value: "minio.storage.svc.cluster.local:9000"
- name: MINIO_ACCESS_KEY
valueFrom:
secretKeyRef:
name: minio-secret
key: access-key
- name: MINIO_SECRET_KEY
valueFrom:
secretKeyRef:
name: minio-secret
key: secret-key
- name: LOG_LEVEL
value: "INFO"

BIN
hadith-ingestion/src/.DS_Store vendored Normal file

Binary file not shown.

View File

@ -1,7 +1,7 @@
""" """
Client for HadithAPI.com API Client for HadithAPI.com API
""" """
from typing import List, Dict, Any, Optional, Generator from typing import List, Dict, Any, Optional, Generator, Tuple
import structlog import structlog
from .base_client import BaseAPIClient from .base_client import BaseAPIClient
from config.settings import settings from config.settings import settings
@ -45,7 +45,8 @@ class HadithAPIClient(BaseAPIClient):
) )
raise Exception(f"API Error: {response.get('message')}") raise Exception(f"API Error: {response.get('message')}")
books = response.get('data', []) books = response.get('books', [])
logger.info( logger.info(
"books_fetched", "books_fetched",
@ -80,7 +81,8 @@ class HadithAPIClient(BaseAPIClient):
) )
raise Exception(f"API Error: {response.get('message')}") raise Exception(f"API Error: {response.get('message')}")
chapters = response.get('data', []) chapters = response.get('chapters', [])
logger.info( logger.info(
"chapters_fetched", "chapters_fetched",
@ -127,7 +129,10 @@ class HadithAPIClient(BaseAPIClient):
) )
response = self.get("hadiths", params=params) response = self.get("hadiths", params=params)
# logger.debug(
# "fetching_hadiths_page####",
# response=response
# )
if response.get('status') != 200: if response.get('status') != 200:
logger.error( logger.error(
"api_error", "api_error",
@ -136,7 +141,7 @@ class HadithAPIClient(BaseAPIClient):
) )
raise Exception(f"API Error: {response.get('message')}") raise Exception(f"API Error: {response.get('message')}")
return response.get('data', {}) return response.get('hadiths', {})
def iter_all_hadiths_in_book( def iter_all_hadiths_in_book(
self, self,
@ -162,15 +167,21 @@ class HadithAPIClient(BaseAPIClient):
while True: while True:
response_data = self.get_hadiths_page( response_data = self.get_hadiths_page(
book_id=book_id, book_id=book_slug,
chapter_id=chapter_id, chapter_id=chapter_id,
page=page, page=page,
limit=batch_size limit=batch_size
) )
hadiths = response_data.get('hadiths', []) hadiths = response_data.get('data', [])
pagination = response_data.get('pagination', {}) pagination = response_data.get('pagination', {})
# logger.info(
# "book_complete",
# book_slug=book_slug,
# hadiths=hadiths,
# pagination=pagination,
# response = response_data
# )
if not hadiths: if not hadiths:
logger.info( logger.info(
"book_complete", "book_complete",
@ -190,12 +201,12 @@ class HadithAPIClient(BaseAPIClient):
"progress", "progress",
book_slug=book_slug, book_slug=book_slug,
fetched=total_fetched, fetched=total_fetched,
total=pagination.get('total', '?') total=response_data.get('total', '?')
) )
# Check if there are more pages # Check if there are more pages
current_page = pagination.get('current_page', page) current_page = response_data.get('current_page', page)
last_page = pagination.get('last_page', 1) last_page = response_data.get('last_page', 1)
if current_page >= last_page: if current_page >= last_page:
logger.info( logger.info(
@ -213,15 +224,15 @@ class HadithAPIClient(BaseAPIClient):
book_id: int, book_id: int,
book_slug: str, book_slug: str,
batch_size: int = 100 batch_size: int = 100
) -> Generator[tuple[Dict[str, Any], Optional[Dict[str, Any]]], None, None]: ) -> Generator[Tuple[Dict[str, Any], Optional[Dict[str, Any]]], None, None]:
""" """
Iterator that yields all hadiths in a book, organized by chapter Iterator that yields all hadiths in a book, organized by chapter
Args: Args:
book_id: Book ID book_id: Book ID
book_slug: Book slug book_slug: Book slug
batch_size: Number of hadiths to fetch per request batch_size: Number of hadiths to fetch per request
Yields: Yields:
Tuple of (hadith_dict, chapter_dict or None) Tuple of (hadith_dict, chapter_dict or None)
""" """

View File

@ -151,7 +151,8 @@ class HadithAPIIngestionService:
book_info = book_mapping[book_slug] book_info = book_mapping[book_slug]
collection_id = book_info['collection_id'] collection_id = book_info['collection_id']
book_id = book_info['book_id'] # book_id = book_info['book_id']
book_id = book_slug
# Create ingestion job # Create ingestion job
job_id = self.repo.create_ingestion_job( job_id = self.repo.create_ingestion_job(

View File

@ -0,0 +1,88 @@
#!/usr/bin/env python3
"""
Quick test script for hadithapi_client.py
"""
import sys
from venv import logger
sys.path.insert(0, '/app')
from src.api_clients.hadithapi_client import HadithAPIClient
from config.settings import settings
def test_api_connection():
"""Test basic API connectivity"""
print("=== Testing HadithAPI Client ===\n")
client = HadithAPIClient()
# Test 1: Get books
print("Test 1: Fetching available books...")
try:
books = client.get_books()
print(f"✓ Success! Found {len(books)} books")
for book in books[:3]: # Show first 3
print(f" - {book.get('bookName')} ({book.get('bookSlug')})")
print(f" Hadiths: {book.get('hadiths_count')}, Chapters: {book.get('chapters_count')}")
logger.info(f"Fetched {len(books)} books successfully")
except Exception as e:
print(f"✗ Failed: {e}")
return False
# Test 2: Get chapters for Sahih Bukhari
print("\nTest 2: Fetching chapters for Sahih Bukhari...")
try:
chapters = client.get_chapters('sahih-bukhari')
print(f"✓ Success! Found {len(chapters)} chapters")
if chapters:
print(f" First chapter: {chapters[0].get('chapterEnglish')}")
except Exception as e:
print(f"✗ Failed: {e}")
return False
# Test 3: Fetch first page of hadiths
print("\nTest 3: Fetching first page of hadiths...")
book_id = None
try:
book = client.get_book_by_slug('sahih-bukhari')
if not book:
print("✗ Failed: Book 'sahih-bukhari' not found")
return False
book_id = book.get('id')
page_data = client.get_hadiths_page('sahih-bukhari', page=1, limit=5)
hadiths = page_data.get('hadiths', [])
print(f"✓ Success! Fetched {len(hadiths)} hadiths")
if hadiths:
first = hadiths[0]
print(f" First hadith number: {first.get('hadithNumber')}")
print(f" Arabic text (first 100 chars): {first.get('hadithArabic', '')[:100]}...")
except Exception as e:
print(f"✗ Failed: {e}")
return False
if book_id is None:
print("✗ Failed: Book ID unavailable for iterator test")
return False
# # Test 4: Test iterator (fetch 3 hadiths)
print("\nTest 4: Testing hadith iterator (3 hadiths)...")
try:
count = 0
for hadith in client.iter_all_hadiths_in_book(book_id='sahih-bukhari', book_slug='sahih-bukhari', batch_size=10):
count += 1
print(f" Hadith #{hadith.get('hadithNumber')} is {hadith.get('englishNarrator')} and is {hadith.get('status')} ")
if count >= 3:
break
print(f"✓ Success! Iterator working correctly")
except Exception as e:
print(f"✗ Failed: {e}")
return False
client.close()
print("\n=== All Tests Passed! ===")
return True
if __name__ == "__main__":
success = test_api_connection()
sys.exit(0 if success else 1)