correct hadithapi iterators

This commit is contained in:
salahangal 2025-11-16 13:43:08 +01:00
parent e4546cd007
commit 53cd6e2415
21 changed files with 3368 additions and 15 deletions

BIN
.DS_Store vendored Normal file

Binary file not shown.

BIN
hadith-ingestion/.DS_Store vendored Normal file

Binary file not shown.

5
hadith-ingestion/combine.sh Executable file
View File

@ -0,0 +1,5 @@
find . -type f -name "*.txt" -o -name "production" -o -name "*.py" -o -name "*.yaml" -o -name "Dockerfile" -o -name "*.sh" -o -name "*.env" -o -name "*.md" ! -name "*.xls" ! -name "*.xlsx"| while read file; do
echo "=== $file ===" >> combined.txt
cat "$file" >> combined.txt
echo "" >> combined.txt
done

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,46 @@
apiVersion: v1
kind: Pod
metadata:
name: hadith-ingestion-list-books
namespace: ml
spec:
restartPolicy: Never
containers:
- name: hadith-ingestion
image: axxs/hadith-ingestion:latest
# command: ["python"]
# args: ["/app/src/main_hadithapi.py", "--list-books"]
command: ["sh","-c","sleep infinity"]
env:
- name: DATABASE_HOST
value: "postgres.db.svc.cluster.local"
- name: DATABASE_PORT
value: "5432"
- name: DATABASE_NAME
value: "hadith_db"
- name: DATABASE_USER
value: "hadith_ingest"
- name: DATABASE_PASSWORD
valueFrom:
secretKeyRef:
name: hadith-db-secret
key: password
- name: HADITHAPI_KEY
valueFrom:
secretKeyRef:
name: hadithapi-secret
key: api-key
- name: MINIO_ENDPOINT
value: "minio.storage.svc.cluster.local:9000"
- name: MINIO_ACCESS_KEY
valueFrom:
secretKeyRef:
name: minio-secret
key: access-key
- name: MINIO_SECRET_KEY
valueFrom:
secretKeyRef:
name: minio-secret
key: secret-key
- name: LOG_LEVEL
value: "INFO"

BIN
hadith-ingestion/src/.DS_Store vendored Normal file

Binary file not shown.

View File

@ -1,7 +1,7 @@
"""
Client for HadithAPI.com API
"""
from typing import List, Dict, Any, Optional, Generator
from typing import List, Dict, Any, Optional, Generator, Tuple
import structlog
from .base_client import BaseAPIClient
from config.settings import settings
@ -45,7 +45,8 @@ class HadithAPIClient(BaseAPIClient):
)
raise Exception(f"API Error: {response.get('message')}")
books = response.get('data', [])
books = response.get('books', [])
logger.info(
"books_fetched",
@ -80,7 +81,8 @@ class HadithAPIClient(BaseAPIClient):
)
raise Exception(f"API Error: {response.get('message')}")
chapters = response.get('data', [])
chapters = response.get('chapters', [])
logger.info(
"chapters_fetched",
@ -127,7 +129,10 @@ class HadithAPIClient(BaseAPIClient):
)
response = self.get("hadiths", params=params)
# logger.debug(
# "fetching_hadiths_page####",
# response=response
# )
if response.get('status') != 200:
logger.error(
"api_error",
@ -136,7 +141,7 @@ class HadithAPIClient(BaseAPIClient):
)
raise Exception(f"API Error: {response.get('message')}")
return response.get('data', {})
return response.get('hadiths', {})
def iter_all_hadiths_in_book(
self,
@ -162,15 +167,21 @@ class HadithAPIClient(BaseAPIClient):
while True:
response_data = self.get_hadiths_page(
book_id=book_id,
book_id=book_slug,
chapter_id=chapter_id,
page=page,
limit=batch_size
)
hadiths = response_data.get('hadiths', [])
hadiths = response_data.get('data', [])
pagination = response_data.get('pagination', {})
# logger.info(
# "book_complete",
# book_slug=book_slug,
# hadiths=hadiths,
# pagination=pagination,
# response = response_data
# )
if not hadiths:
logger.info(
"book_complete",
@ -190,12 +201,12 @@ class HadithAPIClient(BaseAPIClient):
"progress",
book_slug=book_slug,
fetched=total_fetched,
total=pagination.get('total', '?')
total=response_data.get('total', '?')
)
# Check if there are more pages
current_page = pagination.get('current_page', page)
last_page = pagination.get('last_page', 1)
current_page = response_data.get('current_page', page)
last_page = response_data.get('last_page', 1)
if current_page >= last_page:
logger.info(
@ -213,7 +224,7 @@ class HadithAPIClient(BaseAPIClient):
book_id: int,
book_slug: str,
batch_size: int = 100
) -> Generator[tuple[Dict[str, Any], Optional[Dict[str, Any]]], None, None]:
) -> Generator[Tuple[Dict[str, Any], Optional[Dict[str, Any]]], None, None]:
"""
Iterator that yields all hadiths in a book, organized by chapter

View File

@ -151,7 +151,8 @@ class HadithAPIIngestionService:
book_info = book_mapping[book_slug]
collection_id = book_info['collection_id']
book_id = book_info['book_id']
# book_id = book_info['book_id']
book_id = book_slug
# Create ingestion job
job_id = self.repo.create_ingestion_job(

View File

@ -0,0 +1,88 @@
#!/usr/bin/env python3
"""
Quick test script for hadithapi_client.py
"""
import sys
from venv import logger
sys.path.insert(0, '/app')
from src.api_clients.hadithapi_client import HadithAPIClient
from config.settings import settings
def test_api_connection():
"""Test basic API connectivity"""
print("=== Testing HadithAPI Client ===\n")
client = HadithAPIClient()
# Test 1: Get books
print("Test 1: Fetching available books...")
try:
books = client.get_books()
print(f"✓ Success! Found {len(books)} books")
for book in books[:3]: # Show first 3
print(f" - {book.get('bookName')} ({book.get('bookSlug')})")
print(f" Hadiths: {book.get('hadiths_count')}, Chapters: {book.get('chapters_count')}")
logger.info(f"Fetched {len(books)} books successfully")
except Exception as e:
print(f"✗ Failed: {e}")
return False
# Test 2: Get chapters for Sahih Bukhari
print("\nTest 2: Fetching chapters for Sahih Bukhari...")
try:
chapters = client.get_chapters('sahih-bukhari')
print(f"✓ Success! Found {len(chapters)} chapters")
if chapters:
print(f" First chapter: {chapters[0].get('chapterEnglish')}")
except Exception as e:
print(f"✗ Failed: {e}")
return False
# Test 3: Fetch first page of hadiths
print("\nTest 3: Fetching first page of hadiths...")
book_id = None
try:
book = client.get_book_by_slug('sahih-bukhari')
if not book:
print("✗ Failed: Book 'sahih-bukhari' not found")
return False
book_id = book.get('id')
page_data = client.get_hadiths_page('sahih-bukhari', page=1, limit=5)
hadiths = page_data.get('hadiths', [])
print(f"✓ Success! Fetched {len(hadiths)} hadiths")
if hadiths:
first = hadiths[0]
print(f" First hadith number: {first.get('hadithNumber')}")
print(f" Arabic text (first 100 chars): {first.get('hadithArabic', '')[:100]}...")
except Exception as e:
print(f"✗ Failed: {e}")
return False
if book_id is None:
print("✗ Failed: Book ID unavailable for iterator test")
return False
# # Test 4: Test iterator (fetch 3 hadiths)
print("\nTest 4: Testing hadith iterator (3 hadiths)...")
try:
count = 0
for hadith in client.iter_all_hadiths_in_book(book_id='sahih-bukhari', book_slug='sahih-bukhari', batch_size=10):
count += 1
print(f" Hadith #{hadith.get('hadithNumber')} is {hadith.get('englishNarrator')} and is {hadith.get('status')} ")
if count >= 3:
break
print(f"✓ Success! Iterator working correctly")
except Exception as e:
print(f"✗ Failed: {e}")
return False
client.close()
print("\n=== All Tests Passed! ===")
return True
if __name__ == "__main__":
success = test_api_connection()
sys.exit(0 if success else 1)