hadith-ingestion/hadith-ingestion/src/api_clients/hadithapi_client.py

312 lines
9.4 KiB
Python

"""
Client for HadithAPI.com API
"""
from typing import List, Dict, Any, Optional, Generator, Tuple
import structlog
from .base_client import BaseAPIClient
from config.settings import settings
logger = structlog.get_logger()
class HadithAPIClient(BaseAPIClient):
"""Client for interacting with hadithapi.com API"""
def __init__(self, api_key: Optional[str] = None):
super().__init__(
base_url="https://hadithapi.com/api",
api_key=api_key or settings.HADITHAPI_KEY,
rate_limit=30 # Conservative: 30 req/min
)
def _add_api_key(self, params: Optional[Dict] = None) -> Dict:
"""Add API key to request parameters"""
params = params or {}
params['apiKey'] = self.api_key
return params
def get_books(self) -> List[Dict[str, Any]]:
"""
Get list of all available books/collections
Returns:
List of book dictionaries
"""
logger.info("fetching_books")
params = self._add_api_key()
response = self.get("books", params=params)
if response.get('status') != 200:
logger.error(
"api_error",
status=response.get('status'),
message=response.get('message')
)
raise Exception(f"API Error: {response.get('message')}")
books = response.get('books', [])
logger.info(
"books_fetched",
count=len(books)
)
return books
def get_chapters(self, book_slug: str) -> List[Dict[str, Any]]:
"""
Get chapters for a specific book
Args:
book_slug: Book slug identifier (e.g., 'sahih-bukhari')
Returns:
List of chapter dictionaries
"""
logger.info(
"fetching_chapters",
book_slug=book_slug
)
params = self._add_api_key()
response = self.get(f"{book_slug}/chapters", params=params)
if response.get('status') != 200:
logger.error(
"api_error",
status=response.get('status'),
message=response.get('message')
)
raise Exception(f"API Error: {response.get('message')}")
chapters = response.get('chapters', [])
logger.info(
"chapters_fetched",
book_slug=book_slug,
count=len(chapters)
)
return chapters
def get_hadiths_page(
self,
book_id: int,
chapter_id: Optional[int] = None,
page: int = 1,
limit: int = 100
) -> Dict[str, Any]:
"""
Get a page of hadiths
Args:
book_id: Book ID
chapter_id: Optional chapter ID to filter by
page: Page number (1-indexed)
limit: Results per page (max 100)
Returns:
Response dictionary with hadiths and pagination info
"""
params = self._add_api_key({
'book': book_id,
'page': page,
'limit': min(limit, 100) # Enforce max limit
})
if chapter_id:
params['chapter'] = chapter_id
logger.debug(
"fetching_hadiths_page",
book_id=book_id,
chapter_id=chapter_id,
page=page,
limit=limit
)
response = self.get("hadiths", params=params)
# logger.debug(
# "fetching_hadiths_page####",
# response=response
# )
if response.get('status') != 200:
logger.error(
"api_error",
status=response.get('status'),
message=response.get('message')
)
raise Exception(f"API Error: {response.get('message')}")
return response.get('hadiths', {})
def iter_all_hadiths_in_book(
self,
book_id: int,
book_slug: str,
chapter_id: Optional[int] = None,
batch_size: int = 100
) -> Generator[Dict[str, Any], None, None]:
"""
Iterator that yields all hadiths in a book, handling pagination automatically
Args:
book_id: Book ID
book_slug: Book slug for logging
chapter_id: Optional chapter ID to filter by
batch_size: Number of hadiths to fetch per request (max 100)
Yields:
Individual hadith dictionaries
"""
page = 1
total_fetched = 0
while True:
response_data = self.get_hadiths_page(
book_id=book_slug,
chapter_id=chapter_id,
page=page,
limit=batch_size
)
hadiths = response_data.get('data', [])
pagination = response_data.get('pagination', {})
# logger.info(
# "book_complete",
# book_slug=book_slug,
# hadiths=hadiths,
# pagination=pagination,
# response = response_data
# )
if not hadiths:
logger.info(
"book_complete",
book_slug=book_slug,
chapter_id=chapter_id,
total_hadiths=total_fetched
)
break
for hadith in hadiths:
yield hadith
total_fetched += 1
# Log progress
if total_fetched % 500 == 0:
logger.info(
"progress",
book_slug=book_slug,
fetched=total_fetched,
total=response_data.get('total', '?')
)
# Check if there are more pages
current_page = response_data.get('current_page', page)
last_page = response_data.get('last_page', 1)
if current_page >= last_page:
logger.info(
"book_complete",
book_slug=book_slug,
total_hadiths=total_fetched,
total_pages=last_page
)
break
page += 1
def iter_all_hadiths_in_book_with_chapters(
self,
book_id: int,
book_slug: str,
batch_size: int = 100
) -> Generator[Tuple[Dict[str, Any], Optional[Dict[str, Any]]], None, None]:
"""
Iterator that yields all hadiths in a book, organized by chapter
Args:
book_id: Book ID
book_slug: Book slug
batch_size: Number of hadiths to fetch per request
Yields:
Tuple of (hadith_dict, chapter_dict or None)
"""
# First, get all chapters
try:
chapters = self.get_chapters(book_slug)
except Exception as e:
logger.warning(
"chapters_fetch_failed",
book_slug=book_slug,
error=str(e),
fallback="fetching_all_hadiths_without_chapter_filter"
)
# Fallback: fetch all hadiths without chapter filter
for hadith in self.iter_all_hadiths_in_book(
book_id=book_id,
book_slug=book_slug,
batch_size=batch_size
):
chapter_info = hadith.get('chapter')
yield hadith, chapter_info
return
logger.info(
"starting_chapter_by_chapter_fetch",
book_slug=book_slug,
total_chapters=len(chapters)
)
# Process each chapter
for chapter in chapters:
# logger.warning("Processing chapter", chapter=chapter)
if book_slug in {'sahih-muslim','al-tirmidhi','al-silsila-sahiha','al-tirmidhi'}:
chapter_id = chapter.get('chapterNumber')
else:
chapter_id = chapter.get('id')
chapter_number = chapter.get('chapterNumber')
print(chapter_id, chapter_number, chapter.get('name'))
logger.info(
"fetching_chapter",
book_slug=book_slug,
chapter_id=chapter_id,
chapter_number=chapter_number
)
try:
for hadith in self.iter_all_hadiths_in_book(
book_id=book_id,
book_slug=book_slug,
chapter_id=chapter_id,
batch_size=batch_size
):
yield hadith, chapter
except Exception as e:
logger.error(
"chapter_fetch_failed",
book_slug=book_slug,
chapter_id=chapter_id,
error=str(e)
)
continue
def get_book_by_slug(self, book_slug: str) -> Optional[Dict[str, Any]]:
"""
Get book details by slug
Args:
book_slug: Book slug identifier
Returns:
Book dictionary or None if not found
"""
books = self.get_books()
for book in books:
if book.get('bookSlug') == book_slug:
return book
return None