diff --git a/hadith-ingestion/.env b/hadith-ingestion/.env index db27494..ce151b8 100644 --- a/hadith-ingestion/.env +++ b/hadith-ingestion/.env @@ -1,5 +1,6 @@ # Database -DATABASE_HOST=postgres.db.svc.cluster.local +# DATABASE_HOST=postgres.db.svc.cluster.local +DATABASE_HOST = pg.betelgeusebytes.io DATABASE_PORT=5432 DATABASE_NAME=hadith_db DATABASE_USER=hadith_ingest diff --git a/hadith-ingestion/__pycache__/test_hadithapi.cpython-311-pytest-7.4.0.pyc b/hadith-ingestion/__pycache__/test_hadithapi.cpython-311-pytest-7.4.0.pyc new file mode 100644 index 0000000..030be74 Binary files /dev/null and b/hadith-ingestion/__pycache__/test_hadithapi.cpython-311-pytest-7.4.0.pyc differ diff --git a/hadith-ingestion/__pycache__/test_mainhadithapi.cpython-311-pytest-7.4.0.pyc b/hadith-ingestion/__pycache__/test_mainhadithapi.cpython-311-pytest-7.4.0.pyc new file mode 100644 index 0000000..37d4806 Binary files /dev/null and b/hadith-ingestion/__pycache__/test_mainhadithapi.cpython-311-pytest-7.4.0.pyc differ diff --git a/hadith-ingestion/config/__pycache__/settings.cpython-312.pyc b/hadith-ingestion/config/__pycache__/settings.cpython-312.pyc index 97a1e31..2d885a0 100644 Binary files a/hadith-ingestion/config/__pycache__/settings.cpython-312.pyc and b/hadith-ingestion/config/__pycache__/settings.cpython-312.pyc differ diff --git a/hadith-ingestion/config/__pycache__/settings.cpython-38.pyc b/hadith-ingestion/config/__pycache__/settings.cpython-38.pyc index 3ed3541..43c8654 100644 Binary files a/hadith-ingestion/config/__pycache__/settings.cpython-38.pyc and b/hadith-ingestion/config/__pycache__/settings.cpython-38.pyc differ diff --git a/hadith-ingestion/config/settings.py b/hadith-ingestion/config/settings.py index e92bafd..c19d513 100644 --- a/hadith-ingestion/config/settings.py +++ b/hadith-ingestion/config/settings.py @@ -10,7 +10,8 @@ class Settings(BaseSettings): """Application settings loaded from environment variables""" # Database - DATABASE_HOST: str = "postgres.db.svc.cluster.local" + # DATABASE_HOST: str = "postgres.db.svc.cluster.local" + DATABASE_HOST: str = "pg.betelgeusebytes.io" DATABASE_PORT: int = 5432 DATABASE_NAME: str = "hadith_db" DATABASE_USER: str = "hadith_ingest" @@ -29,7 +30,7 @@ class Settings(BaseSettings): f"postgresql+asyncpg://{self.DATABASE_USER}:{self.DATABASE_PASSWORD}" f"@{self.DATABASE_HOST}:{self.DATABASE_PORT}/{self.DATABASE_NAME}" ) - + # MinIO / S3 MINIO_ENDPOINT: str = "minio.storage.svc.cluster.local:9000" MINIO_ACCESS_KEY: str = "minioadmin" @@ -37,9 +38,6 @@ class Settings(BaseSettings): MINIO_BUCKET_RAW: str = "hadith-raw-data" MINIO_BUCKET_PROCESSED: str = "hadith-processed" MINIO_SECURE: bool = False - - # APIs - SUNNAH_API_KEY: Optional[str] = None SUNNAH_BASE_URL: str = "https://api.sunnah.com/v1" HADITH_ONE_API_KEY: Optional[str] = "$2y$10$nTJnyX3WUDoGmjKrKqSmbecANVsQWKyffmtp9fxmsQwR15DEv4mK" # HadithAPI.com diff --git a/hadith-ingestion/src/__pycache__/__init__.cpython-311.pyc b/hadith-ingestion/src/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000..a7fd624 Binary files /dev/null and b/hadith-ingestion/src/__pycache__/__init__.cpython-311.pyc differ diff --git a/hadith-ingestion/src/__pycache__/main_hadithapi.cpython-311.pyc b/hadith-ingestion/src/__pycache__/main_hadithapi.cpython-311.pyc new file mode 100644 index 0000000..2a9148c Binary files /dev/null and b/hadith-ingestion/src/__pycache__/main_hadithapi.cpython-311.pyc differ diff --git a/hadith-ingestion/src/__pycache__/main_hadithapi.cpython-312.pyc b/hadith-ingestion/src/__pycache__/main_hadithapi.cpython-312.pyc new file mode 100644 index 0000000..68d7bf5 Binary files /dev/null and b/hadith-ingestion/src/__pycache__/main_hadithapi.cpython-312.pyc differ diff --git a/hadith-ingestion/src/__pycache__/main_hadithapi.cpython-38.pyc b/hadith-ingestion/src/__pycache__/main_hadithapi.cpython-38.pyc new file mode 100644 index 0000000..87b5c92 Binary files /dev/null and b/hadith-ingestion/src/__pycache__/main_hadithapi.cpython-38.pyc differ diff --git a/hadith-ingestion/src/api_clients/__pycache__/__init__.cpython-311.pyc b/hadith-ingestion/src/api_clients/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000..e6ce562 Binary files /dev/null and b/hadith-ingestion/src/api_clients/__pycache__/__init__.cpython-311.pyc differ diff --git a/hadith-ingestion/src/api_clients/__pycache__/hadithapi_client.cpython-311.pyc b/hadith-ingestion/src/api_clients/__pycache__/hadithapi_client.cpython-311.pyc new file mode 100644 index 0000000..f178923 Binary files /dev/null and b/hadith-ingestion/src/api_clients/__pycache__/hadithapi_client.cpython-311.pyc differ diff --git a/hadith-ingestion/src/api_clients/__pycache__/hadithapi_client.cpython-312.pyc b/hadith-ingestion/src/api_clients/__pycache__/hadithapi_client.cpython-312.pyc index 8c6dce5..a0ad7e7 100644 Binary files a/hadith-ingestion/src/api_clients/__pycache__/hadithapi_client.cpython-312.pyc and b/hadith-ingestion/src/api_clients/__pycache__/hadithapi_client.cpython-312.pyc differ diff --git a/hadith-ingestion/src/database/__pycache__/__init__.cpython-312.pyc b/hadith-ingestion/src/database/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..789481c Binary files /dev/null and b/hadith-ingestion/src/database/__pycache__/__init__.cpython-312.pyc differ diff --git a/hadith-ingestion/src/database/__pycache__/__init__.cpython-38.pyc b/hadith-ingestion/src/database/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000..679f0e9 Binary files /dev/null and b/hadith-ingestion/src/database/__pycache__/__init__.cpython-38.pyc differ diff --git a/hadith-ingestion/src/database/__pycache__/repository.cpython-312.pyc b/hadith-ingestion/src/database/__pycache__/repository.cpython-312.pyc new file mode 100644 index 0000000..9927d16 Binary files /dev/null and b/hadith-ingestion/src/database/__pycache__/repository.cpython-312.pyc differ diff --git a/hadith-ingestion/src/database/repository.py b/hadith-ingestion/src/database/repository.py index 85e8b0b..87011a8 100644 --- a/hadith-ingestion/src/database/repository.py +++ b/hadith-ingestion/src/database/repository.py @@ -2,6 +2,7 @@ Database repository for hadith data operations """ from typing import List, Dict, Any, Optional +import json from uuid import UUID import structlog from sqlalchemy import create_engine, text, select, insert, update @@ -14,31 +15,37 @@ logger = structlog.get_logger() class HadithRepository: """Repository for hadith database operations""" - + def __init__(self, database_url: Optional[str] = None): self.database_url = database_url or settings.DATABASE_URL self.engine = create_engine(self.database_url, pool_pre_ping=True) self.SessionLocal = sessionmaker(bind=self.engine) - + + @staticmethod + def _coerce_uuid(value: Any) -> UUID: + if isinstance(value, UUID): + return value + return UUID(str(value)) + def get_session(self) -> Session: """Get database session""" return self.SessionLocal() - + # ===== Collections ===== - + def get_collection_by_abbreviation(self, abbr: str) -> Optional[Dict[str, Any]]: """Get collection by abbreviation""" with self.get_session() as session: query = text(""" - SELECT * FROM collections + SELECT * FROM collections WHERE abbreviation = :abbr """) result = session.execute(query, {"abbr": abbr}).fetchone() - + if result: return dict(result._mapping) return None - + def get_all_collections(self) -> List[Dict[str, Any]]: """Get all collections""" with self.get_session() as session: @@ -68,34 +75,35 @@ class HadithRepository: metadata: Optional[Dict] = None ) -> UUID: """Insert or update a book""" + metadata_json = json.dumps(metadata or {}) with self.get_session() as session: query = text(""" INSERT INTO books (collection_id, book_number, name_english, name_arabic, metadata) VALUES (:collection_id, :book_number, :name_english, :name_arabic, :metadata) - ON CONFLICT (collection_id, book_number) - DO UPDATE SET + ON CONFLICT (collection_id, book_number) + DO UPDATE SET name_english = EXCLUDED.name_english, name_arabic = EXCLUDED.name_arabic, metadata = EXCLUDED.metadata RETURNING id """) - + result = session.execute(query, { "collection_id": str(collection_id), "book_number": book_number, "name_english": name_english, "name_arabic": name_arabic, - "metadata": metadata or {} + "metadata": metadata_json }) session.commit() - - return UUID(result.fetchone()[0]) - + + return self._coerce_uuid(result.fetchone()[0]) + def get_book(self, collection_id: UUID, book_number: int) -> Optional[Dict[str, Any]]: """Get book by collection and book number""" with self.get_session() as session: query = text(""" - SELECT * FROM books + SELECT * FROM books WHERE collection_id = :collection_id AND book_number = :book_number """) result = session.execute(query, { @@ -125,6 +133,7 @@ class HadithRepository: source_metadata: Optional[Dict] = None ) -> UUID: """Insert or update a hadith""" + with self.get_session() as session: query = text(""" INSERT INTO hadiths ( @@ -152,7 +161,8 @@ class HadithRepository: updated_at = NOW() RETURNING id """) - + metadata_json = json.dumps(source_metadata or {}) + result = session.execute(query, { "collection_id": str(collection_id), "book_id": str(book_id) if book_id else None, @@ -165,12 +175,12 @@ class HadithRepository: "chapter_name": chapter_name, "source_id": source_id, "source_url": source_url, - "source_metadata": source_metadata or {} + "source_metadata": metadata_json }) session.commit() - - return UUID(result.fetchone()[0]) - + + return self._coerce_uuid(result.fetchone()[0]) + def get_hadiths_without_embeddings( self, limit: int = 100, @@ -180,8 +190,8 @@ class HadithRepository: with self.get_session() as session: if collection_id: query = text(""" - SELECT * FROM hadiths - WHERE embedding_generated = FALSE + SELECT * FROM hadiths + WHERE embedding_generated = FALSE AND collection_id = :collection_id ORDER BY created_at ASC LIMIT :limit @@ -200,22 +210,26 @@ class HadithRepository: result = session.execute(query, {"limit": limit}).fetchall() return [dict(row._mapping) for row in result] - + def mark_embedding_generated(self, hadith_id: UUID, version: str = "v1"): """Mark hadith as having embedding generated""" with self.get_session() as session: + # Prepare the update query query = text(""" - UPDATE hadiths - SET embedding_generated = TRUE, + UPDATE hadiths + SET embedding_generated = TRUE, embedding_version = :version, updated_at = NOW() WHERE id = :id """) + # Pre-serialize parameters (keeping consistent with other methods that + # serialize payloads/configs before execution) + params = {"id": str(hadith_id), "version": version} session.execute(query, {"id": str(hadith_id), "version": version}) session.commit() - + # ===== Ingestion Jobs ===== - + def create_ingestion_job( self, job_name: str, @@ -230,15 +244,16 @@ class HadithRepository: VALUES (:job_name, :job_type, :source_name, :config, 'running', NOW()) RETURNING id """) + # serialize config as JSON for storage result = session.execute(query, { "job_name": job_name, "job_type": job_type, "source_name": source_name, - "config": config or {} + "config": json.dumps(config or {}) }) session.commit() - - return UUID(result.fetchone()[0]) + job_id = result.fetchone()[0] + return job_id if isinstance(job_id, UUID) else UUID(str(job_id)) def update_job_progress( self, @@ -297,7 +312,7 @@ class HadithRepository: "error_message": error_message }) session.commit() - + def add_processing_log( self, job_id: UUID, @@ -311,16 +326,17 @@ class HadithRepository: INSERT INTO processing_logs (job_id, log_level, message, details) VALUES (:job_id, :level, :message, :details) """) + details_json = json.dumps(details or {}) session.execute(query, { "job_id": str(job_id), "level": level, "message": message, - "details": details or {} + "details": details_json }) session.commit() - + # ===== Statistics ===== - + def get_collection_stats(self, collection_id: UUID) -> Dict[str, Any]: """Get statistics for a collection""" with self.get_session() as session: @@ -328,7 +344,7 @@ class HadithRepository: SELECT * FROM get_collection_statistics(:collection_id) """) result = session.execute(query, {"collection_id": str(collection_id)}).fetchone() - + if result: return dict(result._mapping) return {} \ No newline at end of file diff --git a/hadith-ingestion/src/main_hadithapi.py b/hadith-ingestion/src/main_hadithapi.py index 19dc566..73ce32e 100644 --- a/hadith-ingestion/src/main_hadithapi.py +++ b/hadith-ingestion/src/main_hadithapi.py @@ -7,9 +7,9 @@ from typing import Optional, Dict, Any from uuid import UUID import structlog from config.settings import settings -from api_clients.hadithapi_client import HadithAPIClient -from database.repository import HadithRepository -from processors.text_cleaner import ArabicTextProcessor, TextCleaner +from src.api_clients.hadithapi_client import HadithAPIClient +from src.database.repository import HadithRepository +from src.processors.text_cleaner import ArabicTextProcessor, TextCleaner # Configure structured logging structlog.configure( @@ -35,14 +35,15 @@ logger = structlog.get_logger() BOOK_SLUG_MAPPING = { 'sahih-bukhari': 'bukhari', 'sahih-muslim': 'muslim', - 'sunan-abu-dawood': 'abudawud', - 'jami-at-tirmidhi': 'tirmidhi', - 'sunan-an-nasai': 'nasai', - 'sunan-ibn-e-majah': 'ibnmajah', + 'abu-dawood': 'abudawud', + 'al-tirmidhi': 'tirmidhi', + 'sunan-nasai': 'nasai', + 'ibn-e-majah': 'ibnmajah', 'muwatta-imam-malik': 'malik', 'musnad-ahmad': 'ahmad', 'sunan-ad-darimi': 'darimi', - 'mishkat-al-masabih': 'mishkat' + 'mishkat': 'mishkat', + 'al-silsila-sahiha': 'al-silsila-sahiha' } @@ -66,12 +67,12 @@ class HadithAPIIngestionService: # Get books from API api_books = self.api_client.get_books() - + book_mapping = {} - + # print(BOOK_SLUG_MAPPING) for api_book in api_books: book_slug = api_book.get('bookSlug') - + # print(book_slug) # Map to our collection abbreviation collection_abbr = BOOK_SLUG_MAPPING.get(book_slug) @@ -82,7 +83,6 @@ class HadithAPIIngestionService: book_name=api_book.get('bookName') ) continue - # Get or verify collection exists in database collection = self.repo.get_collection_by_abbreviation(collection_abbr) @@ -93,8 +93,10 @@ class HadithAPIIngestionService: book_slug=book_slug ) continue - - collection_id = UUID(collection['id']) + + collection_id = collection['id'] + if not isinstance(collection_id, UUID): + collection_id = UUID(str(collection_id)) book_mapping[book_slug] = { 'collection_id': collection_id, 'book_id': api_book.get('id'), @@ -305,7 +307,7 @@ class HadithAPIIngestionService: if not arabic_text: raise ValueError("Missing Arabic text") - + # passed logger.warning("Arabic text extracted and validated", hadith_number=hadith_number) # Clean texts arabic_text = self.text_cleaner.clean_text(arabic_text) if english_text: @@ -319,14 +321,15 @@ class HadithAPIIngestionService: # Get or create chapter (book in our schema) book_id = None chapter_name = None - + # logger.warning("Processing chapter data####", chapter_data.get('positional_args', {}).get('id')) + # logger.info("Processing chapter data####2####", chapter_data.get('id')) if chapter_data: chapter_id = chapter_data.get('id') chapter_number = chapter_data.get('chapterNumber') chapter_name_en = chapter_data.get('chapterEnglish') chapter_name_ar = chapter_data.get('chapterArabic') chapter_name = chapter_name_en - + # print(chapter_number, chapter_name) if chapter_number: try: chapter_number = int(chapter_number) @@ -335,7 +338,8 @@ class HadithAPIIngestionService: # Get or create book (chapter in HadithAPI = book in our schema) existing_book = self.repo.get_book(collection_id, chapter_number) - + # logger.warning("EXISTING BOOK : ", existing_book) + # logger.warning("Fetched or created book", collection_id=collection_id, chapter_number=chapter_number, chapter_name=chapter_name) if not existing_book: book_id = self.repo.upsert_book( collection_id=collection_id, @@ -345,8 +349,12 @@ class HadithAPIIngestionService: metadata=chapter_data ) else: - book_id = UUID(existing_book['id']) - + # print(existing_book['id']) + existing_id = existing_book['id'] + if isinstance(existing_id, str): + existing_id = UUID(existing_id) + book_id = existing_id + # Build source metadata source_metadata = { 'api_id': hadith_data.get('id'), @@ -356,7 +364,7 @@ class HadithAPIIngestionService: 'chapterId': hadith_data.get('chapterId'), 'chapter': chapter_data } - + # logger.warning("Hadith metadata built", source_metadata) # Store hadith hadith_id = self.repo.upsert_hadith( collection_id=collection_id, diff --git a/hadith-ingestion/src/processors/__pycache__/__init__.cpython-312.pyc b/hadith-ingestion/src/processors/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..f24452c Binary files /dev/null and b/hadith-ingestion/src/processors/__pycache__/__init__.cpython-312.pyc differ diff --git a/hadith-ingestion/src/processors/__pycache__/text_cleaner.cpython-312.pyc b/hadith-ingestion/src/processors/__pycache__/text_cleaner.cpython-312.pyc new file mode 100644 index 0000000..d576a61 Binary files /dev/null and b/hadith-ingestion/src/processors/__pycache__/text_cleaner.cpython-312.pyc differ diff --git a/hadith-ingestion/test_mainhadithapi.py b/hadith-ingestion/test_mainhadithapi.py new file mode 100644 index 0000000..287fe83 --- /dev/null +++ b/hadith-ingestion/test_mainhadithapi.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 +""" +Test script for main_hadithapi.py +""" +import sys +import os +sys.path.insert(0, '.') + +from src.main_hadithapi import HadithAPIIngestionService + +def test_main_hadithapi(): + """Test the main HadithAPI ingestion service""" + print("=== Testing HadithAPI Ingestion Service ===\n") + + try: + # Initialize the service + print("1. Initializing HadithAPIIngestionService...") + service = HadithAPIIngestionService() + print("✓ Service initialized successfully\n") + + # Test 1: List available books + print("2. Testing book synchronization...") + book_mapping = service.sync_books_from_api() + print(f"✓ Found {len(book_mapping)} mapped books") + for book_slug, info in list(book_mapping.items())[:3]: # Show first 3 + print(f" - {book_slug}: {info['book_name']} ({info['hadiths_count']} hadiths)") + print() + + # Test 2: Test ingestion with limit + print("3. Testing limited ingestion (10 hadiths from Sahih Bukhari)...") + stats = service.ingest_collection( + book_slug='sahih-bukhari', + limit=10 + ) + print(f"✓ Ingestion completed with stats:") + print(f" Processed: {stats['processed']}") + print(f" Failed: {stats['failed']}") + print(f" Skipped: {stats['skipped']}\n") + + # Test 3: List books functionality + print("4. Testing book listing...") + print("\n=== Available Books ===\n") + for book_slug, info in book_mapping.items(): + print(f"Book Slug: {book_slug}") + print(f" Name: {info['book_name']}") + print(f" Hadiths: {info['hadiths_count']}") + print(f" Chapters: {info['chapters_count']}") + print() + + # Clean up + service.close() + print("=== All Tests Passed! ===") + return True + + except Exception as e: + print(f"✗ Test failed with error: {e}") + import traceback + traceback.print_exc() + return False + +def test_command_line_args(): + """Test command line argument parsing""" + print("=== Testing Command Line Arguments ===\n") + + # We'll simulate command line arguments + import argparse + from src.main_hadithapi import main + + # Test --list-books argument + print("1. Testing --list-books argument...") + original_argv = sys.argv.copy() + + try: + sys.argv = ['main_hadithapi.py', '--list-books'] + # We won't actually run main() as it would exit, but we can check the parsing + parser = argparse.ArgumentParser(description="Ingest hadiths from HadithAPI.com") + parser.add_argument("--book-slug", help="Book slug (e.g., sahih-bukhari)") + parser.add_argument("--limit", type=int, help="Limit number of hadiths to ingest") + parser.add_argument("--list-books", action="store_true", help="List available books and exit") + + args = parser.parse_args(['--list-books']) + print(f"✓ Argument parsing successful: list_books={args.list_books}") + + # Test book-slug argument + args = parser.parse_args(['--book-slug', 'sahih-bukhari', '--limit', '5']) + print(f"✓ Argument parsing successful: book_slug={args.book_slug}, limit={args.limit}") + + print("✓ Command line argument parsing works correctly\n") + return True + + except Exception as e: + print(f"✗ Argument parsing failed: {e}") + return False + finally: + sys.argv = original_argv + +if __name__ == "__main__": + print("Starting tests for main_hadithapi.py...\n") + + # Test command line arguments + if not test_command_line_args(): + sys.exit(1) + + # Test main functionality + if not test_main_hadithapi(): + sys.exit(1) + + print("\n🎉 All tests passed successfully!") + sys.exit(0) \ No newline at end of file diff --git a/hadith-ingestion/tests/__pycache__/__init__.cpython-311.pyc b/hadith-ingestion/tests/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000..49ca1f4 Binary files /dev/null and b/hadith-ingestion/tests/__pycache__/__init__.cpython-311.pyc differ diff --git a/hadith-ingestion/tests/__pycache__/test_clients.cpython-311-pytest-7.4.0.pyc b/hadith-ingestion/tests/__pycache__/test_clients.cpython-311-pytest-7.4.0.pyc new file mode 100644 index 0000000..97ad15a Binary files /dev/null and b/hadith-ingestion/tests/__pycache__/test_clients.cpython-311-pytest-7.4.0.pyc differ