fix ingestion tests errors

This commit is contained in:
salahangal 2025-11-17 13:49:40 +01:00
parent 53cd6e2415
commit 737ac54c1b
23 changed files with 194 additions and 62 deletions

View File

@ -1,5 +1,6 @@
# Database
DATABASE_HOST=postgres.db.svc.cluster.local
# DATABASE_HOST=postgres.db.svc.cluster.local
DATABASE_HOST = pg.betelgeusebytes.io
DATABASE_PORT=5432
DATABASE_NAME=hadith_db
DATABASE_USER=hadith_ingest

View File

@ -10,7 +10,8 @@ class Settings(BaseSettings):
"""Application settings loaded from environment variables"""
# Database
DATABASE_HOST: str = "postgres.db.svc.cluster.local"
# DATABASE_HOST: str = "postgres.db.svc.cluster.local"
DATABASE_HOST: str = "pg.betelgeusebytes.io"
DATABASE_PORT: int = 5432
DATABASE_NAME: str = "hadith_db"
DATABASE_USER: str = "hadith_ingest"
@ -37,9 +38,6 @@ class Settings(BaseSettings):
MINIO_BUCKET_RAW: str = "hadith-raw-data"
MINIO_BUCKET_PROCESSED: str = "hadith-processed"
MINIO_SECURE: bool = False
# APIs
SUNNAH_API_KEY: Optional[str] = None
SUNNAH_BASE_URL: str = "https://api.sunnah.com/v1"
HADITH_ONE_API_KEY: Optional[str] = "$2y$10$nTJnyX3WUDoGmjKrKqSmbecANVsQWKyffmtp9fxmsQwR15DEv4mK"
# HadithAPI.com

View File

@ -2,6 +2,7 @@
Database repository for hadith data operations
"""
from typing import List, Dict, Any, Optional
import json
from uuid import UUID
import structlog
from sqlalchemy import create_engine, text, select, insert, update
@ -20,6 +21,12 @@ class HadithRepository:
self.engine = create_engine(self.database_url, pool_pre_ping=True)
self.SessionLocal = sessionmaker(bind=self.engine)
@staticmethod
def _coerce_uuid(value: Any) -> UUID:
if isinstance(value, UUID):
return value
return UUID(str(value))
def get_session(self) -> Session:
"""Get database session"""
return self.SessionLocal()
@ -68,6 +75,7 @@ class HadithRepository:
metadata: Optional[Dict] = None
) -> UUID:
"""Insert or update a book"""
metadata_json = json.dumps(metadata or {})
with self.get_session() as session:
query = text("""
INSERT INTO books (collection_id, book_number, name_english, name_arabic, metadata)
@ -85,11 +93,11 @@ class HadithRepository:
"book_number": book_number,
"name_english": name_english,
"name_arabic": name_arabic,
"metadata": metadata or {}
"metadata": metadata_json
})
session.commit()
return UUID(result.fetchone()[0])
return self._coerce_uuid(result.fetchone()[0])
def get_book(self, collection_id: UUID, book_number: int) -> Optional[Dict[str, Any]]:
"""Get book by collection and book number"""
@ -125,6 +133,7 @@ class HadithRepository:
source_metadata: Optional[Dict] = None
) -> UUID:
"""Insert or update a hadith"""
with self.get_session() as session:
query = text("""
INSERT INTO hadiths (
@ -152,6 +161,7 @@ class HadithRepository:
updated_at = NOW()
RETURNING id
""")
metadata_json = json.dumps(source_metadata or {})
result = session.execute(query, {
"collection_id": str(collection_id),
@ -165,11 +175,11 @@ class HadithRepository:
"chapter_name": chapter_name,
"source_id": source_id,
"source_url": source_url,
"source_metadata": source_metadata or {}
"source_metadata": metadata_json
})
session.commit()
return UUID(result.fetchone()[0])
return self._coerce_uuid(result.fetchone()[0])
def get_hadiths_without_embeddings(
self,
@ -204,6 +214,7 @@ class HadithRepository:
def mark_embedding_generated(self, hadith_id: UUID, version: str = "v1"):
"""Mark hadith as having embedding generated"""
with self.get_session() as session:
# Prepare the update query
query = text("""
UPDATE hadiths
SET embedding_generated = TRUE,
@ -211,6 +222,9 @@ class HadithRepository:
updated_at = NOW()
WHERE id = :id
""")
# Pre-serialize parameters (keeping consistent with other methods that
# serialize payloads/configs before execution)
params = {"id": str(hadith_id), "version": version}
session.execute(query, {"id": str(hadith_id), "version": version})
session.commit()
@ -230,15 +244,16 @@ class HadithRepository:
VALUES (:job_name, :job_type, :source_name, :config, 'running', NOW())
RETURNING id
""")
# serialize config as JSON for storage
result = session.execute(query, {
"job_name": job_name,
"job_type": job_type,
"source_name": source_name,
"config": config or {}
"config": json.dumps(config or {})
})
session.commit()
return UUID(result.fetchone()[0])
job_id = result.fetchone()[0]
return job_id if isinstance(job_id, UUID) else UUID(str(job_id))
def update_job_progress(
self,
@ -311,11 +326,12 @@ class HadithRepository:
INSERT INTO processing_logs (job_id, log_level, message, details)
VALUES (:job_id, :level, :message, :details)
""")
details_json = json.dumps(details or {})
session.execute(query, {
"job_id": str(job_id),
"level": level,
"message": message,
"details": details or {}
"details": details_json
})
session.commit()

View File

@ -7,9 +7,9 @@ from typing import Optional, Dict, Any
from uuid import UUID
import structlog
from config.settings import settings
from api_clients.hadithapi_client import HadithAPIClient
from database.repository import HadithRepository
from processors.text_cleaner import ArabicTextProcessor, TextCleaner
from src.api_clients.hadithapi_client import HadithAPIClient
from src.database.repository import HadithRepository
from src.processors.text_cleaner import ArabicTextProcessor, TextCleaner
# Configure structured logging
structlog.configure(
@ -35,14 +35,15 @@ logger = structlog.get_logger()
BOOK_SLUG_MAPPING = {
'sahih-bukhari': 'bukhari',
'sahih-muslim': 'muslim',
'sunan-abu-dawood': 'abudawud',
'jami-at-tirmidhi': 'tirmidhi',
'sunan-an-nasai': 'nasai',
'sunan-ibn-e-majah': 'ibnmajah',
'abu-dawood': 'abudawud',
'al-tirmidhi': 'tirmidhi',
'sunan-nasai': 'nasai',
'ibn-e-majah': 'ibnmajah',
'muwatta-imam-malik': 'malik',
'musnad-ahmad': 'ahmad',
'sunan-ad-darimi': 'darimi',
'mishkat-al-masabih': 'mishkat'
'mishkat': 'mishkat',
'al-silsila-sahiha': 'al-silsila-sahiha'
}
@ -68,10 +69,10 @@ class HadithAPIIngestionService:
api_books = self.api_client.get_books()
book_mapping = {}
# print(BOOK_SLUG_MAPPING)
for api_book in api_books:
book_slug = api_book.get('bookSlug')
# print(book_slug)
# Map to our collection abbreviation
collection_abbr = BOOK_SLUG_MAPPING.get(book_slug)
@ -82,7 +83,6 @@ class HadithAPIIngestionService:
book_name=api_book.get('bookName')
)
continue
# Get or verify collection exists in database
collection = self.repo.get_collection_by_abbreviation(collection_abbr)
@ -94,7 +94,9 @@ class HadithAPIIngestionService:
)
continue
collection_id = UUID(collection['id'])
collection_id = collection['id']
if not isinstance(collection_id, UUID):
collection_id = UUID(str(collection_id))
book_mapping[book_slug] = {
'collection_id': collection_id,
'book_id': api_book.get('id'),
@ -305,7 +307,7 @@ class HadithAPIIngestionService:
if not arabic_text:
raise ValueError("Missing Arabic text")
# passed logger.warning("Arabic text extracted and validated", hadith_number=hadith_number)
# Clean texts
arabic_text = self.text_cleaner.clean_text(arabic_text)
if english_text:
@ -319,14 +321,15 @@ class HadithAPIIngestionService:
# Get or create chapter (book in our schema)
book_id = None
chapter_name = None
# logger.warning("Processing chapter data####", chapter_data.get('positional_args', {}).get('id'))
# logger.info("Processing chapter data####2####", chapter_data.get('id'))
if chapter_data:
chapter_id = chapter_data.get('id')
chapter_number = chapter_data.get('chapterNumber')
chapter_name_en = chapter_data.get('chapterEnglish')
chapter_name_ar = chapter_data.get('chapterArabic')
chapter_name = chapter_name_en
# print(chapter_number, chapter_name)
if chapter_number:
try:
chapter_number = int(chapter_number)
@ -335,7 +338,8 @@ class HadithAPIIngestionService:
# Get or create book (chapter in HadithAPI = book in our schema)
existing_book = self.repo.get_book(collection_id, chapter_number)
# logger.warning("EXISTING BOOK : ", existing_book)
# logger.warning("Fetched or created book", collection_id=collection_id, chapter_number=chapter_number, chapter_name=chapter_name)
if not existing_book:
book_id = self.repo.upsert_book(
collection_id=collection_id,
@ -345,7 +349,11 @@ class HadithAPIIngestionService:
metadata=chapter_data
)
else:
book_id = UUID(existing_book['id'])
# print(existing_book['id'])
existing_id = existing_book['id']
if isinstance(existing_id, str):
existing_id = UUID(existing_id)
book_id = existing_id
# Build source metadata
source_metadata = {
@ -356,7 +364,7 @@ class HadithAPIIngestionService:
'chapterId': hadith_data.get('chapterId'),
'chapter': chapter_data
}
# logger.warning("Hadith metadata built", source_metadata)
# Store hadith
hadith_id = self.repo.upsert_hadith(
collection_id=collection_id,

View File

@ -0,0 +1,109 @@
#!/usr/bin/env python3
"""
Test script for main_hadithapi.py
"""
import sys
import os
sys.path.insert(0, '.')
from src.main_hadithapi import HadithAPIIngestionService
def test_main_hadithapi():
"""Test the main HadithAPI ingestion service"""
print("=== Testing HadithAPI Ingestion Service ===\n")
try:
# Initialize the service
print("1. Initializing HadithAPIIngestionService...")
service = HadithAPIIngestionService()
print("✓ Service initialized successfully\n")
# Test 1: List available books
print("2. Testing book synchronization...")
book_mapping = service.sync_books_from_api()
print(f"✓ Found {len(book_mapping)} mapped books")
for book_slug, info in list(book_mapping.items())[:3]: # Show first 3
print(f" - {book_slug}: {info['book_name']} ({info['hadiths_count']} hadiths)")
print()
# Test 2: Test ingestion with limit
print("3. Testing limited ingestion (10 hadiths from Sahih Bukhari)...")
stats = service.ingest_collection(
book_slug='sahih-bukhari',
limit=10
)
print(f"✓ Ingestion completed with stats:")
print(f" Processed: {stats['processed']}")
print(f" Failed: {stats['failed']}")
print(f" Skipped: {stats['skipped']}\n")
# Test 3: List books functionality
print("4. Testing book listing...")
print("\n=== Available Books ===\n")
for book_slug, info in book_mapping.items():
print(f"Book Slug: {book_slug}")
print(f" Name: {info['book_name']}")
print(f" Hadiths: {info['hadiths_count']}")
print(f" Chapters: {info['chapters_count']}")
print()
# Clean up
service.close()
print("=== All Tests Passed! ===")
return True
except Exception as e:
print(f"✗ Test failed with error: {e}")
import traceback
traceback.print_exc()
return False
def test_command_line_args():
"""Test command line argument parsing"""
print("=== Testing Command Line Arguments ===\n")
# We'll simulate command line arguments
import argparse
from src.main_hadithapi import main
# Test --list-books argument
print("1. Testing --list-books argument...")
original_argv = sys.argv.copy()
try:
sys.argv = ['main_hadithapi.py', '--list-books']
# We won't actually run main() as it would exit, but we can check the parsing
parser = argparse.ArgumentParser(description="Ingest hadiths from HadithAPI.com")
parser.add_argument("--book-slug", help="Book slug (e.g., sahih-bukhari)")
parser.add_argument("--limit", type=int, help="Limit number of hadiths to ingest")
parser.add_argument("--list-books", action="store_true", help="List available books and exit")
args = parser.parse_args(['--list-books'])
print(f"✓ Argument parsing successful: list_books={args.list_books}")
# Test book-slug argument
args = parser.parse_args(['--book-slug', 'sahih-bukhari', '--limit', '5'])
print(f"✓ Argument parsing successful: book_slug={args.book_slug}, limit={args.limit}")
print("✓ Command line argument parsing works correctly\n")
return True
except Exception as e:
print(f"✗ Argument parsing failed: {e}")
return False
finally:
sys.argv = original_argv
if __name__ == "__main__":
print("Starting tests for main_hadithapi.py...\n")
# Test command line arguments
if not test_command_line_args():
sys.exit(1)
# Test main functionality
if not test_main_hadithapi():
sys.exit(1)
print("\n🎉 All tests passed successfully!")
sys.exit(0)