News_Scrapper_Demo/news-weather-backend/config/config.py

# config.py
import os
import configparser
from typing import List
from src.utils.validation import validate_config_values, validate_string, validate_integer_range, validate_boolean

class Config:
    SEARCH_URL: str = "https://news.google.com/search"
    HOMEPAGE_URL: str = "https://news.google.com"  # Google News homepage URL
    OUTPUT_FORMAT: str = "json"  # Can be "json" or "csv"
    OUTPUT_DIR: str = "data"
    LOG_LEVEL: str = "INFO"

    # Scheduler settings
    SCHEDULER_INTERVAL_MINUTES: int = 60
    SCHEDULER_DAILY_HOUR: int = 9
    SCHEDULER_DAILY_MINUTE: int = 0

    # Article collection settings
    DEFAULT_ARTICLE_LIMIT: int = 20  # Default number of articles to collect
    MAX_ARTICLE_LIMIT: int = 100  # Maximum number of articles to collect

    # Full article scraping settings
    SCRAPE_FULL_ARTICLES: bool = False  # Whether to scrape full articles
    FULL_ARTICLES_COUNT: int = 3  # Number of top articles to scrape fully

    # Image scraping settings
    SCRAPE_IMAGES: bool = False  # Whether to process images (default: False)
    IMAGE_SCRAPE_MODE: str = "url-only"  # Image scraping mode: "url-only", "download", or "both"

    # Retry mechanism settings
    MAX_RETRIES: int = 3  # Maximum number of retry attempts
    BASE_DELAY: float = 1.0  # Initial delay between retries in seconds
    MAX_DELAY: float = 60.0  # Maximum delay between retries in seconds
    FAILURE_THRESHOLD: int = 5  # Number of failures before circuit breaker opens
    RECOVERY_TIMEOUT: int = 60  # Seconds to wait before trying again after circuit breaker opens

    # Concurrency settings
    CONCURRENT_REQUESTS_LIMIT: int = 100  # Maximum concurrent requests
    CONCURRENT_REQUESTS_PER_HOST: int = 30  # Maximum concurrent requests per host
    CONCURRENT_ARTICLES_PROCESSING: int = 10  # Maximum concurrent articles processing

    # Topics for scheduled collection
    TOPICS: List[str] = [
        "artificial intelligence",
        "machine learning",
        "data science"
    ]

    # Weather API settings
    WEATHER_API_KEY: str = os.getenv("WEATHER_API_KEY", "YOUR_WEATHER_API_KEY_HERE")  # Replace with your actual API key

    @classmethod
    def load_from_file(cls, config_file: str = "config/settings.cfg") -> None:
        """Load configuration from a file"""
        if os.path.exists(config_file):
            config_values = {}
            lines = []
            with open(config_file, 'r') as f:
                lines = [line.strip() for line in f.readlines() if line.strip() and not line.strip().startswith('#')]

            i = 0
            while i < len(lines):
                line = lines[i]
                if '=' in line:
                    key, value = line.split('=', 1)
                    key = key.strip()
                    value = value.strip()

                    # Handle multi-line arrays
                    if value == '[':
                        # Read array values until we find the closing bracket
                        array_values = []
                        i += 1
                        while i < len(lines) and lines[i] != ']':
                            array_line = lines[i].strip().rstrip(',')
                            if array_line:
                                # Remove quotes and whitespace
                                cleaned_value = array_line.strip('"\'')
                                if cleaned_value:
                                    array_values.append(cleaned_value)
                            i += 1
                        config_values[key] = array_values
                    # Handle boolean values
                    elif key in ("SCRAPE_FULL_ARTICLES", "SCRAPE_IMAGES"):
                        config_values[key] = value.lower() in ('true', '1', 'yes', 'on')
                    # Handle integer values
                    elif key in ("FULL_ARTICLES_COUNT", "SCHEDULER_INTERVAL_MINUTES",
                                "SCHEDULER_DAILY_HOUR", "SCHEDULER_DAILY_MINUTE"):
                        config_values[key] = int(value)
                    # Handle string values
                    else:
                        config_values[key] = value.strip('"\'')
                i += 1

            # Validate configuration values
            try:
                validated_config = validate_config_values(config_values)

                # Update class attributes with validated values
                for key, value in validated_config.items():
                    setattr(cls, key, value)

            except ValueError as e:
                raise ValueError(f"Invalid configuration in {config_file}: {str(e)}")