107 lines
4.6 KiB
Python
107 lines
4.6 KiB
Python
# config.py
|
|
import os
|
|
import configparser
|
|
from typing import List
|
|
from src.utils.validation import validate_config_values, validate_string, validate_integer_range, validate_boolean
|
|
|
|
class Config:
|
|
SEARCH_URL: str = "https://news.google.com/search"
|
|
HOMEPAGE_URL: str = "https://news.google.com" # Google News homepage URL
|
|
OUTPUT_FORMAT: str = "json" # Can be "json" or "csv"
|
|
OUTPUT_DIR: str = "data"
|
|
LOG_LEVEL: str = "INFO"
|
|
|
|
# Scheduler settings
|
|
SCHEDULER_INTERVAL_MINUTES: int = 60
|
|
SCHEDULER_DAILY_HOUR: int = 9
|
|
SCHEDULER_DAILY_MINUTE: int = 0
|
|
|
|
# Article collection settings
|
|
DEFAULT_ARTICLE_LIMIT: int = 20 # Default number of articles to collect
|
|
MAX_ARTICLE_LIMIT: int = 100 # Maximum number of articles to collect
|
|
|
|
# Full article scraping settings
|
|
SCRAPE_FULL_ARTICLES: bool = False # Whether to scrape full articles
|
|
FULL_ARTICLES_COUNT: int = 3 # Number of top articles to scrape fully
|
|
|
|
# Image scraping settings
|
|
SCRAPE_IMAGES: bool = False # Whether to process images (default: False)
|
|
IMAGE_SCRAPE_MODE: str = "url-only" # Image scraping mode: "url-only", "download", or "both"
|
|
|
|
# Retry mechanism settings
|
|
MAX_RETRIES: int = 3 # Maximum number of retry attempts
|
|
BASE_DELAY: float = 1.0 # Initial delay between retries in seconds
|
|
MAX_DELAY: float = 60.0 # Maximum delay between retries in seconds
|
|
FAILURE_THRESHOLD: int = 5 # Number of failures before circuit breaker opens
|
|
RECOVERY_TIMEOUT: int = 60 # Seconds to wait before trying again after circuit breaker opens
|
|
|
|
# Concurrency settings
|
|
CONCURRENT_REQUESTS_LIMIT: int = 100 # Maximum concurrent requests
|
|
CONCURRENT_REQUESTS_PER_HOST: int = 30 # Maximum concurrent requests per host
|
|
CONCURRENT_ARTICLES_PROCESSING: int = 10 # Maximum concurrent articles processing
|
|
|
|
# Topics for scheduled collection
|
|
TOPICS: List[str] = [
|
|
"artificial intelligence",
|
|
"machine learning",
|
|
"data science"
|
|
]
|
|
|
|
# Weather API settings
|
|
WEATHER_API_KEY: str = os.getenv("WEATHER_API_KEY", "YOUR_WEATHER_API_KEY_HERE") # Replace with your actual API key
|
|
|
|
@classmethod
|
|
def load_from_file(cls, config_file: str = "config/settings.cfg") -> None:
|
|
"""Load configuration from a file"""
|
|
if os.path.exists(config_file):
|
|
config_values = {}
|
|
lines = []
|
|
with open(config_file, 'r') as f:
|
|
lines = [line.strip() for line in f.readlines() if line.strip() and not line.strip().startswith('#')]
|
|
|
|
i = 0
|
|
while i < len(lines):
|
|
line = lines[i]
|
|
if '=' in line:
|
|
key, value = line.split('=', 1)
|
|
key = key.strip()
|
|
value = value.strip()
|
|
|
|
# Handle multi-line arrays
|
|
if value == '[':
|
|
# Read array values until we find the closing bracket
|
|
array_values = []
|
|
i += 1
|
|
while i < len(lines) and lines[i] != ']':
|
|
array_line = lines[i].strip().rstrip(',')
|
|
if array_line:
|
|
# Remove quotes and whitespace
|
|
cleaned_value = array_line.strip('"\'')
|
|
if cleaned_value:
|
|
array_values.append(cleaned_value)
|
|
i += 1
|
|
config_values[key] = array_values
|
|
# Handle boolean values
|
|
elif key in ("SCRAPE_FULL_ARTICLES", "SCRAPE_IMAGES"):
|
|
config_values[key] = value.lower() in ('true', '1', 'yes', 'on')
|
|
# Handle integer values
|
|
elif key in ("FULL_ARTICLES_COUNT", "SCHEDULER_INTERVAL_MINUTES",
|
|
"SCHEDULER_DAILY_HOUR", "SCHEDULER_DAILY_MINUTE"):
|
|
config_values[key] = int(value)
|
|
# Handle string values
|
|
else:
|
|
config_values[key] = value.strip('"\'')
|
|
i += 1
|
|
|
|
# Validate configuration values
|
|
try:
|
|
validated_config = validate_config_values(config_values)
|
|
|
|
# Update class attributes with validated values
|
|
for key, value in validated_config.items():
|
|
setattr(cls, key, value)
|
|
|
|
except ValueError as e:
|
|
raise ValueError(f"Invalid configuration in {config_file}: {str(e)}")
|
|
|