Refine data management and collection workflows

This commit is contained in:
linkong
2026-03-25 17:19:10 +08:00
parent cc5f16f8a7
commit 020c1d5051
34 changed files with 3341 additions and 947 deletions

View File

@@ -2,8 +2,8 @@
import asyncio
import logging
from datetime import datetime
from typing import Any, Dict
from datetime import datetime, timedelta
from typing import Any, Dict, Optional
from apscheduler.schedulers.asyncio import AsyncIOScheduler
from apscheduler.triggers.interval import IntervalTrigger
@@ -11,6 +11,7 @@ from sqlalchemy import select
from app.db.session import async_session_factory
from app.models.datasource import DataSource
from app.models.task import CollectionTask
from app.services.collectors.registry import collector_registry
logger = logging.getLogger(__name__)
@@ -89,6 +90,35 @@ async def run_collector_task(collector_name: str):
logger.exception("Collector %s failed: %s", collector_name, exc)
async def cleanup_stale_running_tasks(max_age_hours: int = 2) -> int:
"""Mark stale running tasks as failed after restarts or collector hangs."""
cutoff = datetime.utcnow() - timedelta(hours=max_age_hours)
async with async_session_factory() as db:
result = await db.execute(
select(CollectionTask).where(
CollectionTask.status == "running",
CollectionTask.started_at.is_not(None),
CollectionTask.started_at < cutoff,
)
)
stale_tasks = result.scalars().all()
for task in stale_tasks:
task.status = "failed"
task.phase = "failed"
task.completed_at = datetime.utcnow()
existing_error = (task.error_message or "").strip()
cleanup_error = "Marked failed automatically after stale running task cleanup"
task.error_message = f"{existing_error}\n{cleanup_error}".strip() if existing_error else cleanup_error
if stale_tasks:
await db.commit()
logger.warning("Cleaned up %s stale running collection task(s)", len(stale_tasks))
return len(stale_tasks)
def start_scheduler() -> None:
"""Start the scheduler."""
if not scheduler.running:
@@ -144,6 +174,19 @@ def get_scheduler_jobs() -> list[Dict[str, Any]]:
return jobs
async def get_latest_task_id_for_datasource(datasource_id: int) -> Optional[int]:
from app.models.task import CollectionTask
async with async_session_factory() as db:
result = await db.execute(
select(CollectionTask.id)
.where(CollectionTask.datasource_id == datasource_id)
.order_by(CollectionTask.created_at.desc(), CollectionTask.id.desc())
.limit(1)
)
return result.scalar_one_or_none()
def run_collector_now(collector_name: str) -> bool:
"""Run a collector immediately (not scheduled)."""
collector = collector_registry.get(collector_name)