Refine data management and collection workflows
This commit is contained in:
@@ -1,10 +1,11 @@
|
||||
from typing import Dict, Any, List
|
||||
import asyncio
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import httpx
|
||||
|
||||
from app.services.collectors.base import BaseCollector
|
||||
from app.core.data_sources import get_data_sources_config
|
||||
|
||||
from app.services.collectors.base import BaseCollector
|
||||
|
||||
|
||||
class ArcGISCableLandingRelationCollector(BaseCollector):
|
||||
@@ -18,45 +19,129 @@ class ArcGISCableLandingRelationCollector(BaseCollector):
|
||||
def base_url(self) -> str:
|
||||
if self._resolved_url:
|
||||
return self._resolved_url
|
||||
from app.core.data_sources import get_data_sources_config
|
||||
|
||||
config = get_data_sources_config()
|
||||
return config.get_yaml_url("arcgis_cable_landing_relation")
|
||||
|
||||
def _layer_url(self, layer_id: int) -> str:
|
||||
if "/FeatureServer/" not in self.base_url:
|
||||
return self.base_url
|
||||
prefix = self.base_url.split("/FeatureServer/")[0]
|
||||
return f"{prefix}/FeatureServer/{layer_id}/query"
|
||||
|
||||
async def _fetch_layer_attributes(
|
||||
self, client: httpx.AsyncClient, layer_id: int
|
||||
) -> List[Dict[str, Any]]:
|
||||
response = await client.get(
|
||||
self._layer_url(layer_id),
|
||||
params={
|
||||
"where": "1=1",
|
||||
"outFields": "*",
|
||||
"returnGeometry": "false",
|
||||
"f": "json",
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
return [feature.get("attributes", {}) for feature in data.get("features", [])]
|
||||
|
||||
async def _fetch_relation_features(self, client: httpx.AsyncClient) -> List[Dict[str, Any]]:
|
||||
response = await client.get(
|
||||
self.base_url,
|
||||
params={
|
||||
"where": "1=1",
|
||||
"outFields": "*",
|
||||
"returnGeometry": "true",
|
||||
"f": "geojson",
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
return data.get("features", [])
|
||||
|
||||
async def fetch(self) -> List[Dict[str, Any]]:
|
||||
params = {"where": "1=1", "outFields": "*", "returnGeometry": "true", "f": "geojson"}
|
||||
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
response = await client.get(self.base_url, params=params)
|
||||
response.raise_for_status()
|
||||
return self.parse_response(response.json())
|
||||
relation_features, landing_rows, cable_rows = await asyncio.gather(
|
||||
self._fetch_relation_features(client),
|
||||
self._fetch_layer_attributes(client, 1),
|
||||
self._fetch_layer_attributes(client, 2),
|
||||
)
|
||||
return self.parse_response(relation_features, landing_rows, cable_rows)
|
||||
|
||||
def parse_response(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
result = []
|
||||
def _build_landing_lookup(self, landing_rows: List[Dict[str, Any]]) -> Dict[int, Dict[str, Any]]:
|
||||
lookup: Dict[int, Dict[str, Any]] = {}
|
||||
for row in landing_rows:
|
||||
city_id = row.get("city_id")
|
||||
if city_id is None:
|
||||
continue
|
||||
lookup[int(city_id)] = {
|
||||
"landing_point_id": row.get("landing_point_id") or city_id,
|
||||
"landing_point_name": row.get("Name") or row.get("name") or "",
|
||||
"facility": row.get("facility") or "",
|
||||
"status": row.get("status") or "",
|
||||
"country": row.get("country") or "",
|
||||
}
|
||||
return lookup
|
||||
|
||||
features = data.get("features", [])
|
||||
for feature in features:
|
||||
def _build_cable_lookup(self, cable_rows: List[Dict[str, Any]]) -> Dict[int, Dict[str, Any]]:
|
||||
lookup: Dict[int, Dict[str, Any]] = {}
|
||||
for row in cable_rows:
|
||||
cable_id = row.get("cable_id")
|
||||
if cable_id is None:
|
||||
continue
|
||||
lookup[int(cable_id)] = {
|
||||
"cable_name": row.get("Name") or "",
|
||||
"status": row.get("status") or "active",
|
||||
}
|
||||
return lookup
|
||||
|
||||
def parse_response(
|
||||
self,
|
||||
relation_features: List[Dict[str, Any]],
|
||||
landing_rows: List[Dict[str, Any]],
|
||||
cable_rows: List[Dict[str, Any]],
|
||||
) -> List[Dict[str, Any]]:
|
||||
result: List[Dict[str, Any]] = []
|
||||
landing_lookup = self._build_landing_lookup(landing_rows)
|
||||
cable_lookup = self._build_cable_lookup(cable_rows)
|
||||
|
||||
for feature in relation_features:
|
||||
props = feature.get("properties", {})
|
||||
|
||||
try:
|
||||
city_id = props.get("city_id")
|
||||
cable_id = props.get("cable_id")
|
||||
landing_info = landing_lookup.get(int(city_id), {}) if city_id is not None else {}
|
||||
cable_info = cable_lookup.get(int(cable_id), {}) if cable_id is not None else {}
|
||||
|
||||
cable_name = cable_info.get("cable_name") or props.get("cable_name") or "Unknown"
|
||||
landing_point_name = (
|
||||
landing_info.get("landing_point_name")
|
||||
or props.get("landing_point_name")
|
||||
or "Unknown"
|
||||
)
|
||||
facility = landing_info.get("facility") or props.get("facility") or "-"
|
||||
status = cable_info.get("status") or landing_info.get("status") or props.get("status") or "-"
|
||||
country = landing_info.get("country") or props.get("country") or ""
|
||||
landing_point_id = landing_info.get("landing_point_id") or props.get("landing_point_id") or city_id
|
||||
|
||||
entry = {
|
||||
"source_id": f"arcgis_relation_{props.get('OBJECTID', props.get('id', ''))}",
|
||||
"name": f"{props.get('cable_name', 'Unknown')} - {props.get('landing_point_name', 'Unknown')}",
|
||||
"country": props.get("country", ""),
|
||||
"city": props.get("landing_point_name", ""),
|
||||
"name": f"{cable_name} - {landing_point_name}",
|
||||
"country": country,
|
||||
"city": landing_point_name,
|
||||
"latitude": str(props.get("latitude", "")) if props.get("latitude") else "",
|
||||
"longitude": str(props.get("longitude", "")) if props.get("longitude") else "",
|
||||
"value": "",
|
||||
"unit": "",
|
||||
"metadata": {
|
||||
"objectid": props.get("OBJECTID"),
|
||||
"city_id": props.get("city_id"),
|
||||
"cable_id": props.get("cable_id"),
|
||||
"cable_name": props.get("cable_name"),
|
||||
"landing_point_id": props.get("landing_point_id"),
|
||||
"landing_point_name": props.get("landing_point_name"),
|
||||
"facility": props.get("facility"),
|
||||
"status": props.get("status"),
|
||||
"city_id": city_id,
|
||||
"cable_id": cable_id,
|
||||
"cable_name": cable_name,
|
||||
"landing_point_id": landing_point_id,
|
||||
"landing_point_name": landing_point_name,
|
||||
"facility": facility,
|
||||
"status": status,
|
||||
},
|
||||
"reference_date": datetime.utcnow().strftime("%Y-%m-%d"),
|
||||
}
|
||||
|
||||
@@ -4,10 +4,12 @@ from abc import ABC, abstractmethod
|
||||
from typing import Dict, List, Any, Optional
|
||||
from datetime import datetime
|
||||
import httpx
|
||||
from sqlalchemy import text
|
||||
from sqlalchemy import select, text
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.core.collected_data_fields import build_dynamic_metadata, get_record_field
|
||||
from app.core.config import settings
|
||||
from app.core.countries import normalize_country
|
||||
|
||||
|
||||
class BaseCollector(ABC):
|
||||
@@ -39,6 +41,11 @@ class BaseCollector(ABC):
|
||||
records_processed / self._current_task.total_records
|
||||
) * 100
|
||||
|
||||
async def set_phase(self, phase: str):
|
||||
if self._current_task and self._db_session:
|
||||
self._current_task.phase = phase
|
||||
await self._db_session.commit()
|
||||
|
||||
@abstractmethod
|
||||
async def fetch(self) -> List[Dict[str, Any]]:
|
||||
"""Fetch raw data from source"""
|
||||
@@ -48,14 +55,87 @@ class BaseCollector(ABC):
|
||||
"""Transform raw data to internal format (default: pass through)"""
|
||||
return raw_data
|
||||
|
||||
def _parse_reference_date(self, value: Any) -> Optional[datetime]:
|
||||
if not value:
|
||||
return None
|
||||
if isinstance(value, datetime):
|
||||
return value
|
||||
if isinstance(value, str):
|
||||
return datetime.fromisoformat(value.replace("Z", "+00:00"))
|
||||
return None
|
||||
|
||||
def _build_comparable_payload(self, record: Any) -> Dict[str, Any]:
|
||||
return {
|
||||
"name": getattr(record, "name", None),
|
||||
"title": getattr(record, "title", None),
|
||||
"description": getattr(record, "description", None),
|
||||
"country": get_record_field(record, "country"),
|
||||
"city": get_record_field(record, "city"),
|
||||
"latitude": get_record_field(record, "latitude"),
|
||||
"longitude": get_record_field(record, "longitude"),
|
||||
"value": get_record_field(record, "value"),
|
||||
"unit": get_record_field(record, "unit"),
|
||||
"metadata": getattr(record, "extra_data", None) or {},
|
||||
"reference_date": (
|
||||
getattr(record, "reference_date", None).isoformat()
|
||||
if getattr(record, "reference_date", None)
|
||||
else None
|
||||
),
|
||||
}
|
||||
|
||||
async def _create_snapshot(
|
||||
self,
|
||||
db: AsyncSession,
|
||||
task_id: int,
|
||||
data: List[Dict[str, Any]],
|
||||
started_at: datetime,
|
||||
) -> int:
|
||||
from app.models.data_snapshot import DataSnapshot
|
||||
|
||||
reference_dates = [
|
||||
parsed
|
||||
for parsed in (self._parse_reference_date(item.get("reference_date")) for item in data)
|
||||
if parsed is not None
|
||||
]
|
||||
reference_date = max(reference_dates) if reference_dates else None
|
||||
|
||||
result = await db.execute(
|
||||
select(DataSnapshot)
|
||||
.where(DataSnapshot.source == self.name, DataSnapshot.is_current == True)
|
||||
.order_by(DataSnapshot.completed_at.desc().nullslast(), DataSnapshot.id.desc())
|
||||
.limit(1)
|
||||
)
|
||||
previous_snapshot = result.scalar_one_or_none()
|
||||
|
||||
snapshot = DataSnapshot(
|
||||
datasource_id=getattr(self, "_datasource_id", 1),
|
||||
task_id=task_id,
|
||||
source=self.name,
|
||||
snapshot_key=f"{self.name}:{task_id}",
|
||||
reference_date=reference_date,
|
||||
started_at=started_at,
|
||||
status="running",
|
||||
is_current=True,
|
||||
parent_snapshot_id=previous_snapshot.id if previous_snapshot else None,
|
||||
summary={},
|
||||
)
|
||||
db.add(snapshot)
|
||||
|
||||
if previous_snapshot:
|
||||
previous_snapshot.is_current = False
|
||||
|
||||
await db.commit()
|
||||
return snapshot.id
|
||||
|
||||
async def run(self, db: AsyncSession) -> Dict[str, Any]:
|
||||
"""Full pipeline: fetch -> transform -> save"""
|
||||
from app.services.collectors.registry import collector_registry
|
||||
from app.models.task import CollectionTask
|
||||
from app.models.collected_data import CollectedData
|
||||
from app.models.data_snapshot import DataSnapshot
|
||||
|
||||
start_time = datetime.utcnow()
|
||||
datasource_id = getattr(self, "_datasource_id", 1)
|
||||
snapshot_id: Optional[int] = None
|
||||
|
||||
if not collector_registry.is_active(self.name):
|
||||
return {"status": "skipped", "reason": "Collector is disabled"}
|
||||
@@ -63,6 +143,7 @@ class BaseCollector(ABC):
|
||||
task = CollectionTask(
|
||||
datasource_id=datasource_id,
|
||||
status="running",
|
||||
phase="queued",
|
||||
started_at=start_time,
|
||||
)
|
||||
db.add(task)
|
||||
@@ -75,15 +156,20 @@ class BaseCollector(ABC):
|
||||
await self.resolve_url(db)
|
||||
|
||||
try:
|
||||
await self.set_phase("fetching")
|
||||
raw_data = await self.fetch()
|
||||
task.total_records = len(raw_data)
|
||||
await db.commit()
|
||||
|
||||
await self.set_phase("transforming")
|
||||
data = self.transform(raw_data)
|
||||
snapshot_id = await self._create_snapshot(db, task_id, data, start_time)
|
||||
|
||||
records_count = await self._save_data(db, data)
|
||||
await self.set_phase("saving")
|
||||
records_count = await self._save_data(db, data, task_id=task_id, snapshot_id=snapshot_id)
|
||||
|
||||
task.status = "success"
|
||||
task.phase = "completed"
|
||||
task.records_processed = records_count
|
||||
task.progress = 100.0
|
||||
task.completed_at = datetime.utcnow()
|
||||
@@ -97,8 +183,15 @@ class BaseCollector(ABC):
|
||||
}
|
||||
except Exception as e:
|
||||
task.status = "failed"
|
||||
task.phase = "failed"
|
||||
task.error_message = str(e)
|
||||
task.completed_at = datetime.utcnow()
|
||||
if snapshot_id is not None:
|
||||
snapshot = await db.get(DataSnapshot, snapshot_id)
|
||||
if snapshot:
|
||||
snapshot.status = "failed"
|
||||
snapshot.completed_at = datetime.utcnow()
|
||||
snapshot.summary = {"error": str(e)}
|
||||
await db.commit()
|
||||
|
||||
return {
|
||||
@@ -108,53 +201,163 @@ class BaseCollector(ABC):
|
||||
"execution_time_seconds": (datetime.utcnow() - start_time).total_seconds(),
|
||||
}
|
||||
|
||||
async def _save_data(self, db: AsyncSession, data: List[Dict[str, Any]]) -> int:
|
||||
async def _save_data(
|
||||
self,
|
||||
db: AsyncSession,
|
||||
data: List[Dict[str, Any]],
|
||||
task_id: Optional[int] = None,
|
||||
snapshot_id: Optional[int] = None,
|
||||
) -> int:
|
||||
"""Save transformed data to database"""
|
||||
from app.models.collected_data import CollectedData
|
||||
from app.models.data_snapshot import DataSnapshot
|
||||
|
||||
if not data:
|
||||
if snapshot_id is not None:
|
||||
snapshot = await db.get(DataSnapshot, snapshot_id)
|
||||
if snapshot:
|
||||
snapshot.record_count = 0
|
||||
snapshot.summary = {"created": 0, "updated": 0, "unchanged": 0}
|
||||
snapshot.status = "success"
|
||||
snapshot.completed_at = datetime.utcnow()
|
||||
await db.commit()
|
||||
return 0
|
||||
|
||||
collected_at = datetime.utcnow()
|
||||
records_added = 0
|
||||
created_count = 0
|
||||
updated_count = 0
|
||||
unchanged_count = 0
|
||||
seen_entity_keys: set[str] = set()
|
||||
previous_current_keys: set[str] = set()
|
||||
|
||||
previous_current_result = await db.execute(
|
||||
select(CollectedData.entity_key).where(
|
||||
CollectedData.source == self.name,
|
||||
CollectedData.is_current == True,
|
||||
)
|
||||
)
|
||||
previous_current_keys = {row[0] for row in previous_current_result.fetchall() if row[0]}
|
||||
|
||||
for i, item in enumerate(data):
|
||||
print(
|
||||
f"DEBUG: Saving item {i}: name={item.get('name')}, metadata={item.get('metadata', 'NOT FOUND')}"
|
||||
)
|
||||
raw_metadata = item.get("metadata", {})
|
||||
extra_data = build_dynamic_metadata(
|
||||
raw_metadata,
|
||||
country=item.get("country"),
|
||||
city=item.get("city"),
|
||||
latitude=item.get("latitude"),
|
||||
longitude=item.get("longitude"),
|
||||
value=item.get("value"),
|
||||
unit=item.get("unit"),
|
||||
)
|
||||
normalized_country = normalize_country(item.get("country"))
|
||||
if normalized_country is not None:
|
||||
extra_data["country"] = normalized_country
|
||||
|
||||
if item.get("country") and normalized_country != item.get("country"):
|
||||
extra_data["raw_country"] = item.get("country")
|
||||
if normalized_country is None:
|
||||
extra_data["country_validation"] = "invalid"
|
||||
|
||||
source_id = item.get("source_id") or item.get("id")
|
||||
reference_date = (
|
||||
self._parse_reference_date(item.get("reference_date"))
|
||||
)
|
||||
source_id_str = str(source_id) if source_id is not None else None
|
||||
entity_key = f"{self.name}:{source_id_str}" if source_id_str else f"{self.name}:{i}"
|
||||
previous_record = None
|
||||
|
||||
if entity_key and entity_key not in seen_entity_keys:
|
||||
result = await db.execute(
|
||||
select(CollectedData)
|
||||
.where(
|
||||
CollectedData.source == self.name,
|
||||
CollectedData.entity_key == entity_key,
|
||||
CollectedData.is_current == True,
|
||||
)
|
||||
.order_by(CollectedData.collected_at.desc().nullslast(), CollectedData.id.desc())
|
||||
)
|
||||
previous_records = result.scalars().all()
|
||||
if previous_records:
|
||||
previous_record = previous_records[0]
|
||||
for old_record in previous_records:
|
||||
old_record.is_current = False
|
||||
|
||||
record = CollectedData(
|
||||
snapshot_id=snapshot_id,
|
||||
task_id=task_id,
|
||||
source=self.name,
|
||||
source_id=item.get("source_id") or item.get("id"),
|
||||
source_id=source_id_str,
|
||||
entity_key=entity_key,
|
||||
data_type=self.data_type,
|
||||
name=item.get("name"),
|
||||
title=item.get("title"),
|
||||
description=item.get("description"),
|
||||
country=item.get("country"),
|
||||
city=item.get("city"),
|
||||
latitude=str(item.get("latitude", ""))
|
||||
if item.get("latitude") is not None
|
||||
else None,
|
||||
longitude=str(item.get("longitude", ""))
|
||||
if item.get("longitude") is not None
|
||||
else None,
|
||||
value=item.get("value"),
|
||||
unit=item.get("unit"),
|
||||
extra_data=item.get("metadata", {}),
|
||||
extra_data=extra_data,
|
||||
collected_at=collected_at,
|
||||
reference_date=datetime.fromisoformat(
|
||||
item.get("reference_date").replace("Z", "+00:00")
|
||||
)
|
||||
if item.get("reference_date")
|
||||
else None,
|
||||
reference_date=reference_date,
|
||||
is_valid=1,
|
||||
is_current=True,
|
||||
previous_record_id=previous_record.id if previous_record else None,
|
||||
deleted_at=None,
|
||||
)
|
||||
|
||||
if previous_record is None:
|
||||
record.change_type = "created"
|
||||
record.change_summary = {}
|
||||
created_count += 1
|
||||
else:
|
||||
previous_payload = self._build_comparable_payload(previous_record)
|
||||
current_payload = self._build_comparable_payload(record)
|
||||
if current_payload == previous_payload:
|
||||
record.change_type = "unchanged"
|
||||
record.change_summary = {}
|
||||
unchanged_count += 1
|
||||
else:
|
||||
changed_fields = [
|
||||
key for key in current_payload.keys() if current_payload[key] != previous_payload.get(key)
|
||||
]
|
||||
record.change_type = "updated"
|
||||
record.change_summary = {"changed_fields": changed_fields}
|
||||
updated_count += 1
|
||||
|
||||
db.add(record)
|
||||
seen_entity_keys.add(entity_key)
|
||||
records_added += 1
|
||||
|
||||
if i % 100 == 0:
|
||||
self.update_progress(i + 1)
|
||||
await db.commit()
|
||||
|
||||
if snapshot_id is not None:
|
||||
deleted_keys = previous_current_keys - seen_entity_keys
|
||||
await db.execute(
|
||||
text(
|
||||
"""
|
||||
UPDATE collected_data
|
||||
SET is_current = FALSE
|
||||
WHERE source = :source
|
||||
AND snapshot_id IS DISTINCT FROM :snapshot_id
|
||||
AND COALESCE(is_current, TRUE) = TRUE
|
||||
"""
|
||||
),
|
||||
{"source": self.name, "snapshot_id": snapshot_id},
|
||||
)
|
||||
snapshot = await db.get(DataSnapshot, snapshot_id)
|
||||
if snapshot:
|
||||
snapshot.record_count = records_added
|
||||
snapshot.status = "success"
|
||||
snapshot.completed_at = datetime.utcnow()
|
||||
snapshot.summary = {
|
||||
"created": created_count,
|
||||
"updated": updated_count,
|
||||
"unchanged": unchanged_count,
|
||||
"deleted": len(deleted_keys),
|
||||
}
|
||||
|
||||
await db.commit()
|
||||
self.update_progress(len(data))
|
||||
return records_added
|
||||
|
||||
@@ -76,7 +76,7 @@ class PeeringDBIXPCollector(HTTPCollector):
|
||||
print(f"Warning: PeeringDB collection failed after {max_retries} retries: {last_error}")
|
||||
return {}
|
||||
|
||||
async def collect(self) -> List[Dict[str, Any]]:
|
||||
async def fetch(self) -> List[Dict[str, Any]]:
|
||||
"""Collect IXP data from PeeringDB with rate limit handling"""
|
||||
response_data = await self.fetch_with_retry()
|
||||
if not response_data:
|
||||
@@ -177,7 +177,7 @@ class PeeringDBNetworkCollector(HTTPCollector):
|
||||
print(f"Warning: PeeringDB collection failed after {max_retries} retries: {last_error}")
|
||||
return {}
|
||||
|
||||
async def collect(self) -> List[Dict[str, Any]]:
|
||||
async def fetch(self) -> List[Dict[str, Any]]:
|
||||
"""Collect Network data from PeeringDB with rate limit handling"""
|
||||
response_data = await self.fetch_with_retry()
|
||||
if not response_data:
|
||||
@@ -280,7 +280,7 @@ class PeeringDBFacilityCollector(HTTPCollector):
|
||||
print(f"Warning: PeeringDB collection failed after {max_retries} retries: {last_error}")
|
||||
return {}
|
||||
|
||||
async def collect(self) -> List[Dict[str, Any]]:
|
||||
async def fetch(self) -> List[Dict[str, Any]]:
|
||||
"""Collect Facility data from PeeringDB with rate limit handling"""
|
||||
response_data = await self.fetch_with_retry()
|
||||
if not response_data:
|
||||
|
||||
@@ -4,9 +4,9 @@ Collects data from TOP500 supercomputer rankings.
|
||||
https://top500.org/lists/top500/
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import re
|
||||
from typing import Dict, Any, List
|
||||
from datetime import datetime
|
||||
from bs4 import BeautifulSoup
|
||||
import httpx
|
||||
|
||||
@@ -21,14 +21,108 @@ class TOP500Collector(BaseCollector):
|
||||
data_type = "supercomputer"
|
||||
|
||||
async def fetch(self) -> List[Dict[str, Any]]:
|
||||
"""Fetch TOP500 data from website (scraping)"""
|
||||
# Get the latest list page
|
||||
"""Fetch TOP500 list data and enrich each row with detail-page metadata."""
|
||||
url = "https://top500.org/lists/top500/list/2025/11/"
|
||||
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
async with httpx.AsyncClient(timeout=60.0, follow_redirects=True) as client:
|
||||
response = await client.get(url)
|
||||
response.raise_for_status()
|
||||
return self.parse_response(response.text)
|
||||
entries = self.parse_response(response.text)
|
||||
|
||||
semaphore = asyncio.Semaphore(8)
|
||||
|
||||
async def enrich(entry: Dict[str, Any]) -> Dict[str, Any]:
|
||||
detail_url = entry.pop("_detail_url", "")
|
||||
if not detail_url:
|
||||
return entry
|
||||
|
||||
async with semaphore:
|
||||
try:
|
||||
detail_response = await client.get(detail_url)
|
||||
detail_response.raise_for_status()
|
||||
entry["metadata"].update(self.parse_detail_response(detail_response.text))
|
||||
except Exception:
|
||||
entry["metadata"]["detail_fetch_failed"] = True
|
||||
return entry
|
||||
|
||||
return await asyncio.gather(*(enrich(entry) for entry in entries))
|
||||
|
||||
def _extract_system_fields(self, system_cell) -> Dict[str, str]:
|
||||
link = system_cell.find("a")
|
||||
system_name = link.get_text(" ", strip=True) if link else system_cell.get_text(" ", strip=True)
|
||||
detail_url = ""
|
||||
if link and link.get("href"):
|
||||
detail_url = f"https://top500.org{link.get('href')}"
|
||||
|
||||
manufacturer = ""
|
||||
if link and link.next_sibling:
|
||||
manufacturer = str(link.next_sibling).strip(" ,\n\t")
|
||||
|
||||
cell_text = system_cell.get_text("\n", strip=True)
|
||||
lines = [line.strip(" ,") for line in cell_text.splitlines() if line.strip()]
|
||||
|
||||
site = ""
|
||||
country = ""
|
||||
if lines:
|
||||
system_name = lines[0]
|
||||
if len(lines) >= 3:
|
||||
site = lines[-2]
|
||||
country = lines[-1]
|
||||
elif len(lines) == 2:
|
||||
country = lines[-1]
|
||||
|
||||
if not manufacturer and len(lines) >= 2:
|
||||
manufacturer = lines[1]
|
||||
|
||||
return {
|
||||
"name": system_name,
|
||||
"manufacturer": manufacturer,
|
||||
"site": site,
|
||||
"country": country,
|
||||
"detail_url": detail_url,
|
||||
}
|
||||
|
||||
def parse_detail_response(self, html: str) -> Dict[str, Any]:
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
detail_table = soup.find("table", {"class": "table table-condensed"})
|
||||
if not detail_table:
|
||||
return {}
|
||||
|
||||
detail_map: Dict[str, Any] = {}
|
||||
label_aliases = {
|
||||
"Site": "site",
|
||||
"Manufacturer": "manufacturer",
|
||||
"Cores": "cores",
|
||||
"Processor": "processor",
|
||||
"Interconnect": "interconnect",
|
||||
"Installation Year": "installation_year",
|
||||
"Linpack Performance (Rmax)": "rmax",
|
||||
"Theoretical Peak (Rpeak)": "rpeak",
|
||||
"Nmax": "nmax",
|
||||
"HPCG": "hpcg",
|
||||
"Power": "power",
|
||||
"Power Measurement Level": "power_measurement_level",
|
||||
"Operating System": "operating_system",
|
||||
"Compiler": "compiler",
|
||||
"Math Library": "math_library",
|
||||
"MPI": "mpi",
|
||||
}
|
||||
|
||||
for row in detail_table.find_all("tr"):
|
||||
header = row.find("th")
|
||||
value_cell = row.find("td")
|
||||
if not header or not value_cell:
|
||||
continue
|
||||
|
||||
label = header.get_text(" ", strip=True).rstrip(":")
|
||||
key = label_aliases.get(label)
|
||||
if not key:
|
||||
continue
|
||||
|
||||
value = value_cell.get_text(" ", strip=True)
|
||||
detail_map[key] = value
|
||||
|
||||
return detail_map
|
||||
|
||||
def parse_response(self, html: str) -> List[Dict[str, Any]]:
|
||||
"""Parse TOP500 HTML response"""
|
||||
@@ -36,27 +130,26 @@ class TOP500Collector(BaseCollector):
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
# Find the table with TOP500 data
|
||||
table = soup.find("table", {"class": "top500-table"})
|
||||
if not table:
|
||||
# Try alternative table selector
|
||||
table = soup.find("table", {"id": "top500"})
|
||||
table = None
|
||||
for candidate in soup.find_all("table"):
|
||||
header_cells = [
|
||||
cell.get_text(" ", strip=True) for cell in candidate.select("thead th")
|
||||
]
|
||||
normalized_headers = [header.lower() for header in header_cells]
|
||||
if (
|
||||
"rank" in normalized_headers
|
||||
and "system" in normalized_headers
|
||||
and any("cores" in header for header in normalized_headers)
|
||||
and any("rmax" in header for header in normalized_headers)
|
||||
):
|
||||
table = candidate
|
||||
break
|
||||
|
||||
if not table:
|
||||
# Try to find any table with rank data
|
||||
tables = soup.find_all("table")
|
||||
for t in tables:
|
||||
if t.find(string=re.compile(r"Rank.*System.*Cores.*Rmax", re.I)):
|
||||
table = t
|
||||
break
|
||||
|
||||
if not table:
|
||||
# Fallback: try to extract data from any table
|
||||
tables = soup.find_all("table")
|
||||
if tables:
|
||||
table = tables[0]
|
||||
table = soup.find("table", {"class": "top500-table"}) or soup.find("table", {"id": "top500"})
|
||||
|
||||
if table:
|
||||
rows = table.find_all("tr")
|
||||
rows = table.select("tr")
|
||||
for row in rows[1:]: # Skip header row
|
||||
cells = row.find_all(["td", "th"])
|
||||
if len(cells) >= 6:
|
||||
@@ -68,43 +161,26 @@ class TOP500Collector(BaseCollector):
|
||||
|
||||
rank = int(rank_text)
|
||||
|
||||
# System name (may contain link)
|
||||
system_cell = cells[1]
|
||||
system_name = system_cell.get_text(strip=True)
|
||||
# Try to get full name from link title or data attribute
|
||||
link = system_cell.find("a")
|
||||
if link and link.get("title"):
|
||||
system_name = link.get("title")
|
||||
system_fields = self._extract_system_fields(system_cell)
|
||||
system_name = system_fields["name"]
|
||||
manufacturer = system_fields["manufacturer"]
|
||||
site = system_fields["site"]
|
||||
country = system_fields["country"]
|
||||
detail_url = system_fields["detail_url"]
|
||||
|
||||
# Country
|
||||
country_cell = cells[2]
|
||||
country = country_cell.get_text(strip=True)
|
||||
# Try to get country from data attribute or image alt
|
||||
img = country_cell.find("img")
|
||||
if img and img.get("alt"):
|
||||
country = img.get("alt")
|
||||
|
||||
# Extract location (city)
|
||||
city = ""
|
||||
location_text = country_cell.get_text(strip=True)
|
||||
if "(" in location_text and ")" in location_text:
|
||||
city = location_text.split("(")[0].strip()
|
||||
cores = cells[2].get_text(strip=True).replace(",", "")
|
||||
|
||||
# Cores
|
||||
cores = cells[3].get_text(strip=True).replace(",", "")
|
||||
|
||||
# Rmax
|
||||
rmax_text = cells[4].get_text(strip=True)
|
||||
rmax_text = cells[3].get_text(strip=True)
|
||||
rmax = self._parse_performance(rmax_text)
|
||||
|
||||
# Rpeak
|
||||
rpeak_text = cells[5].get_text(strip=True)
|
||||
rpeak_text = cells[4].get_text(strip=True)
|
||||
rpeak = self._parse_performance(rpeak_text)
|
||||
|
||||
# Power (optional)
|
||||
power = ""
|
||||
if len(cells) >= 7:
|
||||
power = cells[6].get_text(strip=True)
|
||||
if len(cells) >= 6:
|
||||
power = cells[5].get_text(strip=True).replace(",", "")
|
||||
|
||||
entry = {
|
||||
"source_id": f"top500_{rank}",
|
||||
@@ -117,10 +193,14 @@ class TOP500Collector(BaseCollector):
|
||||
"unit": "PFlop/s",
|
||||
"metadata": {
|
||||
"rank": rank,
|
||||
"r_peak": rpeak,
|
||||
"power": power,
|
||||
"cores": cores,
|
||||
"rmax": rmax_text,
|
||||
"rpeak": rpeak_text,
|
||||
"power": power,
|
||||
"manufacturer": manufacturer,
|
||||
"site": site,
|
||||
},
|
||||
"_detail_url": detail_url,
|
||||
"reference_date": "2025-11-01",
|
||||
}
|
||||
data.append(entry)
|
||||
@@ -184,10 +264,15 @@ class TOP500Collector(BaseCollector):
|
||||
"unit": "PFlop/s",
|
||||
"metadata": {
|
||||
"rank": 1,
|
||||
"r_peak": 2746.38,
|
||||
"power": 29581,
|
||||
"cores": 11039616,
|
||||
"cores": "11039616",
|
||||
"rmax": "1742.00",
|
||||
"rpeak": "2746.38",
|
||||
"power": "29581",
|
||||
"manufacturer": "HPE",
|
||||
"site": "DOE/NNSA/LLNL",
|
||||
"processor": "AMD 4th Gen EPYC 24C 1.8GHz",
|
||||
"interconnect": "Slingshot-11",
|
||||
"installation_year": "2025",
|
||||
},
|
||||
"reference_date": "2025-11-01",
|
||||
},
|
||||
@@ -202,10 +287,12 @@ class TOP500Collector(BaseCollector):
|
||||
"unit": "PFlop/s",
|
||||
"metadata": {
|
||||
"rank": 2,
|
||||
"r_peak": 2055.72,
|
||||
"power": 24607,
|
||||
"cores": 9066176,
|
||||
"cores": "9066176",
|
||||
"rmax": "1353.00",
|
||||
"rpeak": "2055.72",
|
||||
"power": "24607",
|
||||
"manufacturer": "HPE",
|
||||
"site": "DOE/SC/Oak Ridge National Laboratory",
|
||||
},
|
||||
"reference_date": "2025-11-01",
|
||||
},
|
||||
@@ -220,9 +307,10 @@ class TOP500Collector(BaseCollector):
|
||||
"unit": "PFlop/s",
|
||||
"metadata": {
|
||||
"rank": 3,
|
||||
"r_peak": 1980.01,
|
||||
"power": 38698,
|
||||
"cores": 9264128,
|
||||
"cores": "9264128",
|
||||
"rmax": "1012.00",
|
||||
"rpeak": "1980.01",
|
||||
"power": "38698",
|
||||
"manufacturer": "Intel",
|
||||
},
|
||||
"reference_date": "2025-11-01",
|
||||
|
||||
@@ -2,8 +2,8 @@
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
||||
from apscheduler.triggers.interval import IntervalTrigger
|
||||
@@ -11,6 +11,7 @@ from sqlalchemy import select
|
||||
|
||||
from app.db.session import async_session_factory
|
||||
from app.models.datasource import DataSource
|
||||
from app.models.task import CollectionTask
|
||||
from app.services.collectors.registry import collector_registry
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -89,6 +90,35 @@ async def run_collector_task(collector_name: str):
|
||||
logger.exception("Collector %s failed: %s", collector_name, exc)
|
||||
|
||||
|
||||
async def cleanup_stale_running_tasks(max_age_hours: int = 2) -> int:
|
||||
"""Mark stale running tasks as failed after restarts or collector hangs."""
|
||||
cutoff = datetime.utcnow() - timedelta(hours=max_age_hours)
|
||||
|
||||
async with async_session_factory() as db:
|
||||
result = await db.execute(
|
||||
select(CollectionTask).where(
|
||||
CollectionTask.status == "running",
|
||||
CollectionTask.started_at.is_not(None),
|
||||
CollectionTask.started_at < cutoff,
|
||||
)
|
||||
)
|
||||
stale_tasks = result.scalars().all()
|
||||
|
||||
for task in stale_tasks:
|
||||
task.status = "failed"
|
||||
task.phase = "failed"
|
||||
task.completed_at = datetime.utcnow()
|
||||
existing_error = (task.error_message or "").strip()
|
||||
cleanup_error = "Marked failed automatically after stale running task cleanup"
|
||||
task.error_message = f"{existing_error}\n{cleanup_error}".strip() if existing_error else cleanup_error
|
||||
|
||||
if stale_tasks:
|
||||
await db.commit()
|
||||
logger.warning("Cleaned up %s stale running collection task(s)", len(stale_tasks))
|
||||
|
||||
return len(stale_tasks)
|
||||
|
||||
|
||||
def start_scheduler() -> None:
|
||||
"""Start the scheduler."""
|
||||
if not scheduler.running:
|
||||
@@ -144,6 +174,19 @@ def get_scheduler_jobs() -> list[Dict[str, Any]]:
|
||||
return jobs
|
||||
|
||||
|
||||
async def get_latest_task_id_for_datasource(datasource_id: int) -> Optional[int]:
|
||||
from app.models.task import CollectionTask
|
||||
|
||||
async with async_session_factory() as db:
|
||||
result = await db.execute(
|
||||
select(CollectionTask.id)
|
||||
.where(CollectionTask.datasource_id == datasource_id)
|
||||
.order_by(CollectionTask.created_at.desc(), CollectionTask.id.desc())
|
||||
.limit(1)
|
||||
)
|
||||
return result.scalar_one_or_none()
|
||||
|
||||
|
||||
def run_collector_now(collector_name: str) -> bool:
|
||||
"""Run a collector immediately (not scheduled)."""
|
||||
collector = collector_registry.get(collector_name)
|
||||
|
||||
Reference in New Issue
Block a user