Refine data management and collection workflows

2026-03-25 17:19:10 +08:00
parent cc5f16f8a7
commit 020c1d5051
34 changed files with 3341 additions and 947 deletions
--- a/backend/app/services/collectors/arcgis_relation.py
+++ b/backend/app/services/collectors/arcgis_relation.py
@@ -1,10 +1,11 @@
-from typing import Dict, Any, List
+import asyncio
 from datetime import datetime
+from typing import Any, Dict, List, Optional
+
 import httpx

-from app.services.collectors.base import BaseCollector
 from app.core.data_sources import get_data_sources_config
-
+from app.services.collectors.base import BaseCollector


 class ArcGISCableLandingRelationCollector(BaseCollector):
@@ -18,45 +19,129 @@ class ArcGISCableLandingRelationCollector(BaseCollector):
    def base_url(self) -> str:
        if self._resolved_url:
            return self._resolved_url
-        from app.core.data_sources import get_data_sources_config
-
        config = get_data_sources_config()
        return config.get_yaml_url("arcgis_cable_landing_relation")

+    def _layer_url(self, layer_id: int) -> str:
+        if "/FeatureServer/" not in self.base_url:
+            return self.base_url
+        prefix = self.base_url.split("/FeatureServer/")[0]
+        return f"{prefix}/FeatureServer/{layer_id}/query"
+
+    async def _fetch_layer_attributes(
+        self, client: httpx.AsyncClient, layer_id: int
+    ) -> List[Dict[str, Any]]:
+        response = await client.get(
+            self._layer_url(layer_id),
+            params={
+                "where": "1=1",
+                "outFields": "*",
+                "returnGeometry": "false",
+                "f": "json",
+            },
+        )
+        response.raise_for_status()
+        data = response.json()
+        return [feature.get("attributes", {}) for feature in data.get("features", [])]
+
+    async def _fetch_relation_features(self, client: httpx.AsyncClient) -> List[Dict[str, Any]]:
+        response = await client.get(
+            self.base_url,
+            params={
+                "where": "1=1",
+                "outFields": "*",
+                "returnGeometry": "true",
+                "f": "geojson",
+            },
+        )
+        response.raise_for_status()
+        data = response.json()
+        return data.get("features", [])
+
    async def fetch(self) -> List[Dict[str, Any]]:
-        params = {"where": "1=1", "outFields": "*", "returnGeometry": "true", "f": "geojson"}
-
        async with httpx.AsyncClient(timeout=60.0) as client:
-            response = await client.get(self.base_url, params=params)
-            response.raise_for_status()
-            return self.parse_response(response.json())
+            relation_features, landing_rows, cable_rows = await asyncio.gather(
+                self._fetch_relation_features(client),
+                self._fetch_layer_attributes(client, 1),
+                self._fetch_layer_attributes(client, 2),
+            )
+        return self.parse_response(relation_features, landing_rows, cable_rows)

-    def parse_response(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
-        result = []
+    def _build_landing_lookup(self, landing_rows: List[Dict[str, Any]]) -> Dict[int, Dict[str, Any]]:
+        lookup: Dict[int, Dict[str, Any]] = {}
+        for row in landing_rows:
+            city_id = row.get("city_id")
+            if city_id is None:
+                continue
+            lookup[int(city_id)] = {
+                "landing_point_id": row.get("landing_point_id") or city_id,
+                "landing_point_name": row.get("Name") or row.get("name") or "",
+                "facility": row.get("facility") or "",
+                "status": row.get("status") or "",
+                "country": row.get("country") or "",
+            }
+        return lookup

-        features = data.get("features", [])
-        for feature in features:
+    def _build_cable_lookup(self, cable_rows: List[Dict[str, Any]]) -> Dict[int, Dict[str, Any]]:
+        lookup: Dict[int, Dict[str, Any]] = {}
+        for row in cable_rows:
+            cable_id = row.get("cable_id")
+            if cable_id is None:
+                continue
+            lookup[int(cable_id)] = {
+                "cable_name": row.get("Name") or "",
+                "status": row.get("status") or "active",
+            }
+        return lookup
+
+    def parse_response(
+        self,
+        relation_features: List[Dict[str, Any]],
+        landing_rows: List[Dict[str, Any]],
+        cable_rows: List[Dict[str, Any]],
+    ) -> List[Dict[str, Any]]:
+        result: List[Dict[str, Any]] = []
+        landing_lookup = self._build_landing_lookup(landing_rows)
+        cable_lookup = self._build_cable_lookup(cable_rows)
+
+        for feature in relation_features:
            props = feature.get("properties", {})

            try:
+                city_id = props.get("city_id")
+                cable_id = props.get("cable_id")
+                landing_info = landing_lookup.get(int(city_id), {}) if city_id is not None else {}
+                cable_info = cable_lookup.get(int(cable_id), {}) if cable_id is not None else {}
+
+                cable_name = cable_info.get("cable_name") or props.get("cable_name") or "Unknown"
+                landing_point_name = (
+                    landing_info.get("landing_point_name")
+                    or props.get("landing_point_name")
+                    or "Unknown"
+                )
+                facility = landing_info.get("facility") or props.get("facility") or "-"
+                status = cable_info.get("status") or landing_info.get("status") or props.get("status") or "-"
+                country = landing_info.get("country") or props.get("country") or ""
+                landing_point_id = landing_info.get("landing_point_id") or props.get("landing_point_id") or city_id
+
                entry = {
                    "source_id": f"arcgis_relation_{props.get('OBJECTID', props.get('id', ''))}",
-                    "name": f"{props.get('cable_name', 'Unknown')} - {props.get('landing_point_name', 'Unknown')}",
-                    "country": props.get("country", ""),
-                    "city": props.get("landing_point_name", ""),
+                    "name": f"{cable_name} - {landing_point_name}",
+                    "country": country,
+                    "city": landing_point_name,
                    "latitude": str(props.get("latitude", "")) if props.get("latitude") else "",
                    "longitude": str(props.get("longitude", "")) if props.get("longitude") else "",
                    "value": "",
                    "unit": "",
                    "metadata": {
                        "objectid": props.get("OBJECTID"),
-                        "city_id": props.get("city_id"),
-                        "cable_id": props.get("cable_id"),
-                        "cable_name": props.get("cable_name"),
-                        "landing_point_id": props.get("landing_point_id"),
-                        "landing_point_name": props.get("landing_point_name"),
-                        "facility": props.get("facility"),
-                        "status": props.get("status"),
+                        "city_id": city_id,
+                        "cable_id": cable_id,
+                        "cable_name": cable_name,
+                        "landing_point_id": landing_point_id,
+                        "landing_point_name": landing_point_name,
+                        "facility": facility,
+                        "status": status,
                    },
                    "reference_date": datetime.utcnow().strftime("%Y-%m-%d"),
                }
--- a/backend/app/services/collectors/base.py
+++ b/backend/app/services/collectors/base.py
@@ -4,10 +4,12 @@ from abc import ABC, abstractmethod
 from typing import Dict, List, Any, Optional
 from datetime import datetime
 import httpx
-from sqlalchemy import text
+from sqlalchemy import select, text
 from sqlalchemy.ext.asyncio import AsyncSession

+from app.core.collected_data_fields import build_dynamic_metadata, get_record_field
 from app.core.config import settings
+from app.core.countries import normalize_country


 class BaseCollector(ABC):
@@ -39,6 +41,11 @@ class BaseCollector(ABC):
                records_processed / self._current_task.total_records
            ) * 100

+    async def set_phase(self, phase: str):
+        if self._current_task and self._db_session:
+            self._current_task.phase = phase
+            await self._db_session.commit()
+
    @abstractmethod
    async def fetch(self) -> List[Dict[str, Any]]:
        """Fetch raw data from source"""
@@ -48,14 +55,87 @@ class BaseCollector(ABC):
        """Transform raw data to internal format (default: pass through)"""
        return raw_data

+    def _parse_reference_date(self, value: Any) -> Optional[datetime]:
+        if not value:
+            return None
+        if isinstance(value, datetime):
+            return value
+        if isinstance(value, str):
+            return datetime.fromisoformat(value.replace("Z", "+00:00"))
+        return None
+
+    def _build_comparable_payload(self, record: Any) -> Dict[str, Any]:
+        return {
+            "name": getattr(record, "name", None),
+            "title": getattr(record, "title", None),
+            "description": getattr(record, "description", None),
+            "country": get_record_field(record, "country"),
+            "city": get_record_field(record, "city"),
+            "latitude": get_record_field(record, "latitude"),
+            "longitude": get_record_field(record, "longitude"),
+            "value": get_record_field(record, "value"),
+            "unit": get_record_field(record, "unit"),
+            "metadata": getattr(record, "extra_data", None) or {},
+            "reference_date": (
+                getattr(record, "reference_date", None).isoformat()
+                if getattr(record, "reference_date", None)
+                else None
+            ),
+        }
+
+    async def _create_snapshot(
+        self,
+        db: AsyncSession,
+        task_id: int,
+        data: List[Dict[str, Any]],
+        started_at: datetime,
+    ) -> int:
+        from app.models.data_snapshot import DataSnapshot
+
+        reference_dates = [
+            parsed
+            for parsed in (self._parse_reference_date(item.get("reference_date")) for item in data)
+            if parsed is not None
+        ]
+        reference_date = max(reference_dates) if reference_dates else None
+
+        result = await db.execute(
+            select(DataSnapshot)
+            .where(DataSnapshot.source == self.name, DataSnapshot.is_current == True)
+            .order_by(DataSnapshot.completed_at.desc().nullslast(), DataSnapshot.id.desc())
+            .limit(1)
+        )
+        previous_snapshot = result.scalar_one_or_none()
+
+        snapshot = DataSnapshot(
+            datasource_id=getattr(self, "_datasource_id", 1),
+            task_id=task_id,
+            source=self.name,
+            snapshot_key=f"{self.name}:{task_id}",
+            reference_date=reference_date,
+            started_at=started_at,
+            status="running",
+            is_current=True,
+            parent_snapshot_id=previous_snapshot.id if previous_snapshot else None,
+            summary={},
+        )
+        db.add(snapshot)
+
+        if previous_snapshot:
+            previous_snapshot.is_current = False
+
+        await db.commit()
+        return snapshot.id
+
    async def run(self, db: AsyncSession) -> Dict[str, Any]:
        """Full pipeline: fetch -> transform -> save"""
        from app.services.collectors.registry import collector_registry
        from app.models.task import CollectionTask
-        from app.models.collected_data import CollectedData
+        from app.models.data_snapshot import DataSnapshot

        start_time = datetime.utcnow()
        datasource_id = getattr(self, "_datasource_id", 1)
+        snapshot_id: Optional[int] = None

        if not collector_registry.is_active(self.name):
            return {"status": "skipped", "reason": "Collector is disabled"}
@@ -63,6 +143,7 @@ class BaseCollector(ABC):
        task = CollectionTask(
            datasource_id=datasource_id,
            status="running",
+            phase="queued",
            started_at=start_time,
        )
        db.add(task)
@@ -75,15 +156,20 @@ class BaseCollector(ABC):
        await self.resolve_url(db)

        try:
+            await self.set_phase("fetching")
            raw_data = await self.fetch()
            task.total_records = len(raw_data)
            await db.commit()

+            await self.set_phase("transforming")
            data = self.transform(raw_data)
+            snapshot_id = await self._create_snapshot(db, task_id, data, start_time)

-            records_count = await self._save_data(db, data)
+            await self.set_phase("saving")
+            records_count = await self._save_data(db, data, task_id=task_id, snapshot_id=snapshot_id)

            task.status = "success"
+            task.phase = "completed"
            task.records_processed = records_count
            task.progress = 100.0
            task.completed_at = datetime.utcnow()
@@ -97,8 +183,15 @@ class BaseCollector(ABC):
            }
        except Exception as e:
            task.status = "failed"
+            task.phase = "failed"
            task.error_message = str(e)
            task.completed_at = datetime.utcnow()
+            if snapshot_id is not None:
+                snapshot = await db.get(DataSnapshot, snapshot_id)
+                if snapshot:
+                    snapshot.status = "failed"
+                    snapshot.completed_at = datetime.utcnow()
+                    snapshot.summary = {"error": str(e)}
            await db.commit()

            return {
@@ -108,53 +201,163 @@ class BaseCollector(ABC):
                "execution_time_seconds": (datetime.utcnow() - start_time).total_seconds(),
            }

-    async def _save_data(self, db: AsyncSession, data: List[Dict[str, Any]]) -> int:
+    async def _save_data(
+        self,
+        db: AsyncSession,
+        data: List[Dict[str, Any]],
+        task_id: Optional[int] = None,
+        snapshot_id: Optional[int] = None,
+    ) -> int:
        """Save transformed data to database"""
        from app.models.collected_data import CollectedData
+        from app.models.data_snapshot import DataSnapshot

        if not data:
+            if snapshot_id is not None:
+                snapshot = await db.get(DataSnapshot, snapshot_id)
+                if snapshot:
+                    snapshot.record_count = 0
+                    snapshot.summary = {"created": 0, "updated": 0, "unchanged": 0}
+                    snapshot.status = "success"
+                    snapshot.completed_at = datetime.utcnow()
+                    await db.commit()
            return 0

        collected_at = datetime.utcnow()
        records_added = 0
+        created_count = 0
+        updated_count = 0
+        unchanged_count = 0
+        seen_entity_keys: set[str] = set()
+        previous_current_keys: set[str] = set()
+
+        previous_current_result = await db.execute(
+            select(CollectedData.entity_key).where(
+                CollectedData.source == self.name,
+                CollectedData.is_current == True,
+            )
+        )
+        previous_current_keys = {row[0] for row in previous_current_result.fetchall() if row[0]}

        for i, item in enumerate(data):
            print(
                f"DEBUG: Saving item {i}: name={item.get('name')}, metadata={item.get('metadata', 'NOT FOUND')}"
            )
+            raw_metadata = item.get("metadata", {})
+            extra_data = build_dynamic_metadata(
+                raw_metadata,
+                country=item.get("country"),
+                city=item.get("city"),
+                latitude=item.get("latitude"),
+                longitude=item.get("longitude"),
+                value=item.get("value"),
+                unit=item.get("unit"),
+            )
+            normalized_country = normalize_country(item.get("country"))
+            if normalized_country is not None:
+                extra_data["country"] = normalized_country
+
+            if item.get("country") and normalized_country != item.get("country"):
+                extra_data["raw_country"] = item.get("country")
+                if normalized_country is None:
+                    extra_data["country_validation"] = "invalid"
+
+            source_id = item.get("source_id") or item.get("id")
+            reference_date = (
+                self._parse_reference_date(item.get("reference_date"))
+            )
+            source_id_str = str(source_id) if source_id is not None else None
+            entity_key = f"{self.name}:{source_id_str}" if source_id_str else f"{self.name}:{i}"
+            previous_record = None
+
+            if entity_key and entity_key not in seen_entity_keys:
+                result = await db.execute(
+                    select(CollectedData)
+                    .where(
+                        CollectedData.source == self.name,
+                        CollectedData.entity_key == entity_key,
+                        CollectedData.is_current == True,
+                    )
+                    .order_by(CollectedData.collected_at.desc().nullslast(), CollectedData.id.desc())
+                )
+                previous_records = result.scalars().all()
+                if previous_records:
+                    previous_record = previous_records[0]
+                    for old_record in previous_records:
+                        old_record.is_current = False
+
            record = CollectedData(
+                snapshot_id=snapshot_id,
+                task_id=task_id,
                source=self.name,
-                source_id=item.get("source_id") or item.get("id"),
+                source_id=source_id_str,
+                entity_key=entity_key,
                data_type=self.data_type,
                name=item.get("name"),
                title=item.get("title"),
                description=item.get("description"),
-                country=item.get("country"),
-                city=item.get("city"),
-                latitude=str(item.get("latitude", ""))
-                if item.get("latitude") is not None
-                else None,
-                longitude=str(item.get("longitude", ""))
-                if item.get("longitude") is not None
-                else None,
-                value=item.get("value"),
-                unit=item.get("unit"),
-                extra_data=item.get("metadata", {}),
+                extra_data=extra_data,
                collected_at=collected_at,
-                reference_date=datetime.fromisoformat(
-                    item.get("reference_date").replace("Z", "+00:00")
-                )
-                if item.get("reference_date")
-                else None,
+                reference_date=reference_date,
                is_valid=1,
+                is_current=True,
+                previous_record_id=previous_record.id if previous_record else None,
+                deleted_at=None,
            )
+
+            if previous_record is None:
+                record.change_type = "created"
+                record.change_summary = {}
+                created_count += 1
+            else:
+                previous_payload = self._build_comparable_payload(previous_record)
+                current_payload = self._build_comparable_payload(record)
+                if current_payload == previous_payload:
+                    record.change_type = "unchanged"
+                    record.change_summary = {}
+                    unchanged_count += 1
+                else:
+                    changed_fields = [
+                        key for key in current_payload.keys() if current_payload[key] != previous_payload.get(key)
+                    ]
+                    record.change_type = "updated"
+                    record.change_summary = {"changed_fields": changed_fields}
+                    updated_count += 1
+
            db.add(record)
+            seen_entity_keys.add(entity_key)
            records_added += 1

            if i % 100 == 0:
                self.update_progress(i + 1)
                await db.commit()

+        if snapshot_id is not None:
+            deleted_keys = previous_current_keys - seen_entity_keys
+            await db.execute(
+                text(
+                    """
+                    UPDATE collected_data
+                    SET is_current = FALSE
+                    WHERE source = :source
+                      AND snapshot_id IS DISTINCT FROM :snapshot_id
+                      AND COALESCE(is_current, TRUE) = TRUE
+                    """
+                ),
+                {"source": self.name, "snapshot_id": snapshot_id},
+            )
+            snapshot = await db.get(DataSnapshot, snapshot_id)
+            if snapshot:
+                snapshot.record_count = records_added
+                snapshot.status = "success"
+                snapshot.completed_at = datetime.utcnow()
+                snapshot.summary = {
+                    "created": created_count,
+                    "updated": updated_count,
+                    "unchanged": unchanged_count,
+                    "deleted": len(deleted_keys),
+                }
+
        await db.commit()
        self.update_progress(len(data))
        return records_added
--- a/backend/app/services/collectors/peeringdb.py
+++ b/backend/app/services/collectors/peeringdb.py
@@ -76,7 +76,7 @@ class PeeringDBIXPCollector(HTTPCollector):
        print(f"Warning: PeeringDB collection failed after {max_retries} retries: {last_error}")
        return {}

-    async def collect(self) -> List[Dict[str, Any]]:
+    async def fetch(self) -> List[Dict[str, Any]]:
        """Collect IXP data from PeeringDB with rate limit handling"""
        response_data = await self.fetch_with_retry()
        if not response_data:
@@ -177,7 +177,7 @@ class PeeringDBNetworkCollector(HTTPCollector):
        print(f"Warning: PeeringDB collection failed after {max_retries} retries: {last_error}")
        return {}

-    async def collect(self) -> List[Dict[str, Any]]:
+    async def fetch(self) -> List[Dict[str, Any]]:
        """Collect Network data from PeeringDB with rate limit handling"""
        response_data = await self.fetch_with_retry()
        if not response_data:
@@ -280,7 +280,7 @@ class PeeringDBFacilityCollector(HTTPCollector):
        print(f"Warning: PeeringDB collection failed after {max_retries} retries: {last_error}")
        return {}

-    async def collect(self) -> List[Dict[str, Any]]:
+    async def fetch(self) -> List[Dict[str, Any]]:
        """Collect Facility data from PeeringDB with rate limit handling"""
        response_data = await self.fetch_with_retry()
        if not response_data:
--- a/backend/app/services/collectors/top500.py
+++ b/backend/app/services/collectors/top500.py
@@ -4,9 +4,9 @@ Collects data from TOP500 supercomputer rankings.
 https://top500.org/lists/top500/
 """

+import asyncio
 import re
 from typing import Dict, Any, List
-from datetime import datetime
 from bs4 import BeautifulSoup
 import httpx

@@ -21,14 +21,108 @@ class TOP500Collector(BaseCollector):
    data_type = "supercomputer"

    async def fetch(self) -> List[Dict[str, Any]]:
-        """Fetch TOP500 data from website (scraping)"""
-        # Get the latest list page
+        """Fetch TOP500 list data and enrich each row with detail-page metadata."""
        url = "https://top500.org/lists/top500/list/2025/11/"

-        async with httpx.AsyncClient(timeout=60.0) as client:
+        async with httpx.AsyncClient(timeout=60.0, follow_redirects=True) as client:
            response = await client.get(url)
            response.raise_for_status()
-            return self.parse_response(response.text)
+            entries = self.parse_response(response.text)
+
+            semaphore = asyncio.Semaphore(8)
+
+            async def enrich(entry: Dict[str, Any]) -> Dict[str, Any]:
+                detail_url = entry.pop("_detail_url", "")
+                if not detail_url:
+                    return entry
+
+                async with semaphore:
+                    try:
+                        detail_response = await client.get(detail_url)
+                        detail_response.raise_for_status()
+                        entry["metadata"].update(self.parse_detail_response(detail_response.text))
+                    except Exception:
+                        entry["metadata"]["detail_fetch_failed"] = True
+                return entry
+
+            return await asyncio.gather(*(enrich(entry) for entry in entries))
+
+    def _extract_system_fields(self, system_cell) -> Dict[str, str]:
+        link = system_cell.find("a")
+        system_name = link.get_text(" ", strip=True) if link else system_cell.get_text(" ", strip=True)
+        detail_url = ""
+        if link and link.get("href"):
+            detail_url = f"https://top500.org{link.get('href')}"
+
+        manufacturer = ""
+        if link and link.next_sibling:
+            manufacturer = str(link.next_sibling).strip(" ,\n\t")
+
+        cell_text = system_cell.get_text("\n", strip=True)
+        lines = [line.strip(" ,") for line in cell_text.splitlines() if line.strip()]
+
+        site = ""
+        country = ""
+        if lines:
+            system_name = lines[0]
+        if len(lines) >= 3:
+            site = lines[-2]
+            country = lines[-1]
+        elif len(lines) == 2:
+            country = lines[-1]
+
+        if not manufacturer and len(lines) >= 2:
+            manufacturer = lines[1]
+
+        return {
+            "name": system_name,
+            "manufacturer": manufacturer,
+            "site": site,
+            "country": country,
+            "detail_url": detail_url,
+        }
+
+    def parse_detail_response(self, html: str) -> Dict[str, Any]:
+        soup = BeautifulSoup(html, "html.parser")
+        detail_table = soup.find("table", {"class": "table table-condensed"})
+        if not detail_table:
+            return {}
+
+        detail_map: Dict[str, Any] = {}
+        label_aliases = {
+            "Site": "site",
+            "Manufacturer": "manufacturer",
+            "Cores": "cores",
+            "Processor": "processor",
+            "Interconnect": "interconnect",
+            "Installation Year": "installation_year",
+            "Linpack Performance (Rmax)": "rmax",
+            "Theoretical Peak (Rpeak)": "rpeak",
+            "Nmax": "nmax",
+            "HPCG": "hpcg",
+            "Power": "power",
+            "Power Measurement Level": "power_measurement_level",
+            "Operating System": "operating_system",
+            "Compiler": "compiler",
+            "Math Library": "math_library",
+            "MPI": "mpi",
+        }
+
+        for row in detail_table.find_all("tr"):
+            header = row.find("th")
+            value_cell = row.find("td")
+            if not header or not value_cell:
+                continue
+
+            label = header.get_text(" ", strip=True).rstrip(":")
+            key = label_aliases.get(label)
+            if not key:
+                continue
+
+            value = value_cell.get_text(" ", strip=True)
+            detail_map[key] = value
+
+        return detail_map

    def parse_response(self, html: str) -> List[Dict[str, Any]]:
        """Parse TOP500 HTML response"""
@@ -36,27 +130,26 @@ class TOP500Collector(BaseCollector):
        soup = BeautifulSoup(html, "html.parser")

        # Find the table with TOP500 data
-        table = soup.find("table", {"class": "top500-table"})
-        if not table:
-            # Try alternative table selector
-            table = soup.find("table", {"id": "top500"})
+        table = None
+        for candidate in soup.find_all("table"):
+            header_cells = [
+                cell.get_text(" ", strip=True) for cell in candidate.select("thead th")
+            ]
+            normalized_headers = [header.lower() for header in header_cells]
+            if (
+                "rank" in normalized_headers
+                and "system" in normalized_headers
+                and any("cores" in header for header in normalized_headers)
+                and any("rmax" in header for header in normalized_headers)
+            ):
+                table = candidate
+                break

        if not table:
-            # Try to find any table with rank data
-            tables = soup.find_all("table")
-            for t in tables:
-                if t.find(string=re.compile(r"Rank.*System.*Cores.*Rmax", re.I)):
-                    table = t
-                    break
-
-        if not table:
-            # Fallback: try to extract data from any table
-            tables = soup.find_all("table")
-            if tables:
-                table = tables[0]
+            table = soup.find("table", {"class": "top500-table"}) or soup.find("table", {"id": "top500"})

        if table:
-            rows = table.find_all("tr")
+            rows = table.select("tr")
            for row in rows[1:]:  # Skip header row
                cells = row.find_all(["td", "th"])
                if len(cells) >= 6:
@@ -68,43 +161,26 @@ class TOP500Collector(BaseCollector):

                        rank = int(rank_text)

-                        # System name (may contain link)
                        system_cell = cells[1]
-                        system_name = system_cell.get_text(strip=True)
-                        # Try to get full name from link title or data attribute
-                        link = system_cell.find("a")
-                        if link and link.get("title"):
-                            system_name = link.get("title")
+                        system_fields = self._extract_system_fields(system_cell)
+                        system_name = system_fields["name"]
+                        manufacturer = system_fields["manufacturer"]
+                        site = system_fields["site"]
+                        country = system_fields["country"]
+                        detail_url = system_fields["detail_url"]

-                        # Country
-                        country_cell = cells[2]
-                        country = country_cell.get_text(strip=True)
-                        # Try to get country from data attribute or image alt
-                        img = country_cell.find("img")
-                        if img and img.get("alt"):
-                            country = img.get("alt")
-
-                        # Extract location (city)
                        city = ""
-                        location_text = country_cell.get_text(strip=True)
-                        if "(" in location_text and ")" in location_text:
-                            city = location_text.split("(")[0].strip()
+                        cores = cells[2].get_text(strip=True).replace(",", "")

-                        # Cores
-                        cores = cells[3].get_text(strip=True).replace(",", "")
-
-                        # Rmax
-                        rmax_text = cells[4].get_text(strip=True)
+                        rmax_text = cells[3].get_text(strip=True)
                        rmax = self._parse_performance(rmax_text)

-                        # Rpeak
-                        rpeak_text = cells[5].get_text(strip=True)
+                        rpeak_text = cells[4].get_text(strip=True)
                        rpeak = self._parse_performance(rpeak_text)

-                        # Power (optional)
                        power = ""
-                        if len(cells) >= 7:
-                            power = cells[6].get_text(strip=True)
+                        if len(cells) >= 6:
+                            power = cells[5].get_text(strip=True).replace(",", "")

                        entry = {
                            "source_id": f"top500_{rank}",
@@ -117,10 +193,14 @@ class TOP500Collector(BaseCollector):
                            "unit": "PFlop/s",
                            "metadata": {
                                "rank": rank,
-                                "r_peak": rpeak,
-                                "power": power,
                                "cores": cores,
+                                "rmax": rmax_text,
+                                "rpeak": rpeak_text,
+                                "power": power,
+                                "manufacturer": manufacturer,
+                                "site": site,
                            },
+                            "_detail_url": detail_url,
                            "reference_date": "2025-11-01",
                        }
                        data.append(entry)
@@ -184,10 +264,15 @@ class TOP500Collector(BaseCollector):
                "unit": "PFlop/s",
                "metadata": {
                    "rank": 1,
-                    "r_peak": 2746.38,
-                    "power": 29581,
-                    "cores": 11039616,
+                    "cores": "11039616",
+                    "rmax": "1742.00",
+                    "rpeak": "2746.38",
+                    "power": "29581",
                    "manufacturer": "HPE",
+                    "site": "DOE/NNSA/LLNL",
+                    "processor": "AMD 4th Gen EPYC 24C 1.8GHz",
+                    "interconnect": "Slingshot-11",
+                    "installation_year": "2025",
                },
                "reference_date": "2025-11-01",
            },
@@ -202,10 +287,12 @@ class TOP500Collector(BaseCollector):
                "unit": "PFlop/s",
                "metadata": {
                    "rank": 2,
-                    "r_peak": 2055.72,
-                    "power": 24607,
-                    "cores": 9066176,
+                    "cores": "9066176",
+                    "rmax": "1353.00",
+                    "rpeak": "2055.72",
+                    "power": "24607",
                    "manufacturer": "HPE",
+                    "site": "DOE/SC/Oak Ridge National Laboratory",
                },
                "reference_date": "2025-11-01",
            },
@@ -220,9 +307,10 @@ class TOP500Collector(BaseCollector):
                "unit": "PFlop/s",
                "metadata": {
                    "rank": 3,
-                    "r_peak": 1980.01,
-                    "power": 38698,
-                    "cores": 9264128,
+                    "cores": "9264128",
+                    "rmax": "1012.00",
+                    "rpeak": "1980.01",
+                    "power": "38698",
                    "manufacturer": "Intel",
                },
                "reference_date": "2025-11-01",
--- a/backend/app/services/scheduler.py
+++ b/backend/app/services/scheduler.py
@@ -2,8 +2,8 @@

 import asyncio
 import logging
-from datetime import datetime
-from typing import Any, Dict
+from datetime import datetime, timedelta
+from typing import Any, Dict, Optional

 from apscheduler.schedulers.asyncio import AsyncIOScheduler
 from apscheduler.triggers.interval import IntervalTrigger
@@ -11,6 +11,7 @@ from sqlalchemy import select

 from app.db.session import async_session_factory
 from app.models.datasource import DataSource
+from app.models.task import CollectionTask
 from app.services.collectors.registry import collector_registry

 logger = logging.getLogger(__name__)
@@ -89,6 +90,35 @@ async def run_collector_task(collector_name: str):
            logger.exception("Collector %s failed: %s", collector_name, exc)


+async def cleanup_stale_running_tasks(max_age_hours: int = 2) -> int:
+    """Mark stale running tasks as failed after restarts or collector hangs."""
+    cutoff = datetime.utcnow() - timedelta(hours=max_age_hours)
+
+    async with async_session_factory() as db:
+        result = await db.execute(
+            select(CollectionTask).where(
+                CollectionTask.status == "running",
+                CollectionTask.started_at.is_not(None),
+                CollectionTask.started_at < cutoff,
+            )
+        )
+        stale_tasks = result.scalars().all()
+
+        for task in stale_tasks:
+            task.status = "failed"
+            task.phase = "failed"
+            task.completed_at = datetime.utcnow()
+            existing_error = (task.error_message or "").strip()
+            cleanup_error = "Marked failed automatically after stale running task cleanup"
+            task.error_message = f"{existing_error}\n{cleanup_error}".strip() if existing_error else cleanup_error
+
+        if stale_tasks:
+            await db.commit()
+            logger.warning("Cleaned up %s stale running collection task(s)", len(stale_tasks))
+
+        return len(stale_tasks)
+
+
 def start_scheduler() -> None:
    """Start the scheduler."""
    if not scheduler.running:
@@ -144,6 +174,19 @@ def get_scheduler_jobs() -> list[Dict[str, Any]]:
    return jobs


+async def get_latest_task_id_for_datasource(datasource_id: int) -> Optional[int]:
+    from app.models.task import CollectionTask
+
+    async with async_session_factory() as db:
+        result = await db.execute(
+            select(CollectionTask.id)
+            .where(CollectionTask.datasource_id == datasource_id)
+            .order_by(CollectionTask.created_at.desc(), CollectionTask.id.desc())
+            .limit(1)
+        )
+        return result.scalar_one_or_none()
+
+
 def run_collector_now(collector_name: str) -> bool:
    """Run a collector immediately (not scheduled)."""
    collector = collector_registry.get(collector_name)