Refine data management and collection workflows

2026-03-25 17:19:10 +08:00
parent cc5f16f8a7
commit 020c1d5051
34 changed files with 3341 additions and 947 deletions
--- a/backend/app/services/collectors/top500.py
+++ b/backend/app/services/collectors/top500.py
@@ -4,9 +4,9 @@ Collects data from TOP500 supercomputer rankings.
 https://top500.org/lists/top500/
 """

+import asyncio
 import re
 from typing import Dict, Any, List
-from datetime import datetime
 from bs4 import BeautifulSoup
 import httpx

@@ -21,14 +21,108 @@ class TOP500Collector(BaseCollector):
    data_type = "supercomputer"

    async def fetch(self) -> List[Dict[str, Any]]:
-        """Fetch TOP500 data from website (scraping)"""
-        # Get the latest list page
+        """Fetch TOP500 list data and enrich each row with detail-page metadata."""
        url = "https://top500.org/lists/top500/list/2025/11/"

-        async with httpx.AsyncClient(timeout=60.0) as client:
+        async with httpx.AsyncClient(timeout=60.0, follow_redirects=True) as client:
            response = await client.get(url)
            response.raise_for_status()
-            return self.parse_response(response.text)
+            entries = self.parse_response(response.text)
+
+            semaphore = asyncio.Semaphore(8)
+
+            async def enrich(entry: Dict[str, Any]) -> Dict[str, Any]:
+                detail_url = entry.pop("_detail_url", "")
+                if not detail_url:
+                    return entry
+
+                async with semaphore:
+                    try:
+                        detail_response = await client.get(detail_url)
+                        detail_response.raise_for_status()
+                        entry["metadata"].update(self.parse_detail_response(detail_response.text))
+                    except Exception:
+                        entry["metadata"]["detail_fetch_failed"] = True
+                return entry
+
+            return await asyncio.gather(*(enrich(entry) for entry in entries))
+
+    def _extract_system_fields(self, system_cell) -> Dict[str, str]:
+        link = system_cell.find("a")
+        system_name = link.get_text(" ", strip=True) if link else system_cell.get_text(" ", strip=True)
+        detail_url = ""
+        if link and link.get("href"):
+            detail_url = f"https://top500.org{link.get('href')}"
+
+        manufacturer = ""
+        if link and link.next_sibling:
+            manufacturer = str(link.next_sibling).strip(" ,\n\t")
+
+        cell_text = system_cell.get_text("\n", strip=True)
+        lines = [line.strip(" ,") for line in cell_text.splitlines() if line.strip()]
+
+        site = ""
+        country = ""
+        if lines:
+            system_name = lines[0]
+        if len(lines) >= 3:
+            site = lines[-2]
+            country = lines[-1]
+        elif len(lines) == 2:
+            country = lines[-1]
+
+        if not manufacturer and len(lines) >= 2:
+            manufacturer = lines[1]
+
+        return {
+            "name": system_name,
+            "manufacturer": manufacturer,
+            "site": site,
+            "country": country,
+            "detail_url": detail_url,
+        }
+
+    def parse_detail_response(self, html: str) -> Dict[str, Any]:
+        soup = BeautifulSoup(html, "html.parser")
+        detail_table = soup.find("table", {"class": "table table-condensed"})
+        if not detail_table:
+            return {}
+
+        detail_map: Dict[str, Any] = {}
+        label_aliases = {
+            "Site": "site",
+            "Manufacturer": "manufacturer",
+            "Cores": "cores",
+            "Processor": "processor",
+            "Interconnect": "interconnect",
+            "Installation Year": "installation_year",
+            "Linpack Performance (Rmax)": "rmax",
+            "Theoretical Peak (Rpeak)": "rpeak",
+            "Nmax": "nmax",
+            "HPCG": "hpcg",
+            "Power": "power",
+            "Power Measurement Level": "power_measurement_level",
+            "Operating System": "operating_system",
+            "Compiler": "compiler",
+            "Math Library": "math_library",
+            "MPI": "mpi",
+        }
+
+        for row in detail_table.find_all("tr"):
+            header = row.find("th")
+            value_cell = row.find("td")
+            if not header or not value_cell:
+                continue
+
+            label = header.get_text(" ", strip=True).rstrip(":")
+            key = label_aliases.get(label)
+            if not key:
+                continue
+
+            value = value_cell.get_text(" ", strip=True)
+            detail_map[key] = value
+
+        return detail_map

    def parse_response(self, html: str) -> List[Dict[str, Any]]:
        """Parse TOP500 HTML response"""
@@ -36,27 +130,26 @@ class TOP500Collector(BaseCollector):
        soup = BeautifulSoup(html, "html.parser")

        # Find the table with TOP500 data
-        table = soup.find("table", {"class": "top500-table"})
-        if not table:
-            # Try alternative table selector
-            table = soup.find("table", {"id": "top500"})
+        table = None
+        for candidate in soup.find_all("table"):
+            header_cells = [
+                cell.get_text(" ", strip=True) for cell in candidate.select("thead th")
+            ]
+            normalized_headers = [header.lower() for header in header_cells]
+            if (
+                "rank" in normalized_headers
+                and "system" in normalized_headers
+                and any("cores" in header for header in normalized_headers)
+                and any("rmax" in header for header in normalized_headers)
+            ):
+                table = candidate
+                break

        if not table:
-            # Try to find any table with rank data
-            tables = soup.find_all("table")
-            for t in tables:
-                if t.find(string=re.compile(r"Rank.*System.*Cores.*Rmax", re.I)):
-                    table = t
-                    break
-
-        if not table:
-            # Fallback: try to extract data from any table
-            tables = soup.find_all("table")
-            if tables:
-                table = tables[0]
+            table = soup.find("table", {"class": "top500-table"}) or soup.find("table", {"id": "top500"})

        if table:
-            rows = table.find_all("tr")
+            rows = table.select("tr")
            for row in rows[1:]:  # Skip header row
                cells = row.find_all(["td", "th"])
                if len(cells) >= 6:
@@ -68,43 +161,26 @@ class TOP500Collector(BaseCollector):

                        rank = int(rank_text)

-                        # System name (may contain link)
                        system_cell = cells[1]
-                        system_name = system_cell.get_text(strip=True)
-                        # Try to get full name from link title or data attribute
-                        link = system_cell.find("a")
-                        if link and link.get("title"):
-                            system_name = link.get("title")
+                        system_fields = self._extract_system_fields(system_cell)
+                        system_name = system_fields["name"]
+                        manufacturer = system_fields["manufacturer"]
+                        site = system_fields["site"]
+                        country = system_fields["country"]
+                        detail_url = system_fields["detail_url"]

-                        # Country
-                        country_cell = cells[2]
-                        country = country_cell.get_text(strip=True)
-                        # Try to get country from data attribute or image alt
-                        img = country_cell.find("img")
-                        if img and img.get("alt"):
-                            country = img.get("alt")
-
-                        # Extract location (city)
                        city = ""
-                        location_text = country_cell.get_text(strip=True)
-                        if "(" in location_text and ")" in location_text:
-                            city = location_text.split("(")[0].strip()
+                        cores = cells[2].get_text(strip=True).replace(",", "")

-                        # Cores
-                        cores = cells[3].get_text(strip=True).replace(",", "")
-
-                        # Rmax
-                        rmax_text = cells[4].get_text(strip=True)
+                        rmax_text = cells[3].get_text(strip=True)
                        rmax = self._parse_performance(rmax_text)

-                        # Rpeak
-                        rpeak_text = cells[5].get_text(strip=True)
+                        rpeak_text = cells[4].get_text(strip=True)
                        rpeak = self._parse_performance(rpeak_text)

-                        # Power (optional)
                        power = ""
-                        if len(cells) >= 7:
-                            power = cells[6].get_text(strip=True)
+                        if len(cells) >= 6:
+                            power = cells[5].get_text(strip=True).replace(",", "")

                        entry = {
                            "source_id": f"top500_{rank}",
@@ -117,10 +193,14 @@ class TOP500Collector(BaseCollector):
                            "unit": "PFlop/s",
                            "metadata": {
                                "rank": rank,
-                                "r_peak": rpeak,
-                                "power": power,
                                "cores": cores,
+                                "rmax": rmax_text,
+                                "rpeak": rpeak_text,
+                                "power": power,
+                                "manufacturer": manufacturer,
+                                "site": site,
                            },
+                            "_detail_url": detail_url,
                            "reference_date": "2025-11-01",
                        }
                        data.append(entry)
@@ -184,10 +264,15 @@ class TOP500Collector(BaseCollector):
                "unit": "PFlop/s",
                "metadata": {
                    "rank": 1,
-                    "r_peak": 2746.38,
-                    "power": 29581,
-                    "cores": 11039616,
+                    "cores": "11039616",
+                    "rmax": "1742.00",
+                    "rpeak": "2746.38",
+                    "power": "29581",
                    "manufacturer": "HPE",
+                    "site": "DOE/NNSA/LLNL",
+                    "processor": "AMD 4th Gen EPYC 24C 1.8GHz",
+                    "interconnect": "Slingshot-11",
+                    "installation_year": "2025",
                },
                "reference_date": "2025-11-01",
            },
@@ -202,10 +287,12 @@ class TOP500Collector(BaseCollector):
                "unit": "PFlop/s",
                "metadata": {
                    "rank": 2,
-                    "r_peak": 2055.72,
-                    "power": 24607,
-                    "cores": 9066176,
+                    "cores": "9066176",
+                    "rmax": "1353.00",
+                    "rpeak": "2055.72",
+                    "power": "24607",
                    "manufacturer": "HPE",
+                    "site": "DOE/SC/Oak Ridge National Laboratory",
                },
                "reference_date": "2025-11-01",
            },
@@ -220,9 +307,10 @@ class TOP500Collector(BaseCollector):
                "unit": "PFlop/s",
                "metadata": {
                    "rank": 3,
-                    "r_peak": 1980.01,
-                    "power": 38698,
-                    "cores": 9264128,
+                    "cores": "9264128",
+                    "rmax": "1012.00",
+                    "rpeak": "1980.01",
+                    "power": "38698",
                    "manufacturer": "Intel",
                },
                "reference_date": "2025-11-01",