Refine data management and collection workflows

This commit is contained in:
linkong
2026-03-25 17:19:10 +08:00
parent cc5f16f8a7
commit 020c1d5051
34 changed files with 3341 additions and 947 deletions

View File

@@ -4,9 +4,9 @@ Collects data from TOP500 supercomputer rankings.
https://top500.org/lists/top500/
"""
import asyncio
import re
from typing import Dict, Any, List
from datetime import datetime
from bs4 import BeautifulSoup
import httpx
@@ -21,14 +21,108 @@ class TOP500Collector(BaseCollector):
data_type = "supercomputer"
async def fetch(self) -> List[Dict[str, Any]]:
"""Fetch TOP500 data from website (scraping)"""
# Get the latest list page
"""Fetch TOP500 list data and enrich each row with detail-page metadata."""
url = "https://top500.org/lists/top500/list/2025/11/"
async with httpx.AsyncClient(timeout=60.0) as client:
async with httpx.AsyncClient(timeout=60.0, follow_redirects=True) as client:
response = await client.get(url)
response.raise_for_status()
return self.parse_response(response.text)
entries = self.parse_response(response.text)
semaphore = asyncio.Semaphore(8)
async def enrich(entry: Dict[str, Any]) -> Dict[str, Any]:
detail_url = entry.pop("_detail_url", "")
if not detail_url:
return entry
async with semaphore:
try:
detail_response = await client.get(detail_url)
detail_response.raise_for_status()
entry["metadata"].update(self.parse_detail_response(detail_response.text))
except Exception:
entry["metadata"]["detail_fetch_failed"] = True
return entry
return await asyncio.gather(*(enrich(entry) for entry in entries))
def _extract_system_fields(self, system_cell) -> Dict[str, str]:
link = system_cell.find("a")
system_name = link.get_text(" ", strip=True) if link else system_cell.get_text(" ", strip=True)
detail_url = ""
if link and link.get("href"):
detail_url = f"https://top500.org{link.get('href')}"
manufacturer = ""
if link and link.next_sibling:
manufacturer = str(link.next_sibling).strip(" ,\n\t")
cell_text = system_cell.get_text("\n", strip=True)
lines = [line.strip(" ,") for line in cell_text.splitlines() if line.strip()]
site = ""
country = ""
if lines:
system_name = lines[0]
if len(lines) >= 3:
site = lines[-2]
country = lines[-1]
elif len(lines) == 2:
country = lines[-1]
if not manufacturer and len(lines) >= 2:
manufacturer = lines[1]
return {
"name": system_name,
"manufacturer": manufacturer,
"site": site,
"country": country,
"detail_url": detail_url,
}
def parse_detail_response(self, html: str) -> Dict[str, Any]:
soup = BeautifulSoup(html, "html.parser")
detail_table = soup.find("table", {"class": "table table-condensed"})
if not detail_table:
return {}
detail_map: Dict[str, Any] = {}
label_aliases = {
"Site": "site",
"Manufacturer": "manufacturer",
"Cores": "cores",
"Processor": "processor",
"Interconnect": "interconnect",
"Installation Year": "installation_year",
"Linpack Performance (Rmax)": "rmax",
"Theoretical Peak (Rpeak)": "rpeak",
"Nmax": "nmax",
"HPCG": "hpcg",
"Power": "power",
"Power Measurement Level": "power_measurement_level",
"Operating System": "operating_system",
"Compiler": "compiler",
"Math Library": "math_library",
"MPI": "mpi",
}
for row in detail_table.find_all("tr"):
header = row.find("th")
value_cell = row.find("td")
if not header or not value_cell:
continue
label = header.get_text(" ", strip=True).rstrip(":")
key = label_aliases.get(label)
if not key:
continue
value = value_cell.get_text(" ", strip=True)
detail_map[key] = value
return detail_map
def parse_response(self, html: str) -> List[Dict[str, Any]]:
"""Parse TOP500 HTML response"""
@@ -36,27 +130,26 @@ class TOP500Collector(BaseCollector):
soup = BeautifulSoup(html, "html.parser")
# Find the table with TOP500 data
table = soup.find("table", {"class": "top500-table"})
if not table:
# Try alternative table selector
table = soup.find("table", {"id": "top500"})
table = None
for candidate in soup.find_all("table"):
header_cells = [
cell.get_text(" ", strip=True) for cell in candidate.select("thead th")
]
normalized_headers = [header.lower() for header in header_cells]
if (
"rank" in normalized_headers
and "system" in normalized_headers
and any("cores" in header for header in normalized_headers)
and any("rmax" in header for header in normalized_headers)
):
table = candidate
break
if not table:
# Try to find any table with rank data
tables = soup.find_all("table")
for t in tables:
if t.find(string=re.compile(r"Rank.*System.*Cores.*Rmax", re.I)):
table = t
break
if not table:
# Fallback: try to extract data from any table
tables = soup.find_all("table")
if tables:
table = tables[0]
table = soup.find("table", {"class": "top500-table"}) or soup.find("table", {"id": "top500"})
if table:
rows = table.find_all("tr")
rows = table.select("tr")
for row in rows[1:]: # Skip header row
cells = row.find_all(["td", "th"])
if len(cells) >= 6:
@@ -68,43 +161,26 @@ class TOP500Collector(BaseCollector):
rank = int(rank_text)
# System name (may contain link)
system_cell = cells[1]
system_name = system_cell.get_text(strip=True)
# Try to get full name from link title or data attribute
link = system_cell.find("a")
if link and link.get("title"):
system_name = link.get("title")
system_fields = self._extract_system_fields(system_cell)
system_name = system_fields["name"]
manufacturer = system_fields["manufacturer"]
site = system_fields["site"]
country = system_fields["country"]
detail_url = system_fields["detail_url"]
# Country
country_cell = cells[2]
country = country_cell.get_text(strip=True)
# Try to get country from data attribute or image alt
img = country_cell.find("img")
if img and img.get("alt"):
country = img.get("alt")
# Extract location (city)
city = ""
location_text = country_cell.get_text(strip=True)
if "(" in location_text and ")" in location_text:
city = location_text.split("(")[0].strip()
cores = cells[2].get_text(strip=True).replace(",", "")
# Cores
cores = cells[3].get_text(strip=True).replace(",", "")
# Rmax
rmax_text = cells[4].get_text(strip=True)
rmax_text = cells[3].get_text(strip=True)
rmax = self._parse_performance(rmax_text)
# Rpeak
rpeak_text = cells[5].get_text(strip=True)
rpeak_text = cells[4].get_text(strip=True)
rpeak = self._parse_performance(rpeak_text)
# Power (optional)
power = ""
if len(cells) >= 7:
power = cells[6].get_text(strip=True)
if len(cells) >= 6:
power = cells[5].get_text(strip=True).replace(",", "")
entry = {
"source_id": f"top500_{rank}",
@@ -117,10 +193,14 @@ class TOP500Collector(BaseCollector):
"unit": "PFlop/s",
"metadata": {
"rank": rank,
"r_peak": rpeak,
"power": power,
"cores": cores,
"rmax": rmax_text,
"rpeak": rpeak_text,
"power": power,
"manufacturer": manufacturer,
"site": site,
},
"_detail_url": detail_url,
"reference_date": "2025-11-01",
}
data.append(entry)
@@ -184,10 +264,15 @@ class TOP500Collector(BaseCollector):
"unit": "PFlop/s",
"metadata": {
"rank": 1,
"r_peak": 2746.38,
"power": 29581,
"cores": 11039616,
"cores": "11039616",
"rmax": "1742.00",
"rpeak": "2746.38",
"power": "29581",
"manufacturer": "HPE",
"site": "DOE/NNSA/LLNL",
"processor": "AMD 4th Gen EPYC 24C 1.8GHz",
"interconnect": "Slingshot-11",
"installation_year": "2025",
},
"reference_date": "2025-11-01",
},
@@ -202,10 +287,12 @@ class TOP500Collector(BaseCollector):
"unit": "PFlop/s",
"metadata": {
"rank": 2,
"r_peak": 2055.72,
"power": 24607,
"cores": 9066176,
"cores": "9066176",
"rmax": "1353.00",
"rpeak": "2055.72",
"power": "24607",
"manufacturer": "HPE",
"site": "DOE/SC/Oak Ridge National Laboratory",
},
"reference_date": "2025-11-01",
},
@@ -220,9 +307,10 @@ class TOP500Collector(BaseCollector):
"unit": "PFlop/s",
"metadata": {
"rank": 3,
"r_peak": 1980.01,
"power": 38698,
"cores": 9264128,
"cores": "9264128",
"rmax": "1012.00",
"rpeak": "1980.01",
"power": "38698",
"manufacturer": "Intel",
},
"reference_date": "2025-11-01",