Refine data management and collection workflows
This commit is contained in:
@@ -4,9 +4,9 @@ Collects data from TOP500 supercomputer rankings.
|
||||
https://top500.org/lists/top500/
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import re
|
||||
from typing import Dict, Any, List
|
||||
from datetime import datetime
|
||||
from bs4 import BeautifulSoup
|
||||
import httpx
|
||||
|
||||
@@ -21,14 +21,108 @@ class TOP500Collector(BaseCollector):
|
||||
data_type = "supercomputer"
|
||||
|
||||
async def fetch(self) -> List[Dict[str, Any]]:
|
||||
"""Fetch TOP500 data from website (scraping)"""
|
||||
# Get the latest list page
|
||||
"""Fetch TOP500 list data and enrich each row with detail-page metadata."""
|
||||
url = "https://top500.org/lists/top500/list/2025/11/"
|
||||
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
async with httpx.AsyncClient(timeout=60.0, follow_redirects=True) as client:
|
||||
response = await client.get(url)
|
||||
response.raise_for_status()
|
||||
return self.parse_response(response.text)
|
||||
entries = self.parse_response(response.text)
|
||||
|
||||
semaphore = asyncio.Semaphore(8)
|
||||
|
||||
async def enrich(entry: Dict[str, Any]) -> Dict[str, Any]:
|
||||
detail_url = entry.pop("_detail_url", "")
|
||||
if not detail_url:
|
||||
return entry
|
||||
|
||||
async with semaphore:
|
||||
try:
|
||||
detail_response = await client.get(detail_url)
|
||||
detail_response.raise_for_status()
|
||||
entry["metadata"].update(self.parse_detail_response(detail_response.text))
|
||||
except Exception:
|
||||
entry["metadata"]["detail_fetch_failed"] = True
|
||||
return entry
|
||||
|
||||
return await asyncio.gather(*(enrich(entry) for entry in entries))
|
||||
|
||||
def _extract_system_fields(self, system_cell) -> Dict[str, str]:
|
||||
link = system_cell.find("a")
|
||||
system_name = link.get_text(" ", strip=True) if link else system_cell.get_text(" ", strip=True)
|
||||
detail_url = ""
|
||||
if link and link.get("href"):
|
||||
detail_url = f"https://top500.org{link.get('href')}"
|
||||
|
||||
manufacturer = ""
|
||||
if link and link.next_sibling:
|
||||
manufacturer = str(link.next_sibling).strip(" ,\n\t")
|
||||
|
||||
cell_text = system_cell.get_text("\n", strip=True)
|
||||
lines = [line.strip(" ,") for line in cell_text.splitlines() if line.strip()]
|
||||
|
||||
site = ""
|
||||
country = ""
|
||||
if lines:
|
||||
system_name = lines[0]
|
||||
if len(lines) >= 3:
|
||||
site = lines[-2]
|
||||
country = lines[-1]
|
||||
elif len(lines) == 2:
|
||||
country = lines[-1]
|
||||
|
||||
if not manufacturer and len(lines) >= 2:
|
||||
manufacturer = lines[1]
|
||||
|
||||
return {
|
||||
"name": system_name,
|
||||
"manufacturer": manufacturer,
|
||||
"site": site,
|
||||
"country": country,
|
||||
"detail_url": detail_url,
|
||||
}
|
||||
|
||||
def parse_detail_response(self, html: str) -> Dict[str, Any]:
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
detail_table = soup.find("table", {"class": "table table-condensed"})
|
||||
if not detail_table:
|
||||
return {}
|
||||
|
||||
detail_map: Dict[str, Any] = {}
|
||||
label_aliases = {
|
||||
"Site": "site",
|
||||
"Manufacturer": "manufacturer",
|
||||
"Cores": "cores",
|
||||
"Processor": "processor",
|
||||
"Interconnect": "interconnect",
|
||||
"Installation Year": "installation_year",
|
||||
"Linpack Performance (Rmax)": "rmax",
|
||||
"Theoretical Peak (Rpeak)": "rpeak",
|
||||
"Nmax": "nmax",
|
||||
"HPCG": "hpcg",
|
||||
"Power": "power",
|
||||
"Power Measurement Level": "power_measurement_level",
|
||||
"Operating System": "operating_system",
|
||||
"Compiler": "compiler",
|
||||
"Math Library": "math_library",
|
||||
"MPI": "mpi",
|
||||
}
|
||||
|
||||
for row in detail_table.find_all("tr"):
|
||||
header = row.find("th")
|
||||
value_cell = row.find("td")
|
||||
if not header or not value_cell:
|
||||
continue
|
||||
|
||||
label = header.get_text(" ", strip=True).rstrip(":")
|
||||
key = label_aliases.get(label)
|
||||
if not key:
|
||||
continue
|
||||
|
||||
value = value_cell.get_text(" ", strip=True)
|
||||
detail_map[key] = value
|
||||
|
||||
return detail_map
|
||||
|
||||
def parse_response(self, html: str) -> List[Dict[str, Any]]:
|
||||
"""Parse TOP500 HTML response"""
|
||||
@@ -36,27 +130,26 @@ class TOP500Collector(BaseCollector):
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
# Find the table with TOP500 data
|
||||
table = soup.find("table", {"class": "top500-table"})
|
||||
if not table:
|
||||
# Try alternative table selector
|
||||
table = soup.find("table", {"id": "top500"})
|
||||
table = None
|
||||
for candidate in soup.find_all("table"):
|
||||
header_cells = [
|
||||
cell.get_text(" ", strip=True) for cell in candidate.select("thead th")
|
||||
]
|
||||
normalized_headers = [header.lower() for header in header_cells]
|
||||
if (
|
||||
"rank" in normalized_headers
|
||||
and "system" in normalized_headers
|
||||
and any("cores" in header for header in normalized_headers)
|
||||
and any("rmax" in header for header in normalized_headers)
|
||||
):
|
||||
table = candidate
|
||||
break
|
||||
|
||||
if not table:
|
||||
# Try to find any table with rank data
|
||||
tables = soup.find_all("table")
|
||||
for t in tables:
|
||||
if t.find(string=re.compile(r"Rank.*System.*Cores.*Rmax", re.I)):
|
||||
table = t
|
||||
break
|
||||
|
||||
if not table:
|
||||
# Fallback: try to extract data from any table
|
||||
tables = soup.find_all("table")
|
||||
if tables:
|
||||
table = tables[0]
|
||||
table = soup.find("table", {"class": "top500-table"}) or soup.find("table", {"id": "top500"})
|
||||
|
||||
if table:
|
||||
rows = table.find_all("tr")
|
||||
rows = table.select("tr")
|
||||
for row in rows[1:]: # Skip header row
|
||||
cells = row.find_all(["td", "th"])
|
||||
if len(cells) >= 6:
|
||||
@@ -68,43 +161,26 @@ class TOP500Collector(BaseCollector):
|
||||
|
||||
rank = int(rank_text)
|
||||
|
||||
# System name (may contain link)
|
||||
system_cell = cells[1]
|
||||
system_name = system_cell.get_text(strip=True)
|
||||
# Try to get full name from link title or data attribute
|
||||
link = system_cell.find("a")
|
||||
if link and link.get("title"):
|
||||
system_name = link.get("title")
|
||||
system_fields = self._extract_system_fields(system_cell)
|
||||
system_name = system_fields["name"]
|
||||
manufacturer = system_fields["manufacturer"]
|
||||
site = system_fields["site"]
|
||||
country = system_fields["country"]
|
||||
detail_url = system_fields["detail_url"]
|
||||
|
||||
# Country
|
||||
country_cell = cells[2]
|
||||
country = country_cell.get_text(strip=True)
|
||||
# Try to get country from data attribute or image alt
|
||||
img = country_cell.find("img")
|
||||
if img and img.get("alt"):
|
||||
country = img.get("alt")
|
||||
|
||||
# Extract location (city)
|
||||
city = ""
|
||||
location_text = country_cell.get_text(strip=True)
|
||||
if "(" in location_text and ")" in location_text:
|
||||
city = location_text.split("(")[0].strip()
|
||||
cores = cells[2].get_text(strip=True).replace(",", "")
|
||||
|
||||
# Cores
|
||||
cores = cells[3].get_text(strip=True).replace(",", "")
|
||||
|
||||
# Rmax
|
||||
rmax_text = cells[4].get_text(strip=True)
|
||||
rmax_text = cells[3].get_text(strip=True)
|
||||
rmax = self._parse_performance(rmax_text)
|
||||
|
||||
# Rpeak
|
||||
rpeak_text = cells[5].get_text(strip=True)
|
||||
rpeak_text = cells[4].get_text(strip=True)
|
||||
rpeak = self._parse_performance(rpeak_text)
|
||||
|
||||
# Power (optional)
|
||||
power = ""
|
||||
if len(cells) >= 7:
|
||||
power = cells[6].get_text(strip=True)
|
||||
if len(cells) >= 6:
|
||||
power = cells[5].get_text(strip=True).replace(",", "")
|
||||
|
||||
entry = {
|
||||
"source_id": f"top500_{rank}",
|
||||
@@ -117,10 +193,14 @@ class TOP500Collector(BaseCollector):
|
||||
"unit": "PFlop/s",
|
||||
"metadata": {
|
||||
"rank": rank,
|
||||
"r_peak": rpeak,
|
||||
"power": power,
|
||||
"cores": cores,
|
||||
"rmax": rmax_text,
|
||||
"rpeak": rpeak_text,
|
||||
"power": power,
|
||||
"manufacturer": manufacturer,
|
||||
"site": site,
|
||||
},
|
||||
"_detail_url": detail_url,
|
||||
"reference_date": "2025-11-01",
|
||||
}
|
||||
data.append(entry)
|
||||
@@ -184,10 +264,15 @@ class TOP500Collector(BaseCollector):
|
||||
"unit": "PFlop/s",
|
||||
"metadata": {
|
||||
"rank": 1,
|
||||
"r_peak": 2746.38,
|
||||
"power": 29581,
|
||||
"cores": 11039616,
|
||||
"cores": "11039616",
|
||||
"rmax": "1742.00",
|
||||
"rpeak": "2746.38",
|
||||
"power": "29581",
|
||||
"manufacturer": "HPE",
|
||||
"site": "DOE/NNSA/LLNL",
|
||||
"processor": "AMD 4th Gen EPYC 24C 1.8GHz",
|
||||
"interconnect": "Slingshot-11",
|
||||
"installation_year": "2025",
|
||||
},
|
||||
"reference_date": "2025-11-01",
|
||||
},
|
||||
@@ -202,10 +287,12 @@ class TOP500Collector(BaseCollector):
|
||||
"unit": "PFlop/s",
|
||||
"metadata": {
|
||||
"rank": 2,
|
||||
"r_peak": 2055.72,
|
||||
"power": 24607,
|
||||
"cores": 9066176,
|
||||
"cores": "9066176",
|
||||
"rmax": "1353.00",
|
||||
"rpeak": "2055.72",
|
||||
"power": "24607",
|
||||
"manufacturer": "HPE",
|
||||
"site": "DOE/SC/Oak Ridge National Laboratory",
|
||||
},
|
||||
"reference_date": "2025-11-01",
|
||||
},
|
||||
@@ -220,9 +307,10 @@ class TOP500Collector(BaseCollector):
|
||||
"unit": "PFlop/s",
|
||||
"metadata": {
|
||||
"rank": 3,
|
||||
"r_peak": 1980.01,
|
||||
"power": 38698,
|
||||
"cores": 9264128,
|
||||
"cores": "9264128",
|
||||
"rmax": "1012.00",
|
||||
"rpeak": "1980.01",
|
||||
"power": "38698",
|
||||
"manufacturer": "Intel",
|
||||
},
|
||||
"reference_date": "2025-11-01",
|
||||
|
||||
Reference in New Issue
Block a user