"""TOP500 Supercomputer Collector Collects data from TOP500 supercomputer rankings. https://top500.org/lists/top500/ """ import asyncio import re from typing import Dict, Any, List from bs4 import BeautifulSoup import httpx from app.services.collectors.base import BaseCollector class TOP500Collector(BaseCollector): name = "top500" priority = "P0" module = "L1" frequency_hours = 4 data_type = "supercomputer" async def fetch(self) -> List[Dict[str, Any]]: """Fetch TOP500 list data and enrich each row with detail-page metadata.""" url = "https://top500.org/lists/top500/list/2025/11/" async with httpx.AsyncClient(timeout=60.0, follow_redirects=True) as client: response = await client.get(url) response.raise_for_status() entries = self.parse_response(response.text) semaphore = asyncio.Semaphore(8) async def enrich(entry: Dict[str, Any]) -> Dict[str, Any]: detail_url = entry.pop("_detail_url", "") if not detail_url: return entry async with semaphore: try: detail_response = await client.get(detail_url) detail_response.raise_for_status() entry["metadata"].update(self.parse_detail_response(detail_response.text)) except Exception: entry["metadata"]["detail_fetch_failed"] = True return entry return await asyncio.gather(*(enrich(entry) for entry in entries)) def _extract_system_fields(self, system_cell) -> Dict[str, str]: link = system_cell.find("a") system_name = link.get_text(" ", strip=True) if link else system_cell.get_text(" ", strip=True) detail_url = "" if link and link.get("href"): detail_url = f"https://top500.org{link.get('href')}" manufacturer = "" if link and link.next_sibling: manufacturer = str(link.next_sibling).strip(" ,\n\t") cell_text = system_cell.get_text("\n", strip=True) lines = [line.strip(" ,") for line in cell_text.splitlines() if line.strip()] site = "" country = "" if lines: system_name = lines[0] if len(lines) >= 3: site = lines[-2] country = lines[-1] elif len(lines) == 2: country = lines[-1] if not manufacturer and len(lines) >= 2: manufacturer = lines[1] return { "name": system_name, "manufacturer": manufacturer, "site": site, "country": country, "detail_url": detail_url, } def parse_detail_response(self, html: str) -> Dict[str, Any]: soup = BeautifulSoup(html, "html.parser") detail_table = soup.find("table", {"class": "table table-condensed"}) if not detail_table: return {} detail_map: Dict[str, Any] = {} label_aliases = { "Site": "site", "Manufacturer": "manufacturer", "Cores": "cores", "Processor": "processor", "Interconnect": "interconnect", "Installation Year": "installation_year", "Linpack Performance (Rmax)": "rmax", "Theoretical Peak (Rpeak)": "rpeak", "Nmax": "nmax", "HPCG": "hpcg", "Power": "power", "Power Measurement Level": "power_measurement_level", "Operating System": "operating_system", "Compiler": "compiler", "Math Library": "math_library", "MPI": "mpi", } for row in detail_table.find_all("tr"): header = row.find("th") value_cell = row.find("td") if not header or not value_cell: continue label = header.get_text(" ", strip=True).rstrip(":") key = label_aliases.get(label) if not key: continue value = value_cell.get_text(" ", strip=True) detail_map[key] = value return detail_map def parse_response(self, html: str) -> List[Dict[str, Any]]: """Parse TOP500 HTML response""" data = [] soup = BeautifulSoup(html, "html.parser") # Find the table with TOP500 data table = None for candidate in soup.find_all("table"): header_cells = [ cell.get_text(" ", strip=True) for cell in candidate.select("thead th") ] normalized_headers = [header.lower() for header in header_cells] if ( "rank" in normalized_headers and "system" in normalized_headers and any("cores" in header for header in normalized_headers) and any("rmax" in header for header in normalized_headers) ): table = candidate break if not table: table = soup.find("table", {"class": "top500-table"}) or soup.find("table", {"id": "top500"}) if table: rows = table.select("tr") for row in rows[1:]: # Skip header row cells = row.find_all(["td", "th"]) if len(cells) >= 6: try: # Parse the row data rank_text = cells[0].get_text(strip=True) if not rank_text or not rank_text.isdigit(): continue rank = int(rank_text) system_cell = cells[1] system_fields = self._extract_system_fields(system_cell) system_name = system_fields["name"] manufacturer = system_fields["manufacturer"] site = system_fields["site"] country = system_fields["country"] detail_url = system_fields["detail_url"] city = "" cores = cells[2].get_text(strip=True).replace(",", "") rmax_text = cells[3].get_text(strip=True) rmax = self._parse_performance(rmax_text) rpeak_text = cells[4].get_text(strip=True) rpeak = self._parse_performance(rpeak_text) power = "" if len(cells) >= 6: power = cells[5].get_text(strip=True).replace(",", "") entry = { "source_id": f"top500_{rank}", "name": system_name, "country": country, "city": city, "latitude": 0.0, "longitude": 0.0, "value": str(rmax), "unit": "PFlop/s", "metadata": { "rank": rank, "cores": cores, "rmax": rmax_text, "rpeak": rpeak_text, "power": power, "manufacturer": manufacturer, "site": site, }, "_detail_url": detail_url, "reference_date": "2025-11-01", } data.append(entry) except (ValueError, IndexError, AttributeError) as e: continue # If scraping failed, return sample data for testing if not data: data = self._get_sample_data() return data def _parse_coordinate(self, value: Any) -> float: """Parse coordinate value""" if isinstance(value, (int, float)): return float(value) if isinstance(value, str): try: return float(value) except ValueError: return 0.0 return 0.0 def _parse_performance(self, text: str) -> float: """Parse performance value from text (handles E, P, T suffixes)""" text = text.strip().upper() multipliers = { "E": 1e18, "P": 1e15, "T": 1e12, "G": 1e9, "M": 1e6, "K": 1e3, } match = re.match(r"([\d.]+)\s*([EPTGMK])?F?LOP/?S?", text) if match: value = float(match.group(1)) suffix = match.group(2) if suffix: value *= multipliers.get(suffix, 1) return value # Try simple float parsing try: return float(text.replace(",", "")) except ValueError: return 0.0 def _get_sample_data(self) -> List[Dict[str, Any]]: """Return sample data for testing when scraping fails""" return [ { "source_id": "top500_1", "name": "El Capitan - HPE Cray EX255a, AMD 4th Gen EPYC 24C 1.8GHz, AMD Instinct MI300A", "country": "United States", "city": "Livermore, CA", "latitude": 37.6819, "longitude": -121.7681, "value": "1742.00", "unit": "PFlop/s", "metadata": { "rank": 1, "cores": "11039616", "rmax": "1742.00", "rpeak": "2746.38", "power": "29581", "manufacturer": "HPE", "site": "DOE/NNSA/LLNL", "processor": "AMD 4th Gen EPYC 24C 1.8GHz", "interconnect": "Slingshot-11", "installation_year": "2025", }, "reference_date": "2025-11-01", }, { "source_id": "top500_2", "name": "Frontier - HPE Cray EX235a, AMD Optimized 3rd Generation EPYC 64C 2GHz, AMD Instinct MI250X", "country": "United States", "city": "Oak Ridge, TN", "latitude": 36.0107, "longitude": -84.2663, "value": "1353.00", "unit": "PFlop/s", "metadata": { "rank": 2, "cores": "9066176", "rmax": "1353.00", "rpeak": "2055.72", "power": "24607", "manufacturer": "HPE", "site": "DOE/SC/Oak Ridge National Laboratory", }, "reference_date": "2025-11-01", }, { "source_id": "top500_3", "name": "Aurora - HPE Cray EX - Intel Exascale Compute Blade, Xeon CPU Max 9470 52C 2.4GHz, Intel Data Center GPU Max", "country": "United States", "city": "Argonne, IL", "latitude": 41.3784, "longitude": -87.8600, "value": "1012.00", "unit": "PFlop/s", "metadata": { "rank": 3, "cores": "9264128", "rmax": "1012.00", "rpeak": "1980.01", "power": "38698", "manufacturer": "Intel", }, "reference_date": "2025-11-01", }, ]