planet/backend/app/services/collectors/top500.py

"""TOP500 Supercomputer Collector

Collects data from TOP500 supercomputer rankings.
https://top500.org/lists/top500/
"""

import asyncio
import re
from typing import Dict, Any, List
from bs4 import BeautifulSoup
import httpx

from app.services.collectors.base import BaseCollector


class TOP500Collector(BaseCollector):
    name = "top500"
    priority = "P0"
    module = "L1"
    frequency_hours = 4
    data_type = "supercomputer"

    async def fetch(self) -> List[Dict[str, Any]]:
        """Fetch TOP500 list data and enrich each row with detail-page metadata."""
        url = "https://top500.org/lists/top500/list/2025/11/"

        async with httpx.AsyncClient(timeout=60.0, follow_redirects=True) as client:
            response = await client.get(url)
            response.raise_for_status()
            entries = self.parse_response(response.text)

            semaphore = asyncio.Semaphore(8)

            async def enrich(entry: Dict[str, Any]) -> Dict[str, Any]:
                detail_url = entry.pop("_detail_url", "")
                if not detail_url:
                    return entry

                async with semaphore:
                    try:
                        detail_response = await client.get(detail_url)
                        detail_response.raise_for_status()
                        entry["metadata"].update(self.parse_detail_response(detail_response.text))
                    except Exception:
                        entry["metadata"]["detail_fetch_failed"] = True
                return entry

            return await asyncio.gather(*(enrich(entry) for entry in entries))

    def _extract_system_fields(self, system_cell) -> Dict[str, str]:
        link = system_cell.find("a")
        system_name = link.get_text(" ", strip=True) if link else system_cell.get_text(" ", strip=True)
        detail_url = ""
        if link and link.get("href"):
            detail_url = f"https://top500.org{link.get('href')}"

        manufacturer = ""
        if link and link.next_sibling:
            manufacturer = str(link.next_sibling).strip(" ,\n\t")

        cell_text = system_cell.get_text("\n", strip=True)
        lines = [line.strip(" ,") for line in cell_text.splitlines() if line.strip()]

        site = ""
        country = ""
        if lines:
            system_name = lines[0]
        if len(lines) >= 3:
            site = lines[-2]
            country = lines[-1]
        elif len(lines) == 2:
            country = lines[-1]

        if not manufacturer and len(lines) >= 2:
            manufacturer = lines[1]

        return {
            "name": system_name,
            "manufacturer": manufacturer,
            "site": site,
            "country": country,
            "detail_url": detail_url,
        }

    def parse_detail_response(self, html: str) -> Dict[str, Any]:
        soup = BeautifulSoup(html, "html.parser")
        detail_table = soup.find("table", {"class": "table table-condensed"})
        if not detail_table:
            return {}

        detail_map: Dict[str, Any] = {}
        label_aliases = {
            "Site": "site",
            "Manufacturer": "manufacturer",
            "Cores": "cores",
            "Processor": "processor",
            "Interconnect": "interconnect",
            "Installation Year": "installation_year",
            "Linpack Performance (Rmax)": "rmax",
            "Theoretical Peak (Rpeak)": "rpeak",
            "Nmax": "nmax",
            "HPCG": "hpcg",
            "Power": "power",
            "Power Measurement Level": "power_measurement_level",
            "Operating System": "operating_system",
            "Compiler": "compiler",
            "Math Library": "math_library",
            "MPI": "mpi",
        }

        for row in detail_table.find_all("tr"):
            header = row.find("th")
            value_cell = row.find("td")
            if not header or not value_cell:
                continue

            label = header.get_text(" ", strip=True).rstrip(":")
            key = label_aliases.get(label)
            if not key:
                continue

            value = value_cell.get_text(" ", strip=True)
            detail_map[key] = value

        return detail_map

    def parse_response(self, html: str) -> List[Dict[str, Any]]:
        """Parse TOP500 HTML response"""
        data = []
        soup = BeautifulSoup(html, "html.parser")

        # Find the table with TOP500 data
        table = None
        for candidate in soup.find_all("table"):
            header_cells = [
                cell.get_text(" ", strip=True) for cell in candidate.select("thead th")
            ]
            normalized_headers = [header.lower() for header in header_cells]
            if (
                "rank" in normalized_headers
                and "system" in normalized_headers
                and any("cores" in header for header in normalized_headers)
                and any("rmax" in header for header in normalized_headers)
            ):
                table = candidate
                break

        if not table:
            table = soup.find("table", {"class": "top500-table"}) or soup.find("table", {"id": "top500"})

        if table:
            rows = table.select("tr")
            for row in rows[1:]:  # Skip header row
                cells = row.find_all(["td", "th"])
                if len(cells) >= 6:
                    try:
                        # Parse the row data
                        rank_text = cells[0].get_text(strip=True)
                        if not rank_text or not rank_text.isdigit():
                            continue

                        rank = int(rank_text)

                        system_cell = cells[1]
                        system_fields = self._extract_system_fields(system_cell)
                        system_name = system_fields["name"]
                        manufacturer = system_fields["manufacturer"]
                        site = system_fields["site"]
                        country = system_fields["country"]
                        detail_url = system_fields["detail_url"]

                        city = ""
                        cores = cells[2].get_text(strip=True).replace(",", "")

                        rmax_text = cells[3].get_text(strip=True)
                        rmax = self._parse_performance(rmax_text)

                        rpeak_text = cells[4].get_text(strip=True)
                        rpeak = self._parse_performance(rpeak_text)

                        power = ""
                        if len(cells) >= 6:
                            power = cells[5].get_text(strip=True).replace(",", "")

                        entry = {
                            "source_id": f"top500_{rank}",
                            "name": system_name,
                            "country": country,
                            "city": city,
                            "latitude": 0.0,
                            "longitude": 0.0,
                            "value": str(rmax),
                            "unit": "PFlop/s",
                            "metadata": {
                                "rank": rank,
                                "cores": cores,
                                "rmax": rmax_text,
                                "rpeak": rpeak_text,
                                "power": power,
                                "manufacturer": manufacturer,
                                "site": site,
                            },
                            "_detail_url": detail_url,
                            "reference_date": "2025-11-01",
                        }
                        data.append(entry)
                    except (ValueError, IndexError, AttributeError) as e:
                        continue

        # If scraping failed, return sample data for testing
        if not data:
            data = self._get_sample_data()

        return data

    def _parse_coordinate(self, value: Any) -> float:
        """Parse coordinate value"""
        if isinstance(value, (int, float)):
            return float(value)
        if isinstance(value, str):
            try:
                return float(value)
            except ValueError:
                return 0.0
        return 0.0

    def _parse_performance(self, text: str) -> float:
        """Parse performance value from text (handles E, P, T suffixes)"""
        text = text.strip().upper()
        multipliers = {
            "E": 1e18,
            "P": 1e15,
            "T": 1e12,
            "G": 1e9,
            "M": 1e6,
            "K": 1e3,
        }

        match = re.match(r"([\d.]+)\s*([EPTGMK])?F?LOP/?S?", text)
        if match:
            value = float(match.group(1))
            suffix = match.group(2)
            if suffix:
                value *= multipliers.get(suffix, 1)
            return value

        # Try simple float parsing
        try:
            return float(text.replace(",", ""))
        except ValueError:
            return 0.0

    def _get_sample_data(self) -> List[Dict[str, Any]]:
        """Return sample data for testing when scraping fails"""
        return [
            {
                "source_id": "top500_1",
                "name": "El Capitan - HPE Cray EX255a, AMD 4th Gen EPYC 24C 1.8GHz, AMD Instinct MI300A",
                "country": "United States",
                "city": "Livermore, CA",
                "latitude": 37.6819,
                "longitude": -121.7681,
                "value": "1742.00",
                "unit": "PFlop/s",
                "metadata": {
                    "rank": 1,
                    "cores": "11039616",
                    "rmax": "1742.00",
                    "rpeak": "2746.38",
                    "power": "29581",
                    "manufacturer": "HPE",
                    "site": "DOE/NNSA/LLNL",
                    "processor": "AMD 4th Gen EPYC 24C 1.8GHz",
                    "interconnect": "Slingshot-11",
                    "installation_year": "2025",
                },
                "reference_date": "2025-11-01",
            },
            {
                "source_id": "top500_2",
                "name": "Frontier - HPE Cray EX235a, AMD Optimized 3rd Generation EPYC 64C 2GHz, AMD Instinct MI250X",
                "country": "United States",
                "city": "Oak Ridge, TN",
                "latitude": 36.0107,
                "longitude": -84.2663,
                "value": "1353.00",
                "unit": "PFlop/s",
                "metadata": {
                    "rank": 2,
                    "cores": "9066176",
                    "rmax": "1353.00",
                    "rpeak": "2055.72",
                    "power": "24607",
                    "manufacturer": "HPE",
                    "site": "DOE/SC/Oak Ridge National Laboratory",
                },
                "reference_date": "2025-11-01",
            },
            {
                "source_id": "top500_3",
                "name": "Aurora - HPE Cray EX - Intel Exascale Compute Blade, Xeon CPU Max 9470 52C 2.4GHz, Intel Data Center GPU Max",
                "country": "United States",
                "city": "Argonne, IL",
                "latitude": 41.3784,
                "longitude": -87.8600,
                "value": "1012.00",
                "unit": "PFlop/s",
                "metadata": {
                    "rank": 3,
                    "cores": "9264128",
                    "rmax": "1012.00",
                    "rpeak": "1980.01",
                    "power": "38698",
                    "manufacturer": "Intel",
                },
                "reference_date": "2025-11-01",
            },
        ]