first commit
This commit is contained in:
BIN
backend/app/services/__pycache__/scheduler.cpython-311.pyc
Normal file
BIN
backend/app/services/__pycache__/scheduler.cpython-311.pyc
Normal file
Binary file not shown.
41
backend/app/services/collectors/__init__.py
Normal file
41
backend/app/services/collectors/__init__.py
Normal file
@@ -0,0 +1,41 @@
|
||||
"""__init__.py for collectors package"""
|
||||
|
||||
from app.services.collectors.base import BaseCollector, HTTPCollector, IntervalCollector
|
||||
from app.services.collectors.registry import collector_registry, CollectorRegistry
|
||||
from app.services.collectors.top500 import TOP500Collector
|
||||
from app.services.collectors.epoch_ai import EpochAIGPUCollector
|
||||
from app.services.collectors.huggingface import (
|
||||
HuggingFaceModelCollector,
|
||||
HuggingFaceDatasetCollector,
|
||||
HuggingFaceSpacesCollector,
|
||||
)
|
||||
from app.services.collectors.peeringdb import (
|
||||
PeeringDBIXPCollector,
|
||||
PeeringDBNetworkCollector,
|
||||
PeeringDBFacilityCollector,
|
||||
)
|
||||
from app.services.collectors.telegeography import (
|
||||
TeleGeographyCableCollector,
|
||||
TeleGeographyLandingPointCollector,
|
||||
TeleGeographyCableSystemCollector,
|
||||
)
|
||||
from app.services.collectors.cloudflare import (
|
||||
CloudflareRadarDeviceCollector,
|
||||
CloudflareRadarTrafficCollector,
|
||||
CloudflareRadarTopASCollector,
|
||||
)
|
||||
|
||||
collector_registry.register(TOP500Collector())
|
||||
collector_registry.register(EpochAIGPUCollector())
|
||||
collector_registry.register(HuggingFaceModelCollector())
|
||||
collector_registry.register(HuggingFaceDatasetCollector())
|
||||
collector_registry.register(HuggingFaceSpacesCollector())
|
||||
collector_registry.register(PeeringDBIXPCollector())
|
||||
collector_registry.register(PeeringDBNetworkCollector())
|
||||
collector_registry.register(PeeringDBFacilityCollector())
|
||||
collector_registry.register(TeleGeographyCableCollector())
|
||||
collector_registry.register(TeleGeographyLandingPointCollector())
|
||||
collector_registry.register(TeleGeographyCableSystemCollector())
|
||||
collector_registry.register(CloudflareRadarDeviceCollector())
|
||||
collector_registry.register(CloudflareRadarTrafficCollector())
|
||||
collector_registry.register(CloudflareRadarTopASCollector())
|
||||
Binary file not shown.
BIN
backend/app/services/collectors/__pycache__/base.cpython-311.pyc
Normal file
BIN
backend/app/services/collectors/__pycache__/base.cpython-311.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
179
backend/app/services/collectors/base.py
Normal file
179
backend/app/services/collectors/base.py
Normal file
@@ -0,0 +1,179 @@
|
||||
"""Base collector class for all data sources"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, List, Any, Optional
|
||||
from datetime import datetime
|
||||
import httpx
|
||||
from sqlalchemy import text
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.core.config import settings
|
||||
|
||||
|
||||
class BaseCollector(ABC):
|
||||
"""Abstract base class for data collectors"""
|
||||
|
||||
name: str = "base_collector"
|
||||
priority: str = "P1"
|
||||
module: str = "L1"
|
||||
frequency_hours: int = 4
|
||||
data_type: str = "generic" # Override in subclass: "supercomputer", "model", "dataset", etc.
|
||||
|
||||
@abstractmethod
|
||||
async def fetch(self) -> List[Dict[str, Any]]:
|
||||
"""Fetch raw data from source"""
|
||||
pass
|
||||
|
||||
def transform(self, raw_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Transform raw data to internal format (default: pass through)"""
|
||||
return raw_data
|
||||
|
||||
async def run(self, db: AsyncSession) -> Dict[str, Any]:
|
||||
"""Full pipeline: fetch -> transform -> save"""
|
||||
from app.services.collectors.registry import collector_registry
|
||||
from app.models.task import CollectionTask
|
||||
from app.models.collected_data import CollectedData
|
||||
|
||||
start_time = datetime.utcnow()
|
||||
datasource_id = getattr(self, "_datasource_id", 1) # Default to 1 for built-in collectors
|
||||
|
||||
# Check if collector is active
|
||||
if not collector_registry.is_active(self.name):
|
||||
return {"status": "skipped", "reason": "Collector is disabled"}
|
||||
|
||||
# Log task start
|
||||
task = CollectionTask(
|
||||
datasource_id=datasource_id,
|
||||
status="running",
|
||||
started_at=start_time,
|
||||
)
|
||||
db.add(task)
|
||||
await db.commit()
|
||||
task_id = task.id
|
||||
|
||||
try:
|
||||
raw_data = await self.fetch()
|
||||
data = self.transform(raw_data)
|
||||
|
||||
# Save data to database
|
||||
records_count = await self._save_data(db, data)
|
||||
|
||||
# Log task success
|
||||
task.status = "success"
|
||||
task.records_processed = records_count
|
||||
task.completed_at = datetime.utcnow()
|
||||
await db.commit()
|
||||
|
||||
return {
|
||||
"status": "success",
|
||||
"task_id": task_id,
|
||||
"records_processed": records_count,
|
||||
"execution_time_seconds": (datetime.utcnow() - start_time).total_seconds(),
|
||||
}
|
||||
except Exception as e:
|
||||
# Log task failure
|
||||
task.status = "failed"
|
||||
task.error_message = str(e)
|
||||
task.completed_at = datetime.utcnow()
|
||||
await db.commit()
|
||||
|
||||
return {
|
||||
"status": "failed",
|
||||
"task_id": task_id,
|
||||
"error": str(e),
|
||||
"execution_time_seconds": (datetime.utcnow() - start_time).total_seconds(),
|
||||
}
|
||||
|
||||
async def _save_data(self, db: AsyncSession, data: List[Dict[str, Any]]) -> int:
|
||||
"""Save transformed data to database"""
|
||||
from app.models.collected_data import CollectedData
|
||||
|
||||
if not data:
|
||||
return 0
|
||||
|
||||
collected_at = datetime.utcnow()
|
||||
records_added = 0
|
||||
|
||||
for item in data:
|
||||
# Create CollectedData entry
|
||||
record = CollectedData(
|
||||
source=self.name,
|
||||
source_id=item.get("source_id") or item.get("id"),
|
||||
data_type=self.data_type,
|
||||
name=item.get("name"),
|
||||
title=item.get("title"),
|
||||
description=item.get("description"),
|
||||
country=item.get("country"),
|
||||
city=item.get("city"),
|
||||
latitude=str(item.get("latitude", ""))
|
||||
if item.get("latitude") is not None
|
||||
else None,
|
||||
longitude=str(item.get("longitude", ""))
|
||||
if item.get("longitude") is not None
|
||||
else None,
|
||||
value=item.get("value"),
|
||||
unit=item.get("unit"),
|
||||
extra_data=item.get("metadata", {}),
|
||||
collected_at=collected_at,
|
||||
reference_date=datetime.fromisoformat(
|
||||
item.get("reference_date").replace("Z", "+00:00")
|
||||
)
|
||||
if item.get("reference_date")
|
||||
else None,
|
||||
is_valid=1,
|
||||
)
|
||||
db.add(record)
|
||||
records_added += 1
|
||||
|
||||
await db.commit()
|
||||
return records_added
|
||||
|
||||
async def save(self, db: AsyncSession, data: List[Dict[str, Any]]) -> int:
|
||||
"""Save data to database (legacy method, use _save_data instead)"""
|
||||
return await self._save_data(db, data)
|
||||
|
||||
|
||||
class HTTPCollector(BaseCollector):
|
||||
"""Base class for HTTP API collectors"""
|
||||
|
||||
base_url: str = ""
|
||||
headers: Dict[str, str] = {}
|
||||
|
||||
async def fetch(self) -> List[Dict[str, Any]]:
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
response = await client.get(self.base_url, headers=self.headers)
|
||||
response.raise_for_status()
|
||||
return self.parse_response(response.json())
|
||||
|
||||
@abstractmethod
|
||||
def parse_response(self, response: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
pass
|
||||
|
||||
|
||||
class IntervalCollector(BaseCollector):
|
||||
"""Base class for collectors that run on intervals"""
|
||||
|
||||
async def run(self, db: AsyncSession) -> Dict[str, Any]:
|
||||
return await super().run(db)
|
||||
|
||||
|
||||
async def log_task(
|
||||
db: AsyncSession,
|
||||
datasource_id: int,
|
||||
status: str,
|
||||
records_processed: int = 0,
|
||||
error_message: Optional[str] = None,
|
||||
):
|
||||
"""Log collection task to database"""
|
||||
from app.models.task import CollectionTask
|
||||
|
||||
task = CollectionTask(
|
||||
datasource_id=datasource_id,
|
||||
status=status,
|
||||
records_processed=records_processed,
|
||||
error_message=error_message,
|
||||
started_at=datetime.utcnow(),
|
||||
completed_at=datetime.utcnow(),
|
||||
)
|
||||
db.add(task)
|
||||
await db.commit()
|
||||
163
backend/app/services/collectors/cloudflare.py
Normal file
163
backend/app/services/collectors/cloudflare.py
Normal file
@@ -0,0 +1,163 @@
|
||||
"""Cloudflare Radar Traffic Collector
|
||||
|
||||
Collects Internet traffic data from Cloudflare Radar API.
|
||||
https://developers.cloudflare.com/radar/
|
||||
|
||||
Note: Radar API provides free access to global Internet traffic data.
|
||||
Some endpoints require authentication for higher rate limits.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
from typing import Dict, Any, List
|
||||
from datetime import datetime
|
||||
|
||||
import httpx
|
||||
from app.services.collectors.base import HTTPCollector
|
||||
|
||||
# Cloudflare API token (optional - for higher rate limits)
|
||||
CLOUDFLARE_API_TOKEN = os.environ.get("CLOUDFLARE_API_TOKEN", "")
|
||||
|
||||
|
||||
class CloudflareRadarDeviceCollector(HTTPCollector):
|
||||
"""Collects device type distribution data (mobile vs desktop)"""
|
||||
|
||||
name = "cloudflare_radar_device"
|
||||
priority = "P2"
|
||||
module = "L3"
|
||||
frequency_hours = 24
|
||||
data_type = "device_stats"
|
||||
base_url = "https://api.cloudflare.com/client/v4/radar/http/summary/device_type"
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.headers = {
|
||||
"User-Agent": "Planet-Intelligence-System/1.0 (Python/collector)",
|
||||
"Accept": "application/json",
|
||||
}
|
||||
if CLOUDFLARE_API_TOKEN:
|
||||
self.headers["Authorization"] = f"Bearer {CLOUDFLARE_API_TOKEN}"
|
||||
|
||||
def parse_response(self, response: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
"""Parse Cloudflare Radar device type response"""
|
||||
data = []
|
||||
result = response.get("result", {})
|
||||
summary = result.get("summary_0", {})
|
||||
|
||||
try:
|
||||
entry = {
|
||||
"source_id": "cloudflare_radar_device_global",
|
||||
"name": "Global Device Distribution",
|
||||
"country": "GLOBAL",
|
||||
"city": "",
|
||||
"latitude": 0.0,
|
||||
"longitude": 0.0,
|
||||
"metadata": {
|
||||
"desktop_percent": float(summary.get("desktop", 0)),
|
||||
"mobile_percent": float(summary.get("mobile", 0)),
|
||||
"other_percent": float(summary.get("other", 0)),
|
||||
"date_range": result.get("meta", {}).get("dateRange", {}),
|
||||
},
|
||||
"reference_date": datetime.utcnow().isoformat(),
|
||||
}
|
||||
data.append(entry)
|
||||
except (ValueError, TypeError, KeyError):
|
||||
pass
|
||||
|
||||
return data
|
||||
|
||||
|
||||
class CloudflareRadarTrafficCollector(HTTPCollector):
|
||||
"""Collects traffic volume trends"""
|
||||
|
||||
name = "cloudflare_radar_traffic"
|
||||
priority = "P2"
|
||||
module = "L3"
|
||||
frequency_hours = 24
|
||||
data_type = "traffic_stats"
|
||||
base_url = "https://api.cloudflare.com/client/v4/radar/http/timeseries/requests"
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.headers = {
|
||||
"User-Agent": "Planet-Intelligence-System/1.0 (Python/collector)",
|
||||
"Accept": "application/json",
|
||||
}
|
||||
if CLOUDFLARE_API_TOKEN:
|
||||
self.headers["Authorization"] = f"Bearer {CLOUDFLARE_API_TOKEN}"
|
||||
|
||||
def parse_response(self, response: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
"""Parse Cloudflare Radar traffic timeseries response"""
|
||||
data = []
|
||||
result = response.get("result", {})
|
||||
timeseries = result.get("requests_0", {}).get("timeseries", [])
|
||||
|
||||
for item in timeseries:
|
||||
try:
|
||||
entry = {
|
||||
"source_id": f"cloudflare_traffic_{item.get('datetime', '')}",
|
||||
"name": f"Traffic {item.get('datetime', '')[:10]}",
|
||||
"country": "GLOBAL",
|
||||
"city": "",
|
||||
"latitude": 0.0,
|
||||
"longitude": 0.0,
|
||||
"metadata": {
|
||||
"datetime": item.get("datetime"),
|
||||
"requests": item.get("requests"),
|
||||
"visit_duration": item.get("visitDuration"),
|
||||
},
|
||||
"reference_date": item.get("datetime", datetime.utcnow().isoformat()),
|
||||
}
|
||||
data.append(entry)
|
||||
except (ValueError, TypeError, KeyError):
|
||||
continue
|
||||
|
||||
return data
|
||||
|
||||
|
||||
class CloudflareRadarTopASCollector(HTTPCollector):
|
||||
"""Collects top autonomous systems by traffic"""
|
||||
|
||||
name = "cloudflare_radar_top_as"
|
||||
priority = "P2"
|
||||
module = "L2"
|
||||
frequency_hours = 24
|
||||
data_type = "as_stats"
|
||||
base_url = "https://api.cloudflare.com/client/v4/radar/http/top/locations"
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.headers = {
|
||||
"User-Agent": "Planet-Intelligence-System/1.0 (Python/collector)",
|
||||
"Accept": "application/json",
|
||||
}
|
||||
if CLOUDFLARE_API_TOKEN:
|
||||
self.headers["Authorization"] = f"Bearer {CLOUDFLARE_API_TOKEN}"
|
||||
|
||||
def parse_response(self, response: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
"""Parse Cloudflare Radar top locations response"""
|
||||
data = []
|
||||
result = response.get("result", {})
|
||||
top_locations = result.get("top_locations_0", [])
|
||||
|
||||
for idx, item in enumerate(top_locations):
|
||||
try:
|
||||
entry = {
|
||||
"source_id": f"cloudflare_as_{item.get('rank', idx)}",
|
||||
"name": item.get("location", {}).get("countryName", "Unknown"),
|
||||
"country": item.get("location", {}).get("countryCode", "XX"),
|
||||
"city": item.get("location", {}).get("cityName", ""),
|
||||
"latitude": float(item.get("location", {}).get("latitude", 0)),
|
||||
"longitude": float(item.get("location", {}).get("longitude", 0)),
|
||||
"metadata": {
|
||||
"rank": item.get("rank"),
|
||||
"traffic_share": item.get("trafficShare"),
|
||||
"country_code": item.get("location", {}).get("countryCode"),
|
||||
},
|
||||
"reference_date": datetime.utcnow().isoformat(),
|
||||
}
|
||||
data.append(entry)
|
||||
except (ValueError, TypeError, KeyError):
|
||||
continue
|
||||
|
||||
return data
|
||||
118
backend/app/services/collectors/epoch_ai.py
Normal file
118
backend/app/services/collectors/epoch_ai.py
Normal file
@@ -0,0 +1,118 @@
|
||||
"""Epoch AI GPU Clusters Collector
|
||||
|
||||
Collects data from Epoch AI GPU clusters tracking.
|
||||
https://epoch.ai/data/gpu-clusters
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Dict, Any, List
|
||||
from datetime import datetime
|
||||
from bs4 import BeautifulSoup
|
||||
import httpx
|
||||
|
||||
from app.services.collectors.base import BaseCollector
|
||||
|
||||
|
||||
class EpochAIGPUCollector(BaseCollector):
|
||||
name = "epoch_ai_gpu"
|
||||
priority = "P0"
|
||||
module = "L1"
|
||||
frequency_hours = 6
|
||||
data_type = "gpu_cluster"
|
||||
|
||||
async def fetch(self) -> List[Dict[str, Any]]:
|
||||
"""Fetch Epoch AI GPU clusters data from webpage"""
|
||||
url = "https://epoch.ai/data/gpu-clusters"
|
||||
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
response = await client.get(url)
|
||||
response.raise_for_status()
|
||||
return self.parse_response(response.text)
|
||||
|
||||
def parse_response(self, html: str) -> List[Dict[str, Any]]:
|
||||
"""Parse Epoch AI webpage to extract GPU cluster data"""
|
||||
data = []
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
# Try to find data table on the page
|
||||
tables = soup.find_all("table")
|
||||
for table in tables:
|
||||
rows = table.find_all("tr")
|
||||
for row in rows[1:]: # Skip header
|
||||
cells = row.find_all(["td", "th"])
|
||||
if len(cells) >= 5:
|
||||
try:
|
||||
cluster_name = cells[0].get_text(strip=True)
|
||||
if not cluster_name or cluster_name in ["Cluster", "System", "Name"]:
|
||||
continue
|
||||
|
||||
location_cell = cells[1].get_text(strip=True) if len(cells) > 1 else ""
|
||||
country, city = self._parse_location(location_cell)
|
||||
|
||||
perf_cell = cells[2].get_text(strip=True) if len(cells) > 2 else ""
|
||||
|
||||
entry = {
|
||||
"source_id": f"epoch_{re.sub(r'[^a-zA-Z0-9]', '_', cluster_name.lower())}",
|
||||
"name": cluster_name,
|
||||
"country": country,
|
||||
"city": city,
|
||||
"latitude": "",
|
||||
"longitude": "",
|
||||
"value": self._parse_performance(perf_cell),
|
||||
"unit": "TFlop/s",
|
||||
"metadata": {
|
||||
"raw_data": perf_cell,
|
||||
},
|
||||
"reference_date": datetime.utcnow().strftime("%Y-%m-%d"),
|
||||
}
|
||||
data.append(entry)
|
||||
except (ValueError, IndexError, AttributeError):
|
||||
continue
|
||||
|
||||
# If no table found, return sample data
|
||||
if not data:
|
||||
data = self._get_sample_data()
|
||||
|
||||
return data
|
||||
|
||||
def _parse_location(self, location: str) -> tuple:
|
||||
"""Parse location string into country and city"""
|
||||
if not location:
|
||||
return "", ""
|
||||
if "," in location:
|
||||
parts = location.rsplit(",", 1)
|
||||
city = parts[0].strip()
|
||||
country = parts[1].strip() if len(parts) > 1 else ""
|
||||
return country, city
|
||||
return location, ""
|
||||
|
||||
def _parse_performance(self, perf: str) -> str:
|
||||
"""Parse performance string to extract value"""
|
||||
if not perf:
|
||||
return "0"
|
||||
match = re.search(r"([\d,.]+)\s*(TFlop/s|PFlop/s|GFlop/s)?", perf, re.I)
|
||||
if match:
|
||||
return match.group(1).replace(",", "")
|
||||
match = re.search(r"([\d,.]+)", perf)
|
||||
if match:
|
||||
return match.group(1).replace(",", "")
|
||||
return "0"
|
||||
|
||||
def _get_sample_data(self) -> List[Dict[str, Any]]:
|
||||
"""Return sample data for testing when scraping fails"""
|
||||
return [
|
||||
{
|
||||
"source_id": "epoch_sample_1",
|
||||
"name": "Sample GPU Cluster",
|
||||
"country": "United States",
|
||||
"city": "San Francisco, CA",
|
||||
"latitude": "",
|
||||
"longitude": "",
|
||||
"value": "1000",
|
||||
"unit": "TFlop/s",
|
||||
"metadata": {
|
||||
"note": "Sample data - Epoch AI page structure may vary",
|
||||
},
|
||||
"reference_date": datetime.utcnow().strftime("%Y-%m-%d"),
|
||||
},
|
||||
]
|
||||
136
backend/app/services/collectors/huggingface.py
Normal file
136
backend/app/services/collectors/huggingface.py
Normal file
@@ -0,0 +1,136 @@
|
||||
"""Hugging Face Model Ecosystem Collector
|
||||
|
||||
Collects data from Hugging Face model hub.
|
||||
https://huggingface.co/models
|
||||
https://huggingface.co/datasets
|
||||
https://huggingface.co/spaces
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, List
|
||||
from datetime import datetime
|
||||
|
||||
from app.services.collectors.base import HTTPCollector
|
||||
|
||||
|
||||
class HuggingFaceModelCollector(HTTPCollector):
|
||||
name = "huggingface_models"
|
||||
priority = "P1"
|
||||
module = "L2"
|
||||
frequency_hours = 12
|
||||
data_type = "model"
|
||||
base_url = "https://huggingface.co/api/models"
|
||||
|
||||
def parse_response(self, response: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
"""Parse Hugging Face models API response"""
|
||||
data = []
|
||||
models = (
|
||||
response
|
||||
if isinstance(response, list)
|
||||
else response.get("models", response.get("items", []))
|
||||
)
|
||||
|
||||
for item in models[:100]:
|
||||
try:
|
||||
entry = {
|
||||
"source_id": f"hf_model_{item.get('id', '')}",
|
||||
"name": item.get("id", "Unknown"),
|
||||
"description": (item.get("description", "") or "")[:500],
|
||||
"metadata": {
|
||||
"author": item.get("author"),
|
||||
"likes": item.get("likes"),
|
||||
"downloads": item.get("downloads"),
|
||||
"language": item.get("language"),
|
||||
"tags": (item.get("tags", []) or [])[:10],
|
||||
"pipeline_tag": item.get("pipeline_tag"),
|
||||
"library_name": item.get("library_name"),
|
||||
"created_at": item.get("createdAt"),
|
||||
},
|
||||
"reference_date": datetime.utcnow().strftime("%Y-%m-%d"),
|
||||
}
|
||||
data.append(entry)
|
||||
except (ValueError, TypeError, KeyError):
|
||||
continue
|
||||
|
||||
return data
|
||||
|
||||
|
||||
class HuggingFaceDatasetCollector(HTTPCollector):
|
||||
name = "huggingface_datasets"
|
||||
priority = "P1"
|
||||
module = "L2"
|
||||
frequency_hours = 12
|
||||
data_type = "dataset"
|
||||
base_url = "https://huggingface.co/api/datasets"
|
||||
|
||||
def parse_response(self, response: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
"""Parse Hugging Face datasets API response"""
|
||||
data = []
|
||||
datasets = (
|
||||
response
|
||||
if isinstance(response, list)
|
||||
else response.get("datasets", response.get("items", []))
|
||||
)
|
||||
|
||||
for item in datasets[:100]:
|
||||
try:
|
||||
entry = {
|
||||
"source_id": f"hf_dataset_{item.get('id', '')}",
|
||||
"name": item.get("id", "Unknown"),
|
||||
"description": (item.get("description", "") or "")[:500],
|
||||
"metadata": {
|
||||
"author": item.get("author"),
|
||||
"likes": item.get("likes"),
|
||||
"downloads": item.get("downloads"),
|
||||
"size": item.get("size"),
|
||||
"language": item.get("language"),
|
||||
"tags": (item.get("tags", []) or [])[:10],
|
||||
"created_at": item.get("createdAt"),
|
||||
},
|
||||
"reference_date": datetime.utcnow().strftime("%Y-%m-%d"),
|
||||
}
|
||||
data.append(entry)
|
||||
except (ValueError, TypeError, KeyError):
|
||||
continue
|
||||
|
||||
return data
|
||||
|
||||
|
||||
class HuggingFaceSpacesCollector(HTTPCollector):
|
||||
name = "huggingface_spaces"
|
||||
priority = "P2"
|
||||
module = "L2"
|
||||
frequency_hours = 24
|
||||
data_type = "space"
|
||||
base_url = "https://huggingface.co/api/spaces"
|
||||
|
||||
def parse_response(self, response: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
"""Parse Hugging Face Spaces API response"""
|
||||
data = []
|
||||
spaces = (
|
||||
response
|
||||
if isinstance(response, list)
|
||||
else response.get("spaces", response.get("items", []))
|
||||
)
|
||||
|
||||
for item in spaces[:100]:
|
||||
try:
|
||||
entry = {
|
||||
"source_id": f"hf_space_{item.get('id', '')}",
|
||||
"name": item.get("id", "Unknown"),
|
||||
"description": (item.get("description", "") or "")[:500],
|
||||
"metadata": {
|
||||
"author": item.get("author"),
|
||||
"likes": item.get("likes"),
|
||||
"views": item.get("views"),
|
||||
"sdk": item.get("sdk"),
|
||||
"hardware": item.get("hardware"),
|
||||
"tags": (item.get("tags", []) or [])[:10],
|
||||
"created_at": item.get("createdAt"),
|
||||
},
|
||||
"reference_date": datetime.utcnow().strftime("%Y-%m-%d"),
|
||||
}
|
||||
data.append(entry)
|
||||
except (ValueError, TypeError, KeyError):
|
||||
continue
|
||||
|
||||
return data
|
||||
331
backend/app/services/collectors/peeringdb.py
Normal file
331
backend/app/services/collectors/peeringdb.py
Normal file
@@ -0,0 +1,331 @@
|
||||
"""PeeringDB IXP Nodes Collector
|
||||
|
||||
Collects data from PeeringDB IXP directory.
|
||||
https://www.peeringdb.com
|
||||
|
||||
Note: PeeringDB API has rate limits:
|
||||
- Anonymous: 20 requests/minute
|
||||
- Authenticated: 40 requests/minute (with API key)
|
||||
|
||||
To get higher limits, set PEERINGDB_API_KEY environment variable.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
from typing import Dict, Any, List
|
||||
from datetime import datetime
|
||||
|
||||
import httpx
|
||||
from app.services.collectors.base import HTTPCollector
|
||||
|
||||
# PeeringDB API key - read from environment variable
|
||||
PEERINGDB_API_KEY = os.environ.get("PEERINGDB_API_KEY", "")
|
||||
|
||||
|
||||
class PeeringDBIXPCollector(HTTPCollector):
|
||||
name = "peeringdb_ixp"
|
||||
priority = "P1"
|
||||
module = "L2"
|
||||
frequency_hours = 24
|
||||
data_type = "ixp"
|
||||
base_url = "https://www.peeringdb.com/api/ix"
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
# Set headers with User-Agent
|
||||
self.headers = {
|
||||
"User-Agent": "Planet-Intelligence-System/1.0 (Python/collector)",
|
||||
"Accept": "application/json",
|
||||
}
|
||||
# API key is added to URL as query parameter
|
||||
if PEERINGDB_API_KEY:
|
||||
self.base_url = f"{self.base_url}?key={PEERINGDB_API_KEY}"
|
||||
|
||||
async def fetch_with_retry(
|
||||
self, max_retries: int = 3, base_delay: float = 2.0
|
||||
) -> Dict[str, Any]:
|
||||
"""Fetch data with exponential backoff for rate limiting"""
|
||||
last_error = None
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
response = await client.get(self.base_url, headers=self.headers)
|
||||
|
||||
if response.status_code == 429:
|
||||
# Rate limited - wait and retry with exponential backoff
|
||||
delay = base_delay * (2**attempt)
|
||||
print(f"PeeringDB rate limited, waiting {delay}s before retry...")
|
||||
await asyncio.sleep(delay)
|
||||
last_error = "Rate limited"
|
||||
continue
|
||||
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
if e.response.status_code == 429:
|
||||
delay = base_delay * (2**attempt)
|
||||
print(f"PeeringDB rate limited, waiting {delay}s before retry...")
|
||||
await asyncio.sleep(delay)
|
||||
last_error = "Rate limited"
|
||||
continue
|
||||
raise
|
||||
|
||||
print(f"Warning: PeeringDB collection failed after {max_retries} retries: {last_error}")
|
||||
return {}
|
||||
|
||||
async def collect(self) -> List[Dict[str, Any]]:
|
||||
"""Collect IXP data from PeeringDB with rate limit handling"""
|
||||
response_data = await self.fetch_with_retry()
|
||||
if not response_data:
|
||||
return []
|
||||
return self.parse_response(response_data)
|
||||
|
||||
def parse_response(self, response: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
"""Parse PeeringDB IXP API response"""
|
||||
data = []
|
||||
ixps = response.get("data", response.get("ixps", []))
|
||||
|
||||
for item in ixps:
|
||||
try:
|
||||
entry = {
|
||||
"source_id": f"peeringdb_ixp_{item.get('id', '')}",
|
||||
"name": item.get("name", "Unknown"),
|
||||
"country": item.get("country", "Unknown"),
|
||||
"city": item.get("city", ""),
|
||||
"latitude": self._parse_coordinate(item.get("latitude")),
|
||||
"longitude": self._parse_coordinate(item.get("longitude")),
|
||||
"metadata": {
|
||||
"org_name": item.get("org_name"),
|
||||
"url": item.get("url"),
|
||||
"tech_email": item.get("tech_email"),
|
||||
"tech_phone": item.get("tech_phone"),
|
||||
"network_count": len(item.get("net_set", [])),
|
||||
"created": item.get("created"),
|
||||
"updated": item.get("updated"),
|
||||
},
|
||||
"reference_date": datetime.utcnow().isoformat(),
|
||||
}
|
||||
data.append(entry)
|
||||
except (ValueError, TypeError, KeyError):
|
||||
continue
|
||||
|
||||
return data
|
||||
|
||||
def _parse_coordinate(self, value: Any) -> float:
|
||||
if value is None:
|
||||
return 0.0
|
||||
if isinstance(value, (int, float)):
|
||||
return float(value)
|
||||
if isinstance(value, str):
|
||||
try:
|
||||
return float(value)
|
||||
except ValueError:
|
||||
return 0.0
|
||||
return 0.0
|
||||
|
||||
|
||||
class PeeringDBNetworkCollector(HTTPCollector):
|
||||
name = "peeringdb_network"
|
||||
priority = "P2"
|
||||
module = "L2"
|
||||
frequency_hours = 48
|
||||
data_type = "network"
|
||||
base_url = "https://www.peeringdb.com/api/net"
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.headers = {
|
||||
"User-Agent": "Planet-Intelligence-System/1.0 (Python/collector)",
|
||||
"Accept": "application/json",
|
||||
}
|
||||
if PEERINGDB_API_KEY:
|
||||
self.base_url = f"{self.base_url}?key={PEERINGDB_API_KEY}"
|
||||
|
||||
async def fetch_with_retry(
|
||||
self, max_retries: int = 3, base_delay: float = 2.0
|
||||
) -> Dict[str, Any]:
|
||||
"""Fetch data with exponential backoff for rate limiting"""
|
||||
last_error = None
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
response = await client.get(self.base_url, headers=self.headers)
|
||||
|
||||
if response.status_code == 429:
|
||||
delay = base_delay * (2**attempt)
|
||||
print(f"PeeringDB rate limited, waiting {delay}s before retry...")
|
||||
await asyncio.sleep(delay)
|
||||
last_error = "Rate limited"
|
||||
continue
|
||||
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
if e.response.status_code == 429:
|
||||
delay = base_delay * (2**attempt)
|
||||
print(f"PeeringDB rate limited, waiting {delay}s before retry...")
|
||||
await asyncio.sleep(delay)
|
||||
last_error = "Rate limited"
|
||||
continue
|
||||
raise
|
||||
|
||||
print(f"Warning: PeeringDB collection failed after {max_retries} retries: {last_error}")
|
||||
return {}
|
||||
|
||||
async def collect(self) -> List[Dict[str, Any]]:
|
||||
"""Collect Network data from PeeringDB with rate limit handling"""
|
||||
response_data = await self.fetch_with_retry()
|
||||
if not response_data:
|
||||
return []
|
||||
return self.parse_response(response_data)
|
||||
|
||||
def parse_response(self, response: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
"""Parse PeeringDB Network API response"""
|
||||
data = []
|
||||
networks = response.get("data", response.get("networks", []))
|
||||
|
||||
for item in networks:
|
||||
try:
|
||||
entry = {
|
||||
"source_id": f"peeringdb_net_{item.get('id', '')}",
|
||||
"name": item.get("name", "Unknown"),
|
||||
"country": item.get("country", "Unknown"),
|
||||
"city": item.get("city", ""),
|
||||
"latitude": self._parse_coordinate(item.get("latitude")),
|
||||
"longitude": self._parse_coordinate(item.get("longitude")),
|
||||
"metadata": {
|
||||
"asn": item.get("asn"),
|
||||
"irr_as_set": item.get("irr_as_set"),
|
||||
"url": item.get("url"),
|
||||
"info_type": item.get("info_type"),
|
||||
"info_traffic": item.get("info_traffic"),
|
||||
"info_ratio": item.get("info_ratio"),
|
||||
"ix_count": len(item.get("ix_set", [])),
|
||||
"created": item.get("created"),
|
||||
"updated": item.get("updated"),
|
||||
},
|
||||
"reference_date": datetime.utcnow().isoformat(),
|
||||
}
|
||||
data.append(entry)
|
||||
except (ValueError, TypeError, KeyError):
|
||||
continue
|
||||
|
||||
return data
|
||||
|
||||
def _parse_coordinate(self, value: Any) -> float:
|
||||
if value is None:
|
||||
return 0.0
|
||||
if isinstance(value, (int, float)):
|
||||
return float(value)
|
||||
if isinstance(value, str):
|
||||
try:
|
||||
return float(value)
|
||||
except ValueError:
|
||||
return 0.0
|
||||
return 0.0
|
||||
|
||||
|
||||
class PeeringDBFacilityCollector(HTTPCollector):
|
||||
name = "peeringdb_facility"
|
||||
priority = "P2"
|
||||
module = "L2"
|
||||
frequency_hours = 48
|
||||
data_type = "facility"
|
||||
base_url = "https://www.peeringdb.com/api/fac"
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.headers = {
|
||||
"User-Agent": "Planet-Intelligence-System/1.0 (Python/collector)",
|
||||
"Accept": "application/json",
|
||||
}
|
||||
if PEERINGDB_API_KEY:
|
||||
self.base_url = f"{self.base_url}?key={PEERINGDB_API_KEY}"
|
||||
|
||||
async def fetch_with_retry(
|
||||
self, max_retries: int = 3, base_delay: float = 2.0
|
||||
) -> Dict[str, Any]:
|
||||
"""Fetch data with exponential backoff for rate limiting"""
|
||||
last_error = None
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
response = await client.get(self.base_url, headers=self.headers)
|
||||
|
||||
if response.status_code == 429:
|
||||
delay = base_delay * (2**attempt)
|
||||
print(f"PeeringDB rate limited, waiting {delay}s before retry...")
|
||||
await asyncio.sleep(delay)
|
||||
last_error = "Rate limited"
|
||||
continue
|
||||
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
if e.response.status_code == 429:
|
||||
delay = base_delay * (2**attempt)
|
||||
print(f"PeeringDB rate limited, waiting {delay}s before retry...")
|
||||
await asyncio.sleep(delay)
|
||||
last_error = "Rate limited"
|
||||
continue
|
||||
raise
|
||||
|
||||
print(f"Warning: PeeringDB collection failed after {max_retries} retries: {last_error}")
|
||||
return {}
|
||||
|
||||
async def collect(self) -> List[Dict[str, Any]]:
|
||||
"""Collect Facility data from PeeringDB with rate limit handling"""
|
||||
response_data = await self.fetch_with_retry()
|
||||
if not response_data:
|
||||
return []
|
||||
return self.parse_response(response_data)
|
||||
|
||||
def parse_response(self, response: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
"""Parse PeeringDB Facility API response"""
|
||||
data = []
|
||||
facilities = response.get("data", response.get("facilities", []))
|
||||
|
||||
for item in facilities:
|
||||
try:
|
||||
entry = {
|
||||
"source_id": f"peeringdb_fac_{item.get('id', '')}",
|
||||
"name": item.get("name", "Unknown"),
|
||||
"country": item.get("country", "Unknown"),
|
||||
"city": item.get("city", ""),
|
||||
"latitude": self._parse_coordinate(item.get("latitude")),
|
||||
"longitude": self._parse_coordinate(item.get("longitude")),
|
||||
"metadata": {
|
||||
"org_name": item.get("org_name"),
|
||||
"address": item.get("address"),
|
||||
"url": item.get("url"),
|
||||
"rack_count": item.get("rack_count"),
|
||||
"power": item.get("power"),
|
||||
"network_count": len(item.get("net_set", [])),
|
||||
"created": item.get("created"),
|
||||
"updated": item.get("updated"),
|
||||
},
|
||||
"reference_date": datetime.utcnow().isoformat(),
|
||||
}
|
||||
data.append(entry)
|
||||
except (ValueError, TypeError, KeyError):
|
||||
continue
|
||||
|
||||
return data
|
||||
|
||||
def _parse_coordinate(self, value: Any) -> float:
|
||||
if value is None:
|
||||
return 0.0
|
||||
if isinstance(value, (int, float)):
|
||||
return float(value)
|
||||
if isinstance(value, str):
|
||||
try:
|
||||
return float(value)
|
||||
except ValueError:
|
||||
return 0.0
|
||||
return 0.0
|
||||
43
backend/app/services/collectors/registry.py
Normal file
43
backend/app/services/collectors/registry.py
Normal file
@@ -0,0 +1,43 @@
|
||||
"""Collector registry for managing all data collectors"""
|
||||
|
||||
from typing import Dict, Optional
|
||||
from app.services.collectors.base import BaseCollector
|
||||
|
||||
|
||||
class CollectorRegistry:
|
||||
"""Registry for all data collectors"""
|
||||
|
||||
_collectors: Dict[str, BaseCollector] = {}
|
||||
_active_collectors: set = set()
|
||||
|
||||
@classmethod
|
||||
def register(cls, collector: BaseCollector):
|
||||
"""Register a collector"""
|
||||
cls._collectors[collector.name] = collector
|
||||
cls._active_collectors.add(collector.name)
|
||||
|
||||
@classmethod
|
||||
def get(cls, name: str) -> Optional[BaseCollector]:
|
||||
"""Get a collector by name"""
|
||||
return cls._collectors.get(name)
|
||||
|
||||
@classmethod
|
||||
def all(cls) -> Dict[str, BaseCollector]:
|
||||
"""Get all collectors"""
|
||||
return cls._collectors.copy()
|
||||
|
||||
@classmethod
|
||||
def is_active(cls, name: str) -> bool:
|
||||
"""Check if a collector is active"""
|
||||
return name in cls._active_collectors
|
||||
|
||||
@classmethod
|
||||
def set_active(cls, name: str, active: bool = True):
|
||||
"""Set collector active status"""
|
||||
if active:
|
||||
cls._active_collectors.add(name)
|
||||
else:
|
||||
cls._active_collectors.discard(name)
|
||||
|
||||
|
||||
collector_registry = CollectorRegistry()
|
||||
286
backend/app/services/collectors/telegeography.py
Normal file
286
backend/app/services/collectors/telegeography.py
Normal file
@@ -0,0 +1,286 @@
|
||||
"""TeleGeography Submarine Cables Collector
|
||||
|
||||
Collects data from TeleGeography submarine cable database.
|
||||
Uses Wayback Machine as backup data source since live data requires JavaScript rendering.
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
from typing import Dict, Any, List
|
||||
from datetime import datetime
|
||||
from bs4 import BeautifulSoup
|
||||
import httpx
|
||||
|
||||
from app.services.collectors.base import BaseCollector
|
||||
|
||||
|
||||
class TeleGeographyCableCollector(BaseCollector):
|
||||
name = "telegeography_cables"
|
||||
priority = "P1"
|
||||
module = "L2"
|
||||
frequency_hours = 168 # 7 days
|
||||
data_type = "submarine_cable"
|
||||
|
||||
async def fetch(self) -> List[Dict[str, Any]]:
|
||||
"""Fetch submarine cable data from Wayback Machine"""
|
||||
# Try multiple data sources
|
||||
sources = [
|
||||
# Wayback Machine archive of TeleGeography
|
||||
"https://web.archive.org/web/2024/https://www.submarinecablemap.com/api/v3/cable",
|
||||
# Alternative: Try scraping the page
|
||||
"https://www.submarinecablemap.com",
|
||||
]
|
||||
|
||||
for url in sources:
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=60.0, follow_redirects=True) as client:
|
||||
response = await client.get(url)
|
||||
response.raise_for_status()
|
||||
|
||||
# Check if response is JSON
|
||||
content_type = response.headers.get("content-type", "")
|
||||
if "application/json" in content_type or url.endswith(".json"):
|
||||
return self.parse_response(response.json())
|
||||
else:
|
||||
# It's HTML, try to scrape
|
||||
data = self.scrape_cables_from_html(response.text)
|
||||
if data:
|
||||
return data
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# Fallback to sample data
|
||||
return self._get_sample_data()
|
||||
|
||||
def scrape_cables_from_html(self, html: str) -> List[Dict[str, Any]]:
|
||||
"""Try to extract cable data from HTML page"""
|
||||
data = []
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
# Look for embedded JSON data in scripts
|
||||
scripts = soup.find_all("script")
|
||||
for script in scripts:
|
||||
text = script.string or ""
|
||||
if "cable" in text.lower() and ("{" in text or "[" in text):
|
||||
# Try to find JSON data
|
||||
match = re.search(r"\[.+\]", text, re.DOTALL)
|
||||
if match:
|
||||
try:
|
||||
potential_data = json.loads(match.group())
|
||||
if isinstance(potential_data, list):
|
||||
return potential_data
|
||||
except:
|
||||
pass
|
||||
|
||||
return data
|
||||
|
||||
def parse_response(self, data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Parse submarine cable data"""
|
||||
result = []
|
||||
|
||||
if not isinstance(data, list):
|
||||
data = [data]
|
||||
|
||||
for item in data:
|
||||
try:
|
||||
entry = {
|
||||
"source_id": f"telegeo_cable_{item.get('id', item.get('cable_id', ''))}",
|
||||
"name": item.get("name", item.get("cable_name", "Unknown")),
|
||||
"country": "",
|
||||
"city": "",
|
||||
"latitude": "",
|
||||
"longitude": "",
|
||||
"value": str(item.get("length", item.get("length_km", 0))),
|
||||
"unit": "km",
|
||||
"metadata": {
|
||||
"owner": item.get("owner"),
|
||||
"operator": item.get("operator"),
|
||||
"length_km": item.get("length", item.get("length_km")),
|
||||
"rfs": item.get("rfs"),
|
||||
"status": item.get("status", "active"),
|
||||
"cable_type": item.get("type", "fiber optic"),
|
||||
"capacity_tbps": item.get("capacity"),
|
||||
"url": item.get("url"),
|
||||
},
|
||||
"reference_date": datetime.utcnow().strftime("%Y-%m-%d"),
|
||||
}
|
||||
result.append(entry)
|
||||
except (ValueError, TypeError, KeyError):
|
||||
continue
|
||||
|
||||
if not result:
|
||||
result = self._get_sample_data()
|
||||
|
||||
return result
|
||||
|
||||
def _get_sample_data(self) -> List[Dict[str, Any]]:
|
||||
"""Return sample submarine cable data"""
|
||||
return [
|
||||
{
|
||||
"source_id": "telegeo_sample_1",
|
||||
"name": "2Africa",
|
||||
"country": "",
|
||||
"city": "",
|
||||
"latitude": "",
|
||||
"longitude": "",
|
||||
"value": "45000",
|
||||
"unit": "km",
|
||||
"metadata": {
|
||||
"note": "Sample data - TeleGeography requires browser/scraper for live data",
|
||||
"owner": "Meta, Orange, Vodafone, etc.",
|
||||
"status": "active",
|
||||
},
|
||||
"reference_date": datetime.utcnow().strftime("%Y-%m-%d"),
|
||||
},
|
||||
{
|
||||
"source_id": "telegeo_sample_2",
|
||||
"name": "Asia Connect Cable 1",
|
||||
"country": "",
|
||||
"city": "",
|
||||
"latitude": "",
|
||||
"longitude": "",
|
||||
"value": "12000",
|
||||
"unit": "km",
|
||||
"metadata": {
|
||||
"note": "Sample data",
|
||||
"owner": "Alibaba, NEC",
|
||||
"status": "planned",
|
||||
},
|
||||
"reference_date": datetime.utcnow().strftime("%Y-%m-%d"),
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
class TeleGeographyLandingPointCollector(BaseCollector):
|
||||
name = "telegeography_landing"
|
||||
priority = "P2"
|
||||
module = "L2"
|
||||
frequency_hours = 168
|
||||
data_type = "landing_point"
|
||||
|
||||
async def fetch(self) -> List[Dict[str, Any]]:
|
||||
"""Fetch landing point data from GitHub mirror"""
|
||||
url = "https://raw.githubusercontent.com/lintaojlu/submarine_cable_information/main/landing_point.json"
|
||||
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
response = await client.get(url)
|
||||
response.raise_for_status()
|
||||
return self.parse_response(response.json())
|
||||
|
||||
def parse_response(self, data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Parse landing point data"""
|
||||
result = []
|
||||
|
||||
for item in data:
|
||||
try:
|
||||
entry = {
|
||||
"source_id": f"telegeo_lp_{item.get('id', '')}",
|
||||
"name": item.get("name", "Unknown"),
|
||||
"country": item.get("country", "Unknown"),
|
||||
"city": item.get("city", item.get("name", "")),
|
||||
"latitude": str(item.get("latitude", "")),
|
||||
"longitude": str(item.get("longitude", "")),
|
||||
"value": "",
|
||||
"unit": "",
|
||||
"metadata": {
|
||||
"cable_count": len(item.get("cables", [])),
|
||||
"url": item.get("url"),
|
||||
},
|
||||
"reference_date": datetime.utcnow().strftime("%Y-%m-%d"),
|
||||
}
|
||||
result.append(entry)
|
||||
except (ValueError, TypeError, KeyError):
|
||||
continue
|
||||
|
||||
if not result:
|
||||
result = self._get_sample_data()
|
||||
|
||||
return result
|
||||
|
||||
def _get_sample_data(self) -> List[Dict[str, Any]]:
|
||||
"""Return sample landing point data"""
|
||||
return [
|
||||
{
|
||||
"source_id": "telegeo_lp_sample_1",
|
||||
"name": "Sample Landing Point",
|
||||
"country": "United States",
|
||||
"city": "Los Angeles, CA",
|
||||
"latitude": "34.0522",
|
||||
"longitude": "-118.2437",
|
||||
"value": "",
|
||||
"unit": "",
|
||||
"metadata": {"note": "Sample data"},
|
||||
"reference_date": datetime.utcnow().strftime("%Y-%m-%d"),
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
class TeleGeographyCableSystemCollector(BaseCollector):
|
||||
name = "telegeography_systems"
|
||||
priority = "P2"
|
||||
module = "L2"
|
||||
frequency_hours = 168
|
||||
data_type = "cable_system"
|
||||
|
||||
async def fetch(self) -> List[Dict[str, Any]]:
|
||||
"""Fetch cable system data"""
|
||||
url = "https://raw.githubusercontent.com/lintaojlu/submarine_cable_information/main/cable.json"
|
||||
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
response = await client.get(url)
|
||||
response.raise_for_status()
|
||||
return self.parse_response(response.json())
|
||||
|
||||
def parse_response(self, data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Parse cable system data"""
|
||||
result = []
|
||||
|
||||
for item in data:
|
||||
try:
|
||||
entry = {
|
||||
"source_id": f"telegeo_sys_{item.get('id', item.get('cable_id', ''))}",
|
||||
"name": item.get("name", item.get("cable_name", "Unknown")),
|
||||
"country": "",
|
||||
"city": "",
|
||||
"latitude": "",
|
||||
"longitude": "",
|
||||
"value": str(item.get("length", 0)),
|
||||
"unit": "km",
|
||||
"metadata": {
|
||||
"owner": item.get("owner"),
|
||||
"operator": item.get("operator"),
|
||||
"route": item.get("route"),
|
||||
"countries": item.get("countries", []),
|
||||
"length_km": item.get("length"),
|
||||
"rfs": item.get("rfs"),
|
||||
"status": item.get("status", "active"),
|
||||
"investment": item.get("investment"),
|
||||
"url": item.get("url"),
|
||||
},
|
||||
"reference_date": datetime.utcnow().strftime("%Y-%m-%d"),
|
||||
}
|
||||
result.append(entry)
|
||||
except (ValueError, TypeError, KeyError):
|
||||
continue
|
||||
|
||||
if not result:
|
||||
result = self._get_sample_data()
|
||||
|
||||
return result
|
||||
|
||||
def _get_sample_data(self) -> List[Dict[str, Any]]:
|
||||
"""Return sample cable system data"""
|
||||
return [
|
||||
{
|
||||
"source_id": "telegeo_sys_sample_1",
|
||||
"name": "Sample Cable System",
|
||||
"country": "",
|
||||
"city": "",
|
||||
"latitude": "",
|
||||
"longitude": "",
|
||||
"value": "5000",
|
||||
"unit": "km",
|
||||
"metadata": {"note": "Sample data"},
|
||||
"reference_date": datetime.utcnow().strftime("%Y-%m-%d"),
|
||||
},
|
||||
]
|
||||
230
backend/app/services/collectors/top500.py
Normal file
230
backend/app/services/collectors/top500.py
Normal file
@@ -0,0 +1,230 @@
|
||||
"""TOP500 Supercomputer Collector
|
||||
|
||||
Collects data from TOP500 supercomputer rankings.
|
||||
https://top500.org/lists/top500/
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Dict, Any, List
|
||||
from datetime import datetime
|
||||
from bs4 import BeautifulSoup
|
||||
import httpx
|
||||
|
||||
from app.services.collectors.base import BaseCollector
|
||||
|
||||
|
||||
class TOP500Collector(BaseCollector):
|
||||
name = "top500"
|
||||
priority = "P0"
|
||||
module = "L1"
|
||||
frequency_hours = 4
|
||||
data_type = "supercomputer"
|
||||
|
||||
async def fetch(self) -> List[Dict[str, Any]]:
|
||||
"""Fetch TOP500 data from website (scraping)"""
|
||||
# Get the latest list page
|
||||
url = "https://top500.org/lists/top500/list/2025/11/"
|
||||
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
response = await client.get(url)
|
||||
response.raise_for_status()
|
||||
return self.parse_response(response.text)
|
||||
|
||||
def parse_response(self, html: str) -> List[Dict[str, Any]]:
|
||||
"""Parse TOP500 HTML response"""
|
||||
data = []
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
# Find the table with TOP500 data
|
||||
table = soup.find("table", {"class": "top500-table"})
|
||||
if not table:
|
||||
# Try alternative table selector
|
||||
table = soup.find("table", {"id": "top500"})
|
||||
|
||||
if not table:
|
||||
# Try to find any table with rank data
|
||||
tables = soup.find_all("table")
|
||||
for t in tables:
|
||||
if t.find(string=re.compile(r"Rank.*System.*Cores.*Rmax", re.I)):
|
||||
table = t
|
||||
break
|
||||
|
||||
if not table:
|
||||
# Fallback: try to extract data from any table
|
||||
tables = soup.find_all("table")
|
||||
if tables:
|
||||
table = tables[0]
|
||||
|
||||
if table:
|
||||
rows = table.find_all("tr")
|
||||
for row in rows[1:]: # Skip header row
|
||||
cells = row.find_all(["td", "th"])
|
||||
if len(cells) >= 6:
|
||||
try:
|
||||
# Parse the row data
|
||||
rank_text = cells[0].get_text(strip=True)
|
||||
if not rank_text or not rank_text.isdigit():
|
||||
continue
|
||||
|
||||
rank = int(rank_text)
|
||||
|
||||
# System name (may contain link)
|
||||
system_cell = cells[1]
|
||||
system_name = system_cell.get_text(strip=True)
|
||||
# Try to get full name from link title or data attribute
|
||||
link = system_cell.find("a")
|
||||
if link and link.get("title"):
|
||||
system_name = link.get("title")
|
||||
|
||||
# Country
|
||||
country_cell = cells[2]
|
||||
country = country_cell.get_text(strip=True)
|
||||
# Try to get country from data attribute or image alt
|
||||
img = country_cell.find("img")
|
||||
if img and img.get("alt"):
|
||||
country = img.get("alt")
|
||||
|
||||
# Extract location (city)
|
||||
city = ""
|
||||
location_text = country_cell.get_text(strip=True)
|
||||
if "(" in location_text and ")" in location_text:
|
||||
city = location_text.split("(")[0].strip()
|
||||
|
||||
# Cores
|
||||
cores = cells[3].get_text(strip=True).replace(",", "")
|
||||
|
||||
# Rmax
|
||||
rmax_text = cells[4].get_text(strip=True)
|
||||
rmax = self._parse_performance(rmax_text)
|
||||
|
||||
# Rpeak
|
||||
rpeak_text = cells[5].get_text(strip=True)
|
||||
rpeak = self._parse_performance(rpeak_text)
|
||||
|
||||
# Power (optional)
|
||||
power = ""
|
||||
if len(cells) >= 7:
|
||||
power = cells[6].get_text(strip=True)
|
||||
|
||||
entry = {
|
||||
"source_id": f"top500_{rank}",
|
||||
"name": system_name,
|
||||
"country": country,
|
||||
"city": city,
|
||||
"latitude": 0.0,
|
||||
"longitude": 0.0,
|
||||
"value": str(rmax),
|
||||
"unit": "PFlop/s",
|
||||
"metadata": {
|
||||
"rank": rank,
|
||||
"r_peak": rpeak,
|
||||
"power": power,
|
||||
"cores": cores,
|
||||
},
|
||||
"reference_date": "2025-11-01",
|
||||
}
|
||||
data.append(entry)
|
||||
except (ValueError, IndexError, AttributeError) as e:
|
||||
continue
|
||||
|
||||
# If scraping failed, return sample data for testing
|
||||
if not data:
|
||||
data = self._get_sample_data()
|
||||
|
||||
return data
|
||||
|
||||
def _parse_coordinate(self, value: Any) -> float:
|
||||
"""Parse coordinate value"""
|
||||
if isinstance(value, (int, float)):
|
||||
return float(value)
|
||||
if isinstance(value, str):
|
||||
try:
|
||||
return float(value)
|
||||
except ValueError:
|
||||
return 0.0
|
||||
return 0.0
|
||||
|
||||
def _parse_performance(self, text: str) -> float:
|
||||
"""Parse performance value from text (handles E, P, T suffixes)"""
|
||||
text = text.strip().upper()
|
||||
multipliers = {
|
||||
"E": 1e18,
|
||||
"P": 1e15,
|
||||
"T": 1e12,
|
||||
"G": 1e9,
|
||||
"M": 1e6,
|
||||
"K": 1e3,
|
||||
}
|
||||
|
||||
match = re.match(r"([\d.]+)\s*([EPTGMK])?F?LOP/?S?", text)
|
||||
if match:
|
||||
value = float(match.group(1))
|
||||
suffix = match.group(2)
|
||||
if suffix:
|
||||
value *= multipliers.get(suffix, 1)
|
||||
return value
|
||||
|
||||
# Try simple float parsing
|
||||
try:
|
||||
return float(text.replace(",", ""))
|
||||
except ValueError:
|
||||
return 0.0
|
||||
|
||||
def _get_sample_data(self) -> List[Dict[str, Any]]:
|
||||
"""Return sample data for testing when scraping fails"""
|
||||
return [
|
||||
{
|
||||
"source_id": "top500_1",
|
||||
"name": "El Capitan - HPE Cray EX255a, AMD 4th Gen EPYC 24C 1.8GHz, AMD Instinct MI300A",
|
||||
"country": "United States",
|
||||
"city": "Livermore, CA",
|
||||
"latitude": 37.6819,
|
||||
"longitude": -121.7681,
|
||||
"value": "1742.00",
|
||||
"unit": "PFlop/s",
|
||||
"metadata": {
|
||||
"rank": 1,
|
||||
"r_peak": 2746.38,
|
||||
"power": 29581,
|
||||
"cores": 11039616,
|
||||
"manufacturer": "HPE",
|
||||
},
|
||||
"reference_date": "2025-11-01",
|
||||
},
|
||||
{
|
||||
"source_id": "top500_2",
|
||||
"name": "Frontier - HPE Cray EX235a, AMD Optimized 3rd Generation EPYC 64C 2GHz, AMD Instinct MI250X",
|
||||
"country": "United States",
|
||||
"city": "Oak Ridge, TN",
|
||||
"latitude": 36.0107,
|
||||
"longitude": -84.2663,
|
||||
"value": "1353.00",
|
||||
"unit": "PFlop/s",
|
||||
"metadata": {
|
||||
"rank": 2,
|
||||
"r_peak": 2055.72,
|
||||
"power": 24607,
|
||||
"cores": 9066176,
|
||||
"manufacturer": "HPE",
|
||||
},
|
||||
"reference_date": "2025-11-01",
|
||||
},
|
||||
{
|
||||
"source_id": "top500_3",
|
||||
"name": "Aurora - HPE Cray EX - Intel Exascale Compute Blade, Xeon CPU Max 9470 52C 2.4GHz, Intel Data Center GPU Max",
|
||||
"country": "United States",
|
||||
"city": "Argonne, IL",
|
||||
"latitude": 41.3784,
|
||||
"longitude": -87.8600,
|
||||
"value": "1012.00",
|
||||
"unit": "PFlop/s",
|
||||
"metadata": {
|
||||
"rank": 3,
|
||||
"r_peak": 1980.01,
|
||||
"power": 38698,
|
||||
"cores": 9264128,
|
||||
"manufacturer": "Intel",
|
||||
},
|
||||
"reference_date": "2025-11-01",
|
||||
},
|
||||
]
|
||||
146
backend/app/services/scheduler.py
Normal file
146
backend/app/services/scheduler.py
Normal file
@@ -0,0 +1,146 @@
|
||||
"""Task Scheduler for running collection jobs"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from typing import Dict, Any
|
||||
|
||||
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
||||
from apscheduler.triggers.interval import IntervalTrigger
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.db.session import async_session_factory
|
||||
from app.services.collectors.registry import collector_registry
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
scheduler = AsyncIOScheduler()
|
||||
|
||||
|
||||
COLLECTOR_TO_ID = {
|
||||
"top500": 1,
|
||||
"epoch_ai_gpu": 2,
|
||||
"huggingface_models": 3,
|
||||
"huggingface_datasets": 4,
|
||||
"huggingface_spaces": 5,
|
||||
"peeringdb_ixp": 6,
|
||||
"peeringdb_network": 7,
|
||||
"peeringdb_facility": 8,
|
||||
"telegeography_cables": 9,
|
||||
"telegeography_landing": 10,
|
||||
"telegeography_systems": 11,
|
||||
}
|
||||
|
||||
|
||||
async def run_collector_task(collector_name: str):
|
||||
"""Run a single collector task"""
|
||||
collector = collector_registry.get(collector_name)
|
||||
if not collector:
|
||||
logger.error(f"Collector not found: {collector_name}")
|
||||
return
|
||||
|
||||
# Get the correct datasource_id
|
||||
datasource_id = COLLECTOR_TO_ID.get(collector_name, 1)
|
||||
|
||||
async with async_session_factory() as db:
|
||||
try:
|
||||
# Set the datasource_id on the collector instance
|
||||
collector._datasource_id = datasource_id
|
||||
|
||||
logger.info(f"Running collector: {collector_name} (datasource_id={datasource_id})")
|
||||
result = await collector.run(db)
|
||||
logger.info(f"Collector {collector_name} completed: {result}")
|
||||
except Exception as e:
|
||||
logger.error(f"Collector {collector_name} failed: {e}")
|
||||
|
||||
|
||||
def start_scheduler():
|
||||
"""Start the scheduler with all registered collectors"""
|
||||
collectors = collector_registry.all()
|
||||
|
||||
for name, collector in collectors.items():
|
||||
if collector_registry.is_active(name):
|
||||
scheduler.add_job(
|
||||
run_collector_task,
|
||||
trigger=IntervalTrigger(hours=collector.frequency_hours),
|
||||
id=name,
|
||||
name=name,
|
||||
replace_existing=True,
|
||||
kwargs={"collector_name": name},
|
||||
)
|
||||
logger.info(f"Scheduled collector: {name} (every {collector.frequency_hours}h)")
|
||||
|
||||
scheduler.start()
|
||||
logger.info("Scheduler started")
|
||||
|
||||
|
||||
def stop_scheduler():
|
||||
"""Stop the scheduler"""
|
||||
scheduler.shutdown()
|
||||
logger.info("Scheduler stopped")
|
||||
|
||||
|
||||
def get_scheduler_jobs() -> list[Dict[str, Any]]:
|
||||
"""Get all scheduled jobs"""
|
||||
jobs = []
|
||||
for job in scheduler.get_jobs():
|
||||
jobs.append(
|
||||
{
|
||||
"id": job.id,
|
||||
"name": job.name,
|
||||
"next_run_time": job.next_run_time.isoformat() if job.next_run_time else None,
|
||||
"trigger": str(job.trigger),
|
||||
}
|
||||
)
|
||||
return jobs
|
||||
|
||||
|
||||
def add_job(collector_name: str, hours: int = 4):
|
||||
"""Add a new scheduled job"""
|
||||
collector = collector_registry.get(collector_name)
|
||||
if not collector:
|
||||
raise ValueError(f"Collector not found: {collector_name}")
|
||||
|
||||
scheduler.add_job(
|
||||
run_collector_task,
|
||||
trigger=IntervalTrigger(hours=hours),
|
||||
id=collector_name,
|
||||
name=collector_name,
|
||||
replace_existing=True,
|
||||
kwargs={"collector_name": collector_name},
|
||||
)
|
||||
logger.info(f"Added scheduled job: {collector_name} (every {hours}h)")
|
||||
|
||||
|
||||
def remove_job(collector_name: str):
|
||||
"""Remove a scheduled job"""
|
||||
scheduler.remove_job(collector_name)
|
||||
logger.info(f"Removed scheduled job: {collector_name}")
|
||||
|
||||
|
||||
def pause_job(collector_name: str):
|
||||
"""Pause a scheduled job"""
|
||||
scheduler.pause_job(collector_name)
|
||||
logger.info(f"Paused job: {collector_name}")
|
||||
|
||||
|
||||
def resume_job(collector_name: str):
|
||||
"""Resume a scheduled job"""
|
||||
scheduler.resume_job(collector_name)
|
||||
logger.info(f"Resumed job: {collector_name}")
|
||||
|
||||
|
||||
def run_collector_now(collector_name: str) -> bool:
|
||||
"""Run a collector immediately (not scheduled)"""
|
||||
collector = collector_registry.get(collector_name)
|
||||
if not collector:
|
||||
logger.error(f"Collector not found: {collector_name}")
|
||||
return False
|
||||
|
||||
try:
|
||||
asyncio.create_task(run_collector_task(collector_name))
|
||||
logger.info(f"Triggered collector: {collector_name}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to trigger collector {collector_name}: {e}")
|
||||
return False
|
||||
Reference in New Issue
Block a user