first commit

This commit is contained in:
rayd1o
2026-03-05 11:46:58 +08:00
commit e7033775d8
20657 changed files with 1988940 additions and 0 deletions

View File

@@ -0,0 +1,286 @@
"""TeleGeography Submarine Cables Collector
Collects data from TeleGeography submarine cable database.
Uses Wayback Machine as backup data source since live data requires JavaScript rendering.
"""
import json
import re
from typing import Dict, Any, List
from datetime import datetime
from bs4 import BeautifulSoup
import httpx
from app.services.collectors.base import BaseCollector
class TeleGeographyCableCollector(BaseCollector):
name = "telegeography_cables"
priority = "P1"
module = "L2"
frequency_hours = 168 # 7 days
data_type = "submarine_cable"
async def fetch(self) -> List[Dict[str, Any]]:
"""Fetch submarine cable data from Wayback Machine"""
# Try multiple data sources
sources = [
# Wayback Machine archive of TeleGeography
"https://web.archive.org/web/2024/https://www.submarinecablemap.com/api/v3/cable",
# Alternative: Try scraping the page
"https://www.submarinecablemap.com",
]
for url in sources:
try:
async with httpx.AsyncClient(timeout=60.0, follow_redirects=True) as client:
response = await client.get(url)
response.raise_for_status()
# Check if response is JSON
content_type = response.headers.get("content-type", "")
if "application/json" in content_type or url.endswith(".json"):
return self.parse_response(response.json())
else:
# It's HTML, try to scrape
data = self.scrape_cables_from_html(response.text)
if data:
return data
except Exception:
continue
# Fallback to sample data
return self._get_sample_data()
def scrape_cables_from_html(self, html: str) -> List[Dict[str, Any]]:
"""Try to extract cable data from HTML page"""
data = []
soup = BeautifulSoup(html, "html.parser")
# Look for embedded JSON data in scripts
scripts = soup.find_all("script")
for script in scripts:
text = script.string or ""
if "cable" in text.lower() and ("{" in text or "[" in text):
# Try to find JSON data
match = re.search(r"\[.+\]", text, re.DOTALL)
if match:
try:
potential_data = json.loads(match.group())
if isinstance(potential_data, list):
return potential_data
except:
pass
return data
def parse_response(self, data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Parse submarine cable data"""
result = []
if not isinstance(data, list):
data = [data]
for item in data:
try:
entry = {
"source_id": f"telegeo_cable_{item.get('id', item.get('cable_id', ''))}",
"name": item.get("name", item.get("cable_name", "Unknown")),
"country": "",
"city": "",
"latitude": "",
"longitude": "",
"value": str(item.get("length", item.get("length_km", 0))),
"unit": "km",
"metadata": {
"owner": item.get("owner"),
"operator": item.get("operator"),
"length_km": item.get("length", item.get("length_km")),
"rfs": item.get("rfs"),
"status": item.get("status", "active"),
"cable_type": item.get("type", "fiber optic"),
"capacity_tbps": item.get("capacity"),
"url": item.get("url"),
},
"reference_date": datetime.utcnow().strftime("%Y-%m-%d"),
}
result.append(entry)
except (ValueError, TypeError, KeyError):
continue
if not result:
result = self._get_sample_data()
return result
def _get_sample_data(self) -> List[Dict[str, Any]]:
"""Return sample submarine cable data"""
return [
{
"source_id": "telegeo_sample_1",
"name": "2Africa",
"country": "",
"city": "",
"latitude": "",
"longitude": "",
"value": "45000",
"unit": "km",
"metadata": {
"note": "Sample data - TeleGeography requires browser/scraper for live data",
"owner": "Meta, Orange, Vodafone, etc.",
"status": "active",
},
"reference_date": datetime.utcnow().strftime("%Y-%m-%d"),
},
{
"source_id": "telegeo_sample_2",
"name": "Asia Connect Cable 1",
"country": "",
"city": "",
"latitude": "",
"longitude": "",
"value": "12000",
"unit": "km",
"metadata": {
"note": "Sample data",
"owner": "Alibaba, NEC",
"status": "planned",
},
"reference_date": datetime.utcnow().strftime("%Y-%m-%d"),
},
]
class TeleGeographyLandingPointCollector(BaseCollector):
name = "telegeography_landing"
priority = "P2"
module = "L2"
frequency_hours = 168
data_type = "landing_point"
async def fetch(self) -> List[Dict[str, Any]]:
"""Fetch landing point data from GitHub mirror"""
url = "https://raw.githubusercontent.com/lintaojlu/submarine_cable_information/main/landing_point.json"
async with httpx.AsyncClient(timeout=60.0) as client:
response = await client.get(url)
response.raise_for_status()
return self.parse_response(response.json())
def parse_response(self, data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Parse landing point data"""
result = []
for item in data:
try:
entry = {
"source_id": f"telegeo_lp_{item.get('id', '')}",
"name": item.get("name", "Unknown"),
"country": item.get("country", "Unknown"),
"city": item.get("city", item.get("name", "")),
"latitude": str(item.get("latitude", "")),
"longitude": str(item.get("longitude", "")),
"value": "",
"unit": "",
"metadata": {
"cable_count": len(item.get("cables", [])),
"url": item.get("url"),
},
"reference_date": datetime.utcnow().strftime("%Y-%m-%d"),
}
result.append(entry)
except (ValueError, TypeError, KeyError):
continue
if not result:
result = self._get_sample_data()
return result
def _get_sample_data(self) -> List[Dict[str, Any]]:
"""Return sample landing point data"""
return [
{
"source_id": "telegeo_lp_sample_1",
"name": "Sample Landing Point",
"country": "United States",
"city": "Los Angeles, CA",
"latitude": "34.0522",
"longitude": "-118.2437",
"value": "",
"unit": "",
"metadata": {"note": "Sample data"},
"reference_date": datetime.utcnow().strftime("%Y-%m-%d"),
},
]
class TeleGeographyCableSystemCollector(BaseCollector):
name = "telegeography_systems"
priority = "P2"
module = "L2"
frequency_hours = 168
data_type = "cable_system"
async def fetch(self) -> List[Dict[str, Any]]:
"""Fetch cable system data"""
url = "https://raw.githubusercontent.com/lintaojlu/submarine_cable_information/main/cable.json"
async with httpx.AsyncClient(timeout=60.0) as client:
response = await client.get(url)
response.raise_for_status()
return self.parse_response(response.json())
def parse_response(self, data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Parse cable system data"""
result = []
for item in data:
try:
entry = {
"source_id": f"telegeo_sys_{item.get('id', item.get('cable_id', ''))}",
"name": item.get("name", item.get("cable_name", "Unknown")),
"country": "",
"city": "",
"latitude": "",
"longitude": "",
"value": str(item.get("length", 0)),
"unit": "km",
"metadata": {
"owner": item.get("owner"),
"operator": item.get("operator"),
"route": item.get("route"),
"countries": item.get("countries", []),
"length_km": item.get("length"),
"rfs": item.get("rfs"),
"status": item.get("status", "active"),
"investment": item.get("investment"),
"url": item.get("url"),
},
"reference_date": datetime.utcnow().strftime("%Y-%m-%d"),
}
result.append(entry)
except (ValueError, TypeError, KeyError):
continue
if not result:
result = self._get_sample_data()
return result
def _get_sample_data(self) -> List[Dict[str, Any]]:
"""Return sample cable system data"""
return [
{
"source_id": "telegeo_sys_sample_1",
"name": "Sample Cable System",
"country": "",
"city": "",
"latitude": "",
"longitude": "",
"value": "5000",
"unit": "km",
"metadata": {"note": "Sample data"},
"reference_date": datetime.utcnow().strftime("%Y-%m-%d"),
},
]