Files
planet/backend/app/services/collectors/huggingface.py
2026-03-05 11:46:58 +08:00

137 lines
4.8 KiB
Python

"""Hugging Face Model Ecosystem Collector
Collects data from Hugging Face model hub.
https://huggingface.co/models
https://huggingface.co/datasets
https://huggingface.co/spaces
"""
from typing import Dict, Any, List
from datetime import datetime
from app.services.collectors.base import HTTPCollector
class HuggingFaceModelCollector(HTTPCollector):
name = "huggingface_models"
priority = "P1"
module = "L2"
frequency_hours = 12
data_type = "model"
base_url = "https://huggingface.co/api/models"
def parse_response(self, response: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Parse Hugging Face models API response"""
data = []
models = (
response
if isinstance(response, list)
else response.get("models", response.get("items", []))
)
for item in models[:100]:
try:
entry = {
"source_id": f"hf_model_{item.get('id', '')}",
"name": item.get("id", "Unknown"),
"description": (item.get("description", "") or "")[:500],
"metadata": {
"author": item.get("author"),
"likes": item.get("likes"),
"downloads": item.get("downloads"),
"language": item.get("language"),
"tags": (item.get("tags", []) or [])[:10],
"pipeline_tag": item.get("pipeline_tag"),
"library_name": item.get("library_name"),
"created_at": item.get("createdAt"),
},
"reference_date": datetime.utcnow().strftime("%Y-%m-%d"),
}
data.append(entry)
except (ValueError, TypeError, KeyError):
continue
return data
class HuggingFaceDatasetCollector(HTTPCollector):
name = "huggingface_datasets"
priority = "P1"
module = "L2"
frequency_hours = 12
data_type = "dataset"
base_url = "https://huggingface.co/api/datasets"
def parse_response(self, response: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Parse Hugging Face datasets API response"""
data = []
datasets = (
response
if isinstance(response, list)
else response.get("datasets", response.get("items", []))
)
for item in datasets[:100]:
try:
entry = {
"source_id": f"hf_dataset_{item.get('id', '')}",
"name": item.get("id", "Unknown"),
"description": (item.get("description", "") or "")[:500],
"metadata": {
"author": item.get("author"),
"likes": item.get("likes"),
"downloads": item.get("downloads"),
"size": item.get("size"),
"language": item.get("language"),
"tags": (item.get("tags", []) or [])[:10],
"created_at": item.get("createdAt"),
},
"reference_date": datetime.utcnow().strftime("%Y-%m-%d"),
}
data.append(entry)
except (ValueError, TypeError, KeyError):
continue
return data
class HuggingFaceSpacesCollector(HTTPCollector):
name = "huggingface_spaces"
priority = "P2"
module = "L2"
frequency_hours = 24
data_type = "space"
base_url = "https://huggingface.co/api/spaces"
def parse_response(self, response: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Parse Hugging Face Spaces API response"""
data = []
spaces = (
response
if isinstance(response, list)
else response.get("spaces", response.get("items", []))
)
for item in spaces[:100]:
try:
entry = {
"source_id": f"hf_space_{item.get('id', '')}",
"name": item.get("id", "Unknown"),
"description": (item.get("description", "") or "")[:500],
"metadata": {
"author": item.get("author"),
"likes": item.get("likes"),
"views": item.get("views"),
"sdk": item.get("sdk"),
"hardware": item.get("hardware"),
"tags": (item.get("tags", []) or [])[:10],
"created_at": item.get("createdAt"),
},
"reference_date": datetime.utcnow().strftime("%Y-%m-%d"),
}
data.append(entry)
except (ValueError, TypeError, KeyError):
continue
return data