137 lines
4.8 KiB
Python
137 lines
4.8 KiB
Python
"""Hugging Face Model Ecosystem Collector
|
|
|
|
Collects data from Hugging Face model hub.
|
|
https://huggingface.co/models
|
|
https://huggingface.co/datasets
|
|
https://huggingface.co/spaces
|
|
"""
|
|
|
|
from typing import Dict, Any, List
|
|
from datetime import datetime
|
|
|
|
from app.services.collectors.base import HTTPCollector
|
|
|
|
|
|
class HuggingFaceModelCollector(HTTPCollector):
|
|
name = "huggingface_models"
|
|
priority = "P1"
|
|
module = "L2"
|
|
frequency_hours = 12
|
|
data_type = "model"
|
|
base_url = "https://huggingface.co/api/models"
|
|
|
|
def parse_response(self, response: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
"""Parse Hugging Face models API response"""
|
|
data = []
|
|
models = (
|
|
response
|
|
if isinstance(response, list)
|
|
else response.get("models", response.get("items", []))
|
|
)
|
|
|
|
for item in models[:100]:
|
|
try:
|
|
entry = {
|
|
"source_id": f"hf_model_{item.get('id', '')}",
|
|
"name": item.get("id", "Unknown"),
|
|
"description": (item.get("description", "") or "")[:500],
|
|
"metadata": {
|
|
"author": item.get("author"),
|
|
"likes": item.get("likes"),
|
|
"downloads": item.get("downloads"),
|
|
"language": item.get("language"),
|
|
"tags": (item.get("tags", []) or [])[:10],
|
|
"pipeline_tag": item.get("pipeline_tag"),
|
|
"library_name": item.get("library_name"),
|
|
"created_at": item.get("createdAt"),
|
|
},
|
|
"reference_date": datetime.utcnow().strftime("%Y-%m-%d"),
|
|
}
|
|
data.append(entry)
|
|
except (ValueError, TypeError, KeyError):
|
|
continue
|
|
|
|
return data
|
|
|
|
|
|
class HuggingFaceDatasetCollector(HTTPCollector):
|
|
name = "huggingface_datasets"
|
|
priority = "P1"
|
|
module = "L2"
|
|
frequency_hours = 12
|
|
data_type = "dataset"
|
|
base_url = "https://huggingface.co/api/datasets"
|
|
|
|
def parse_response(self, response: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
"""Parse Hugging Face datasets API response"""
|
|
data = []
|
|
datasets = (
|
|
response
|
|
if isinstance(response, list)
|
|
else response.get("datasets", response.get("items", []))
|
|
)
|
|
|
|
for item in datasets[:100]:
|
|
try:
|
|
entry = {
|
|
"source_id": f"hf_dataset_{item.get('id', '')}",
|
|
"name": item.get("id", "Unknown"),
|
|
"description": (item.get("description", "") or "")[:500],
|
|
"metadata": {
|
|
"author": item.get("author"),
|
|
"likes": item.get("likes"),
|
|
"downloads": item.get("downloads"),
|
|
"size": item.get("size"),
|
|
"language": item.get("language"),
|
|
"tags": (item.get("tags", []) or [])[:10],
|
|
"created_at": item.get("createdAt"),
|
|
},
|
|
"reference_date": datetime.utcnow().strftime("%Y-%m-%d"),
|
|
}
|
|
data.append(entry)
|
|
except (ValueError, TypeError, KeyError):
|
|
continue
|
|
|
|
return data
|
|
|
|
|
|
class HuggingFaceSpacesCollector(HTTPCollector):
|
|
name = "huggingface_spaces"
|
|
priority = "P2"
|
|
module = "L2"
|
|
frequency_hours = 24
|
|
data_type = "space"
|
|
base_url = "https://huggingface.co/api/spaces"
|
|
|
|
def parse_response(self, response: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
"""Parse Hugging Face Spaces API response"""
|
|
data = []
|
|
spaces = (
|
|
response
|
|
if isinstance(response, list)
|
|
else response.get("spaces", response.get("items", []))
|
|
)
|
|
|
|
for item in spaces[:100]:
|
|
try:
|
|
entry = {
|
|
"source_id": f"hf_space_{item.get('id', '')}",
|
|
"name": item.get("id", "Unknown"),
|
|
"description": (item.get("description", "") or "")[:500],
|
|
"metadata": {
|
|
"author": item.get("author"),
|
|
"likes": item.get("likes"),
|
|
"views": item.get("views"),
|
|
"sdk": item.get("sdk"),
|
|
"hardware": item.get("hardware"),
|
|
"tags": (item.get("tags", []) or [])[:10],
|
|
"created_at": item.get("createdAt"),
|
|
},
|
|
"reference_date": datetime.utcnow().strftime("%Y-%m-%d"),
|
|
}
|
|
data.append(entry)
|
|
except (ValueError, TypeError, KeyError):
|
|
continue
|
|
|
|
return data
|