first commit
This commit is contained in:
136
backend/app/services/collectors/huggingface.py
Normal file
136
backend/app/services/collectors/huggingface.py
Normal file
@@ -0,0 +1,136 @@
|
||||
"""Hugging Face Model Ecosystem Collector
|
||||
|
||||
Collects data from Hugging Face model hub.
|
||||
https://huggingface.co/models
|
||||
https://huggingface.co/datasets
|
||||
https://huggingface.co/spaces
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, List
|
||||
from datetime import datetime
|
||||
|
||||
from app.services.collectors.base import HTTPCollector
|
||||
|
||||
|
||||
class HuggingFaceModelCollector(HTTPCollector):
|
||||
name = "huggingface_models"
|
||||
priority = "P1"
|
||||
module = "L2"
|
||||
frequency_hours = 12
|
||||
data_type = "model"
|
||||
base_url = "https://huggingface.co/api/models"
|
||||
|
||||
def parse_response(self, response: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
"""Parse Hugging Face models API response"""
|
||||
data = []
|
||||
models = (
|
||||
response
|
||||
if isinstance(response, list)
|
||||
else response.get("models", response.get("items", []))
|
||||
)
|
||||
|
||||
for item in models[:100]:
|
||||
try:
|
||||
entry = {
|
||||
"source_id": f"hf_model_{item.get('id', '')}",
|
||||
"name": item.get("id", "Unknown"),
|
||||
"description": (item.get("description", "") or "")[:500],
|
||||
"metadata": {
|
||||
"author": item.get("author"),
|
||||
"likes": item.get("likes"),
|
||||
"downloads": item.get("downloads"),
|
||||
"language": item.get("language"),
|
||||
"tags": (item.get("tags", []) or [])[:10],
|
||||
"pipeline_tag": item.get("pipeline_tag"),
|
||||
"library_name": item.get("library_name"),
|
||||
"created_at": item.get("createdAt"),
|
||||
},
|
||||
"reference_date": datetime.utcnow().strftime("%Y-%m-%d"),
|
||||
}
|
||||
data.append(entry)
|
||||
except (ValueError, TypeError, KeyError):
|
||||
continue
|
||||
|
||||
return data
|
||||
|
||||
|
||||
class HuggingFaceDatasetCollector(HTTPCollector):
|
||||
name = "huggingface_datasets"
|
||||
priority = "P1"
|
||||
module = "L2"
|
||||
frequency_hours = 12
|
||||
data_type = "dataset"
|
||||
base_url = "https://huggingface.co/api/datasets"
|
||||
|
||||
def parse_response(self, response: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
"""Parse Hugging Face datasets API response"""
|
||||
data = []
|
||||
datasets = (
|
||||
response
|
||||
if isinstance(response, list)
|
||||
else response.get("datasets", response.get("items", []))
|
||||
)
|
||||
|
||||
for item in datasets[:100]:
|
||||
try:
|
||||
entry = {
|
||||
"source_id": f"hf_dataset_{item.get('id', '')}",
|
||||
"name": item.get("id", "Unknown"),
|
||||
"description": (item.get("description", "") or "")[:500],
|
||||
"metadata": {
|
||||
"author": item.get("author"),
|
||||
"likes": item.get("likes"),
|
||||
"downloads": item.get("downloads"),
|
||||
"size": item.get("size"),
|
||||
"language": item.get("language"),
|
||||
"tags": (item.get("tags", []) or [])[:10],
|
||||
"created_at": item.get("createdAt"),
|
||||
},
|
||||
"reference_date": datetime.utcnow().strftime("%Y-%m-%d"),
|
||||
}
|
||||
data.append(entry)
|
||||
except (ValueError, TypeError, KeyError):
|
||||
continue
|
||||
|
||||
return data
|
||||
|
||||
|
||||
class HuggingFaceSpacesCollector(HTTPCollector):
|
||||
name = "huggingface_spaces"
|
||||
priority = "P2"
|
||||
module = "L2"
|
||||
frequency_hours = 24
|
||||
data_type = "space"
|
||||
base_url = "https://huggingface.co/api/spaces"
|
||||
|
||||
def parse_response(self, response: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
"""Parse Hugging Face Spaces API response"""
|
||||
data = []
|
||||
spaces = (
|
||||
response
|
||||
if isinstance(response, list)
|
||||
else response.get("spaces", response.get("items", []))
|
||||
)
|
||||
|
||||
for item in spaces[:100]:
|
||||
try:
|
||||
entry = {
|
||||
"source_id": f"hf_space_{item.get('id', '')}",
|
||||
"name": item.get("id", "Unknown"),
|
||||
"description": (item.get("description", "") or "")[:500],
|
||||
"metadata": {
|
||||
"author": item.get("author"),
|
||||
"likes": item.get("likes"),
|
||||
"views": item.get("views"),
|
||||
"sdk": item.get("sdk"),
|
||||
"hardware": item.get("hardware"),
|
||||
"tags": (item.get("tags", []) or [])[:10],
|
||||
"created_at": item.get("createdAt"),
|
||||
},
|
||||
"reference_date": datetime.utcnow().strftime("%Y-%m-%d"),
|
||||
}
|
||||
data.append(entry)
|
||||
except (ValueError, TypeError, KeyError):
|
||||
continue
|
||||
|
||||
return data
|
||||
Reference in New Issue
Block a user