"""Hugging Face Model Ecosystem Collector Collects data from Hugging Face model hub. https://huggingface.co/models https://huggingface.co/datasets https://huggingface.co/spaces """ from typing import Dict, Any, List from datetime import datetime from app.services.collectors.base import HTTPCollector class HuggingFaceModelCollector(HTTPCollector): name = "huggingface_models" priority = "P1" module = "L2" frequency_hours = 12 data_type = "model" base_url = "https://huggingface.co/api/models" def parse_response(self, response: Dict[str, Any]) -> List[Dict[str, Any]]: """Parse Hugging Face models API response""" data = [] models = ( response if isinstance(response, list) else response.get("models", response.get("items", [])) ) for item in models[:100]: try: entry = { "source_id": f"hf_model_{item.get('id', '')}", "name": item.get("id", "Unknown"), "description": (item.get("description", "") or "")[:500], "metadata": { "author": item.get("author"), "likes": item.get("likes"), "downloads": item.get("downloads"), "language": item.get("language"), "tags": (item.get("tags", []) or [])[:10], "pipeline_tag": item.get("pipeline_tag"), "library_name": item.get("library_name"), "created_at": item.get("createdAt"), }, "reference_date": datetime.utcnow().strftime("%Y-%m-%d"), } data.append(entry) except (ValueError, TypeError, KeyError): continue return data class HuggingFaceDatasetCollector(HTTPCollector): name = "huggingface_datasets" priority = "P1" module = "L2" frequency_hours = 12 data_type = "dataset" base_url = "https://huggingface.co/api/datasets" def parse_response(self, response: Dict[str, Any]) -> List[Dict[str, Any]]: """Parse Hugging Face datasets API response""" data = [] datasets = ( response if isinstance(response, list) else response.get("datasets", response.get("items", [])) ) for item in datasets[:100]: try: entry = { "source_id": f"hf_dataset_{item.get('id', '')}", "name": item.get("id", "Unknown"), "description": (item.get("description", "") or "")[:500], "metadata": { "author": item.get("author"), "likes": item.get("likes"), "downloads": item.get("downloads"), "size": item.get("size"), "language": item.get("language"), "tags": (item.get("tags", []) or [])[:10], "created_at": item.get("createdAt"), }, "reference_date": datetime.utcnow().strftime("%Y-%m-%d"), } data.append(entry) except (ValueError, TypeError, KeyError): continue return data class HuggingFaceSpacesCollector(HTTPCollector): name = "huggingface_spaces" priority = "P2" module = "L2" frequency_hours = 24 data_type = "space" base_url = "https://huggingface.co/api/spaces" def parse_response(self, response: Dict[str, Any]) -> List[Dict[str, Any]]: """Parse Hugging Face Spaces API response""" data = [] spaces = ( response if isinstance(response, list) else response.get("spaces", response.get("items", [])) ) for item in spaces[:100]: try: entry = { "source_id": f"hf_space_{item.get('id', '')}", "name": item.get("id", "Unknown"), "description": (item.get("description", "") or "")[:500], "metadata": { "author": item.get("author"), "likes": item.get("likes"), "views": item.get("views"), "sdk": item.get("sdk"), "hardware": item.get("hardware"), "tags": (item.get("tags", []) or [])[:10], "created_at": item.get("createdAt"), }, "reference_date": datetime.utcnow().strftime("%Y-%m-%d"), } data.append(entry) except (ValueError, TypeError, KeyError): continue return data