"""Collected Data model for storing data from all collectors""" from sqlalchemy import Boolean, Column, DateTime, ForeignKey, Integer, String, Text, JSON, Index from sqlalchemy.sql import func from app.core.collected_data_fields import get_record_field from app.db.session import Base class CollectedData(Base): """Generic model for storing collected data from all sources""" __tablename__ = "collected_data" id = Column(Integer, primary_key=True, autoincrement=True) snapshot_id = Column(Integer, ForeignKey("data_snapshots.id"), nullable=True, index=True) task_id = Column(Integer, ForeignKey("collection_tasks.id"), nullable=True, index=True) source = Column(String(100), nullable=False, index=True) # e.g., "top500", "huggingface_models" source_id = Column(String(100), index=True) # Original ID from source, e.g., "rank_1" entity_key = Column(String(255), index=True) data_type = Column( String(50), nullable=False, index=True ) # e.g., "supercomputer", "model", "dataset" # Core data fields name = Column(String(500)) title = Column(String(500)) description = Column(Text) # Additional metadata as JSON extra_data = Column( "metadata", JSON, default={} ) # Using 'extra_data' as attribute name but 'metadata' as column name # Timestamps collected_at = Column(DateTime(timezone=True), server_default=func.now(), index=True) reference_date = Column(DateTime(timezone=True)) # Data reference date (e.g., TOP500 list date) # Status is_valid = Column(Integer, default=1) # 1=valid, 0=invalid is_current = Column(Boolean, default=True, index=True) previous_record_id = Column(Integer, ForeignKey("collected_data.id"), nullable=True, index=True) change_type = Column(String(20), nullable=True) change_summary = Column(JSON, default={}) deleted_at = Column(DateTime(timezone=True), nullable=True) # Indexes for common queries __table_args__ = ( Index("idx_collected_data_source_collected", "source", "collected_at"), Index("idx_collected_data_source_type", "source", "data_type"), Index("idx_collected_data_source_source_id", "source", "source_id"), ) def __repr__(self): return f"" def to_dict(self) -> dict: """Convert to dictionary""" return { "id": self.id, "snapshot_id": self.snapshot_id, "task_id": self.task_id, "source": self.source, "source_id": self.source_id, "entity_key": self.entity_key, "data_type": self.data_type, "name": self.name, "title": self.title, "description": self.description, "country": get_record_field(self, "country"), "city": get_record_field(self, "city"), "latitude": get_record_field(self, "latitude"), "longitude": get_record_field(self, "longitude"), "value": get_record_field(self, "value"), "unit": get_record_field(self, "unit"), "metadata": self.extra_data, "collected_at": self.collected_at.isoformat() if self.collected_at is not None else None, "reference_date": self.reference_date.isoformat() if self.reference_date is not None else None, "is_current": self.is_current, "previous_record_id": self.previous_record_id, "change_type": self.change_type, "change_summary": self.change_summary, "deleted_at": self.deleted_at.isoformat() if self.deleted_at is not None else None, }