89 lines
3.6 KiB
Python
89 lines
3.6 KiB
Python
"""Collected Data model for storing data from all collectors"""
|
|
|
|
from sqlalchemy import Boolean, Column, DateTime, ForeignKey, Integer, String, Text, JSON, Index
|
|
from sqlalchemy.sql import func
|
|
|
|
from app.core.collected_data_fields import get_record_field
|
|
from app.db.session import Base
|
|
|
|
|
|
class CollectedData(Base):
|
|
"""Generic model for storing collected data from all sources"""
|
|
|
|
__tablename__ = "collected_data"
|
|
|
|
id = Column(Integer, primary_key=True, autoincrement=True)
|
|
snapshot_id = Column(Integer, ForeignKey("data_snapshots.id"), nullable=True, index=True)
|
|
task_id = Column(Integer, ForeignKey("collection_tasks.id"), nullable=True, index=True)
|
|
source = Column(String(100), nullable=False, index=True) # e.g., "top500", "huggingface_models"
|
|
source_id = Column(String(100), index=True) # Original ID from source, e.g., "rank_1"
|
|
entity_key = Column(String(255), index=True)
|
|
data_type = Column(
|
|
String(50), nullable=False, index=True
|
|
) # e.g., "supercomputer", "model", "dataset"
|
|
|
|
# Core data fields
|
|
name = Column(String(500))
|
|
title = Column(String(500))
|
|
description = Column(Text)
|
|
|
|
# Additional metadata as JSON
|
|
extra_data = Column(
|
|
"metadata", JSON, default={}
|
|
) # Using 'extra_data' as attribute name but 'metadata' as column name
|
|
|
|
# Timestamps
|
|
collected_at = Column(DateTime(timezone=True), server_default=func.now(), index=True)
|
|
reference_date = Column(DateTime(timezone=True)) # Data reference date (e.g., TOP500 list date)
|
|
|
|
# Status
|
|
is_valid = Column(Integer, default=1) # 1=valid, 0=invalid
|
|
is_current = Column(Boolean, default=True, index=True)
|
|
previous_record_id = Column(Integer, ForeignKey("collected_data.id"), nullable=True, index=True)
|
|
change_type = Column(String(20), nullable=True)
|
|
change_summary = Column(JSON, default={})
|
|
deleted_at = Column(DateTime(timezone=True), nullable=True)
|
|
|
|
# Indexes for common queries
|
|
__table_args__ = (
|
|
Index("idx_collected_data_source_collected", "source", "collected_at"),
|
|
Index("idx_collected_data_source_type", "source", "data_type"),
|
|
Index("idx_collected_data_source_source_id", "source", "source_id"),
|
|
)
|
|
|
|
def __repr__(self):
|
|
return f"<CollectedData {self.id}: {self.source}/{self.data_type}>"
|
|
|
|
def to_dict(self) -> dict:
|
|
"""Convert to dictionary"""
|
|
return {
|
|
"id": self.id,
|
|
"snapshot_id": self.snapshot_id,
|
|
"task_id": self.task_id,
|
|
"source": self.source,
|
|
"source_id": self.source_id,
|
|
"entity_key": self.entity_key,
|
|
"data_type": self.data_type,
|
|
"name": self.name,
|
|
"title": self.title,
|
|
"description": self.description,
|
|
"country": get_record_field(self, "country"),
|
|
"city": get_record_field(self, "city"),
|
|
"latitude": get_record_field(self, "latitude"),
|
|
"longitude": get_record_field(self, "longitude"),
|
|
"value": get_record_field(self, "value"),
|
|
"unit": get_record_field(self, "unit"),
|
|
"metadata": self.extra_data,
|
|
"collected_at": self.collected_at.isoformat()
|
|
if self.collected_at is not None
|
|
else None,
|
|
"reference_date": self.reference_date.isoformat()
|
|
if self.reference_date is not None
|
|
else None,
|
|
"is_current": self.is_current,
|
|
"previous_record_id": self.previous_record_id,
|
|
"change_type": self.change_type,
|
|
"change_summary": self.change_summary,
|
|
"deleted_at": self.deleted_at.isoformat() if self.deleted_at is not None else None,
|
|
}
|