Files
planet/backend/app/models/collected_data.py
2026-03-25 17:19:10 +08:00

89 lines
3.6 KiB
Python

"""Collected Data model for storing data from all collectors"""
from sqlalchemy import Boolean, Column, DateTime, ForeignKey, Integer, String, Text, JSON, Index
from sqlalchemy.sql import func
from app.core.collected_data_fields import get_record_field
from app.db.session import Base
class CollectedData(Base):
"""Generic model for storing collected data from all sources"""
__tablename__ = "collected_data"
id = Column(Integer, primary_key=True, autoincrement=True)
snapshot_id = Column(Integer, ForeignKey("data_snapshots.id"), nullable=True, index=True)
task_id = Column(Integer, ForeignKey("collection_tasks.id"), nullable=True, index=True)
source = Column(String(100), nullable=False, index=True) # e.g., "top500", "huggingface_models"
source_id = Column(String(100), index=True) # Original ID from source, e.g., "rank_1"
entity_key = Column(String(255), index=True)
data_type = Column(
String(50), nullable=False, index=True
) # e.g., "supercomputer", "model", "dataset"
# Core data fields
name = Column(String(500))
title = Column(String(500))
description = Column(Text)
# Additional metadata as JSON
extra_data = Column(
"metadata", JSON, default={}
) # Using 'extra_data' as attribute name but 'metadata' as column name
# Timestamps
collected_at = Column(DateTime(timezone=True), server_default=func.now(), index=True)
reference_date = Column(DateTime(timezone=True)) # Data reference date (e.g., TOP500 list date)
# Status
is_valid = Column(Integer, default=1) # 1=valid, 0=invalid
is_current = Column(Boolean, default=True, index=True)
previous_record_id = Column(Integer, ForeignKey("collected_data.id"), nullable=True, index=True)
change_type = Column(String(20), nullable=True)
change_summary = Column(JSON, default={})
deleted_at = Column(DateTime(timezone=True), nullable=True)
# Indexes for common queries
__table_args__ = (
Index("idx_collected_data_source_collected", "source", "collected_at"),
Index("idx_collected_data_source_type", "source", "data_type"),
Index("idx_collected_data_source_source_id", "source", "source_id"),
)
def __repr__(self):
return f"<CollectedData {self.id}: {self.source}/{self.data_type}>"
def to_dict(self) -> dict:
"""Convert to dictionary"""
return {
"id": self.id,
"snapshot_id": self.snapshot_id,
"task_id": self.task_id,
"source": self.source,
"source_id": self.source_id,
"entity_key": self.entity_key,
"data_type": self.data_type,
"name": self.name,
"title": self.title,
"description": self.description,
"country": get_record_field(self, "country"),
"city": get_record_field(self, "city"),
"latitude": get_record_field(self, "latitude"),
"longitude": get_record_field(self, "longitude"),
"value": get_record_field(self, "value"),
"unit": get_record_field(self, "unit"),
"metadata": self.extra_data,
"collected_at": self.collected_at.isoformat()
if self.collected_at is not None
else None,
"reference_date": self.reference_date.isoformat()
if self.reference_date is not None
else None,
"is_current": self.is_current,
"previous_record_id": self.previous_record_id,
"change_type": self.change_type,
"change_summary": self.change_summary,
"deleted_at": self.deleted_at.isoformat() if self.deleted_at is not None else None,
}