Refine data management and collection workflows

This commit is contained in:
linkong
2026-03-25 17:19:10 +08:00
parent cc5f16f8a7
commit 020c1d5051
34 changed files with 3341 additions and 947 deletions

View File

@@ -1,6 +1,7 @@
from app.models.user import User
from app.models.gpu_cluster import GPUCluster
from app.models.task import CollectionTask
from app.models.data_snapshot import DataSnapshot
from app.models.datasource import DataSource
from app.models.datasource_config import DataSourceConfig
from app.models.alert import Alert, AlertSeverity, AlertStatus
@@ -10,6 +11,7 @@ __all__ = [
"User",
"GPUCluster",
"CollectionTask",
"DataSnapshot",
"DataSource",
"DataSourceConfig",
"SystemSetting",

View File

@@ -1,8 +1,9 @@
"""Collected Data model for storing data from all collectors"""
from sqlalchemy import Column, DateTime, Integer, String, Text, JSON, Index
from sqlalchemy import Boolean, Column, DateTime, ForeignKey, Integer, String, Text, JSON, Index
from sqlalchemy.sql import func
from app.core.collected_data_fields import get_record_field
from app.db.session import Base
@@ -12,8 +13,11 @@ class CollectedData(Base):
__tablename__ = "collected_data"
id = Column(Integer, primary_key=True, autoincrement=True)
snapshot_id = Column(Integer, ForeignKey("data_snapshots.id"), nullable=True, index=True)
task_id = Column(Integer, ForeignKey("collection_tasks.id"), nullable=True, index=True)
source = Column(String(100), nullable=False, index=True) # e.g., "top500", "huggingface_models"
source_id = Column(String(100), index=True) # Original ID from source, e.g., "rank_1"
entity_key = Column(String(255), index=True)
data_type = Column(
String(50), nullable=False, index=True
) # e.g., "supercomputer", "model", "dataset"
@@ -23,16 +27,6 @@ class CollectedData(Base):
title = Column(String(500))
description = Column(Text)
# Location data (for geo visualization)
country = Column(String(100))
city = Column(String(100))
latitude = Column(String(50))
longitude = Column(String(50))
# Performance metrics
value = Column(String(100)) # Generic value field (Rmax, Rpeak, etc.)
unit = Column(String(20))
# Additional metadata as JSON
extra_data = Column(
"metadata", JSON, default={}
@@ -44,11 +38,17 @@ class CollectedData(Base):
# Status
is_valid = Column(Integer, default=1) # 1=valid, 0=invalid
is_current = Column(Boolean, default=True, index=True)
previous_record_id = Column(Integer, ForeignKey("collected_data.id"), nullable=True, index=True)
change_type = Column(String(20), nullable=True)
change_summary = Column(JSON, default={})
deleted_at = Column(DateTime(timezone=True), nullable=True)
# Indexes for common queries
__table_args__ = (
Index("idx_collected_data_source_collected", "source", "collected_at"),
Index("idx_collected_data_source_type", "source", "data_type"),
Index("idx_collected_data_source_source_id", "source", "source_id"),
)
def __repr__(self):
@@ -58,18 +58,21 @@ class CollectedData(Base):
"""Convert to dictionary"""
return {
"id": self.id,
"snapshot_id": self.snapshot_id,
"task_id": self.task_id,
"source": self.source,
"source_id": self.source_id,
"entity_key": self.entity_key,
"data_type": self.data_type,
"name": self.name,
"title": self.title,
"description": self.description,
"country": self.country,
"city": self.city,
"latitude": self.latitude,
"longitude": self.longitude,
"value": self.value,
"unit": self.unit,
"country": get_record_field(self, "country"),
"city": get_record_field(self, "city"),
"latitude": get_record_field(self, "latitude"),
"longitude": get_record_field(self, "longitude"),
"value": get_record_field(self, "value"),
"unit": get_record_field(self, "unit"),
"metadata": self.extra_data,
"collected_at": self.collected_at.isoformat()
if self.collected_at is not None
@@ -77,4 +80,9 @@ class CollectedData(Base):
"reference_date": self.reference_date.isoformat()
if self.reference_date is not None
else None,
"is_current": self.is_current,
"previous_record_id": self.previous_record_id,
"change_type": self.change_type,
"change_summary": self.change_summary,
"deleted_at": self.deleted_at.isoformat() if self.deleted_at is not None else None,
}

View File

@@ -0,0 +1,26 @@
from sqlalchemy import Boolean, Column, DateTime, ForeignKey, Integer, JSON, String
from sqlalchemy.sql import func
from app.db.session import Base
class DataSnapshot(Base):
__tablename__ = "data_snapshots"
id = Column(Integer, primary_key=True, autoincrement=True)
datasource_id = Column(Integer, nullable=False, index=True)
task_id = Column(Integer, ForeignKey("collection_tasks.id"), nullable=True, index=True)
source = Column(String(100), nullable=False, index=True)
snapshot_key = Column(String(100), nullable=True, index=True)
reference_date = Column(DateTime(timezone=True), nullable=True)
started_at = Column(DateTime(timezone=True), server_default=func.now())
completed_at = Column(DateTime(timezone=True), nullable=True)
record_count = Column(Integer, default=0)
status = Column(String(20), nullable=False, default="running")
is_current = Column(Boolean, default=True, index=True)
parent_snapshot_id = Column(Integer, ForeignKey("data_snapshots.id"), nullable=True, index=True)
summary = Column(JSON, default={})
created_at = Column(DateTime(timezone=True), server_default=func.now())
def __repr__(self):
return f"<DataSnapshot {self.id}: {self.source}/{self.status}>"

View File

@@ -12,6 +12,7 @@ class CollectionTask(Base):
id = Column(Integer, primary_key=True, autoincrement=True)
datasource_id = Column(Integer, nullable=False, index=True)
status = Column(String(20), nullable=False) # pending, running, success, failed, cancelled
phase = Column(String(30), default="queued")
started_at = Column(DateTime(timezone=True))
completed_at = Column(DateTime(timezone=True))
records_processed = Column(Integer, default=0)