Refine data management and collection workflows

This commit is contained in:
linkong
2026-03-25 17:19:10 +08:00
parent cc5f16f8a7
commit 020c1d5051
34 changed files with 3341 additions and 947 deletions

View File

@@ -0,0 +1,57 @@
#!/usr/bin/env python3
"""Backfill legacy collected_data columns into metadata."""
import asyncio
import os
import sys
ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
BACKEND_DIR = os.path.join(ROOT_DIR, "backend")
sys.path.insert(0, ROOT_DIR)
sys.path.insert(0, BACKEND_DIR)
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine
from sqlalchemy.orm import sessionmaker
from app.core.collected_data_fields import build_dynamic_metadata
from app.models.collected_data import CollectedData
async def main():
database_url = os.environ.get(
"DATABASE_URL", "postgresql+asyncpg://postgres:postgres@localhost:5432/planet_db"
)
engine = create_async_engine(database_url, echo=False)
async_session = sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
updated = 0
async with async_session() as session:
result = await session.execute(select(CollectedData))
records = result.scalars().all()
for record in records:
merged_metadata = build_dynamic_metadata(
record.extra_data or {},
country=record.country,
city=record.city,
latitude=record.latitude,
longitude=record.longitude,
value=record.value,
unit=record.unit,
)
if merged_metadata != (record.extra_data or {}):
record.extra_data = merged_metadata
updated += 1
await session.commit()
await engine.dispose()
print(f"Backfill completed. Updated {updated} collected_data rows.")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,119 @@
#!/usr/bin/env python3
"""Check whether collected_data is ready for strong-coupled column removal."""
import asyncio
import os
import sys
ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
BACKEND_DIR = os.path.join(ROOT_DIR, "backend")
sys.path.insert(0, ROOT_DIR)
sys.path.insert(0, BACKEND_DIR)
from sqlalchemy import text
from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine
from sqlalchemy.orm import sessionmaker
CHECKS = {
"country_missing_in_metadata": """
SELECT COUNT(*)
FROM collected_data
WHERE country IS NOT NULL
AND country != ''
AND COALESCE(metadata->>'country', '') = ''
""",
"city_missing_in_metadata": """
SELECT COUNT(*)
FROM collected_data
WHERE city IS NOT NULL
AND city != ''
AND COALESCE(metadata->>'city', '') = ''
""",
"latitude_missing_in_metadata": """
SELECT COUNT(*)
FROM collected_data
WHERE latitude IS NOT NULL
AND latitude != ''
AND COALESCE(metadata->>'latitude', '') = ''
""",
"longitude_missing_in_metadata": """
SELECT COUNT(*)
FROM collected_data
WHERE longitude IS NOT NULL
AND longitude != ''
AND COALESCE(metadata->>'longitude', '') = ''
""",
"value_missing_in_metadata": """
SELECT COUNT(*)
FROM collected_data
WHERE value IS NOT NULL
AND value != ''
AND COALESCE(metadata->>'value', '') = ''
""",
"unit_missing_in_metadata": """
SELECT COUNT(*)
FROM collected_data
WHERE unit IS NOT NULL
AND unit != ''
AND COALESCE(metadata->>'unit', '') = ''
""",
"rows_with_any_legacy_value": """
SELECT COUNT(*)
FROM collected_data
WHERE COALESCE(country, '') != ''
OR COALESCE(city, '') != ''
OR COALESCE(latitude, '') != ''
OR COALESCE(longitude, '') != ''
OR COALESCE(value, '') != ''
OR COALESCE(unit, '') != ''
""",
"total_rows": """
SELECT COUNT(*) FROM collected_data
""",
}
async def scalar(session: AsyncSession, sql: str) -> int:
result = await session.execute(text(sql))
return int(result.scalar() or 0)
async def main():
database_url = os.environ.get(
"DATABASE_URL", "postgresql+asyncpg://postgres:postgres@localhost:5432/planet_db"
)
engine = create_async_engine(database_url, echo=False)
async_session = sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
async with async_session() as session:
results = {name: await scalar(session, sql) for name, sql in CHECKS.items()}
await engine.dispose()
print("Collected Data Column Removal Readiness")
print("=" * 44)
for key, value in results.items():
print(f"{key}: {value}")
blocking_checks = {
key: value
for key, value in results.items()
if key.endswith("_missing_in_metadata") and value > 0
}
print("\nConclusion:")
if blocking_checks:
print("NOT READY")
print("The following fields still have legacy column values not mirrored into metadata:")
for key, value in blocking_checks.items():
print(f"- {key}: {value}")
else:
print("READY FOR COLUMN REMOVAL CHECKPOINT")
print("All legacy column values are mirrored into metadata.")
print("You can proceed to the SQL migration after one more functional verification round.")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,41 @@
"""Drop legacy collected_data columns after metadata backfill verification."""
from __future__ import annotations
import asyncio
import sys
from pathlib import Path
from sqlalchemy import text
ROOT = Path(__file__).resolve().parents[1]
BACKEND_DIR = ROOT / "backend"
for path in (ROOT, BACKEND_DIR):
path_str = str(path)
if path_str not in sys.path:
sys.path.insert(0, path_str)
from app.db.session import engine # noqa: E402
DROP_SQL = """
ALTER TABLE collected_data
DROP COLUMN IF EXISTS country,
DROP COLUMN IF EXISTS city,
DROP COLUMN IF EXISTS latitude,
DROP COLUMN IF EXISTS longitude,
DROP COLUMN IF EXISTS value,
DROP COLUMN IF EXISTS unit;
"""
async def main() -> None:
async with engine.begin() as conn:
await conn.execute(text(DROP_SQL))
print("Dropped legacy collected_data columns: country, city, latitude, longitude, value, unit.")
if __name__ == "__main__":
asyncio.run(main())