Refine data management and collection workflows
This commit is contained in:
57
scripts/backfill_collected_data_metadata.py
Normal file
57
scripts/backfill_collected_data_metadata.py
Normal file
@@ -0,0 +1,57 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Backfill legacy collected_data columns into metadata."""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
|
||||
ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
BACKEND_DIR = os.path.join(ROOT_DIR, "backend")
|
||||
|
||||
sys.path.insert(0, ROOT_DIR)
|
||||
sys.path.insert(0, BACKEND_DIR)
|
||||
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
from app.core.collected_data_fields import build_dynamic_metadata
|
||||
from app.models.collected_data import CollectedData
|
||||
|
||||
|
||||
async def main():
|
||||
database_url = os.environ.get(
|
||||
"DATABASE_URL", "postgresql+asyncpg://postgres:postgres@localhost:5432/planet_db"
|
||||
)
|
||||
engine = create_async_engine(database_url, echo=False)
|
||||
async_session = sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
|
||||
|
||||
updated = 0
|
||||
|
||||
async with async_session() as session:
|
||||
result = await session.execute(select(CollectedData))
|
||||
records = result.scalars().all()
|
||||
|
||||
for record in records:
|
||||
merged_metadata = build_dynamic_metadata(
|
||||
record.extra_data or {},
|
||||
country=record.country,
|
||||
city=record.city,
|
||||
latitude=record.latitude,
|
||||
longitude=record.longitude,
|
||||
value=record.value,
|
||||
unit=record.unit,
|
||||
)
|
||||
|
||||
if merged_metadata != (record.extra_data or {}):
|
||||
record.extra_data = merged_metadata
|
||||
updated += 1
|
||||
|
||||
await session.commit()
|
||||
|
||||
await engine.dispose()
|
||||
print(f"Backfill completed. Updated {updated} collected_data rows.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
119
scripts/check_collected_data_column_removal_ready.py
Normal file
119
scripts/check_collected_data_column_removal_ready.py
Normal file
@@ -0,0 +1,119 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Check whether collected_data is ready for strong-coupled column removal."""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
|
||||
ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
BACKEND_DIR = os.path.join(ROOT_DIR, "backend")
|
||||
|
||||
sys.path.insert(0, ROOT_DIR)
|
||||
sys.path.insert(0, BACKEND_DIR)
|
||||
|
||||
from sqlalchemy import text
|
||||
from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
|
||||
CHECKS = {
|
||||
"country_missing_in_metadata": """
|
||||
SELECT COUNT(*)
|
||||
FROM collected_data
|
||||
WHERE country IS NOT NULL
|
||||
AND country != ''
|
||||
AND COALESCE(metadata->>'country', '') = ''
|
||||
""",
|
||||
"city_missing_in_metadata": """
|
||||
SELECT COUNT(*)
|
||||
FROM collected_data
|
||||
WHERE city IS NOT NULL
|
||||
AND city != ''
|
||||
AND COALESCE(metadata->>'city', '') = ''
|
||||
""",
|
||||
"latitude_missing_in_metadata": """
|
||||
SELECT COUNT(*)
|
||||
FROM collected_data
|
||||
WHERE latitude IS NOT NULL
|
||||
AND latitude != ''
|
||||
AND COALESCE(metadata->>'latitude', '') = ''
|
||||
""",
|
||||
"longitude_missing_in_metadata": """
|
||||
SELECT COUNT(*)
|
||||
FROM collected_data
|
||||
WHERE longitude IS NOT NULL
|
||||
AND longitude != ''
|
||||
AND COALESCE(metadata->>'longitude', '') = ''
|
||||
""",
|
||||
"value_missing_in_metadata": """
|
||||
SELECT COUNT(*)
|
||||
FROM collected_data
|
||||
WHERE value IS NOT NULL
|
||||
AND value != ''
|
||||
AND COALESCE(metadata->>'value', '') = ''
|
||||
""",
|
||||
"unit_missing_in_metadata": """
|
||||
SELECT COUNT(*)
|
||||
FROM collected_data
|
||||
WHERE unit IS NOT NULL
|
||||
AND unit != ''
|
||||
AND COALESCE(metadata->>'unit', '') = ''
|
||||
""",
|
||||
"rows_with_any_legacy_value": """
|
||||
SELECT COUNT(*)
|
||||
FROM collected_data
|
||||
WHERE COALESCE(country, '') != ''
|
||||
OR COALESCE(city, '') != ''
|
||||
OR COALESCE(latitude, '') != ''
|
||||
OR COALESCE(longitude, '') != ''
|
||||
OR COALESCE(value, '') != ''
|
||||
OR COALESCE(unit, '') != ''
|
||||
""",
|
||||
"total_rows": """
|
||||
SELECT COUNT(*) FROM collected_data
|
||||
""",
|
||||
}
|
||||
|
||||
|
||||
async def scalar(session: AsyncSession, sql: str) -> int:
|
||||
result = await session.execute(text(sql))
|
||||
return int(result.scalar() or 0)
|
||||
|
||||
|
||||
async def main():
|
||||
database_url = os.environ.get(
|
||||
"DATABASE_URL", "postgresql+asyncpg://postgres:postgres@localhost:5432/planet_db"
|
||||
)
|
||||
engine = create_async_engine(database_url, echo=False)
|
||||
async_session = sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
|
||||
|
||||
async with async_session() as session:
|
||||
results = {name: await scalar(session, sql) for name, sql in CHECKS.items()}
|
||||
|
||||
await engine.dispose()
|
||||
|
||||
print("Collected Data Column Removal Readiness")
|
||||
print("=" * 44)
|
||||
for key, value in results.items():
|
||||
print(f"{key}: {value}")
|
||||
|
||||
blocking_checks = {
|
||||
key: value
|
||||
for key, value in results.items()
|
||||
if key.endswith("_missing_in_metadata") and value > 0
|
||||
}
|
||||
|
||||
print("\nConclusion:")
|
||||
if blocking_checks:
|
||||
print("NOT READY")
|
||||
print("The following fields still have legacy column values not mirrored into metadata:")
|
||||
for key, value in blocking_checks.items():
|
||||
print(f"- {key}: {value}")
|
||||
else:
|
||||
print("READY FOR COLUMN REMOVAL CHECKPOINT")
|
||||
print("All legacy column values are mirrored into metadata.")
|
||||
print("You can proceed to the SQL migration after one more functional verification round.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
41
scripts/drop_collected_data_legacy_columns.py
Normal file
41
scripts/drop_collected_data_legacy_columns.py
Normal file
@@ -0,0 +1,41 @@
|
||||
"""Drop legacy collected_data columns after metadata backfill verification."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from sqlalchemy import text
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
BACKEND_DIR = ROOT / "backend"
|
||||
|
||||
for path in (ROOT, BACKEND_DIR):
|
||||
path_str = str(path)
|
||||
if path_str not in sys.path:
|
||||
sys.path.insert(0, path_str)
|
||||
|
||||
from app.db.session import engine # noqa: E402
|
||||
|
||||
|
||||
DROP_SQL = """
|
||||
ALTER TABLE collected_data
|
||||
DROP COLUMN IF EXISTS country,
|
||||
DROP COLUMN IF EXISTS city,
|
||||
DROP COLUMN IF EXISTS latitude,
|
||||
DROP COLUMN IF EXISTS longitude,
|
||||
DROP COLUMN IF EXISTS value,
|
||||
DROP COLUMN IF EXISTS unit;
|
||||
"""
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
async with engine.begin() as conn:
|
||||
await conn.execute(text(DROP_SQL))
|
||||
|
||||
print("Dropped legacy collected_data columns: country, city, latitude, longitude, value, unit.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user