Refine data management and collection workflows

2026-03-25 17:19:10 +08:00
parent cc5f16f8a7
commit 020c1d5051
34 changed files with 3341 additions and 947 deletions
--- a/backend/app/api/v1/collected_data.py
+++ b/backend/app/api/v1/collected_data.py
@@ -7,6 +7,8 @@ import json
 import csv
 import io

+from app.core.collected_data_fields import get_metadata_field
+from app.core.countries import COUNTRY_OPTIONS, get_country_search_variants, normalize_country
 from app.db.session import get_db
 from app.models.user import User
 from app.core.security import get_current_user
@@ -15,8 +17,119 @@ from app.models.collected_data import CollectedData
 router = APIRouter()


+COUNTRY_SQL = "metadata->>'country'"
+SEARCHABLE_SQL = [
+    "name",
+    "title",
+    "description",
+    "source",
+    "data_type",
+    "source_id",
+    "metadata::text",
+]
+
+
+def parse_multi_values(value: Optional[str]) -> list[str]:
+    if not value:
+        return []
+    return [item.strip() for item in value.split(",") if item.strip()]
+
+
+def build_in_condition(field_sql: str, values: list[str], param_prefix: str, params: dict) -> str:
+    placeholders = []
+    for index, value in enumerate(values):
+        key = f"{param_prefix}_{index}"
+        params[key] = value
+        placeholders.append(f":{key}")
+    return f"{field_sql} IN ({', '.join(placeholders)})"
+
+
+def build_search_condition(search: Optional[str], params: dict) -> Optional[str]:
+    if not search:
+        return None
+
+    normalized = search.strip()
+    if not normalized:
+        return None
+
+    search_terms = [normalized]
+    for variant in get_country_search_variants(normalized):
+        if variant.casefold() not in {term.casefold() for term in search_terms}:
+            search_terms.append(variant)
+
+    conditions = []
+    for index, term in enumerate(search_terms):
+        params[f"search_{index}"] = f"%{term}%"
+        conditions.extend(f"{field} ILIKE :search_{index}" for field in SEARCHABLE_SQL)
+
+    params["search_exact"] = normalized
+    params["search_prefix"] = f"{normalized}%"
+
+    canonical_variants = get_country_search_variants(normalized)
+    canonical = canonical_variants[0] if canonical_variants else None
+    params["country_search_exact"] = canonical or normalized
+    params["country_search_prefix"] = f"{(canonical or normalized)}%"
+
+    return "(" + " OR ".join(conditions) + ")"
+
+
+def build_search_rank_sql(search: Optional[str]) -> str:
+    if not search or not search.strip():
+        return "0"
+
+    return """
+        CASE
+            WHEN name ILIKE :search_exact THEN 700
+            WHEN name ILIKE :search_prefix THEN 600
+            WHEN title ILIKE :search_exact THEN 500
+            WHEN title ILIKE :search_prefix THEN 400
+            WHEN metadata->>'country' ILIKE :country_search_exact THEN 380
+            WHEN metadata->>'country' ILIKE :country_search_prefix THEN 340
+            WHEN source_id ILIKE :search_exact THEN 350
+            WHEN source ILIKE :search_exact THEN 300
+            WHEN data_type ILIKE :search_exact THEN 250
+            WHEN description ILIKE :search_0 THEN 150
+            WHEN metadata::text ILIKE :search_0 THEN 100
+            WHEN title ILIKE :search_0 THEN 80
+            WHEN name ILIKE :search_0 THEN 60
+            WHEN source ILIKE :search_0 THEN 40
+            WHEN data_type ILIKE :search_0 THEN 30
+            WHEN source_id ILIKE :search_0 THEN 20
+            ELSE 0
+        END
+    """
+
+
+def serialize_collected_row(row) -> dict:
+    metadata = row[7]
+    return {
+        "id": row[0],
+        "source": row[1],
+        "source_id": row[2],
+        "data_type": row[3],
+        "name": row[4],
+        "title": row[5],
+        "description": row[6],
+        "country": get_metadata_field(metadata, "country"),
+        "city": get_metadata_field(metadata, "city"),
+        "latitude": get_metadata_field(metadata, "latitude"),
+        "longitude": get_metadata_field(metadata, "longitude"),
+        "value": get_metadata_field(metadata, "value"),
+        "unit": get_metadata_field(metadata, "unit"),
+        "metadata": metadata,
+        "cores": get_metadata_field(metadata, "cores"),
+        "rmax": get_metadata_field(metadata, "rmax"),
+        "rpeak": get_metadata_field(metadata, "rpeak"),
+        "power": get_metadata_field(metadata, "power"),
+        "collected_at": row[8].isoformat() if row[8] else None,
+        "reference_date": row[9].isoformat() if row[9] else None,
+        "is_valid": row[10],
+    }
+
+
@router.get("")
 async def list_collected_data(
+    mode: str = Query("current", description="查询模式: current/history"),
    source: Optional[str] = Query(None, description="数据源过滤"),
    data_type: Optional[str] = Query(None, description="数据类型过滤"),
    country: Optional[str] = Query(None, description="国家过滤"),
@@ -27,25 +140,30 @@ async def list_collected_data(
    db: AsyncSession = Depends(get_db),
 ):
    """查询采集的数据列表"""
+    normalized_country = normalize_country(country) if country else None
+    source_values = parse_multi_values(source)
+    data_type_values = parse_multi_values(data_type)

    # Build WHERE clause
    conditions = []
    params = {}

-    if source:
-        conditions.append("source = :source")
-        params["source"] = source
-    if data_type:
-        conditions.append("data_type = :data_type")
-        params["data_type"] = data_type
-    if country:
-        conditions.append("country = :country")
-        params["country"] = country
-    if search:
-        conditions.append("(name ILIKE :search OR title ILIKE :search)")
-        params["search"] = f"%{search}%"
+    if mode != "history":
+        conditions.append("COALESCE(is_current, TRUE) = TRUE")
+
+    if source_values:
+        conditions.append(build_in_condition("source", source_values, "source", params))
+    if data_type_values:
+        conditions.append(build_in_condition("data_type", data_type_values, "data_type", params))
+    if normalized_country:
+        conditions.append(f"{COUNTRY_SQL} = :country")
+        params["country"] = normalized_country
+    search_condition = build_search_condition(search, params)
+    if search_condition:
+        conditions.append(search_condition)

    where_sql = " AND ".join(conditions) if conditions else "1=1"
+    search_rank_sql = build_search_rank_sql(search)

    # Calculate offset
    offset = (page - 1) * page_size
@@ -58,11 +176,11 @@ async def list_collected_data(
    # Query data
    query = text(f"""
        SELECT id, source, source_id, data_type, name, title, description,
-               country, city, latitude, longitude, value, unit,
-               metadata, collected_at, reference_date, is_valid
+               metadata, collected_at, reference_date, is_valid,
+               {search_rank_sql} AS search_rank
        FROM collected_data
        WHERE {where_sql}
-        ORDER BY collected_at DESC
+        ORDER BY search_rank DESC, collected_at DESC
        LIMIT :limit OFFSET :offset
    """)
    params["limit"] = page_size
@@ -73,27 +191,7 @@ async def list_collected_data(

    data = []
    for row in rows:
-        data.append(
-            {
-                "id": row[0],
-                "source": row[1],
-                "source_id": row[2],
-                "data_type": row[3],
-                "name": row[4],
-                "title": row[5],
-                "description": row[6],
-                "country": row[7],
-                "city": row[8],
-                "latitude": row[9],
-                "longitude": row[10],
-                "value": row[11],
-                "unit": row[12],
-                "metadata": row[13],
-                "collected_at": row[14].isoformat() if row[14] else None,
-                "reference_date": row[15].isoformat() if row[15] else None,
-                "is_valid": row[16],
-            }
-        )
+        data.append(serialize_collected_row(row[:11]))

    return {
        "total": total,
@@ -105,16 +203,19 @@ async def list_collected_data(

@router.get("/summary")
 async def get_data_summary(
+    mode: str = Query("current", description="查询模式: current/history"),
    current_user: User = Depends(get_current_user),
    db: AsyncSession = Depends(get_db),
 ):
    """获取数据汇总统计"""
+    where_sql = "WHERE COALESCE(is_current, TRUE) = TRUE" if mode != "history" else ""

    # By source and data_type
    result = await db.execute(
        text("""
        SELECT source, data_type, COUNT(*) as count
        FROM collected_data
+        """ + where_sql + """
        GROUP BY source, data_type
        ORDER BY source, data_type
    """)
@@ -138,6 +239,7 @@ async def get_data_summary(
        text("""
        SELECT source, COUNT(*) as count
        FROM collected_data
+        """ + where_sql + """
        GROUP BY source
        ORDER BY count DESC
    """)
@@ -153,6 +255,7 @@ async def get_data_summary(

@router.get("/sources")
 async def get_data_sources(
+    mode: str = Query("current", description="查询模式: current/history"),
    current_user: User = Depends(get_current_user),
    db: AsyncSession = Depends(get_db),
 ):
@@ -160,7 +263,9 @@ async def get_data_sources(

    result = await db.execute(
        text("""
-        SELECT DISTINCT source FROM collected_data ORDER BY source
+        SELECT DISTINCT source FROM collected_data
+        """ + ("WHERE COALESCE(is_current, TRUE) = TRUE " if mode != "history" else "") + """
+        ORDER BY source
    """)
    )
    rows = result.fetchall()
@@ -172,6 +277,7 @@ async def get_data_sources(

@router.get("/types")
 async def get_data_types(
+    mode: str = Query("current", description="查询模式: current/history"),
    current_user: User = Depends(get_current_user),
    db: AsyncSession = Depends(get_db),
 ):
@@ -179,7 +285,9 @@ async def get_data_types(

    result = await db.execute(
        text("""
-        SELECT DISTINCT data_type FROM collected_data ORDER BY data_type
+        SELECT DISTINCT data_type FROM collected_data
+        """ + ("WHERE COALESCE(is_current, TRUE) = TRUE " if mode != "history" else "") + """
+        ORDER BY data_type
    """)
    )
    rows = result.fetchall()
@@ -196,17 +304,8 @@ async def get_countries(
 ):
    """获取所有国家列表"""

-    result = await db.execute(
-        text("""
-        SELECT DISTINCT country FROM collected_data
-        WHERE country IS NOT NULL AND country != ''
-        ORDER BY country
-    """)
-    )
-    rows = result.fetchall()
-
    return {
-        "countries": [row[0] for row in rows],
+        "countries": COUNTRY_OPTIONS,
    }


@@ -221,7 +320,6 @@ async def get_collected_data(
    result = await db.execute(
        text("""
            SELECT id, source, source_id, data_type, name, title, description,
-                   country, city, latitude, longitude, value, unit,
                   metadata, collected_at, reference_date, is_valid
            FROM collected_data
            WHERE id = :id
@@ -236,25 +334,7 @@ async def get_collected_data(
            detail="数据不存在",
        )

-    return {
-        "id": row[0],
-        "source": row[1],
-        "source_id": row[2],
-        "data_type": row[3],
-        "name": row[4],
-        "title": row[5],
-        "description": row[6],
-        "country": row[7],
-        "city": row[8],
-        "latitude": row[9],
-        "longitude": row[10],
-        "value": row[11],
-        "unit": row[12],
-        "metadata": row[13],
-        "collected_at": row[14].isoformat() if row[14] else None,
-        "reference_date": row[15].isoformat() if row[15] else None,
-        "is_valid": row[16],
-    }
+    return serialize_collected_row(row)


 def build_where_clause(
@@ -263,19 +343,21 @@ def build_where_clause(
    """Build WHERE clause and params for queries"""
    conditions = []
    params = {}
+    source_values = parse_multi_values(source)
+    data_type_values = parse_multi_values(data_type)

-    if source:
-        conditions.append("source = :source")
-        params["source"] = source
-    if data_type:
-        conditions.append("data_type = :data_type")
-        params["data_type"] = data_type
-    if country:
-        conditions.append("country = :country")
-        params["country"] = country
-    if search:
-        conditions.append("(name ILIKE :search OR title ILIKE :search)")
-        params["search"] = f"%{search}%"
+    if source_values:
+        conditions.append(build_in_condition("source", source_values, "source", params))
+    if data_type_values:
+        conditions.append(build_in_condition("data_type", data_type_values, "data_type", params))
+    normalized_country = normalize_country(country) if country else None
+
+    if normalized_country:
+        conditions.append(f"{COUNTRY_SQL} = :country")
+        params["country"] = normalized_country
+    search_condition = build_search_condition(search, params)
+    if search_condition:
+        conditions.append(search_condition)

    where_sql = " AND ".join(conditions) if conditions else "1=1"
    return where_sql, params
@@ -283,6 +365,7 @@ def build_where_clause(

@router.get("/export/json")
 async def export_json(
+    mode: str = Query("current", description="查询模式: current/history"),
    source: Optional[str] = Query(None, description="数据源过滤"),
    data_type: Optional[str] = Query(None, description="数据类型过滤"),
    country: Optional[str] = Query(None, description="国家过滤"),
@@ -294,11 +377,12 @@ async def export_json(
    """导出数据为 JSON 格式"""

    where_sql, params = build_where_clause(source, data_type, country, search)
+    if mode != "history":
+        where_sql = f"({where_sql}) AND COALESCE(is_current, TRUE) = TRUE"
    params["limit"] = limit

    query = text(f"""
        SELECT id, source, source_id, data_type, name, title, description,
-               country, city, latitude, longitude, value, unit,
               metadata, collected_at, reference_date, is_valid
        FROM collected_data
        WHERE {where_sql}
@@ -311,27 +395,7 @@ async def export_json(

    data = []
    for row in rows:
-        data.append(
-            {
-                "id": row[0],
-                "source": row[1],
-                "source_id": row[2],
-                "data_type": row[3],
-                "name": row[4],
-                "title": row[5],
-                "description": row[6],
-                "country": row[7],
-                "city": row[8],
-                "latitude": row[9],
-                "longitude": row[10],
-                "value": row[11],
-                "unit": row[12],
-                "metadata": row[13],
-                "collected_at": row[14].isoformat() if row[14] else None,
-                "reference_date": row[15].isoformat() if row[15] else None,
-                "is_valid": row[16],
-            }
-        )
+        data.append(serialize_collected_row(row))

    json_str = json.dumps({"data": data, "total": len(data)}, ensure_ascii=False, indent=2)

@@ -346,6 +410,7 @@ async def export_json(

@router.get("/export/csv")
 async def export_csv(
+    mode: str = Query("current", description="查询模式: current/history"),
    source: Optional[str] = Query(None, description="数据源过滤"),
    data_type: Optional[str] = Query(None, description="数据类型过滤"),
    country: Optional[str] = Query(None, description="国家过滤"),
@@ -357,11 +422,12 @@ async def export_csv(
    """导出数据为 CSV 格式"""

    where_sql, params = build_where_clause(source, data_type, country, search)
+    if mode != "history":
+        where_sql = f"({where_sql}) AND COALESCE(is_current, TRUE) = TRUE"
    params["limit"] = limit

    query = text(f"""
        SELECT id, source, source_id, data_type, name, title, description,
-               country, city, latitude, longitude, value, unit,
               metadata, collected_at, reference_date, is_valid
        FROM collected_data
        WHERE {where_sql}
@@ -409,16 +475,16 @@ async def export_csv(
                row[4],
                row[5],
                row[6],
-                row[7],
-                row[8],
-                row[9],
+                get_metadata_field(row[7], "country"),
+                get_metadata_field(row[7], "city"),
+                get_metadata_field(row[7], "latitude"),
+                get_metadata_field(row[7], "longitude"),
+                get_metadata_field(row[7], "value"),
+                get_metadata_field(row[7], "unit"),
+                json.dumps(row[7]) if row[7] else "",
+                row[8].isoformat() if row[8] else "",
+                row[9].isoformat() if row[9] else "",
                row[10],
-                row[11],
-                row[12],
-                json.dumps(row[13]) if row[13] else "",
-                row[14].isoformat() if row[14] else "",
-                row[15].isoformat() if row[15] else "",
-                row[16],
            ]
        )

--- a/backend/app/api/v1/datasources.py
+++ b/backend/app/api/v1/datasources.py
@@ -5,12 +5,13 @@ from sqlalchemy import func, select
 from sqlalchemy.ext.asyncio import AsyncSession

 from app.core.security import get_current_user
+from app.core.data_sources import get_data_sources_config
 from app.db.session import get_db
 from app.models.collected_data import CollectedData
 from app.models.datasource import DataSource
 from app.models.task import CollectionTask
 from app.models.user import User
-from app.services.scheduler import run_collector_now, sync_datasource_job
+from app.services.scheduler import get_latest_task_id_for_datasource, run_collector_now, sync_datasource_job

 router = APIRouter()

@@ -83,9 +84,11 @@ async def list_datasources(
    datasources = result.scalars().all()

    collector_list = []
+    config = get_data_sources_config()
    for datasource in datasources:
        running_task = await get_running_task(db, datasource.id)
        last_task = await get_last_completed_task(db, datasource.id)
+        endpoint = await config.get_url(datasource.source, db)
        data_count_result = await db.execute(
            select(func.count(CollectedData.id)).where(CollectedData.source == datasource.source)
        )
@@ -105,10 +108,12 @@ async def list_datasources(
                "frequency_minutes": datasource.frequency_minutes,
                "is_active": datasource.is_active,
                "collector_class": datasource.collector_class,
+                "endpoint": endpoint,
                "last_run": last_run,
                "is_running": running_task is not None,
                "task_id": running_task.id if running_task else None,
                "progress": running_task.progress if running_task else None,
+                "phase": running_task.phase if running_task else None,
                "records_processed": running_task.records_processed if running_task else None,
                "total_records": running_task.total_records if running_task else None,
            }
@@ -127,6 +132,9 @@ async def get_datasource(
    if not datasource:
        raise HTTPException(status_code=404, detail="Data source not found")

+    config = get_data_sources_config()
+    endpoint = await config.get_url(datasource.source, db)
+
    return {
        "id": datasource.id,
        "name": datasource.name,
@@ -136,6 +144,7 @@ async def get_datasource(
        "frequency_minutes": datasource.frequency_minutes,
        "collector_class": datasource.collector_class,
        "source": datasource.source,
+        "endpoint": endpoint,
        "is_active": datasource.is_active,
    }

@@ -212,9 +221,16 @@ async def trigger_datasource(
    if not success:
        raise HTTPException(status_code=500, detail=f"Failed to trigger collector '{datasource.source}'")

+    task_id = None
+    for _ in range(10):
+        task_id = await get_latest_task_id_for_datasource(datasource.id)
+        if task_id is not None:
+            break
+
    return {
        "status": "triggered",
        "source_id": datasource.id,
+        "task_id": task_id,
        "collector_name": datasource.source,
        "message": f"Collector '{datasource.source}' has been triggered",
    }
@@ -252,21 +268,29 @@ async def clear_datasource_data(
@router.get("/{source_id}/task-status")
 async def get_task_status(
    source_id: str,
+    task_id: Optional[int] = None,
    db: AsyncSession = Depends(get_db),
 ):
    datasource = await get_datasource_record(db, source_id)
    if not datasource:
        raise HTTPException(status_code=404, detail="Data source not found")

-    running_task = await get_running_task(db, datasource.id)
-    if not running_task:
-        return {"is_running": False, "task_id": None, "progress": None}
+    if task_id is not None:
+        task = await db.get(CollectionTask, task_id)
+        if not task or task.datasource_id != datasource.id:
+            raise HTTPException(status_code=404, detail="Task not found")
+    else:
+        task = await get_running_task(db, datasource.id)
+
+    if not task:
+        return {"is_running": False, "task_id": None, "progress": None, "phase": None, "status": "idle"}

    return {
-        "is_running": True,
-        "task_id": running_task.id,
-        "progress": running_task.progress,
-        "records_processed": running_task.records_processed,
-        "total_records": running_task.total_records,
-        "status": running_task.status,
-    }
+        "is_running": task.status == "running",
+        "task_id": task.id,
+        "progress": task.progress,
+        "phase": task.phase,
+        "records_processed": task.records_processed,
+        "total_records": task.total_records,
+        "status": task.status,
+    }
--- a/backend/app/api/v1/visualization.py
+++ b/backend/app/api/v1/visualization.py
@@ -10,6 +10,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy import select, func
 from typing import List, Dict, Any, Optional

+from app.core.collected_data_fields import get_record_field
 from app.db.session import get_db
 from app.models.collected_data import CollectedData
 from app.services.cable_graph import build_graph_from_data, CableGraph
@@ -83,9 +84,9 @@ def convert_cable_to_geojson(records: List[CollectedData]) -> Dict[str, Any]:
                    "rfs": metadata.get("rfs"),
                    "RFS": metadata.get("rfs"),
                    "status": metadata.get("status", "active"),
-                    "length": record.value,
-                    "length_km": record.value,
-                    "SHAPE__Length": record.value,
+                    "length": get_record_field(record, "value"),
+                    "length_km": get_record_field(record, "value"),
+                    "SHAPE__Length": get_record_field(record, "value"),
                    "url": metadata.get("url"),
                    "color": metadata.get("color"),
                    "year": metadata.get("year"),
@@ -101,8 +102,10 @@ def convert_landing_point_to_geojson(records: List[CollectedData], city_to_cable
    
    for record in records:
        try:
-            lat = float(record.latitude) if record.latitude else None
-            lon = float(record.longitude) if record.longitude else None
+            latitude = get_record_field(record, "latitude")
+            longitude = get_record_field(record, "longitude")
+            lat = float(latitude) if latitude else None
+            lon = float(longitude) if longitude else None
        except (ValueError, TypeError):
            continue
        
@@ -116,8 +119,8 @@ def convert_landing_point_to_geojson(records: List[CollectedData], city_to_cable
            "id": record.id,
            "source_id": record.source_id,
            "name": record.name,
-            "country": record.country,
-            "city": record.city,
+            "country": get_record_field(record, "country"),
+            "city": get_record_field(record, "city"),
            "is_tbd": metadata.get("is_tbd", False),
        }
        
@@ -185,9 +188,11 @@ def convert_supercomputer_to_geojson(records: List[CollectedData]) -> Dict[str,

    for record in records:
        try:
-            lat = float(record.latitude) if record.latitude and record.latitude != "0.0" else None
+            latitude = get_record_field(record, "latitude")
+            longitude = get_record_field(record, "longitude")
+            lat = float(latitude) if latitude and latitude != "0.0" else None
            lon = (
-                float(record.longitude) if record.longitude and record.longitude != "0.0" else None
+                float(longitude) if longitude and longitude != "0.0" else None
            )
        except (ValueError, TypeError):
            lat, lon = None, None
@@ -203,12 +208,12 @@ def convert_supercomputer_to_geojson(records: List[CollectedData]) -> Dict[str,
                    "id": record.id,
                    "name": record.name,
                    "rank": metadata.get("rank"),
-                    "r_max": record.value,
-                    "r_peak": metadata.get("r_peak"),
-                    "cores": metadata.get("cores"),
-                    "power": metadata.get("power"),
-                    "country": record.country,
-                    "city": record.city,
+                    "r_max": get_record_field(record, "rmax"),
+                    "r_peak": get_record_field(record, "rpeak"),
+                    "cores": get_record_field(record, "cores"),
+                    "power": get_record_field(record, "power"),
+                    "country": get_record_field(record, "country"),
+                    "city": get_record_field(record, "city"),
                    "data_type": "supercomputer",
                },
            }
@@ -223,8 +228,10 @@ def convert_gpu_cluster_to_geojson(records: List[CollectedData]) -> Dict[str, An

    for record in records:
        try:
-            lat = float(record.latitude) if record.latitude else None
-            lon = float(record.longitude) if record.longitude else None
+            latitude = get_record_field(record, "latitude")
+            longitude = get_record_field(record, "longitude")
+            lat = float(latitude) if latitude else None
+            lon = float(longitude) if longitude else None
        except (ValueError, TypeError):
            lat, lon = None, None

@@ -238,8 +245,8 @@ def convert_gpu_cluster_to_geojson(records: List[CollectedData]) -> Dict[str, An
                "properties": {
                    "id": record.id,
                    "name": record.name,
-                    "country": record.country,
-                    "city": record.city,
+                    "country": get_record_field(record, "country"),
+                    "city": get_record_field(record, "city"),
                    "metadata": metadata,
                    "data_type": "gpu_cluster",
                },