Refine data management and collection workflows

This commit is contained in:
linkong
2026-03-25 17:19:10 +08:00
parent cc5f16f8a7
commit 020c1d5051
34 changed files with 3341 additions and 947 deletions

View File

@@ -7,6 +7,8 @@ import json
import csv
import io
from app.core.collected_data_fields import get_metadata_field
from app.core.countries import COUNTRY_OPTIONS, get_country_search_variants, normalize_country
from app.db.session import get_db
from app.models.user import User
from app.core.security import get_current_user
@@ -15,8 +17,119 @@ from app.models.collected_data import CollectedData
router = APIRouter()
COUNTRY_SQL = "metadata->>'country'"
SEARCHABLE_SQL = [
"name",
"title",
"description",
"source",
"data_type",
"source_id",
"metadata::text",
]
def parse_multi_values(value: Optional[str]) -> list[str]:
if not value:
return []
return [item.strip() for item in value.split(",") if item.strip()]
def build_in_condition(field_sql: str, values: list[str], param_prefix: str, params: dict) -> str:
placeholders = []
for index, value in enumerate(values):
key = f"{param_prefix}_{index}"
params[key] = value
placeholders.append(f":{key}")
return f"{field_sql} IN ({', '.join(placeholders)})"
def build_search_condition(search: Optional[str], params: dict) -> Optional[str]:
if not search:
return None
normalized = search.strip()
if not normalized:
return None
search_terms = [normalized]
for variant in get_country_search_variants(normalized):
if variant.casefold() not in {term.casefold() for term in search_terms}:
search_terms.append(variant)
conditions = []
for index, term in enumerate(search_terms):
params[f"search_{index}"] = f"%{term}%"
conditions.extend(f"{field} ILIKE :search_{index}" for field in SEARCHABLE_SQL)
params["search_exact"] = normalized
params["search_prefix"] = f"{normalized}%"
canonical_variants = get_country_search_variants(normalized)
canonical = canonical_variants[0] if canonical_variants else None
params["country_search_exact"] = canonical or normalized
params["country_search_prefix"] = f"{(canonical or normalized)}%"
return "(" + " OR ".join(conditions) + ")"
def build_search_rank_sql(search: Optional[str]) -> str:
if not search or not search.strip():
return "0"
return """
CASE
WHEN name ILIKE :search_exact THEN 700
WHEN name ILIKE :search_prefix THEN 600
WHEN title ILIKE :search_exact THEN 500
WHEN title ILIKE :search_prefix THEN 400
WHEN metadata->>'country' ILIKE :country_search_exact THEN 380
WHEN metadata->>'country' ILIKE :country_search_prefix THEN 340
WHEN source_id ILIKE :search_exact THEN 350
WHEN source ILIKE :search_exact THEN 300
WHEN data_type ILIKE :search_exact THEN 250
WHEN description ILIKE :search_0 THEN 150
WHEN metadata::text ILIKE :search_0 THEN 100
WHEN title ILIKE :search_0 THEN 80
WHEN name ILIKE :search_0 THEN 60
WHEN source ILIKE :search_0 THEN 40
WHEN data_type ILIKE :search_0 THEN 30
WHEN source_id ILIKE :search_0 THEN 20
ELSE 0
END
"""
def serialize_collected_row(row) -> dict:
metadata = row[7]
return {
"id": row[0],
"source": row[1],
"source_id": row[2],
"data_type": row[3],
"name": row[4],
"title": row[5],
"description": row[6],
"country": get_metadata_field(metadata, "country"),
"city": get_metadata_field(metadata, "city"),
"latitude": get_metadata_field(metadata, "latitude"),
"longitude": get_metadata_field(metadata, "longitude"),
"value": get_metadata_field(metadata, "value"),
"unit": get_metadata_field(metadata, "unit"),
"metadata": metadata,
"cores": get_metadata_field(metadata, "cores"),
"rmax": get_metadata_field(metadata, "rmax"),
"rpeak": get_metadata_field(metadata, "rpeak"),
"power": get_metadata_field(metadata, "power"),
"collected_at": row[8].isoformat() if row[8] else None,
"reference_date": row[9].isoformat() if row[9] else None,
"is_valid": row[10],
}
@router.get("")
async def list_collected_data(
mode: str = Query("current", description="查询模式: current/history"),
source: Optional[str] = Query(None, description="数据源过滤"),
data_type: Optional[str] = Query(None, description="数据类型过滤"),
country: Optional[str] = Query(None, description="国家过滤"),
@@ -27,25 +140,30 @@ async def list_collected_data(
db: AsyncSession = Depends(get_db),
):
"""查询采集的数据列表"""
normalized_country = normalize_country(country) if country else None
source_values = parse_multi_values(source)
data_type_values = parse_multi_values(data_type)
# Build WHERE clause
conditions = []
params = {}
if source:
conditions.append("source = :source")
params["source"] = source
if data_type:
conditions.append("data_type = :data_type")
params["data_type"] = data_type
if country:
conditions.append("country = :country")
params["country"] = country
if search:
conditions.append("(name ILIKE :search OR title ILIKE :search)")
params["search"] = f"%{search}%"
if mode != "history":
conditions.append("COALESCE(is_current, TRUE) = TRUE")
if source_values:
conditions.append(build_in_condition("source", source_values, "source", params))
if data_type_values:
conditions.append(build_in_condition("data_type", data_type_values, "data_type", params))
if normalized_country:
conditions.append(f"{COUNTRY_SQL} = :country")
params["country"] = normalized_country
search_condition = build_search_condition(search, params)
if search_condition:
conditions.append(search_condition)
where_sql = " AND ".join(conditions) if conditions else "1=1"
search_rank_sql = build_search_rank_sql(search)
# Calculate offset
offset = (page - 1) * page_size
@@ -58,11 +176,11 @@ async def list_collected_data(
# Query data
query = text(f"""
SELECT id, source, source_id, data_type, name, title, description,
country, city, latitude, longitude, value, unit,
metadata, collected_at, reference_date, is_valid
metadata, collected_at, reference_date, is_valid,
{search_rank_sql} AS search_rank
FROM collected_data
WHERE {where_sql}
ORDER BY collected_at DESC
ORDER BY search_rank DESC, collected_at DESC
LIMIT :limit OFFSET :offset
""")
params["limit"] = page_size
@@ -73,27 +191,7 @@ async def list_collected_data(
data = []
for row in rows:
data.append(
{
"id": row[0],
"source": row[1],
"source_id": row[2],
"data_type": row[3],
"name": row[4],
"title": row[5],
"description": row[6],
"country": row[7],
"city": row[8],
"latitude": row[9],
"longitude": row[10],
"value": row[11],
"unit": row[12],
"metadata": row[13],
"collected_at": row[14].isoformat() if row[14] else None,
"reference_date": row[15].isoformat() if row[15] else None,
"is_valid": row[16],
}
)
data.append(serialize_collected_row(row[:11]))
return {
"total": total,
@@ -105,16 +203,19 @@ async def list_collected_data(
@router.get("/summary")
async def get_data_summary(
mode: str = Query("current", description="查询模式: current/history"),
current_user: User = Depends(get_current_user),
db: AsyncSession = Depends(get_db),
):
"""获取数据汇总统计"""
where_sql = "WHERE COALESCE(is_current, TRUE) = TRUE" if mode != "history" else ""
# By source and data_type
result = await db.execute(
text("""
SELECT source, data_type, COUNT(*) as count
FROM collected_data
""" + where_sql + """
GROUP BY source, data_type
ORDER BY source, data_type
""")
@@ -138,6 +239,7 @@ async def get_data_summary(
text("""
SELECT source, COUNT(*) as count
FROM collected_data
""" + where_sql + """
GROUP BY source
ORDER BY count DESC
""")
@@ -153,6 +255,7 @@ async def get_data_summary(
@router.get("/sources")
async def get_data_sources(
mode: str = Query("current", description="查询模式: current/history"),
current_user: User = Depends(get_current_user),
db: AsyncSession = Depends(get_db),
):
@@ -160,7 +263,9 @@ async def get_data_sources(
result = await db.execute(
text("""
SELECT DISTINCT source FROM collected_data ORDER BY source
SELECT DISTINCT source FROM collected_data
""" + ("WHERE COALESCE(is_current, TRUE) = TRUE " if mode != "history" else "") + """
ORDER BY source
""")
)
rows = result.fetchall()
@@ -172,6 +277,7 @@ async def get_data_sources(
@router.get("/types")
async def get_data_types(
mode: str = Query("current", description="查询模式: current/history"),
current_user: User = Depends(get_current_user),
db: AsyncSession = Depends(get_db),
):
@@ -179,7 +285,9 @@ async def get_data_types(
result = await db.execute(
text("""
SELECT DISTINCT data_type FROM collected_data ORDER BY data_type
SELECT DISTINCT data_type FROM collected_data
""" + ("WHERE COALESCE(is_current, TRUE) = TRUE " if mode != "history" else "") + """
ORDER BY data_type
""")
)
rows = result.fetchall()
@@ -196,17 +304,8 @@ async def get_countries(
):
"""获取所有国家列表"""
result = await db.execute(
text("""
SELECT DISTINCT country FROM collected_data
WHERE country IS NOT NULL AND country != ''
ORDER BY country
""")
)
rows = result.fetchall()
return {
"countries": [row[0] for row in rows],
"countries": COUNTRY_OPTIONS,
}
@@ -221,7 +320,6 @@ async def get_collected_data(
result = await db.execute(
text("""
SELECT id, source, source_id, data_type, name, title, description,
country, city, latitude, longitude, value, unit,
metadata, collected_at, reference_date, is_valid
FROM collected_data
WHERE id = :id
@@ -236,25 +334,7 @@ async def get_collected_data(
detail="数据不存在",
)
return {
"id": row[0],
"source": row[1],
"source_id": row[2],
"data_type": row[3],
"name": row[4],
"title": row[5],
"description": row[6],
"country": row[7],
"city": row[8],
"latitude": row[9],
"longitude": row[10],
"value": row[11],
"unit": row[12],
"metadata": row[13],
"collected_at": row[14].isoformat() if row[14] else None,
"reference_date": row[15].isoformat() if row[15] else None,
"is_valid": row[16],
}
return serialize_collected_row(row)
def build_where_clause(
@@ -263,19 +343,21 @@ def build_where_clause(
"""Build WHERE clause and params for queries"""
conditions = []
params = {}
source_values = parse_multi_values(source)
data_type_values = parse_multi_values(data_type)
if source:
conditions.append("source = :source")
params["source"] = source
if data_type:
conditions.append("data_type = :data_type")
params["data_type"] = data_type
if country:
conditions.append("country = :country")
params["country"] = country
if search:
conditions.append("(name ILIKE :search OR title ILIKE :search)")
params["search"] = f"%{search}%"
if source_values:
conditions.append(build_in_condition("source", source_values, "source", params))
if data_type_values:
conditions.append(build_in_condition("data_type", data_type_values, "data_type", params))
normalized_country = normalize_country(country) if country else None
if normalized_country:
conditions.append(f"{COUNTRY_SQL} = :country")
params["country"] = normalized_country
search_condition = build_search_condition(search, params)
if search_condition:
conditions.append(search_condition)
where_sql = " AND ".join(conditions) if conditions else "1=1"
return where_sql, params
@@ -283,6 +365,7 @@ def build_where_clause(
@router.get("/export/json")
async def export_json(
mode: str = Query("current", description="查询模式: current/history"),
source: Optional[str] = Query(None, description="数据源过滤"),
data_type: Optional[str] = Query(None, description="数据类型过滤"),
country: Optional[str] = Query(None, description="国家过滤"),
@@ -294,11 +377,12 @@ async def export_json(
"""导出数据为 JSON 格式"""
where_sql, params = build_where_clause(source, data_type, country, search)
if mode != "history":
where_sql = f"({where_sql}) AND COALESCE(is_current, TRUE) = TRUE"
params["limit"] = limit
query = text(f"""
SELECT id, source, source_id, data_type, name, title, description,
country, city, latitude, longitude, value, unit,
metadata, collected_at, reference_date, is_valid
FROM collected_data
WHERE {where_sql}
@@ -311,27 +395,7 @@ async def export_json(
data = []
for row in rows:
data.append(
{
"id": row[0],
"source": row[1],
"source_id": row[2],
"data_type": row[3],
"name": row[4],
"title": row[5],
"description": row[6],
"country": row[7],
"city": row[8],
"latitude": row[9],
"longitude": row[10],
"value": row[11],
"unit": row[12],
"metadata": row[13],
"collected_at": row[14].isoformat() if row[14] else None,
"reference_date": row[15].isoformat() if row[15] else None,
"is_valid": row[16],
}
)
data.append(serialize_collected_row(row))
json_str = json.dumps({"data": data, "total": len(data)}, ensure_ascii=False, indent=2)
@@ -346,6 +410,7 @@ async def export_json(
@router.get("/export/csv")
async def export_csv(
mode: str = Query("current", description="查询模式: current/history"),
source: Optional[str] = Query(None, description="数据源过滤"),
data_type: Optional[str] = Query(None, description="数据类型过滤"),
country: Optional[str] = Query(None, description="国家过滤"),
@@ -357,11 +422,12 @@ async def export_csv(
"""导出数据为 CSV 格式"""
where_sql, params = build_where_clause(source, data_type, country, search)
if mode != "history":
where_sql = f"({where_sql}) AND COALESCE(is_current, TRUE) = TRUE"
params["limit"] = limit
query = text(f"""
SELECT id, source, source_id, data_type, name, title, description,
country, city, latitude, longitude, value, unit,
metadata, collected_at, reference_date, is_valid
FROM collected_data
WHERE {where_sql}
@@ -409,16 +475,16 @@ async def export_csv(
row[4],
row[5],
row[6],
row[7],
row[8],
row[9],
get_metadata_field(row[7], "country"),
get_metadata_field(row[7], "city"),
get_metadata_field(row[7], "latitude"),
get_metadata_field(row[7], "longitude"),
get_metadata_field(row[7], "value"),
get_metadata_field(row[7], "unit"),
json.dumps(row[7]) if row[7] else "",
row[8].isoformat() if row[8] else "",
row[9].isoformat() if row[9] else "",
row[10],
row[11],
row[12],
json.dumps(row[13]) if row[13] else "",
row[14].isoformat() if row[14] else "",
row[15].isoformat() if row[15] else "",
row[16],
]
)