Refine data management and collection workflows

This commit is contained in:
linkong
2026-03-25 17:19:10 +08:00
parent cc5f16f8a7
commit 020c1d5051
34 changed files with 3341 additions and 947 deletions

View File

@@ -0,0 +1,62 @@
from typing import Any, Dict, Optional
FIELD_ALIASES = {
"country": ("country",),
"city": ("city",),
"latitude": ("latitude",),
"longitude": ("longitude",),
"value": ("value",),
"unit": ("unit",),
"cores": ("cores",),
"rmax": ("rmax", "r_max"),
"rpeak": ("rpeak", "r_peak"),
"power": ("power",),
}
def get_metadata_field(metadata: Optional[Dict[str, Any]], field: str, fallback: Any = None) -> Any:
if isinstance(metadata, dict):
for key in FIELD_ALIASES.get(field, (field,)):
value = metadata.get(key)
if value not in (None, ""):
return value
return fallback
def build_dynamic_metadata(
metadata: Optional[Dict[str, Any]],
*,
country: Any = None,
city: Any = None,
latitude: Any = None,
longitude: Any = None,
value: Any = None,
unit: Any = None,
) -> Dict[str, Any]:
merged = dict(metadata) if isinstance(metadata, dict) else {}
fallbacks = {
"country": country,
"city": city,
"latitude": latitude,
"longitude": longitude,
"value": value,
"unit": unit,
}
for field, fallback in fallbacks.items():
if fallback not in (None, "") and get_metadata_field(merged, field) in (None, ""):
merged[field] = fallback
return merged
def get_record_field(record: Any, field: str) -> Any:
metadata = getattr(record, "extra_data", None) or {}
fallback_attr = field
if field in {"cores", "rmax", "rpeak", "power"}:
fallback = None
else:
fallback = getattr(record, fallback_attr, None)
return get_metadata_field(metadata, field, fallback=fallback)

View File

@@ -0,0 +1,280 @@
import re
from typing import Any, Optional
COUNTRY_ENTRIES = [
("阿富汗", ["Afghanistan", "AF", "AFG"]),
("阿尔巴尼亚", ["Albania", "AL", "ALB"]),
("阿尔及利亚", ["Algeria", "DZ", "DZA"]),
("安道尔", ["Andorra", "AD", "AND"]),
("安哥拉", ["Angola", "AO", "AGO"]),
("安提瓜和巴布达", ["Antigua and Barbuda", "AG", "ATG"]),
("阿根廷", ["Argentina", "AR", "ARG"]),
("亚美尼亚", ["Armenia", "AM", "ARM"]),
("澳大利亚", ["Australia", "AU", "AUS"]),
("奥地利", ["Austria", "AT", "AUT"]),
("阿塞拜疆", ["Azerbaijan", "AZ", "AZE"]),
("巴哈马", ["Bahamas", "BS", "BHS"]),
("巴林", ["Bahrain", "BH", "BHR"]),
("孟加拉国", ["Bangladesh", "BD", "BGD"]),
("巴巴多斯", ["Barbados", "BB", "BRB"]),
("白俄罗斯", ["Belarus", "BY", "BLR"]),
("比利时", ["Belgium", "BE", "BEL"]),
("伯利兹", ["Belize", "BZ", "BLZ"]),
("贝宁", ["Benin", "BJ", "BEN"]),
("不丹", ["Bhutan", "BT", "BTN"]),
("玻利维亚", ["Bolivia", "BO", "BOL", "Bolivia (Plurinational State of)"]),
("波斯尼亚和黑塞哥维那", ["Bosnia and Herzegovina", "BA", "BIH"]),
("博茨瓦纳", ["Botswana", "BW", "BWA"]),
("巴西", ["Brazil", "BR", "BRA"]),
("文莱", ["Brunei", "BN", "BRN", "Brunei Darussalam"]),
("保加利亚", ["Bulgaria", "BG", "BGR"]),
("布基纳法索", ["Burkina Faso", "BF", "BFA"]),
("布隆迪", ["Burundi", "BI", "BDI"]),
("柬埔寨", ["Cambodia", "KH", "KHM"]),
("喀麦隆", ["Cameroon", "CM", "CMR"]),
("加拿大", ["Canada", "CA", "CAN"]),
("佛得角", ["Cape Verde", "CV", "CPV", "Cabo Verde"]),
("中非", ["Central African Republic", "CF", "CAF"]),
("乍得", ["Chad", "TD", "TCD"]),
("智利", ["Chile", "CL", "CHL"]),
("中国", ["China", "CN", "CHN", "Mainland China", "PRC", "People's Republic of China"]),
("中国(香港)", ["Hong Kong", "HK", "HKG", "Hong Kong SAR", "China Hong Kong", "Hong Kong, China"]),
("中国(澳门)", ["Macao", "Macau", "MO", "MAC", "Macao SAR", "China Macao", "Macau, China"]),
("中国(台湾)", ["Taiwan", "TW", "TWN", "Chinese Taipei", "Taiwan, China"]),
("哥伦比亚", ["Colombia", "CO", "COL"]),
("科摩罗", ["Comoros", "KM", "COM"]),
("刚果(布)", ["Republic of the Congo", "Congo", "Congo-Brazzaville", "CG", "COG"]),
("刚果(金)", ["Democratic Republic of the Congo", "DR Congo", "Congo-Kinshasa", "CD", "COD"]),
("哥斯达黎加", ["Costa Rica", "CR", "CRI"]),
("科特迪瓦", ["Cote d'Ivoire", "Côte d'Ivoire", "Ivory Coast", "CI", "CIV"]),
("克罗地亚", ["Croatia", "HR", "HRV"]),
("古巴", ["Cuba", "CU", "CUB"]),
("塞浦路斯", ["Cyprus", "CY", "CYP"]),
("捷克", ["Czech Republic", "Czechia", "CZ", "CZE"]),
("丹麦", ["Denmark", "DK", "DNK"]),
("吉布提", ["Djibouti", "DJ", "DJI"]),
("多米尼克", ["Dominica", "DM", "DMA"]),
("多米尼加", ["Dominican Republic", "DO", "DOM"]),
("厄瓜多尔", ["Ecuador", "EC", "ECU"]),
("埃及", ["Egypt", "EG", "EGY"]),
("萨尔瓦多", ["El Salvador", "SV", "SLV"]),
("赤道几内亚", ["Equatorial Guinea", "GQ", "GNQ"]),
("厄立特里亚", ["Eritrea", "ER", "ERI"]),
("爱沙尼亚", ["Estonia", "EE", "EST"]),
("埃斯瓦蒂尼", ["Eswatini", "SZ", "SWZ", "Swaziland"]),
("埃塞俄比亚", ["Ethiopia", "ET", "ETH"]),
("斐济", ["Fiji", "FJ", "FJI"]),
("芬兰", ["Finland", "FI", "FIN"]),
("法国", ["France", "FR", "FRA"]),
("加蓬", ["Gabon", "GA", "GAB"]),
("冈比亚", ["Gambia", "GM", "GMB"]),
("格鲁吉亚", ["Georgia", "GE", "GEO"]),
("德国", ["Germany", "DE", "DEU"]),
("加纳", ["Ghana", "GH", "GHA"]),
("希腊", ["Greece", "GR", "GRC"]),
("格林纳达", ["Grenada", "GD", "GRD"]),
("危地马拉", ["Guatemala", "GT", "GTM"]),
("几内亚", ["Guinea", "GN", "GIN"]),
("几内亚比绍", ["Guinea-Bissau", "GW", "GNB"]),
("圭亚那", ["Guyana", "GY", "GUY"]),
("海地", ["Haiti", "HT", "HTI"]),
("洪都拉斯", ["Honduras", "HN", "HND"]),
("匈牙利", ["Hungary", "HU", "HUN"]),
("冰岛", ["Iceland", "IS", "ISL"]),
("印度", ["India", "IN", "IND"]),
("印度尼西亚", ["Indonesia", "ID", "IDN"]),
("伊朗", ["Iran", "IR", "IRN", "Iran (Islamic Republic of)"]),
("伊拉克", ["Iraq", "IQ", "IRQ"]),
("爱尔兰", ["Ireland", "IE", "IRL"]),
("以色列", ["Israel", "IL", "ISR"]),
("意大利", ["Italy", "IT", "ITA"]),
("牙买加", ["Jamaica", "JM", "JAM"]),
("日本", ["Japan", "JP", "JPN"]),
("约旦", ["Jordan", "JO", "JOR"]),
("哈萨克斯坦", ["Kazakhstan", "KZ", "KAZ"]),
("肯尼亚", ["Kenya", "KE", "KEN"]),
("基里巴斯", ["Kiribati", "KI", "KIR"]),
("朝鲜", ["North Korea", "Korea, DPRK", "Democratic People's Republic of Korea", "KP", "PRK"]),
("韩国", ["South Korea", "Republic of Korea", "Korea", "KR", "KOR"]),
("科威特", ["Kuwait", "KW", "KWT"]),
("吉尔吉斯斯坦", ["Kyrgyzstan", "KG", "KGZ"]),
("老挝", ["Laos", "Lao PDR", "Lao People's Democratic Republic", "LA", "LAO"]),
("拉脱维亚", ["Latvia", "LV", "LVA"]),
("黎巴嫩", ["Lebanon", "LB", "LBN"]),
("莱索托", ["Lesotho", "LS", "LSO"]),
("利比里亚", ["Liberia", "LR", "LBR"]),
("利比亚", ["Libya", "LY", "LBY"]),
("列支敦士登", ["Liechtenstein", "LI", "LIE"]),
("立陶宛", ["Lithuania", "LT", "LTU"]),
("卢森堡", ["Luxembourg", "LU", "LUX"]),
("马达加斯加", ["Madagascar", "MG", "MDG"]),
("马拉维", ["Malawi", "MW", "MWI"]),
("马来西亚", ["Malaysia", "MY", "MYS"]),
("马尔代夫", ["Maldives", "MV", "MDV"]),
("马里", ["Mali", "ML", "MLI"]),
("马耳他", ["Malta", "MT", "MLT"]),
("马绍尔群岛", ["Marshall Islands", "MH", "MHL"]),
("毛里塔尼亚", ["Mauritania", "MR", "MRT"]),
("毛里求斯", ["Mauritius", "MU", "MUS"]),
("墨西哥", ["Mexico", "MX", "MEX"]),
("密克罗尼西亚", ["Micronesia", "FM", "FSM", "Federated States of Micronesia"]),
("摩尔多瓦", ["Moldova", "MD", "MDA", "Republic of Moldova"]),
("摩纳哥", ["Monaco", "MC", "MCO"]),
("蒙古", ["Mongolia", "MN", "MNG"]),
("黑山", ["Montenegro", "ME", "MNE"]),
("摩洛哥", ["Morocco", "MA", "MAR"]),
("莫桑比克", ["Mozambique", "MZ", "MOZ"]),
("缅甸", ["Myanmar", "MM", "MMR", "Burma"]),
("纳米比亚", ["Namibia", "NA", "NAM"]),
("瑙鲁", ["Nauru", "NR", "NRU"]),
("尼泊尔", ["Nepal", "NP", "NPL"]),
("荷兰", ["Netherlands", "NL", "NLD"]),
("新西兰", ["New Zealand", "NZ", "NZL"]),
("尼加拉瓜", ["Nicaragua", "NI", "NIC"]),
("尼日尔", ["Niger", "NE", "NER"]),
("尼日利亚", ["Nigeria", "NG", "NGA"]),
("北马其顿", ["North Macedonia", "MK", "MKD", "Macedonia"]),
("挪威", ["Norway", "NO", "NOR"]),
("阿曼", ["Oman", "OM", "OMN"]),
("巴基斯坦", ["Pakistan", "PK", "PAK"]),
("帕劳", ["Palau", "PW", "PLW"]),
("巴勒斯坦", ["Palestine", "PS", "PSE", "State of Palestine"]),
("巴拿马", ["Panama", "PA", "PAN"]),
("巴布亚新几内亚", ["Papua New Guinea", "PG", "PNG"]),
("巴拉圭", ["Paraguay", "PY", "PRY"]),
("秘鲁", ["Peru", "PE", "PER"]),
("菲律宾", ["Philippines", "PH", "PHL"]),
("波兰", ["Poland", "PL", "POL"]),
("葡萄牙", ["Portugal", "PT", "PRT"]),
("卡塔尔", ["Qatar", "QA", "QAT"]),
("罗马尼亚", ["Romania", "RO", "ROU"]),
("俄罗斯", ["Russia", "Russian Federation", "RU", "RUS"]),
("卢旺达", ["Rwanda", "RW", "RWA"]),
("圣基茨和尼维斯", ["Saint Kitts and Nevis", "KN", "KNA"]),
("圣卢西亚", ["Saint Lucia", "LC", "LCA"]),
("圣文森特和格林纳丁斯", ["Saint Vincent and the Grenadines", "VC", "VCT"]),
("萨摩亚", ["Samoa", "WS", "WSM"]),
("圣马力诺", ["San Marino", "SM", "SMR"]),
("圣多美和普林西比", ["Sao Tome and Principe", "ST", "STP", "São Tomé and Príncipe"]),
("沙特阿拉伯", ["Saudi Arabia", "SA", "SAU"]),
("塞内加尔", ["Senegal", "SN", "SEN"]),
("塞尔维亚", ["Serbia", "RS", "SRB", "Kosovo", "XK", "XKS", "Republic of Kosovo"]),
("塞舌尔", ["Seychelles", "SC", "SYC"]),
("塞拉利昂", ["Sierra Leone", "SL", "SLE"]),
("新加坡", ["Singapore", "SG", "SGP"]),
("斯洛伐克", ["Slovakia", "SK", "SVK"]),
("斯洛文尼亚", ["Slovenia", "SI", "SVN"]),
("所罗门群岛", ["Solomon Islands", "SB", "SLB"]),
("索马里", ["Somalia", "SO", "SOM"]),
("南非", ["South Africa", "ZA", "ZAF"]),
("南苏丹", ["South Sudan", "SS", "SSD"]),
("西班牙", ["Spain", "ES", "ESP"]),
("斯里兰卡", ["Sri Lanka", "LK", "LKA"]),
("苏丹", ["Sudan", "SD", "SDN"]),
("苏里南", ["Suriname", "SR", "SUR"]),
("瑞典", ["Sweden", "SE", "SWE"]),
("瑞士", ["Switzerland", "CH", "CHE"]),
("叙利亚", ["Syria", "SY", "SYR", "Syrian Arab Republic"]),
("塔吉克斯坦", ["Tajikistan", "TJ", "TJK"]),
("坦桑尼亚", ["Tanzania", "TZ", "TZA", "United Republic of Tanzania"]),
("泰国", ["Thailand", "TH", "THA"]),
("东帝汶", ["Timor-Leste", "East Timor", "TL", "TLS"]),
("多哥", ["Togo", "TG", "TGO"]),
("汤加", ["Tonga", "TO", "TON"]),
("特立尼达和多巴哥", ["Trinidad and Tobago", "TT", "TTO"]),
("突尼斯", ["Tunisia", "TN", "TUN"]),
("土耳其", ["Turkey", "TR", "TUR", "Türkiye"]),
("土库曼斯坦", ["Turkmenistan", "TM", "TKM"]),
("图瓦卢", ["Tuvalu", "TV", "TUV"]),
("乌干达", ["Uganda", "UG", "UGA"]),
("乌克兰", ["Ukraine", "UA", "UKR"]),
("阿联酋", ["United Arab Emirates", "AE", "ARE", "UAE"]),
("英国", ["United Kingdom", "UK", "GB", "GBR", "Great Britain", "Britain", "England"]),
("美国", ["United States", "United States of America", "US", "USA", "U.S.", "U.S.A."]),
("乌拉圭", ["Uruguay", "UY", "URY"]),
("乌兹别克斯坦", ["Uzbekistan", "UZ", "UZB"]),
("瓦努阿图", ["Vanuatu", "VU", "VUT"]),
("梵蒂冈", ["Vatican City", "Holy See", "VA", "VAT"]),
("委内瑞拉", ["Venezuela", "VE", "VEN", "Venezuela (Bolivarian Republic of)"]),
("越南", ["Vietnam", "Viet Nam", "VN", "VNM"]),
("也门", ["Yemen", "YE", "YEM"]),
("赞比亚", ["Zambia", "ZM", "ZMB"]),
("津巴布韦", ["Zimbabwe", "ZW", "ZWE"]),
]
COUNTRY_OPTIONS = [entry[0] for entry in COUNTRY_ENTRIES]
CANONICAL_COUNTRY_SET = set(COUNTRY_OPTIONS)
INVALID_COUNTRY_VALUES = {
"",
"-",
"--",
"unknown",
"n/a",
"na",
"none",
"null",
"global",
"world",
"worldwide",
"xx",
}
NUMERIC_LIKE_PATTERN = re.compile(r"^[\d\s,._%+\-]+$")
COUNTRY_ALIAS_MAP = {}
COUNTRY_VARIANTS_MAP = {}
for canonical, aliases in COUNTRY_ENTRIES:
COUNTRY_ALIAS_MAP[canonical.casefold()] = canonical
variants = [canonical, *aliases]
COUNTRY_VARIANTS_MAP[canonical] = variants
for alias in aliases:
COUNTRY_ALIAS_MAP[alias.casefold()] = canonical
def normalize_country(value: Any) -> Optional[str]:
if value is None:
return None
if not isinstance(value, str):
return None
normalized = re.sub(r"\s+", " ", value.strip())
normalized = normalized.replace("(", "").replace(")", "")
if not normalized:
return None
lowered = normalized.casefold()
if lowered in INVALID_COUNTRY_VALUES:
return None
if NUMERIC_LIKE_PATTERN.fullmatch(normalized):
return None
if normalized in CANONICAL_COUNTRY_SET:
return normalized
return COUNTRY_ALIAS_MAP.get(lowered)
def get_country_search_variants(value: Any) -> list[str]:
canonical = normalize_country(value)
if canonical is None:
return []
variants = []
seen = set()
for item in COUNTRY_VARIANTS_MAP.get(canonical, [canonical]):
if not isinstance(item, str):
continue
normalized = re.sub(r"\s+", " ", item.strip())
if not normalized:
continue
key = normalized.casefold()
if key in seen:
continue
seen.add(key)
variants.append(normalized)
return variants