# Wikipedia 世界杯赛程提取技巧 ## 页面结构 2026世界杯页面：`https://en.wikipedia.org/wiki/2026_FIFA_World_Cup` 每个比赛在 `

` 块中，包含： - 比赛编号：`Match (\d+)` - 日期：`class="fdate"` 标签 - 时间：`` 标签，包含时区信息（如 `1:00 p.m. UTC−6`） - 主队：`class="fhome"` 标签内的 `>文本<` - 客队：`class="faway"` 标签内的 `>文本<` ## Python 解析代码 ```python import urllib.request, re, html as html_module url = "https://en.wikipedia.org/wiki/2026_FIFA_World_Cup" req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"}) with urllib.request.urlopen(req, timeout=15) as resp: content = resp.read().decode() boxes = re.findall( r']*class="[^"]*footballbox[^"]*"[^>]*>(.*?)

\s*', content, re.DOTALL ) schedule = [] for box in boxes: mid = re.search(r'Match (\d+)', box) if not mid: continue date_m = re.search(r'class="fdate"[^>]*>(.*?)', box, re.DOTALL) date_str = '' if date_m: date_str = re.sub(r'<[^>]+>', '', date_m.group(1)).strip() date_str = html_module.unescape(date_str) dm = re.search(r'(\w+ \d+, \d{4})', date_str) date_str = dm.group(1) if dm else date_str time_m = re.search(r']*>(.*?)', box, re.DOTALL) time_str = '' tz_offset = None if time_m: raw = time_m.group(1) raw_clean = re.sub(r'<[^>]+>', '', raw).strip() raw_clean = html_module.unescape(raw_clean) tm = re.search(r'(\d{1,2}:\d{2}\s*[ap]\.m\.)', raw_clean, re.IGNORECASE) time_str = tm.group(1) if tm else raw_clean tz_m = re.search(r'UTC([−+-]\d+)', raw) if tz_m: tz_str = tz_m.group(1).replace('−', '-') tz_offset = int(tz_str) home_m = re.search(r'class="fhome"[^>]*>(.*?)', box, re.DOTALL) home_str = '' if home_m: texts = re.findall(r'>([^<]+)<', home_m.group(1)) for t in texts: t = t.strip() if t and len(t) > 1 and not t.startswith('&#') and not t.startswith('Match'): home_str = html_module.unescape(t) break away_m = re.search(r'class="faway"[^>]*>(.*?)', box, re.DOTALL) away_str = '' if away_m: texts = re.findall(r'>([^<]+)<', away_m.group(1)) for t in texts: t = t.strip() if t and len(t) > 1 and not t.startswith('&#') and not t.startswith('Match'): away_str = html_module.unescape(t) break schedule.append({ 'match': int(mid.group(1)), 'date': date_str, 'time': time_str, 'tz': tz_offset, 'home': home_str, 'away': away_str }) schedule.sort(key=lambda x: x['match']) ``` ## 时区转换（→ 北京时间 UTC+8） | 当地时区 | 换算 | 示例 | |---------|------|------| | UTC-6（墨西哥城） | +13小时 | 当地15:00 = 北京次日04:00 | | UTC-4（多伦多/纽约） | +12小时 | 当地18:00 = 北京次日06:00 | | UTC-7（洛杉矶） | +15小时 | 当地17:00 = 北京次日08:00 | ## 注意事项 - 用 Python urllib 而不是 curl 抓取 Wikipedia（更可靠） - 页面很大（~150KB），解析需要几秒钟 - 比赛编号不连续（1, 2, 3, 4, 5, 6, 7, 8, 25, 26...），按编号排序即可