-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstfc_scraper.py
More file actions
150 lines (121 loc) · 5.59 KB
/
stfc_scraper.py
File metadata and controls
150 lines (121 loc) · 5.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
"""
STFC.pro player data scraper.
Fetches player info from stfc.pro and stfc.wtf using metadata extraction.
Both TLDs work the same way and return identical data.
"""
import re
import logging
from dataclasses import dataclass
from typing import Optional
import requests
from bs4 import BeautifulSoup
log = logging.getLogger("stfc_scraper")
@dataclass
class PlayerData:
"""Player data extracted from stfc.pro/stfc.wtf"""
player_id: str
username: str
level: int
server: int
alliance_tag: Optional[str] = None
rank: Optional[str] = None
def __repr__(self):
alliance_str = f", alliance=[{self.alliance_tag}]" if self.alliance_tag else ""
rank_str = f", rank={self.rank}" if self.rank else ""
return f"PlayerData(id={self.player_id}, name={self.username}, level={self.level}, server={self.server}{alliance_str}{rank_str})"
class STFCProScraper:
"""Scrapes player data from stfc.pro, stfc.wtf, or stfc.live."""
BASE_URL_PRO = "https://stfc.pro/players"
BASE_URL_WTF = "https://stfc.wtf/players"
BASE_URL_LIVE = "https://stfc.live/players"
TIMEOUT = 10
@staticmethod
def extract_player_id_from_url(url: str) -> Optional[str]:
"""Extract player ID from a stfc.pro, stfc.wtf, or stfc.live URL.
Examples:
https://stfc.pro/players/2659122580 → 2659122580
https://stfc.wtf/players/2659122580 → 2659122580
https://stfc.live/players/2659122580 → 2659122580
2659122580 → 2659122580
"""
# Try to extract from full URL (supports .pro, .wtf, and .live)
match = re.search(r"stfc\.(pro|wtf|live)/players/(\d+)", url)
if match:
return match.group(2)
# Try to parse as plain ID
if re.match(r"^\d+$", url.strip()):
return url.strip()
return None
@staticmethod
def fetch_player_data(player_id: str) -> Optional[PlayerData]:
"""Fetch player data from stfc.pro, stfc.wtf, or stfc.live.
Tries stfc.pro first, then stfc.wtf, then stfc.live as fallback.
Returns:
PlayerData object if successful, None if not found or error.
"""
# Try all URLs in order
for base_url in [STFCProScraper.BASE_URL_PRO, STFCProScraper.BASE_URL_WTF, STFCProScraper.BASE_URL_LIVE]:
url = f"{base_url}/{player_id}"
try:
response = requests.get(
url,
timeout=STFCProScraper.TIMEOUT,
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
)
response.raise_for_status()
except requests.RequestException as e:
log.debug(f"Failed to fetch {url}: {e}")
continue
# Parse metadata from HTML
soup = BeautifulSoup(response.content, "html.parser")
# Extract from title and meta tags
title_tag = soup.find("title")
meta_desc = soup.find("meta", {"name": "description"})
if not title_tag or not meta_desc:
log.debug(f"Could not find required metadata for player {player_id} at {base_url}")
continue
title = title_tag.string or ""
description = meta_desc.get("content", "")
# Parse title: "DarthHαywire – Level 77 Player - STFC Statistics"
# Extract username and level
title_match = re.search(r"^(.+?)\s+–\s+Level\s+(\d+)", title)
if not title_match:
log.debug(f"Could not parse title for player {player_id}: {title}")
continue
username = title_match.group(1)
level = int(title_match.group(2))
# Parse description: "Stats for DarthHαywire, level 77 of [SITH], server 118." or "Stats for ..., level 73, server 172." (no alliance)
desc_match = re.search(r"of\s+\[([^\]]+)\],\s+server\s+(\d+)", description)
if desc_match:
alliance_tag = desc_match.group(1)
server = int(desc_match.group(2))
else:
# Try parsing without alliance tag
desc_match = re.search(r",\s+server\s+(\d+)", description)
if desc_match:
alliance_tag = None
server = int(desc_match.group(1))
else:
log.debug(f"Could not parse alliance/server from description: {description}")
continue
# Extract rank from HTML body
rank_match = re.search(r'>(Agent|Operative|Premier|Commodore|Admiral)</span>', response.text)
rank = rank_match.group(1) if rank_match else None
log.info(f"Successfully fetched player {player_id} from {base_url}")
return PlayerData(
player_id=player_id,
username=username,
level=level,
alliance_tag=alliance_tag,
server=server,
rank=rank,
)
# All URLs failed
log.warning(f"Could not fetch player data for {player_id} from stfc.pro, stfc.wtf, or stfc.live")
return None
def format_player_info(player_data: PlayerData) -> str:
"""Format player data for display."""
return (
f"**{player_data.username}**\n"
f"Level: {player_data.level} | Server: {player_data.server} | Alliance: [{player_data.alliance_tag}]"
)