STFC-Verifier/stfc_scraper.py at main · ComputerEndProgram/STFC-Verifier · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
"""
STFC.pro player data scraper.

Fetches player info from stfc.pro and stfc.wtf using metadata extraction.
Both TLDs work the same way and return identical data.
"""

import re
import logging
from dataclasses import dataclass
from typing import Optional
import requests
from bs4 import BeautifulSoup

log = logging.getLogger("stfc_scraper")


@dataclass
class PlayerData:
    """Player data extracted from stfc.pro/stfc.wtf"""
    player_id: str
    username: str
    level: int
    server: int
    alliance_tag: Optional[str] = None
    rank: Optional[str] = None

    def __repr__(self):
        alliance_str = f", alliance=[{self.alliance_tag}]" if self.alliance_tag else ""
        rank_str = f", rank={self.rank}" if self.rank else ""
        return f"PlayerData(id={self.player_id}, name={self.username}, level={self.level}, server={self.server}{alliance_str}{rank_str})"


class STFCProScraper:
    """Scrapes player data from stfc.pro, stfc.wtf, or stfc.live."""

    BASE_URL_PRO = "https://stfc.pro/players"
    BASE_URL_WTF = "https://stfc.wtf/players"
    BASE_URL_LIVE = "https://stfc.live/players"
    TIMEOUT = 10

    @staticmethod
    def extract_player_id_from_url(url: str) -> Optional[str]:
        """Extract player ID from a stfc.pro, stfc.wtf, or stfc.live URL.

        Examples:
            https://stfc.pro/players/2659122580 → 2659122580
            https://stfc.wtf/players/2659122580 → 2659122580
            https://stfc.live/players/2659122580 → 2659122580
            2659122580 → 2659122580
        """
        # Try to extract from full URL (supports .pro, .wtf, and .live)
        match = re.search(r"stfc\.(pro|wtf|live)/players/(\d+)", url)
        if match:
            return match.group(2)

        # Try to parse as plain ID
        if re.match(r"^\d+$", url.strip()):
            return url.strip()

        return None

    @staticmethod
    def fetch_player_data(player_id: str) -> Optional[PlayerData]:
        """Fetch player data from stfc.pro, stfc.wtf, or stfc.live.

        Tries stfc.pro first, then stfc.wtf, then stfc.live as fallback.

        Returns:
            PlayerData object if successful, None if not found or error.
        """
        # Try all URLs in order
        for base_url in [STFCProScraper.BASE_URL_PRO, STFCProScraper.BASE_URL_WTF, STFCProScraper.BASE_URL_LIVE]:
            url = f"{base_url}/{player_id}"

            try:
                response = requests.get(
                    url,
                    timeout=STFCProScraper.TIMEOUT,
                    headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
                )
                response.raise_for_status()
            except requests.RequestException as e:
                log.debug(f"Failed to fetch {url}: {e}")
                continue

            # Parse metadata from HTML
            soup = BeautifulSoup(response.content, "html.parser")

            # Extract from title and meta tags
            title_tag = soup.find("title")
            meta_desc = soup.find("meta", {"name": "description"})

            if not title_tag or not meta_desc:
                log.debug(f"Could not find required metadata for player {player_id} at {base_url}")
                continue

            title = title_tag.string or ""
            description = meta_desc.get("content", "")

            # Parse title: "DarthHαywire – Level 77 Player - STFC Statistics"
            # Extract username and level
            title_match = re.search(r"^(.+?)\s+–\s+Level\s+(\d+)", title)
            if not title_match:
                log.debug(f"Could not parse title for player {player_id}: {title}")
                continue

            username = title_match.group(1)
            level = int(title_match.group(2))

            # Parse description: "Stats for DarthHαywire, level 77 of [SITH], server 118." or "Stats for ..., level 73, server 172." (no alliance)
            desc_match = re.search(r"of\s+\[([^\]]+)\],\s+server\s+(\d+)", description)
            if desc_match:
                alliance_tag = desc_match.group(1)
                server = int(desc_match.group(2))
            else:
                # Try parsing without alliance tag
                desc_match = re.search(r",\s+server\s+(\d+)", description)
                if desc_match:
                    alliance_tag = None
                    server = int(desc_match.group(1))
                else:
                    log.debug(f"Could not parse alliance/server from description: {description}")
                    continue

            # Extract rank from HTML body
            rank_match = re.search(r'>(Agent|Operative|Premier|Commodore|Admiral)</span>', response.text)
            rank = rank_match.group(1) if rank_match else None

            log.info(f"Successfully fetched player {player_id} from {base_url}")
            return PlayerData(
                player_id=player_id,
                username=username,
                level=level,
                alliance_tag=alliance_tag,
                server=server,
                rank=rank,
            )

        # All URLs failed
        log.warning(f"Could not fetch player data for {player_id} from stfc.pro, stfc.wtf, or stfc.live")
        return None


def format_player_info(player_data: PlayerData) -> str:
    """Format player data for display."""
    return (
        f"**{player_data.username}**\n"
        f"Level: {player_data.level} | Server: {player_data.server} | Alliance: [{player_data.alliance_tag}]"
    )