left4me/l4d2web/services/steam_workshop.py
mwiegand c6b41429ee
feat(l4d2-web): steam workshop API client and downloader
Adds l4d2web/services/steam_workshop.py: parse_workshop_input (single ID,
URL, or multi-line batch), resolve_collection (HTTPS POST to
GetCollectionDetails), fetch_metadata_batch (HTTPS POST to
GetPublishedFileDetails with consumer_app_id == 550 enforcement that
raises WorkshopValidationError in add-mode and silently skips in
refresh-mode), download_to_cache (atomic + idempotent on mtime+size),
and refresh_all (ThreadPoolExecutor with per-item error collection).

Adds requests as an explicit dependency.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-07 16:37:39 +02:00

295 lines
9.5 KiB
Python

"""Steam Workshop API client + downloader.
Pure HTTP/file logic — no DB writes, no Flask, no job-worker integration.
Used by the workshop overlay builder and the admin refresh job.
Endpoints:
- GetCollectionDetails: resolve a collection ID to its child item IDs.
- GetPublishedFileDetails: batch-fetch metadata for items, including a public
file_url for the .vpk.
Both endpoints accept anonymous POSTs; no Steam Web API key required.
"""
from __future__ import annotations
import os
import re
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass, field
from pathlib import Path
from typing import Callable, Iterable, Literal
import requests
# HTTPS only (decision 16). The reference downloader uses HTTP — we don't.
GET_PUBLISHED_FILE_DETAILS_URL = (
"https://api.steampowered.com/ISteamRemoteStorage/GetPublishedFileDetails/v1/"
)
GET_COLLECTION_DETAILS_URL = (
"https://api.steampowered.com/ISteamRemoteStorage/GetCollectionDetails/v1/"
)
L4D2_APP_ID = 550
REQUEST_TIMEOUT_SECONDS = 30
DOWNLOAD_CHUNK_BYTES = 1_048_576
_NUMERIC_ID_RE = re.compile(r"^\d+$")
_URL_ID_RE = re.compile(r"^https?://([a-z0-9.-]*\.)?steamcommunity\.com/.*[?&]id=(\d+)", re.IGNORECASE)
_BARE_URL_ID_RE = re.compile(r"^([a-z0-9.-]*\.)?steamcommunity\.com/.*[?&]id=(\d+)", re.IGNORECASE)
_session_local = threading.local()
def _session() -> requests.Session:
"""Per-thread session for connection reuse without cross-thread leakage."""
sess = getattr(_session_local, "session", None)
if sess is None:
sess = requests.Session()
_session_local.session = sess
return sess
class WorkshopValidationError(ValueError):
"""Raised during user-add when an item fails a fixed precondition
(e.g. consumer_app_id != 550)."""
@dataclass(slots=True)
class WorkshopMetadata:
steam_id: str
title: str
filename: str
file_url: str
file_size: int
time_updated: int
preview_url: str
consumer_app_id: int
result: int
@dataclass(slots=True)
class RefreshReport:
downloaded: int = 0
skipped: int = 0
errors: int = 0
per_item_errors: dict[str, str] = field(default_factory=dict)
def parse_workshop_input(raw: str) -> list[str]:
"""Parse a single ID, a single workshop URL, or a multi-line / whitespace-
separated batch of either. Returns deduplicated digit-only IDs in order.
Raises ValueError on garbage."""
if not raw or not raw.strip():
raise ValueError("input is empty")
tokens: list[str] = []
for token in re.split(r"\s+", raw.strip()):
if not token:
continue
tokens.append(_extract_id(token))
seen: set[str] = set()
deduped: list[str] = []
for tok in tokens:
if tok not in seen:
seen.add(tok)
deduped.append(tok)
return deduped
def _extract_id(token: str) -> str:
if _NUMERIC_ID_RE.fullmatch(token):
return token
m = _URL_ID_RE.match(token)
if m:
return m.group(2)
m = _BARE_URL_ID_RE.match(token)
if m:
return m.group(2)
raise ValueError(f"could not parse a Steam workshop id from: {token!r}")
def resolve_collection(collection_id: str) -> list[str]:
"""POST GetCollectionDetails for one collection; return its non-collection
child publishedfileids in order. Nested collections (filetype != 0) are
skipped."""
if not _NUMERIC_ID_RE.fullmatch(collection_id):
raise ValueError("collection_id must be digits only")
response = _session().post(
GET_COLLECTION_DETAILS_URL,
data={
"collectioncount": 1,
"publishedfileids[0]": collection_id,
},
timeout=REQUEST_TIMEOUT_SECONDS,
)
response.raise_for_status()
payload = response.json()
children: list[str] = []
for collection in payload.get("response", {}).get("collectiondetails", []):
for child in collection.get("children", []):
if child.get("filetype", 0) != 0:
continue # nested collection, skip
child_id = child.get("publishedfileid")
if child_id is not None:
children.append(str(child_id))
return children
def fetch_metadata_batch(
steam_ids: list[str], *, mode: Literal["add", "refresh"]
) -> list[WorkshopMetadata]:
"""One POST to GetPublishedFileDetails covering all ids.
In `mode="add"`, any non-L4D2 (`consumer_app_id != 550`) raises
WorkshopValidationError so the user-add request fails cleanly.
In `mode="refresh"`, non-L4D2 entries are skipped from the result.
Items with `result != 1` are returned as-is (the caller persists the result
code into `WorkshopItem.last_error`).
"""
if not steam_ids:
return []
for sid in steam_ids:
if not _NUMERIC_ID_RE.fullmatch(sid):
raise ValueError(f"steam id must be digits only: {sid!r}")
payload: dict[str, str | int] = {"itemcount": len(steam_ids)}
for index, sid in enumerate(steam_ids):
payload[f"publishedfileids[{index}]"] = sid
response = _session().post(
GET_PUBLISHED_FILE_DETAILS_URL,
data=payload,
timeout=REQUEST_TIMEOUT_SECONDS,
)
response.raise_for_status()
body = response.json()
metas: list[WorkshopMetadata] = []
for entry in body.get("response", {}).get("publishedfiledetails", []):
meta = WorkshopMetadata(
steam_id=str(entry.get("publishedfileid", "")),
title=str(entry.get("title", "") or ""),
filename=str(entry.get("filename", "") or ""),
file_url=str(entry.get("file_url", "") or ""),
file_size=int(entry.get("file_size") or 0),
time_updated=int(entry.get("time_updated") or 0),
preview_url=str(entry.get("preview_url", "") or ""),
consumer_app_id=int(entry.get("consumer_app_id") or 0),
result=int(entry.get("result") or 0),
)
# consumer_app_id is only meaningful when the lookup itself succeeded.
if meta.result == 1 and meta.consumer_app_id != L4D2_APP_ID:
if mode == "add":
raise WorkshopValidationError(
f"item {meta.steam_id} is not a Left 4 Dead 2 workshop "
f"item (consumer_app_id={meta.consumer_app_id})"
)
# refresh mode: drop the entry silently from the batch
continue
metas.append(meta)
return metas
def download_to_cache(
meta: WorkshopMetadata,
cache_root: Path,
*,
on_progress: Callable[[int, int], None] | None = None,
should_cancel: Callable[[], bool] | None = None,
) -> Path:
"""Download `meta.file_url` to `cache_root/{steam_id}.vpk`.
Atomic via `*.partial` + `os.replace`. Idempotent: a no-op when the
existing file's `(mtime, size)` already matches `(time_updated, file_size)`.
Sets `os.utime(target, (time_updated, time_updated))` so the next run
short-circuits.
"""
if not _NUMERIC_ID_RE.fullmatch(meta.steam_id):
raise ValueError("meta.steam_id must be digits only")
cache_root.mkdir(parents=True, exist_ok=True)
target = cache_root / f"{meta.steam_id}.vpk"
if (
target.exists()
and int(target.stat().st_mtime) == int(meta.time_updated)
and int(target.stat().st_size) == int(meta.file_size)
):
return target
if not meta.file_url:
raise ValueError(f"item {meta.steam_id} has no file_url; cannot download")
partial = target.with_suffix(target.suffix + ".partial")
response = _session().get(meta.file_url, stream=True, timeout=REQUEST_TIMEOUT_SECONDS)
response.raise_for_status()
written = 0
try:
with open(partial, "wb") as f:
for chunk in response.iter_content(chunk_size=DOWNLOAD_CHUNK_BYTES):
if should_cancel is not None and should_cancel():
raise InterruptedError("download cancelled")
if not chunk:
continue
f.write(chunk)
written += len(chunk)
if on_progress is not None:
on_progress(written, int(meta.file_size))
os.replace(partial, target)
except BaseException:
partial.unlink(missing_ok=True)
raise
os.utime(target, (meta.time_updated, meta.time_updated))
return target
def refresh_all(
metas: Iterable[WorkshopMetadata],
cache_root: Path,
*,
executor_workers: int = 8,
should_cancel: Callable[[], bool] | None = None,
) -> RefreshReport:
"""Download (or skip-as-cached) every metadata item using a thread pool.
Per-item errors are collected; sibling items continue."""
metas_list = list(metas)
report = RefreshReport()
if not metas_list:
return report
cache_root.mkdir(parents=True, exist_ok=True)
with ThreadPoolExecutor(max_workers=executor_workers) as executor:
futures = {}
for meta in metas_list:
if should_cancel is not None and should_cancel():
break
future = executor.submit(
download_to_cache,
meta,
cache_root,
should_cancel=should_cancel,
)
futures[future] = meta
for future in as_completed(futures):
meta = futures[future]
try:
future.result()
except Exception as exc:
report.errors += 1
report.per_item_errors[meta.steam_id] = str(exc)
continue
report.downloaded += 1
return report