feat(l4d2-web): steam workshop API client and downloader

Adds l4d2web/services/steam_workshop.py: parse_workshop_input (single ID, URL, or multi-line batch), resolve_collection (HTTPS POST to GetCollectionDetails), fetch_metadata_batch (HTTPS POST to GetPublishedFileDetails with consumer_app_id == 550 enforcement that raises WorkshopValidationError in add-mode and silently skips in refresh-mode), download_to_cache (atomic + idempotent on mtime+size), and refresh_all (ThreadPoolExecutor with per-item error collection). Adds requests as an explicit dependency. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-07 16:37:39 +02:00 · 2026-05-07 16:37:39 +02:00 · c6b41429ee
commit c6b41429ee
parent 2543a05c12
3 changed files with 608 additions and 0 deletions
--- a/l4d2web/pyproject.toml
+++ b/l4d2web/pyproject.toml
@ -14,6 +14,7 @@ dependencies = [
  "alembic>=1.13",
  "PyYAML>=6.0",
  "gunicorn>=22.0",
+  "requests>=2.31",
 ]

 [tool.setuptools]
--- a/l4d2web/services/steam_workshop.py
+++ b/l4d2web/services/steam_workshop.py
@ -0,0 +1,295 @@
+"""Steam Workshop API client + downloader.
+
+Pure HTTP/file logic — no DB writes, no Flask, no job-worker integration.
+Used by the workshop overlay builder and the admin refresh job.
+
+Endpoints:
+- GetCollectionDetails: resolve a collection ID to its child item IDs.
+- GetPublishedFileDetails: batch-fetch metadata for items, including a public
+  file_url for the .vpk.
+
+Both endpoints accept anonymous POSTs; no Steam Web API key required.
+"""
+from __future__ import annotations
+
+import os
+import re
+import threading
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Callable, Iterable, Literal
+
+import requests
+
+
+# HTTPS only (decision 16). The reference downloader uses HTTP — we don't.
+GET_PUBLISHED_FILE_DETAILS_URL = (
+    "https://api.steampowered.com/ISteamRemoteStorage/GetPublishedFileDetails/v1/"
+)
+GET_COLLECTION_DETAILS_URL = (
+    "https://api.steampowered.com/ISteamRemoteStorage/GetCollectionDetails/v1/"
+)
+
+L4D2_APP_ID = 550
+
+REQUEST_TIMEOUT_SECONDS = 30
+DOWNLOAD_CHUNK_BYTES = 1_048_576
+
+_NUMERIC_ID_RE = re.compile(r"^\d+$")
+_URL_ID_RE = re.compile(r"^https?://([a-z0-9.-]*\.)?steamcommunity\.com/.*[?&]id=(\d+)", re.IGNORECASE)
+_BARE_URL_ID_RE = re.compile(r"^([a-z0-9.-]*\.)?steamcommunity\.com/.*[?&]id=(\d+)", re.IGNORECASE)
+
+_session_local = threading.local()
+
+
+def _session() -> requests.Session:
+    """Per-thread session for connection reuse without cross-thread leakage."""
+    sess = getattr(_session_local, "session", None)
+    if sess is None:
+        sess = requests.Session()
+        _session_local.session = sess
+    return sess
+
+
+class WorkshopValidationError(ValueError):
+    """Raised during user-add when an item fails a fixed precondition
+    (e.g. consumer_app_id != 550)."""
+
+
+@dataclass(slots=True)
+class WorkshopMetadata:
+    steam_id: str
+    title: str
+    filename: str
+    file_url: str
+    file_size: int
+    time_updated: int
+    preview_url: str
+    consumer_app_id: int
+    result: int
+
+
+@dataclass(slots=True)
+class RefreshReport:
+    downloaded: int = 0
+    skipped: int = 0
+    errors: int = 0
+    per_item_errors: dict[str, str] = field(default_factory=dict)
+
+
+def parse_workshop_input(raw: str) -> list[str]:
+    """Parse a single ID, a single workshop URL, or a multi-line / whitespace-
+    separated batch of either. Returns deduplicated digit-only IDs in order.
+    Raises ValueError on garbage."""
+    if not raw or not raw.strip():
+        raise ValueError("input is empty")
+
+    tokens: list[str] = []
+    for token in re.split(r"\s+", raw.strip()):
+        if not token:
+            continue
+        tokens.append(_extract_id(token))
+
+    seen: set[str] = set()
+    deduped: list[str] = []
+    for tok in tokens:
+        if tok not in seen:
+            seen.add(tok)
+            deduped.append(tok)
+    return deduped
+
+
+def _extract_id(token: str) -> str:
+    if _NUMERIC_ID_RE.fullmatch(token):
+        return token
+    m = _URL_ID_RE.match(token)
+    if m:
+        return m.group(2)
+    m = _BARE_URL_ID_RE.match(token)
+    if m:
+        return m.group(2)
+    raise ValueError(f"could not parse a Steam workshop id from: {token!r}")
+
+
+def resolve_collection(collection_id: str) -> list[str]:
+    """POST GetCollectionDetails for one collection; return its non-collection
+    child publishedfileids in order. Nested collections (filetype != 0) are
+    skipped."""
+    if not _NUMERIC_ID_RE.fullmatch(collection_id):
+        raise ValueError("collection_id must be digits only")
+
+    response = _session().post(
+        GET_COLLECTION_DETAILS_URL,
+        data={
+            "collectioncount": 1,
+            "publishedfileids[0]": collection_id,
+        },
+        timeout=REQUEST_TIMEOUT_SECONDS,
+    )
+    response.raise_for_status()
+    payload = response.json()
+    children: list[str] = []
+    for collection in payload.get("response", {}).get("collectiondetails", []):
+        for child in collection.get("children", []):
+            if child.get("filetype", 0) != 0:
+                continue  # nested collection, skip
+            child_id = child.get("publishedfileid")
+            if child_id is not None:
+                children.append(str(child_id))
+    return children
+
+
+def fetch_metadata_batch(
+    steam_ids: list[str], *, mode: Literal["add", "refresh"]
+) -> list[WorkshopMetadata]:
+    """One POST to GetPublishedFileDetails covering all ids.
+
+    In `mode="add"`, any non-L4D2 (`consumer_app_id != 550`) raises
+    WorkshopValidationError so the user-add request fails cleanly.
+
+    In `mode="refresh"`, non-L4D2 entries are skipped from the result.
+
+    Items with `result != 1` are returned as-is (the caller persists the result
+    code into `WorkshopItem.last_error`).
+    """
+    if not steam_ids:
+        return []
+    for sid in steam_ids:
+        if not _NUMERIC_ID_RE.fullmatch(sid):
+            raise ValueError(f"steam id must be digits only: {sid!r}")
+
+    payload: dict[str, str | int] = {"itemcount": len(steam_ids)}
+    for index, sid in enumerate(steam_ids):
+        payload[f"publishedfileids[{index}]"] = sid
+
+    response = _session().post(
+        GET_PUBLISHED_FILE_DETAILS_URL,
+        data=payload,
+        timeout=REQUEST_TIMEOUT_SECONDS,
+    )
+    response.raise_for_status()
+    body = response.json()
+
+    metas: list[WorkshopMetadata] = []
+    for entry in body.get("response", {}).get("publishedfiledetails", []):
+        meta = WorkshopMetadata(
+            steam_id=str(entry.get("publishedfileid", "")),
+            title=str(entry.get("title", "") or ""),
+            filename=str(entry.get("filename", "") or ""),
+            file_url=str(entry.get("file_url", "") or ""),
+            file_size=int(entry.get("file_size") or 0),
+            time_updated=int(entry.get("time_updated") or 0),
+            preview_url=str(entry.get("preview_url", "") or ""),
+            consumer_app_id=int(entry.get("consumer_app_id") or 0),
+            result=int(entry.get("result") or 0),
+        )
+
+        # consumer_app_id is only meaningful when the lookup itself succeeded.
+        if meta.result == 1 and meta.consumer_app_id != L4D2_APP_ID:
+            if mode == "add":
+                raise WorkshopValidationError(
+                    f"item {meta.steam_id} is not a Left 4 Dead 2 workshop "
+                    f"item (consumer_app_id={meta.consumer_app_id})"
+                )
+            # refresh mode: drop the entry silently from the batch
+            continue
+
+        metas.append(meta)
+    return metas
+
+
+def download_to_cache(
+    meta: WorkshopMetadata,
+    cache_root: Path,
+    *,
+    on_progress: Callable[[int, int], None] | None = None,
+    should_cancel: Callable[[], bool] | None = None,
+) -> Path:
+    """Download `meta.file_url` to `cache_root/{steam_id}.vpk`.
+
+    Atomic via `*.partial` + `os.replace`. Idempotent: a no-op when the
+    existing file's `(mtime, size)` already matches `(time_updated, file_size)`.
+    Sets `os.utime(target, (time_updated, time_updated))` so the next run
+    short-circuits.
+    """
+    if not _NUMERIC_ID_RE.fullmatch(meta.steam_id):
+        raise ValueError("meta.steam_id must be digits only")
+    cache_root.mkdir(parents=True, exist_ok=True)
+    target = cache_root / f"{meta.steam_id}.vpk"
+
+    if (
+        target.exists()
+        and int(target.stat().st_mtime) == int(meta.time_updated)
+        and int(target.stat().st_size) == int(meta.file_size)
+    ):
+        return target
+
+    if not meta.file_url:
+        raise ValueError(f"item {meta.steam_id} has no file_url; cannot download")
+
+    partial = target.with_suffix(target.suffix + ".partial")
+    response = _session().get(meta.file_url, stream=True, timeout=REQUEST_TIMEOUT_SECONDS)
+    response.raise_for_status()
+
+    written = 0
+    try:
+        with open(partial, "wb") as f:
+            for chunk in response.iter_content(chunk_size=DOWNLOAD_CHUNK_BYTES):
+                if should_cancel is not None and should_cancel():
+                    raise InterruptedError("download cancelled")
+                if not chunk:
+                    continue
+                f.write(chunk)
+                written += len(chunk)
+                if on_progress is not None:
+                    on_progress(written, int(meta.file_size))
+        os.replace(partial, target)
+    except BaseException:
+        partial.unlink(missing_ok=True)
+        raise
+
+    os.utime(target, (meta.time_updated, meta.time_updated))
+    return target
+
+
+def refresh_all(
+    metas: Iterable[WorkshopMetadata],
+    cache_root: Path,
+    *,
+    executor_workers: int = 8,
+    should_cancel: Callable[[], bool] | None = None,
+) -> RefreshReport:
+    """Download (or skip-as-cached) every metadata item using a thread pool.
+    Per-item errors are collected; sibling items continue."""
+    metas_list = list(metas)
+    report = RefreshReport()
+    if not metas_list:
+        return report
+
+    cache_root.mkdir(parents=True, exist_ok=True)
+
+    with ThreadPoolExecutor(max_workers=executor_workers) as executor:
+        futures = {}
+        for meta in metas_list:
+            if should_cancel is not None and should_cancel():
+                break
+            future = executor.submit(
+                download_to_cache,
+                meta,
+                cache_root,
+                should_cancel=should_cancel,
+            )
+            futures[future] = meta
+
+        for future in as_completed(futures):
+            meta = futures[future]
+            try:
+                future.result()
+            except Exception as exc:
+                report.errors += 1
+                report.per_item_errors[meta.steam_id] = str(exc)
+                continue
+            report.downloaded += 1
+
+    return report
--- a/l4d2web/tests/test_steam_workshop.py
+++ b/l4d2web/tests/test_steam_workshop.py
@ -0,0 +1,312 @@
+"""Tests for the Steam Workshop API client and downloader."""
+from __future__ import annotations
+
+import os
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from l4d2web.services import steam_workshop
+
+
+def test_parse_workshop_input_single_numeric() -> None:
+    assert steam_workshop.parse_workshop_input("12345") == ["12345"]
+
+
+def test_parse_workshop_input_single_url() -> None:
+    url = "https://steamcommunity.com/sharedfiles/filedetails/?id=98765"
+    assert steam_workshop.parse_workshop_input(url) == ["98765"]
+
+
+def test_parse_workshop_input_workshop_url_variant() -> None:
+    url = "steamcommunity.com/workshop/filedetails/?id=42"
+    assert steam_workshop.parse_workshop_input(url) == ["42"]
+
+
+def test_parse_workshop_input_multiline_batch() -> None:
+    raw = """
+    12345
+    https://steamcommunity.com/sharedfiles/filedetails/?id=67890
+    99999
+    """
+    assert steam_workshop.parse_workshop_input(raw) == ["12345", "67890", "99999"]
+
+
+def test_parse_workshop_input_deduplicates_preserving_order() -> None:
+    raw = "100\n200\n100\n300"
+    assert steam_workshop.parse_workshop_input(raw) == ["100", "200", "300"]
+
+
+def test_parse_workshop_input_rejects_garbage() -> None:
+    with pytest.raises(ValueError):
+        steam_workshop.parse_workshop_input("not-a-number")
+
+
+def test_parse_workshop_input_rejects_empty() -> None:
+    with pytest.raises(ValueError):
+        steam_workshop.parse_workshop_input("")
+
+
+def test_parse_workshop_input_rejects_non_steam_url() -> None:
+    with pytest.raises(ValueError):
+        steam_workshop.parse_workshop_input("https://example.com/?id=12345")
+
+
+def test_endpoints_are_https() -> None:
+    assert steam_workshop.GET_PUBLISHED_FILE_DETAILS_URL.startswith("https://")
+    assert steam_workshop.GET_COLLECTION_DETAILS_URL.startswith("https://")
+    assert "api.steampowered.com" in steam_workshop.GET_PUBLISHED_FILE_DETAILS_URL
+
+
+def test_resolve_collection_returns_child_ids() -> None:
+    fake_response = MagicMock(status_code=200)
+    fake_response.raise_for_status = MagicMock()
+    fake_response.json.return_value = {
+        "response": {
+            "collectiondetails": [
+                {
+                    "publishedfileid": "555",
+                    "result": 1,
+                    "children": [
+                        {"publishedfileid": "1001", "filetype": 0},
+                        {"publishedfileid": "1002", "filetype": 0},
+                        {"publishedfileid": "9999", "filetype": 1},  # nested collection — skip
+                    ],
+                }
+            ]
+        }
+    }
+    with patch.object(steam_workshop, "_session", return_value=MagicMock(post=MagicMock(return_value=fake_response))):
+        ids = steam_workshop.resolve_collection("555")
+    assert ids == ["1001", "1002"]
+
+
+def test_fetch_metadata_batch_parses_published_file_details() -> None:
+    fake_response = MagicMock(status_code=200)
+    fake_response.raise_for_status = MagicMock()
+    fake_response.json.return_value = {
+        "response": {
+            "publishedfiledetails": [
+                {
+                    "publishedfileid": "1001",
+                    "result": 1,
+                    "consumer_app_id": 550,
+                    "title": "Map A",
+                    "filename": "map_a.vpk",
+                    "file_url": "https://steamusercontent.com/abc/map_a.vpk",
+                    "file_size": "1024",
+                    "time_updated": 1700000000,
+                    "preview_url": "https://steamuserimages.com/preview_a.jpg",
+                }
+            ]
+        }
+    }
+    with patch.object(steam_workshop, "_session", return_value=MagicMock(post=MagicMock(return_value=fake_response))):
+        metas = steam_workshop.fetch_metadata_batch(["1001"], mode="add")
+    assert len(metas) == 1
+    m = metas[0]
+    assert m.steam_id == "1001"
+    assert m.title == "Map A"
+    assert m.filename == "map_a.vpk"
+    assert m.file_url == "https://steamusercontent.com/abc/map_a.vpk"
+    assert m.file_size == 1024
+    assert m.time_updated == 1700000000
+    assert m.preview_url == "https://steamuserimages.com/preview_a.jpg"
+    assert m.consumer_app_id == 550
+    assert m.result == 1
+
+
+def test_fetch_metadata_batch_rejects_non_l4d2_in_add_mode() -> None:
+    fake_response = MagicMock(status_code=200)
+    fake_response.raise_for_status = MagicMock()
+    fake_response.json.return_value = {
+        "response": {
+            "publishedfiledetails": [
+                {
+                    "publishedfileid": "1001",
+                    "result": 1,
+                    "consumer_app_id": 440,  # TF2
+                    "title": "Other",
+                    "filename": "x.vpk",
+                    "file_url": "https://example.com/x.vpk",
+                    "file_size": "0",
+                    "time_updated": 0,
+                }
+            ]
+        }
+    }
+    with patch.object(steam_workshop, "_session", return_value=MagicMock(post=MagicMock(return_value=fake_response))):
+        with pytest.raises(steam_workshop.WorkshopValidationError):
+            steam_workshop.fetch_metadata_batch(["1001"], mode="add")
+
+
+def test_fetch_metadata_batch_skips_non_l4d2_in_refresh_mode() -> None:
+    fake_response = MagicMock(status_code=200)
+    fake_response.raise_for_status = MagicMock()
+    fake_response.json.return_value = {
+        "response": {
+            "publishedfiledetails": [
+                {
+                    "publishedfileid": "1001",
+                    "result": 1,
+                    "consumer_app_id": 440,
+                    "title": "Other",
+                    "filename": "x.vpk",
+                    "file_url": "https://example.com/x.vpk",
+                    "file_size": "0",
+                    "time_updated": 0,
+                },
+                {
+                    "publishedfileid": "1002",
+                    "result": 1,
+                    "consumer_app_id": 550,
+                    "title": "Good",
+                    "filename": "g.vpk",
+                    "file_url": "https://example.com/g.vpk",
+                    "file_size": "100",
+                    "time_updated": 1,
+                },
+            ]
+        }
+    }
+    with patch.object(steam_workshop, "_session", return_value=MagicMock(post=MagicMock(return_value=fake_response))):
+        metas = steam_workshop.fetch_metadata_batch(["1001", "1002"], mode="refresh")
+    # The non-L4D2 item is dropped; the L4D2 item is kept.
+    assert [m.steam_id for m in metas] == ["1002"]
+
+
+def test_fetch_metadata_batch_captures_result_failure() -> None:
+    fake_response = MagicMock(status_code=200)
+    fake_response.raise_for_status = MagicMock()
+    fake_response.json.return_value = {
+        "response": {
+            "publishedfiledetails": [
+                {
+                    "publishedfileid": "999",
+                    "result": 9,  # not found / hidden / etc.
+                }
+            ]
+        }
+    }
+    with patch.object(steam_workshop, "_session", return_value=MagicMock(post=MagicMock(return_value=fake_response))):
+        metas = steam_workshop.fetch_metadata_batch(["999"], mode="refresh")
+    # Item is kept but marked with the failing result; consumer app id never validated.
+    assert len(metas) == 1
+    assert metas[0].result == 9
+
+
+def test_download_to_cache_writes_atomically_and_sets_mtime(tmp_path: Path) -> None:
+    cache_root = tmp_path / "workshop_cache"
+    cache_root.mkdir()
+    meta = steam_workshop.WorkshopMetadata(
+        steam_id="1001",
+        title="A",
+        filename="a.vpk",
+        file_url="https://example.com/a.vpk",
+        file_size=11,
+        time_updated=1700000000,
+        preview_url="",
+        consumer_app_id=550,
+        result=1,
+    )
+    fake_response = MagicMock(status_code=200)
+    fake_response.raise_for_status = MagicMock()
+    fake_response.iter_content.return_value = [b"hello world"]
+
+    with patch.object(steam_workshop, "_session", return_value=MagicMock(get=MagicMock(return_value=fake_response))):
+        path = steam_workshop.download_to_cache(meta, cache_root)
+
+    assert path == cache_root / "1001.vpk"
+    assert path.read_bytes() == b"hello world"
+    assert int(path.stat().st_mtime) == 1700000000
+    # No leftover .partial file.
+    assert not (cache_root / "1001.vpk.partial").exists()
+
+
+def test_download_to_cache_is_idempotent(tmp_path: Path) -> None:
+    cache_root = tmp_path / "workshop_cache"
+    cache_root.mkdir()
+    target = cache_root / "1001.vpk"
+    target.write_bytes(b"existing")
+    os.utime(target, (1700000000, 1700000000))
+
+    meta = steam_workshop.WorkshopMetadata(
+        steam_id="1001",
+        title="A",
+        filename="a.vpk",
+        file_url="https://example.com/a.vpk",
+        file_size=8,  # matches existing
+        time_updated=1700000000,  # matches existing mtime
+        preview_url="",
+        consumer_app_id=550,
+        result=1,
+    )
+
+    fake_session = MagicMock()
+    with patch.object(steam_workshop, "_session", return_value=fake_session):
+        steam_workshop.download_to_cache(meta, cache_root)
+
+    fake_session.get.assert_not_called()
+
+
+def test_download_to_cache_redownloads_when_mtime_or_size_differ(tmp_path: Path) -> None:
+    cache_root = tmp_path / "workshop_cache"
+    cache_root.mkdir()
+    target = cache_root / "1001.vpk"
+    target.write_bytes(b"old")
+    os.utime(target, (1500000000, 1500000000))
+
+    meta = steam_workshop.WorkshopMetadata(
+        steam_id="1001",
+        title="A",
+        filename="a.vpk",
+        file_url="https://example.com/a.vpk",
+        file_size=11,
+        time_updated=1700000000,
+        preview_url="",
+        consumer_app_id=550,
+        result=1,
+    )
+
+    fake_response = MagicMock(status_code=200)
+    fake_response.raise_for_status = MagicMock()
+    fake_response.iter_content.return_value = [b"hello world"]
+
+    with patch.object(steam_workshop, "_session", return_value=MagicMock(get=MagicMock(return_value=fake_response))):
+        steam_workshop.download_to_cache(meta, cache_root)
+
+    assert target.read_bytes() == b"hello world"
+    assert int(target.stat().st_mtime) == 1700000000
+
+
+def test_refresh_all_uses_thread_pool_and_collects_errors(tmp_path: Path) -> None:
+    cache_root = tmp_path / "workshop_cache"
+    cache_root.mkdir()
+
+    metas = [
+        steam_workshop.WorkshopMetadata(
+            steam_id=str(i),
+            title=f"M{i}",
+            filename=f"m{i}.vpk",
+            file_url=f"https://example.com/m{i}.vpk",
+            file_size=5,
+            time_updated=1700000000,
+            preview_url="",
+            consumer_app_id=550,
+            result=1,
+        )
+        for i in (1, 2, 3)
+    ]
+
+    def fake_download(meta, cache_root_arg, **kwargs):
+        if meta.steam_id == "2":
+            raise RuntimeError("simulated download failure")
+        return cache_root_arg / f"{meta.steam_id}.vpk"
+
+    with patch.object(steam_workshop, "download_to_cache", side_effect=fake_download):
+        report = steam_workshop.refresh_all(metas, cache_root, executor_workers=4)
+
+    assert report.downloaded == 2
+    assert report.errors == 1
+    assert "2" in report.per_item_errors