adding caching to presque gratos

This commit is contained in:
Gaël
2025-09-29 09:36:47 +02:00
parent 595c11eeb5
commit d08a598fd7
4 changed files with 258 additions and 0 deletions

162
backfill.py Normal file
View File

@@ -0,0 +1,162 @@
# backfill_from_ghost.py
from __future__ import annotations
import os, re, sys, html
from typing import Dict, List, Optional
import requests
from bs4 import BeautifulSoup
# Reuse your existing GhostAdmin client (same headers/base/proxy behavior)
# Adjust the import path if your Ghost client lives elsewhere.
from presquegratos import GhostAdmin
from storage import Storage
from keys import xgp_key, egs_key, psplus_key
# ---------------- Ghost helpers (reusing your admin client) ----------------
def ghost_list_posts(ghost: GhostAdmin, page: int = 1) -> Dict:
# Minimal params: avoid 'filter' and 'fields' to dodge 400 behind __bot proxy
url = ghost.base + "posts/"
params = {
"limit": "50",
"page": str(page),
"order": "published_at DESC",
"formats": "lexical,html", # <-- IMPORTANT
}
r = requests.get(url, headers=ghost._headers(), params=params, timeout=30)
r.raise_for_status()
return r.json()
def list_recap_posts(ghost: GhostAdmin, hard_limit: int = 2000) -> List[Dict]:
posts: List[Dict] = []
page = 1
while True:
data = ghost_list_posts(ghost, page=page)
batch = data.get("posts", [])
if not batch:
break
# client-side filter to be robust to proxy quirks
for p in batch:
title = (p.get("title") or "").strip()
if title.startswith("Récap hebdo"):
posts.append(p)
if len(batch) < 50 or len(posts) >= hard_limit:
break
page += 1
return posts
# ---------------- Parsing helpers (unchanged) ----------------
#MS_STORE_RE = re.compile(r"(?:microsoft|xbox)\.com/.*/store/.*/([0-9A-Z]{12,})", re.I)
MS_STORE_RE = re.compile(r"(?:xbox|microsoft)\.com/.*/store/.*/([0-9A-Z]{12,16})", re.I)
EPIC_RE = re.compile(r"epicgames\.com/store/.*/p/([\w\-]+)", re.I)
PSBLOG_RE = re.compile(r"blog\.playstation\.com/.*", re.I)
def clean_text(s: str) -> str:
return re.sub(r"\s+", " ", html.unescape(s or "")).strip()
def extract_sections(soup: BeautifulSoup) -> Dict[str, BeautifulSoup]:
sections: Dict[str, BeautifulSoup] = {}
current = None
current_key = None
for node in soup.find_all(["h2","h3","h4","p","ul","ol","div","section"]):
if node.name in ("h2","h3","h4"):
title = clean_text(node.get_text())
key = None
tl = title.lower()
if "game pass" in tl:
key = "xgp"
elif "egs" in tl or "epic" in tl:
key = "egs"
elif "ps plus" in tl or "ps+" in tl:
key = "psplus"
if key:
current_key = key
current = sections[key] = soup.new_tag("div")
continue
if current_key and current is not None:
current.append(node)
return sections
def parse_xgp(section: BeautifulSoup) -> List[Dict]:
items = []
for a in section.find_all("a", href=True):
href = a["href"]
m = MS_STORE_RE.search(href)
title = clean_text(a.get_text())
if m or title:
productId = m.group(1) if m else None
items.append({"title": title, "productId": productId})
uniq, seen = [], set()
for it in items:
k = xgp_key(it)
if k not in seen:
uniq.append(it); seen.add(k)
return uniq
def parse_egs(section: BeautifulSoup) -> List[Dict]:
items = []
for a in section.find_all("a", href=True):
if not EPIC_RE.search(a["href"]):
continue
title = clean_text(a.get_text()) or clean_text(a.get("title"))
items.append({"title": title, "start": ""})
uniq, seen = [], set()
for it in items:
k = egs_key(it)
if k not in seen:
uniq.append(it); seen.add(k)
return uniq
def parse_psplus(section: BeautifulSoup, post_title: str) -> Optional[Dict]:
a = section.find("a", href=PSBLOG_RE)
url = a["href"] if a else ""
m = re.search(r"(\d{2})-(\d{2})-(\d{4})", post_title)
iso = ""
if m:
d, mth, y = m.group(1), m.group(2), m.group(3)
iso = f"{y}-{mth}-{d}"
return {"url": url, "date": iso}
# ---------------- Main backfill ----------------
def backfill():
# Use the same env your main script uses; GhostAdmin will read them internally or
# you can pass them explicitly if your class expects (base_url, admin_key).
ghost = GhostAdmin(
admin_url=os.environ.get("GHOST_ADMIN_URL", "").rstrip("/") + "/",
admin_key=os.environ.get("GHOST_ADMIN_KEY", "")
)
store = Storage()
posts = list_recap_posts(ghost)
print(f"Found {len(posts)} recap posts.")
total_xgp = total_egs = total_ps = 0
dedup = []
for p in posts:
pid = p["id"]
title = p.get("title") or ""
html_body = p.get("html") or ""
soup = BeautifulSoup(html_body, "html.parser")
sections = extract_sections(soup)
for it in parse_xgp(sections.get("xgp", BeautifulSoup("", "html.parser"))):
key = xgp_key(it)
if not key in dedup:
store.remember("xgp", key, pid); total_xgp += 1
dedup.append(key)
# for it in parse_egs(sections.get("egs", BeautifulSoup("", "html.parser"))):
# store.remember("egs", egs_key(it), pid); total_egs += 1
# if "psplus" in sections:
# item = parse_psplus(sections["psplus"], title)
# store.remember("psplus", psplus_key(item), pid); total_ps += 1
print(f"Backfilled from: {title}")
print(f"Done. Inserted ~ XGP:{total_xgp} | EGS:{total_egs} | PS+:{total_ps}")
if __name__ == "__main__":
backfill()

20
keys.py Normal file
View File

@@ -0,0 +1,20 @@
# keys.py (or inline in your main)
def xgp_key(item) -> str:
# Prefer stable Microsoft Store productId if present; fallback to normalized title.
pid = (item.get("productId") or "").strip()
if pid:
return f"item:xgp:{pid}"
title = (item.get("title") or "").strip().lower()
return f"item:xgp:title:{title}"
def egs_key(item) -> str:
# Use title + start window (your fetcher usually knows the free-week start)
title = (item.get("title") or "").strip()
start = (item.get("start") or "").strip() # ISO or YYYY-MM-DD
return f"item:egs:{title}|{start}"
def psplus_key(item) -> str:
# Use official PS Blog URL + the published month (or your computed date)
url = (item.get("url") or "").strip()
date = (item.get("date") or "").strip()
return f"item:psplus:{url}|{date}"

View File

@@ -21,10 +21,13 @@ from bs4 import BeautifulSoup
from logging.handlers import RotatingFileHandler from logging.handlers import RotatingFileHandler
from datetime import datetime, timedelta, timezone from datetime import datetime, timedelta, timezone
import zoneinfo import zoneinfo
from storage import Storage
from keys import xgp_key
LOG = logging.getLogger("bot_weekly") LOG = logging.getLogger("bot_weekly")
LOG_PATTERN = logging.Formatter("%(asctime)s:%(levelname)s: [%(filename)s] %(message)s") LOG_PATTERN = logging.Formatter("%(asctime)s:%(levelname)s: [%(filename)s] %(message)s")
def setuplogger(): def setuplogger():
stream_handler = logging.StreamHandler() stream_handler = logging.StreamHandler()
stream_handler.setFormatter(LOG_PATTERN) stream_handler.setFormatter(LOG_PATTERN)
@@ -505,6 +508,7 @@ def build_html(egs: List[Dict[str, Any]],
parts.append("</div>") parts.append("</div>")
# --- Xbox Game Pass # --- Xbox Game Pass
parts.append("<h3>🟩 Xbox Game Pass — Récemment ajoutés</h3>") parts.append("<h3>🟩 Xbox Game Pass — Récemment ajoutés</h3>")
if not xgp: if not xgp:
parts.append("<p>Pas d'entrées détectées.</p>") parts.append("<p>Pas d'entrées détectées.</p>")
@@ -568,6 +572,23 @@ async def run_weekly():
LOG.warning("XGP fetch failed: %s", e) LOG.warning("XGP fetch failed: %s", e)
# Build HTML # Build HTML
store = Storage()
def keep_new(items, key_fn):
fresh = []
for it in items:
k = key_fn(it)
if not store.seen(k):
it["_dedup_key"] = k
fresh.append(it)
return fresh
# Fetch your three sources as you already do:
xgp_items = keep_new(xgp_items, xgp_key)
html_body, feature = build_html(egs_items, psplus_data, xgp_items) html_body, feature = build_html(egs_items, psplus_data, xgp_items)
# Title (FR) # Title (FR)
start = (datetime.now(TZ) - timedelta(days=6)).strftime("%d/%m/%Y") start = (datetime.now(TZ) - timedelta(days=6)).strftime("%d/%m/%Y")

55
storage.py Normal file
View File

@@ -0,0 +1,55 @@
# storage.py
from __future__ import annotations
import sqlite3, pathlib, datetime as dt
from typing import Optional, Iterable, Tuple
import os
DB_PATH = "/data/published.db" # bind-mount ./data:/data in docker
_SCHEMA = """
PRAGMA journal_mode = WAL;
CREATE TABLE IF NOT EXISTS published_items(
platform TEXT NOT NULL, -- e.g. xgp | egs | psplus
key TEXT PRIMARY KEY, -- your dedupe key (see below)
first_seen_utc TEXT NOT NULL, -- ISO-8601
last_post_id TEXT -- Ghost post id that recorded it
);
CREATE INDEX IF NOT EXISTS idx_platform ON published_items(platform);
"""
class Storage:
def __init__(self, db_path: str = DB_PATH):
if not os.path.isfile(db_path):
db_path = os.environ.get("DB_FILE_FALLBACK", r"f:\workspace\Substack_JV\data\published.db")
print(db_path)
pathlib.Path(db_path).parent.mkdir(parents=True, exist_ok=True)
self.conn = sqlite3.connect(db_path)
self.conn.execute("PRAGMA foreign_keys = ON;")
for stmt in filter(None, _SCHEMA.split(";")):
if stmt.strip():
self.conn.execute(stmt)
def seen(self, key: str) -> bool:
cur = self.conn.execute("SELECT 1 FROM published_items WHERE key=?", (key,))
return cur.fetchone() is not None
def remember(self, platform: str, key: str, post_id: Optional[str]):
self.conn.execute(
"INSERT OR IGNORE INTO published_items(platform,key,first_seen_utc,last_post_id) VALUES(?,?,?,?)",
(platform, key, dt.datetime.utcnow().isoformat(), post_id),
)
if post_id:
self.conn.execute("UPDATE published_items SET last_post_id=? WHERE key=?", (post_id, key))
self.conn.commit()
def bulk_remember(self, platform: str, pairs: Iterable[Tuple[str, Optional[str]]]):
rows = [(platform, k, dt.datetime.utcnow().isoformat(), pid) for (k, pid) in pairs]
self.conn.executemany(
"INSERT OR IGNORE INTO published_items(platform,key,first_seen_utc,last_post_id) VALUES(?,?,?,?)",
rows
)
self.conn.commit()