From a0f988ed326d315a1c224def27096202af7378f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABl?= Date: Sun, 7 Sep 2025 16:07:03 +0200 Subject: [PATCH] fixing yt + crashes + failsafe --- Dockerfile | 2 +- post_rss_to_ghost.py | 353 +++++++++++++++++++++++++++++-------------- xboxsyde.py | 23 +++ 3 files changed, 262 insertions(+), 116 deletions(-) create mode 100644 xboxsyde.py diff --git a/Dockerfile b/Dockerfile index 568ca97..537cd9d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.8 +FROM python:3.9 RUN apt-get update && apt-get install -y git RUN git clone http://gitea.zep.best/zep/Substack_JV.git /app diff --git a/post_rss_to_ghost.py b/post_rss_to_ghost.py index 4cfec87..125f873 100644 --- a/post_rss_to_ghost.py +++ b/post_rss_to_ghost.py @@ -1,18 +1,50 @@ import asyncio import argparse -import datetime +import datetime as dt import html -import io import logging import os import random import re import time from logging.handlers import RotatingFileHandler -from typing import Optional +from typing import Optional, List + import feedparser import requests -import jwt +import jwt +import zoneinfo # Python 3.9+ +from urllib.parse import urlparse, parse_qs + +# ------------- YouTube helpers ------------- + +YOUTUBE_EMBED_TMPL = ( + '
' + '
' +) + +def extract_youtube_id(url: str) -> Optional[str]: + try: + u = urlparse(url) + host = u.netloc.lower() + if host.endswith("youtube.com"): + if u.path == "/watch": + return parse_qs(u.query).get("v", [None])[0] + m = re.match(r"^/(shorts/|live/)?([A-Za-z0-9_-]{6,})", u.path) + if m: + return m.group(2) + if host == "youtu.be": + slug = u.path.strip("/").split("/")[0] + return slug or None + except Exception: + return None + return None + +# ------------- Logging ------------- LOG = logging.getLogger("bot") LOG_PATTERN = logging.Formatter("%(asctime)s:%(levelname)s: [%(filename)s] %(message)s") @@ -22,19 +54,21 @@ def setuplogger(): stream_handler.setFormatter(LOG_PATTERN) stream_handler.setLevel(logging.DEBUG) - file_handler = RotatingFileHandler("bot.log", "a", 1000000, 1) + file_handler = RotatingFileHandler("bot.log", "a", 1_000_000, 1) file_handler.setFormatter(LOG_PATTERN) LOG.setLevel(logging.DEBUG) LOG.addHandler(stream_handler) LOG.addHandler(file_handler) +# ------------- Model ------------- + class RSSfeed: - def __init__(self, url, yt=False): + def __init__(self, url: str, yt: bool = False): self.url = url self.youtube = yt -# ---------- Ghost Admin API client ---------- +# ------------- Ghost Admin API client ------------- class GhostAdmin: def __init__(self, admin_url: str, admin_key: str, accept_version: str = "v6.0"): @@ -56,6 +90,20 @@ class GhostAdmin: "Content-Type": "application/json", } + def latest_published_date(self, tz_name: str = "Europe/Brussels"): + """ + Date (aware) de la dernière publication (status=published), ou None. + """ + url = self.base + "posts/?limit=1&order=published_at%20desc&fields=published_at" + resp = requests.get(url, headers=self._headers(), timeout=20) + resp.raise_for_status() + posts = resp.json().get("posts", []) + if not posts or not posts[0].get("published_at"): + return None + # ISO 8601 → aware UTC → converti tz locale + dtu = dt.datetime.fromisoformat(posts[0]["published_at"].replace("Z", "+00:00")) + return dtu.astimezone(zoneinfo.ZoneInfo(tz_name)) + def get_newsletters(self): url = self.base + "newsletters/" resp = requests.get(url, headers=self._headers(), timeout=20) @@ -66,22 +114,16 @@ class GhostAdmin: def pick_newsletter_slug(self, preferred_slug: Optional[str]) -> str: if preferred_slug: return preferred_slug - # Fallback: choose the first active newsletter (favor default if present) nls = self.get_newsletters() if not nls: raise RuntimeError("No newsletters configured in Ghost (Settings → Newsletters).") - # try 'status=active' first actives = [n for n in nls if n.get("status") == "active"] - # prefer default one if flagged for n in actives: if n.get("is_default"): return n.get("slug") return (actives or nls)[0].get("slug") def create_post_html(self, title: str, html_content: str, status: str = "draft", feature_image: Optional[str] = None): - """ - Create a post with HTML source; optionally set feature_image (absolute URL). - """ url = self.base + "posts/?source=html" post = {"title": title, "html": html_content, "status": status} if feature_image: @@ -92,9 +134,6 @@ class GhostAdmin: return resp.json()["posts"][0] def publish_post(self, post_id: str, updated_at: str, newsletter_slug: Optional[str], email_segment: Optional[str]): - """ - Publish + (if newsletter provided) send email to the chosen audience. - """ slug = self.pick_newsletter_slug(newsletter_slug) params = [f"newsletter={requests.utils.quote(slug)}"] if email_segment: @@ -105,188 +144,265 @@ class GhostAdmin: if resp.status_code >= 400: raise RuntimeError(f"Ghost publish error {resp.status_code}: {resp.text}") return resp.json()["posts"][0] -# ---------- Your task logic (ported from Substack) ---------- + +# ------------- Task orchestration ------------- class GhostTask: - def __init__(self, feeds, admin_url, admin_key, newsletter_slug=None, email_segment=None): + def __init__(self, feeds: List[RSSfeed], admin_url: str, admin_key: str, newsletter_slug: Optional[str] = None, email_segment: Optional[str] = None): self.ghost = GhostAdmin(admin_url, admin_key) self.feeds = feeds self.newsletter_slug = newsletter_slug self.email_segment = email_segment for feed in self.feeds: - LOG.info("Adding feed " + feed.url) + LOG.info("Adding feed %s", feed.url) - def get_fr_date(self): - import datetime + # --- startup immediate run if not yet published today + + def _published_today(self) -> bool: + tz = zoneinfo.ZoneInfo("Europe/Brussels") + last = self.ghost.latest_published_date("Europe/Brussels") + if not last: + return False + return last.date() == dt.datetime.now(tz).date() + + async def maybe_run_today(self): + if not self._published_today(): + LOG.info("Aucune newsletter publiée aujourd'hui → génération immédiate.") + await self.daily_task() + else: + LOG.info("Déjà publié aujourd'hui, on attend la prochaine fenêtre.") + + # --- utils + + @staticmethod + def _fr_date_today() -> str: months = { - 'January':'Janvier','February':'Février','March':'Mars','April':'Avril','May':'Mai','June':'Juin', - 'July':'Juillet','August':'Août','September':'Septembre','October':'Octobre','November':'Novembre','December':'Décembre' + 'January': 'Janvier', 'February': 'Février', 'March': 'Mars', 'April': 'Avril', + 'May': 'Mai', 'June': 'Juin', 'July': 'Juillet', 'August': 'Août', + 'September': 'Septembre', 'October': 'Octobre', 'November': 'Novembre', 'December': 'Décembre' } - today = datetime.datetime.now() + today = dt.datetime.now() formatted = today.strftime("%d %B %Y") for en, fr in months.items(): formatted = formatted.replace(en, fr) return formatted - def _build_html_roundup(self, items, feeds): + @staticmethod + def _safe_get(url: str, timeout: int = 20) -> Optional[bytes]: + try: + r = requests.get(url, timeout=timeout, headers={"User-Agent": "ghost-bot/1.0"}) + r.raise_for_status() + return r.content + except Exception as e: + LOG.warning("Flux indisponible: %s (%s)", url, e) + return None + + @staticmethod + def _entry_datetime(entry) -> Optional[dt.datetime]: """ - Build HTML and capture the first encountered image URL (for feature_image). - Returns (html_string, first_image_url_or_None). + Tente de récupérer une datetime aware (UTC) pour un item feedparser. """ - parts = [] - parts.append(f"

Les news du {self.get_fr_date()}

") + # Try common fields first + if getattr(entry, "published", None): + try: + # YouTube (ISO) e.g. 2025-09-05T10:20:33+00:00 + return dt.datetime.fromisoformat(entry.published.replace("Z", "+00:00")).astimezone(dt.timezone.utc) + except Exception: + pass + try: + # RFC822 e.g. Fri, 05 Sep 2025 10:20:33 +0000 + return dt.datetime.strptime(entry.published.replace('GMT', '+0000'), + '%a, %d %b %Y %H:%M:%S %z').astimezone(dt.timezone.utc) + except Exception: + pass + if getattr(entry, "updated_parsed", None): + try: + return dt.datetime.fromtimestamp(time.mktime(entry.updated_parsed), tz=dt.timezone.utc) + except Exception: + pass + return None + + # --- HTML builder + + def _build_html_roundup(self, items: List[dict], feeds: List[RSSfeed]): + """ + Construit le HTML et retourne (html, feature_image_url_ou_None). + - YouTube: iframe + miniature cliquable (fallback email-safe) + - Images: collecte la première pour feature_image + """ + parts: List[str] = [] + parts.append(f"

Les news du {self._fr_date_today()}

") first_image: Optional[str] = None for post in items: - title = post.get("title", "") - linkURL = post.get("link", "") + title = post.get("title", "") or "" + linkURL = post.get("link", "") or "" parts.append(f'

{html.escape(title)}

') - if "yt_videoid" in post: - videoId = post["yt_videoid"] - parts.append(f'

https://www.youtube.com/watch?v={videoId}

') - parts.append(f'

{html.escape(linkURL)}

') + # --- YouTube embed / fallback + vid = post.get("yt_videoid") or extract_youtube_id(linkURL) + if vid: + # iframe (web) + thumbnail (email-safe) + lien + thumb = f"https://i.ytimg.com/vi/{vid}/hqdefault.jpg" + parts.append(YOUTUBE_EMBED_TMPL.format(vid=vid)) + parts.append(f'

Voir sur YouTube

') + parts.append(f'

YouTube thumbnail

') + if not first_image: + first_image = thumb else: + # --- Texte + lien ftext = "" - if "summary" in post: + if "summary" in post and post["summary"]: ftext = html.unescape(post["summary"]) ftext = re.sub("<[^<]+?>", "", ftext) ftext = re.sub(r"L’article .* est apparu en premier sur .*", "", ftext) if ftext: parts.append(f"

{html.escape(ftext)}

") if linkURL: - parts.append(f'

{html.escape(linkURL)}

') + esc = html.escape(linkURL) + parts.append(f'

{esc}

') - # Attach images in the body; remember the first one for feature_image - if "links" in post: - for link in post["links"]: - if link.get("type") in ("image/jpg","image/jpeg","image/png","image/webp"): - imgUrl = link.get("href") - if imgUrl: - if not first_image: - first_image = imgUrl - parts.append(f'
') + # --- Images dans le contenu + for link in post.get("links", []) or []: + if link.get("type") in ("image/jpg", "image/jpeg", "image/png", "image/webp"): + imgUrl = link.get("href") + if imgUrl: + if not first_image: + first_image = imgUrl + parts.append(f'
') - # Sources + # --- Sources parts.append("

Sources

") for feed in feeds: - parts.append(f'

{html.escape(feed.url)}

') + esc = html.escape(feed.url) + parts.append(f'

{esc}

') parts.append('

Abonnez-vous pour recevoir chaque jour les news et soutenir mon travail.

') return "\n".join(parts), first_image - def format_duration(self, seconds): + @staticmethod + def _format_duration(seconds: float) -> str: + seconds = int(seconds) days, seconds = divmod(seconds, 86400) hours, seconds = divmod(seconds, 3600) minutes, seconds = divmod(seconds, 60) parts = [] - if days > 0: parts.append(f"{days} days") - if hours > 0: parts.append(f"{hours} hours") - if minutes > 0: parts.append(f"{minutes} minutes") - if seconds > 0: parts.append(f"{seconds} seconds") + if days: parts.append(f"{days} days") + if hours: parts.append(f"{hours} hours") + if minutes: parts.append(f"{minutes} minutes") + if seconds: parts.append(f"{seconds} seconds") return ", ".join(parts) if parts else "0 seconds" - async def run_daily_at_6_am(self): + async def run_daily_at_6_05(self): while True: - now = datetime.datetime.now() - next_run = (now + datetime.timedelta(days=1)).replace(hour=6, minute=5, second=0, microsecond=0) + now = dt.datetime.now() + next_run = (now + dt.timedelta(days=1)).replace(hour=6, minute=5, second=0, microsecond=0) sleep_seconds = (next_run - now).total_seconds() while sleep_seconds > 0: - LOG.info(f"Waiting for {self.format_duration(sleep_seconds)} for next scan") + LOG.info("Waiting for %s for next scan", self._format_duration(sleep_seconds)) await asyncio.sleep(min(sleep_seconds, 5 * 60)) - now = datetime.datetime.now() + now = dt.datetime.now() sleep_seconds = (next_run - now).total_seconds() LOG.info("Going to run the daily task") await self.daily_task() async def daily_task(self): + # Log newsletters (debug) + try: + nls = self.ghost.get_newsletters() + LOG.info("Newsletters: %s", ", ".join(f"{n.get('name')}[{n.get('slug')}]" for n in nls)) + except Exception as e: + LOG.warning("Unable to list newsletters: %s", e) - nls = self.ghost.get_newsletters() - print("Newsletters:") - for n in nls: - print(f"- title={n.get('name')} slug={n.get('slug')} status={n.get('status')} default={n.get('is_default')}") + title_post = "Les news du " + self._fr_date_today() + LOG.info("Running daily task : %s", title_post) - title_post = "Les news du " + self.get_fr_date() - LOG.info("Running daily task : " + str(title_post)) - - # Re-read feeds (unchanged) + # (Re)charge les feeds feeds_file = os.environ.get("FEEDS_FILE", "/data/feeds.txt") if not os.path.isfile(feeds_file): - feeds_file = os.environ.get("FEEDS_FILE_FALLBACK", "x:\\substack\\feeds.txt") - self.feeds = [] - with open(feeds_file) as f: + feeds_file = os.environ.get("FEEDS_FILE_FALLBACK", r"c:\workspace\Substack_JV\feeds.txt") + feeds: List[RSSfeed] = [] + with open(feeds_file, encoding="utf-8") as f: lines = [line.strip() for line in f if line.strip()] for line in lines: - self.feeds.append(RSSfeed(line, "youtube" in line)) + feeds.append(RSSfeed(line, "youtube" in line.lower())) + self.feeds = feeds - yesterday_6am = datetime.datetime.now(datetime.timezone.utc).replace( - hour=6, minute=0, second=0, microsecond=0 - ) - datetime.timedelta(days=1) - - all_news_posts = [] + # Fenêtre: depuis hier 06:00 UTC + yesterday_6am_utc = dt.datetime.now(dt.timezone.utc).replace(hour=6, minute=0, second=0, microsecond=0) - dt.timedelta(days=1) + + all_news_posts: List[dict] = [] for feed in self.feeds: - LOG.info("Scanning feed " + feed.url) - html_text = requests.get(feed.url, timeout=30).text - newsFeed = feedparser.parse(html_text) + LOG.info("Scanning feed %s", feed.url) + content = self._safe_get(feed.url, timeout=30) + if not content: + continue + fp = feedparser.parse(content) - if feed.youtube: - new_posts = [e for e in newsFeed.entries if datetime.datetime.fromisoformat(e.published) > yesterday_6am] - else: - try: - new_posts = [e for e in newsFeed.entries - if datetime.datetime.strptime(e.published.replace('GMT', '+0000'), - '%a, %d %b %Y %H:%M:%S %z') > yesterday_6am] - except Exception: - new_posts = [e for e in newsFeed.entries - if datetime.datetime.fromtimestamp(time.mktime(e.updated_parsed)).replace( - tzinfo=datetime.timezone.utc) > yesterday_6am] + # Sélection des items récents + new_entries = [] + for e in fp.entries: + dte = self._entry_datetime(e) + if dte and dte > yesterday_6am_utc: + new_entries.append(e) + # Filtrage ad-hoc filtered = [] - for e in new_posts: - linkURL = e.get("link", "") + for e in new_entries: + linkURL = e.get("link", "") or "" if "actugaming" in linkURL and ("puzzle-" in linkURL or "guide-" in linkURL): continue + # enrich YouTube id if applicable + if feed.youtube and linkURL: + vid = extract_youtube_id(linkURL) + if vid: + e["yt_videoid"] = vid filtered.append(e) + all_news_posts.extend(filtered) + if not all_news_posts: + LOG.warning("Aucun item récupéré (flux down ?). On n'envoie pas aujourd'hui.") + return + random.shuffle(all_news_posts) roundup_html, feature_image = self._build_html_roundup(all_news_posts, self.feeds) - # 1) Create as draft WITH feature_image if we found one + # 1) Create draft (with feature image if any) created = self.ghost.create_post_html(title_post, roundup_html, status="draft", feature_image=feature_image) - # 2) Publish AND SEND EMAIL (always) + # 2) Publish + send email published = self.ghost.publish_post( post_id=created["id"], updated_at=created["updated_at"], - newsletter_slug=os.environ.get("GHOST_NEWSLETTER_SLUG"), # may be None -> auto-pick - email_segment=os.environ.get("GHOST_EMAIL_SEGMENT"), # may be None -> send to all + newsletter_slug=os.environ.get("GHOST_NEWSLETTER_SLUG"), + email_segment=os.environ.get("GHOST_EMAIL_SEGMENT"), ) + LOG.info("Published post: %s (emailed via newsletter)", published.get("url")) - LOG.info(f"Published post: {published.get('url')} (emailed via newsletter)") - - def debug_list_newsletters(admin_url, admin_key): - g = GhostAdmin(admin_url, admin_key) - nls = g.get_newsletters() - print("Newsletters:") - for n in nls: - print(f"- title={n.get('name')} slug={n.get('slug')} status={n.get('status')} default={n.get('is_default')}") -# ---------------- main ---------------- +# ------------- main ------------- async def main(): setuplogger() - # Feeds initial pass (kept for parity with your original script) - feeds = [] + + parser = argparse.ArgumentParser() + parser.add_argument("--run-once", action="store_true", help="Run immediately once then exit") + args = parser.parse_args() + + # Feeds init (list may be reloaded inside task) + feeds: List[RSSfeed] = [] feeds_file = os.environ.get("FEEDS_FILE", "/data/feeds.txt") if not os.path.isfile(feeds_file): feeds_file = os.environ.get("FEEDS_FILE_FALLBACK", r"c:\workspace\Substack_JV\feeds.txt") - with open(feeds_file) as f: - lines = [line.strip() for line in f if line.strip()] - for line in lines: - feeds.append(RSSfeed(line, "youtube" in line)) + with open(feeds_file, encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line: + continue + feeds.append(RSSfeed(line, "youtube" in line.lower())) - admin_url = os.environ["GHOST_ADMIN_URL"] - admin_key = os.environ["GHOST_ADMIN_KEY"] + admin_url = os.environ["GHOST_ADMIN_URL"] # e.g. https://ghostadmin.zep.best/ghost/api/admin/ + admin_key = os.environ["GHOST_ADMIN_KEY"] # integration_id:secret_hex task = GhostTask( feeds=feeds, @@ -297,9 +413,16 @@ async def main(): ) LOG.info("Starting bot") - await task.run_daily_at_6_am() - # Or just run once: - #await task.daily_task() + + if args.run-once: + await task.daily_task() + return + + # Démarrage: publier l'édition du jour si elle n'existe pas encore + await task.maybe_run_today() + + # Planification quotidienne à 06:05 Europe/Brussels (via heure locale du conteneur) + await task.run_daily_at_6_05() if __name__ == "__main__": asyncio.run(main()) diff --git a/xboxsyde.py b/xboxsyde.py new file mode 100644 index 0000000..612a96c --- /dev/null +++ b/xboxsyde.py @@ -0,0 +1,23 @@ +import feedparser +import io +import html +import datetime +import requests +import time + +url = r'https://www.xboxygen.com/spip.php?page=backend' + +html_text = requests.get(url).text +news = feedparser.parse(html_text) + +yesterday_6am = datetime.datetime.now(datetime.timezone.utc).replace(hour=6, minute=0, second=0, microsecond=0) - datetime.timedelta(days=1) + +try: + new_posts = [entry for entry in news.entries if datetime.datetime.strptime(entry.published.replace('GMT', '+0000'), '%a, %d %b %Y %H:%M:%S %z') > yesterday_6am] + +except: + new_posts = [entry for entry in news.entries if datetime.datetime.fromtimestamp(time.mktime(entry.updated_parsed)).replace(tzinfo=datetime.timezone.utc) > yesterday_6am] + #else if + #entry.updated.replace('GMT', '+0000'), '%a, %d %b %Y %H:%M:%S %z' + +print(new_posts) \ No newline at end of file