diff --git a/Dockerfile b/Dockerfile index 568ca97..537cd9d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.8 +FROM python:3.9 RUN apt-get update && apt-get install -y git RUN git clone http://gitea.zep.best/zep/Substack_JV.git /app diff --git a/post_rss_to_ghost.py b/post_rss_to_ghost.py index 4cfec87..125f873 100644 --- a/post_rss_to_ghost.py +++ b/post_rss_to_ghost.py @@ -1,18 +1,50 @@ import asyncio import argparse -import datetime +import datetime as dt import html -import io import logging import os import random import re import time from logging.handlers import RotatingFileHandler -from typing import Optional +from typing import Optional, List + import feedparser import requests -import jwt +import jwt +import zoneinfo # Python 3.9+ +from urllib.parse import urlparse, parse_qs + +# ------------- YouTube helpers ------------- + +YOUTUBE_EMBED_TMPL = ( + '
https://www.youtube.com/watch?v={videoId}
') - parts.append(f'') + # --- YouTube embed / fallback + vid = post.get("yt_videoid") or extract_youtube_id(linkURL) + if vid: + # iframe (web) + thumbnail (email-safe) + lien + thumb = f"https://i.ytimg.com/vi/{vid}/hqdefault.jpg" + parts.append(YOUTUBE_EMBED_TMPL.format(vid=vid)) + parts.append(f'') + parts.append(f'') + if not first_image: + first_image = thumb else: + # --- Texte + lien ftext = "" - if "summary" in post: + if "summary" in post and post["summary"]: ftext = html.unescape(post["summary"]) ftext = re.sub("<[^<]+?>", "", ftext) ftext = re.sub(r"L’article .* est apparu en premier sur .*", "", ftext) if ftext: parts.append(f"{html.escape(ftext)}
") if linkURL: - parts.append(f'') + esc = html.escape(linkURL) + parts.append(f'') - # Attach images in the body; remember the first one for feature_image - if "links" in post: - for link in post["links"]: - if link.get("type") in ("image/jpg","image/jpeg","image/png","image/webp"): - imgUrl = link.get("href") - if imgUrl: - if not first_image: - first_image = imgUrl - parts.append(f'Abonnez-vous pour recevoir chaque jour les news et soutenir mon travail.
') return "\n".join(parts), first_image - def format_duration(self, seconds): + @staticmethod + def _format_duration(seconds: float) -> str: + seconds = int(seconds) days, seconds = divmod(seconds, 86400) hours, seconds = divmod(seconds, 3600) minutes, seconds = divmod(seconds, 60) parts = [] - if days > 0: parts.append(f"{days} days") - if hours > 0: parts.append(f"{hours} hours") - if minutes > 0: parts.append(f"{minutes} minutes") - if seconds > 0: parts.append(f"{seconds} seconds") + if days: parts.append(f"{days} days") + if hours: parts.append(f"{hours} hours") + if minutes: parts.append(f"{minutes} minutes") + if seconds: parts.append(f"{seconds} seconds") return ", ".join(parts) if parts else "0 seconds" - async def run_daily_at_6_am(self): + async def run_daily_at_6_05(self): while True: - now = datetime.datetime.now() - next_run = (now + datetime.timedelta(days=1)).replace(hour=6, minute=5, second=0, microsecond=0) + now = dt.datetime.now() + next_run = (now + dt.timedelta(days=1)).replace(hour=6, minute=5, second=0, microsecond=0) sleep_seconds = (next_run - now).total_seconds() while sleep_seconds > 0: - LOG.info(f"Waiting for {self.format_duration(sleep_seconds)} for next scan") + LOG.info("Waiting for %s for next scan", self._format_duration(sleep_seconds)) await asyncio.sleep(min(sleep_seconds, 5 * 60)) - now = datetime.datetime.now() + now = dt.datetime.now() sleep_seconds = (next_run - now).total_seconds() LOG.info("Going to run the daily task") await self.daily_task() async def daily_task(self): + # Log newsletters (debug) + try: + nls = self.ghost.get_newsletters() + LOG.info("Newsletters: %s", ", ".join(f"{n.get('name')}[{n.get('slug')}]" for n in nls)) + except Exception as e: + LOG.warning("Unable to list newsletters: %s", e) - nls = self.ghost.get_newsletters() - print("Newsletters:") - for n in nls: - print(f"- title={n.get('name')} slug={n.get('slug')} status={n.get('status')} default={n.get('is_default')}") + title_post = "Les news du " + self._fr_date_today() + LOG.info("Running daily task : %s", title_post) - title_post = "Les news du " + self.get_fr_date() - LOG.info("Running daily task : " + str(title_post)) - - # Re-read feeds (unchanged) + # (Re)charge les feeds feeds_file = os.environ.get("FEEDS_FILE", "/data/feeds.txt") if not os.path.isfile(feeds_file): - feeds_file = os.environ.get("FEEDS_FILE_FALLBACK", "x:\\substack\\feeds.txt") - self.feeds = [] - with open(feeds_file) as f: + feeds_file = os.environ.get("FEEDS_FILE_FALLBACK", r"c:\workspace\Substack_JV\feeds.txt") + feeds: List[RSSfeed] = [] + with open(feeds_file, encoding="utf-8") as f: lines = [line.strip() for line in f if line.strip()] for line in lines: - self.feeds.append(RSSfeed(line, "youtube" in line)) + feeds.append(RSSfeed(line, "youtube" in line.lower())) + self.feeds = feeds - yesterday_6am = datetime.datetime.now(datetime.timezone.utc).replace( - hour=6, minute=0, second=0, microsecond=0 - ) - datetime.timedelta(days=1) - - all_news_posts = [] + # Fenêtre: depuis hier 06:00 UTC + yesterday_6am_utc = dt.datetime.now(dt.timezone.utc).replace(hour=6, minute=0, second=0, microsecond=0) - dt.timedelta(days=1) + + all_news_posts: List[dict] = [] for feed in self.feeds: - LOG.info("Scanning feed " + feed.url) - html_text = requests.get(feed.url, timeout=30).text - newsFeed = feedparser.parse(html_text) + LOG.info("Scanning feed %s", feed.url) + content = self._safe_get(feed.url, timeout=30) + if not content: + continue + fp = feedparser.parse(content) - if feed.youtube: - new_posts = [e for e in newsFeed.entries if datetime.datetime.fromisoformat(e.published) > yesterday_6am] - else: - try: - new_posts = [e for e in newsFeed.entries - if datetime.datetime.strptime(e.published.replace('GMT', '+0000'), - '%a, %d %b %Y %H:%M:%S %z') > yesterday_6am] - except Exception: - new_posts = [e for e in newsFeed.entries - if datetime.datetime.fromtimestamp(time.mktime(e.updated_parsed)).replace( - tzinfo=datetime.timezone.utc) > yesterday_6am] + # Sélection des items récents + new_entries = [] + for e in fp.entries: + dte = self._entry_datetime(e) + if dte and dte > yesterday_6am_utc: + new_entries.append(e) + # Filtrage ad-hoc filtered = [] - for e in new_posts: - linkURL = e.get("link", "") + for e in new_entries: + linkURL = e.get("link", "") or "" if "actugaming" in linkURL and ("puzzle-" in linkURL or "guide-" in linkURL): continue + # enrich YouTube id if applicable + if feed.youtube and linkURL: + vid = extract_youtube_id(linkURL) + if vid: + e["yt_videoid"] = vid filtered.append(e) + all_news_posts.extend(filtered) + if not all_news_posts: + LOG.warning("Aucun item récupéré (flux down ?). On n'envoie pas aujourd'hui.") + return + random.shuffle(all_news_posts) roundup_html, feature_image = self._build_html_roundup(all_news_posts, self.feeds) - # 1) Create as draft WITH feature_image if we found one + # 1) Create draft (with feature image if any) created = self.ghost.create_post_html(title_post, roundup_html, status="draft", feature_image=feature_image) - # 2) Publish AND SEND EMAIL (always) + # 2) Publish + send email published = self.ghost.publish_post( post_id=created["id"], updated_at=created["updated_at"], - newsletter_slug=os.environ.get("GHOST_NEWSLETTER_SLUG"), # may be None -> auto-pick - email_segment=os.environ.get("GHOST_EMAIL_SEGMENT"), # may be None -> send to all + newsletter_slug=os.environ.get("GHOST_NEWSLETTER_SLUG"), + email_segment=os.environ.get("GHOST_EMAIL_SEGMENT"), ) + LOG.info("Published post: %s (emailed via newsletter)", published.get("url")) - LOG.info(f"Published post: {published.get('url')} (emailed via newsletter)") - - def debug_list_newsletters(admin_url, admin_key): - g = GhostAdmin(admin_url, admin_key) - nls = g.get_newsletters() - print("Newsletters:") - for n in nls: - print(f"- title={n.get('name')} slug={n.get('slug')} status={n.get('status')} default={n.get('is_default')}") -# ---------------- main ---------------- +# ------------- main ------------- async def main(): setuplogger() - # Feeds initial pass (kept for parity with your original script) - feeds = [] + + parser = argparse.ArgumentParser() + parser.add_argument("--run-once", action="store_true", help="Run immediately once then exit") + args = parser.parse_args() + + # Feeds init (list may be reloaded inside task) + feeds: List[RSSfeed] = [] feeds_file = os.environ.get("FEEDS_FILE", "/data/feeds.txt") if not os.path.isfile(feeds_file): feeds_file = os.environ.get("FEEDS_FILE_FALLBACK", r"c:\workspace\Substack_JV\feeds.txt") - with open(feeds_file) as f: - lines = [line.strip() for line in f if line.strip()] - for line in lines: - feeds.append(RSSfeed(line, "youtube" in line)) + with open(feeds_file, encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line: + continue + feeds.append(RSSfeed(line, "youtube" in line.lower())) - admin_url = os.environ["GHOST_ADMIN_URL"] - admin_key = os.environ["GHOST_ADMIN_KEY"] + admin_url = os.environ["GHOST_ADMIN_URL"] # e.g. https://ghostadmin.zep.best/ghost/api/admin/ + admin_key = os.environ["GHOST_ADMIN_KEY"] # integration_id:secret_hex task = GhostTask( feeds=feeds, @@ -297,9 +413,16 @@ async def main(): ) LOG.info("Starting bot") - await task.run_daily_at_6_am() - # Or just run once: - #await task.daily_task() + + if args.run-once: + await task.daily_task() + return + + # Démarrage: publier l'édition du jour si elle n'existe pas encore + await task.maybe_run_today() + + # Planification quotidienne à 06:05 Europe/Brussels (via heure locale du conteneur) + await task.run_daily_at_6_05() if __name__ == "__main__": asyncio.run(main()) diff --git a/xboxsyde.py b/xboxsyde.py new file mode 100644 index 0000000..612a96c --- /dev/null +++ b/xboxsyde.py @@ -0,0 +1,23 @@ +import feedparser +import io +import html +import datetime +import requests +import time + +url = r'https://www.xboxygen.com/spip.php?page=backend' + +html_text = requests.get(url).text +news = feedparser.parse(html_text) + +yesterday_6am = datetime.datetime.now(datetime.timezone.utc).replace(hour=6, minute=0, second=0, microsecond=0) - datetime.timedelta(days=1) + +try: + new_posts = [entry for entry in news.entries if datetime.datetime.strptime(entry.published.replace('GMT', '+0000'), '%a, %d %b %Y %H:%M:%S %z') > yesterday_6am] + +except: + new_posts = [entry for entry in news.entries if datetime.datetime.fromtimestamp(time.mktime(entry.updated_parsed)).replace(tzinfo=datetime.timezone.utc) > yesterday_6am] + #else if + #entry.updated.replace('GMT', '+0000'), '%a, %d %b %Y %H:%M:%S %z' + +print(new_posts) \ No newline at end of file