diff --git a/env.bat b/env.bat new file mode 100644 index 0000000..80e3b86 --- /dev/null +++ b/env.bat @@ -0,0 +1,7 @@ +set GHOST_ADMIN_KEY=68bad0e13546e700012dd65d:116a81b7e189d3b3d3b86082f97ef65daedb06498a3f1f902b8e0c08d095dc19 +set GHOST_ADMIN_URL=https://ghostadmin.zep.best/ghost/api/admin/__bot/FF4440EBA737506D397C170A8422109C357AA7582F10938B7C5F11D6B652F5D4 +set GHOST_EMAIL_SEGMENT=status:free +set GHOST_NEWSLETTER_SLUG=default-newsletter +set GHOST_CONTENT_URL=https://ghost.zep.best +set DB_FILE_FALLBACK=f:\workspace\Substack_JV\data\published.db +set MISTRAL_API_KEY=tQJHvYlmwz1ihKxOhXS3FmDNTRhBh6b3 \ No newline at end of file diff --git a/post_rss_to_ghost.py b/post_rss_to_ghost.py index 058a92f..6504fd4 100644 --- a/post_rss_to_ghost.py +++ b/post_rss_to_ghost.py @@ -2,18 +2,67 @@ import asyncio import argparse import datetime as dt import html +import json import logging import os -import random import re import time from logging.handlers import RotatingFileHandler -from typing import Optional, List +from typing import Optional, List, Dict import feedparser import requests import jwt import zoneinfo # Python 3.9+ -from urllib.parse import urlparse, parse_qs +from urllib.parse import urlparse, parse_qs, urljoin + +# ------------- Web Crawler for Images ------------- + +def extract_image_from_url(url: str, timeout: int = 10) -> Optional[str]: + """ + Fetch a webpage and extract the best image (og:image, twitter:image, or first large image). + Returns the image URL or None. + """ + try: + resp = requests.get( + url, + timeout=timeout, + headers={ + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Accept": "text/html,application/xhtml+xml", + }, + allow_redirects=True, + ) + resp.raise_for_status() + html_content = resp.text + + # Try OpenGraph image first (most reliable) + og_match = re.search(r']+property=["\']og:image["\'][^>]+content=["\']([^"\']+)["\']', html_content, re.IGNORECASE) + if not og_match: + og_match = re.search(r']+content=["\']([^"\']+)["\'][^>]+property=["\']og:image["\']', html_content, re.IGNORECASE) + if og_match: + img_url = og_match.group(1) + return urljoin(url, img_url) + + # Try Twitter card image + tw_match = re.search(r']+name=["\']twitter:image["\'][^>]+content=["\']([^"\']+)["\']', html_content, re.IGNORECASE) + if not tw_match: + tw_match = re.search(r']+content=["\']([^"\']+)["\'][^>]+name=["\']twitter:image["\']', html_content, re.IGNORECASE) + if tw_match: + img_url = tw_match.group(1) + return urljoin(url, img_url) + + # Fallback: look for article/main image + article_img = re.search(r']*>.*?]+src=["\']([^"\']+)["\']', html_content, re.IGNORECASE | re.DOTALL) + if article_img: + img_url = article_img.group(1) + # Skip tiny images, icons, avatars + if not any(skip in img_url.lower() for skip in ['avatar', 'icon', 'logo', 'emoji', '1x1', 'pixel']): + return urljoin(url, img_url) + + return None + except Exception as e: + LOG.debug("Failed to extract image from %s: %s", url, e) + return None # ------------- YouTube helpers ------------- @@ -30,11 +79,11 @@ def fetch_youtube_oembed_html(youtube_url: str, timeout: int = 10) -> Optional[s ) resp.raise_for_status() data = resp.json() - html = data.get("html") - if not html: + html_content = data.get("html") + if not html_content: return None # Wrap in Ghost embed card container; do NOT alter the iframe attributes. - return f'
{html}
' + return f'
{html_content}
' except Exception: return None @@ -82,6 +131,239 @@ class RSSfeed: self.url = url self.youtube = yt +# ------------- Mistral AI Client ------------- + +class MistralClient: + """Client for Mistral AI API to filter and group news items.""" + + def __init__(self, api_key: str, model: str = "mistral-small-latest"): + self.api_key = api_key + self.model = model + self.base_url = "https://api.mistral.ai/v1/chat/completions" + + def _call_api(self, messages: List[Dict], temperature: float = 0.3) -> Optional[str]: + """Make a call to the Mistral API.""" + headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", + } + payload = { + "model": self.model, + "messages": messages, + "temperature": temperature, + "response_format": {"type": "json_object"}, + } + try: + resp = requests.post(self.base_url, headers=headers, json=payload, timeout=120) + resp.raise_for_status() + return resp.json()["choices"][0]["message"]["content"] + except Exception as e: + LOG.error("Mistral API error: %s", e) + return None + + def filter_news_items(self, items: List[dict], dry_run: bool = False) -> List[dict]: + """ + Filter out non-news items (tips, walkthroughs, guides, tutorials). + Returns only actual game news items. + """ + if not items: + return [] + + # Prepare items for analysis + items_for_analysis = [] + for i, item in enumerate(items): + items_for_analysis.append({ + "id": i, + "title": item.get("title", ""), + "link": item.get("link", ""), + "summary": (item.get("summary", "") or "")[:300], # Truncate for API + }) + + # Split into batches to avoid token limits + batch_size = 50 + filtered_ids = set() + + for batch_start in range(0, len(items_for_analysis), batch_size): + batch = items_for_analysis[batch_start:batch_start + batch_size] + + prompt = f"""Tu analyses des articles de sites de jeux vidéo. Tu dois identifier UNIQUEMENT les articles à EXCLURE. + +EXCLURE UNIQUEMENT si le titre contient EXPLICITEMENT UN de ces mots-clés: +- "guide" (le mot exact) +- "soluce" (le mot exact) +- "astuce" (le mot exact) +- "solution" (le mot exact, pas "résolution") +- "code promo" +- "bon plan" +- "-20%" ou "-30%" etc (réductions) +- "tuto" ou "tutoriel" +- "comment faire" +- "how to" + +NE JAMAIS EXCLURE: +- "Early Access" = news de sortie anticipée, À GARDER +- "Test" ou "Review" = critique, À GARDER +- "Partie Rapide" = émission/podcast, À GARDER +- Tout article de news, annonce, sortie, preview +- Tout article d'opinion, éditorial, récap +- Tout le reste qui ne contient pas les mots-clés d'exclusion ci-dessus + +Articles à analyser: +{json.dumps(batch, ensure_ascii=False, indent=2)} + +Retourne un JSON avec "exclude_ids" contenant UNIQUEMENT les IDs des articles guides/soluces/promos. +Si aucun article ne correspond aux critères d'exclusion, retourne {{"exclude_ids": []}} + +Sois TRÈS conservateur - en cas de doute, NE PAS exclure.""" + + messages = [{"role": "user", "content": prompt}] + response = self._call_api(messages) + + if response: + try: + result = json.loads(response) + excluded_ids = set(result.get("exclude_ids", [])) + # Keep all items NOT in excluded_ids + batch_ids = {item["id"] for item in batch} + kept_ids = batch_ids - excluded_ids + filtered_ids.update(kept_ids) + except json.JSONDecodeError: + LOG.warning("Failed to parse Mistral response for filtering") + # Fallback: include all items from this batch + filtered_ids.update(item["id"] for item in batch) + else: + # Fallback: include all items from this batch + filtered_ids.update(item["id"] for item in batch) + + # Log filtered out items in dry-run mode + if dry_run: + excluded_ids = set(range(len(items))) - filtered_ids + if excluded_ids: + LOG.debug("=== FILTERED OUT (non-news) ===") + for i in sorted(excluded_ids): + LOG.debug(" [EXCLUDED] %s", items[i].get("title", "No title")) + LOG.debug("=== KEPT (news) ===") + for i in sorted(filtered_ids): + if i < len(items): + LOG.debug(" [KEPT] %s", items[i].get("title", "No title")) + + return [items[i] for i in sorted(filtered_ids) if i < len(items)] + + def group_similar_items(self, items: List[dict]) -> List[Dict]: + """ + Group news items by category (News, Tests/Reviews, Previews, etc.) + with sub-groups by game/topic within each category. + Returns a list of categories, each with sub-groups containing items. + """ + if not items: + return [] + + # Prepare items for analysis + items_for_analysis = [] + for i, item in enumerate(items): + items_for_analysis.append({ + "id": i, + "title": item.get("title", ""), + "link": item.get("link", ""), + }) + + prompt = f"""Organise ces articles de jeux vidéo en CATÉGORIES et SOUS-GROUPES. + +Articles à organiser: +{json.dumps(items_for_analysis, ensure_ascii=False, indent=2)} + +CATÉGORIES (utilise ces noms exacts): +1. "Actualités" - News, annonces, sorties, mises à jour, industrie +2. "Tests & Critiques" - Reviews, tests, avis, notes +3. "Aperçus & Previews" - Previews, impressions, démos, hands-on +4. "Vidéos" - Trailers, gameplay vidéos, podcasts +5. "Autres" - Le reste + +RÈGLES DE GROUPEMENT (TRÈS IMPORTANT): +- Groupe par FRANCHISE ou SÉRIE (ex: tous les "Final Fantasy" ensemble, même FF7, FF16, FF XIV) +- Groupe par UNIVERS (ex: "Warhammer 40K" = Space Marine + Dawn of War + Darktide) +- Groupe par ÉVÉNEMENT (ex: "Nintendo Direct", "State of Play", "Game Awards") +- N'utilise JAMAIS de noms de sites web comme groupes (pas "NoFrag", "JeuxOnline", etc.) + +EXEMPLES DE GROUPEMENTS CORRECTS: +- "Final Fantasy VII Rebirth sur Switch 2" + "Final Fantasy VII Remake Intergrade en tête" → groupe "Final Fantasy" +- "Techmarine dans Space Marine 2" + "Dawn of War 4 gameplay Ork" → groupe "Warhammer 40K" +- "GTA 6 trailer" + "GTA 6 date de sortie" → groupe "GTA 6" +- "Nintendo Direct annoncé" + "Zelda dans le Nintendo Direct" → groupe "Nintendo Direct" + +Retourne ce JSON: +{{ + "categories": [ + {{ + "name": "Actualités", + "subgroups": [ + {{"title": "Final Fantasy", "item_ids": [0, 3, 7]}}, + {{"title": "Warhammer 40K", "item_ids": [1, 2]}}, + {{"title": "Steam", "item_ids": [5]}} + ] + }} + ] +}} + +IMPORTANT: Chaque article dans UN SEUL sous-groupe. Titre = nom de franchise/série/univers, PAS nom de site.""" + + messages = [{"role": "user", "content": prompt}] + response = self._call_api(messages, temperature=0.2) + + if response: + try: + result = json.loads(response) + categories = [] + used_ids = set() + + for cat_data in result.get("categories", []): + cat_name = cat_data.get("name", "Autres") + subgroups = [] + + for sg_data in cat_data.get("subgroups", []): + sg_title = sg_data.get("title", "Divers") + item_ids = sg_data.get("item_ids", []) + + # Filter to valid, unused IDs + valid_ids = [i for i in item_ids if i < len(items) and i not in used_ids] + if valid_ids: + used_ids.update(valid_ids) + subgroups.append({ + "title": sg_title, + "items": [items[i] for i in valid_ids] + }) + + if subgroups: + categories.append({ + "name": cat_name, + "subgroups": subgroups + }) + + # Add any ungrouped items + ungrouped = [items[i] for i in range(len(items)) if i not in used_ids] + if ungrouped: + # Find or create "Autres" category + autres_cat = next((c for c in categories if c["name"] == "Autres"), None) + if autres_cat: + autres_cat["subgroups"].append({"title": "Divers", "items": ungrouped}) + else: + categories.append({ + "name": "Autres", + "subgroups": [{"title": "Divers", "items": ungrouped}] + }) + + return categories + + except json.JSONDecodeError: + LOG.warning("Failed to parse Mistral response for grouping") + + # Fallback: return all items in a single category/subgroup + return [{ + "name": "Actualités de la semaine", + "subgroups": [{"title": "Toutes les news", "items": items}] + }] + + # ------------- Ghost Admin API client ------------- class GhostAdmin: @@ -163,41 +445,63 @@ class GhostAdmin: # ------------- Task orchestration ------------- class GhostTask: - def __init__(self, feeds: List[RSSfeed], admin_url: str, admin_key: str, newsletter_slug: Optional[str] = None, email_segment: Optional[str] = None): + def __init__(self, feeds: List[RSSfeed], admin_url: str, admin_key: str, + mistral_api_key: Optional[str] = None, + newsletter_slug: Optional[str] = None, email_segment: Optional[str] = None, + dry_run: bool = False): self.ghost = GhostAdmin(admin_url, admin_key) self.feeds = feeds self.newsletter_slug = newsletter_slug self.email_segment = email_segment + self.mistral = MistralClient(mistral_api_key) if mistral_api_key else None + self.dry_run = dry_run for feed in self.feeds: LOG.info("Adding feed %s", feed.url) - # --- startup immediate run if not yet published today + # --- startup immediate run if not yet published this week - def _published_today(self) -> bool: + def _published_this_week(self) -> bool: + """Check if we already published this week (since last Saturday 12:00).""" tz = zoneinfo.ZoneInfo("Europe/Brussels") last = self.ghost.latest_published_date("Europe/Brussels") if not last: return False - return last.date() == dt.datetime.now(tz).date() + + now = dt.datetime.now(tz) + # Find last Saturday at 12:00 + days_since_saturday = (now.weekday() - 5) % 7 # Saturday = 5 + last_saturday = (now - dt.timedelta(days=days_since_saturday)).replace( + hour=12, minute=0, second=0, microsecond=0 + ) + + return last >= last_saturday - async def maybe_run_today(self): - if not self._published_today(): - LOG.info("Aucune newsletter publiée aujourd'hui -> génération immédiate.") - await self.daily_task() + async def maybe_run_this_week(self): + if not self._published_this_week(): + LOG.info("Aucune newsletter publiée cette semaine -> génération immédiate.") + await self.weekly_task() else: - LOG.info("Déjà publié aujourd'hui, on attend la prochaine fenêtre.") + LOG.info("Déjà publié cette semaine, on attend la prochaine fenêtre.") # --- utils @staticmethod - def _fr_date_today() -> str: + def _fr_week_range() -> str: + """Returns a French formatted date range for the past week.""" months = { 'January': 'Janvier', 'February': 'Février', 'March': 'Mars', 'April': 'Avril', 'May': 'Mai', 'June': 'Juin', 'July': 'Juillet', 'August': 'Août', 'September': 'Septembre', 'October': 'Octobre', 'November': 'Novembre', 'December': 'Décembre' } today = dt.datetime.now() - formatted = today.strftime("%d %B %Y") + week_ago = today - dt.timedelta(days=7) + + # Format: "24 - 31 Janvier 2025" or "28 Janvier - 4 Février 2025" + if week_ago.month == today.month: + formatted = f"{week_ago.day} - {today.strftime('%d %B %Y')}" + else: + formatted = f"{week_ago.strftime('%d %B')} - {today.strftime('%d %B %Y')}" + for en, fr in months.items(): formatted = formatted.replace(en, fr) return formatted @@ -237,70 +541,179 @@ class GhostTask: pass return None - # --- HTML builder + # --- HTML builder for grouped content - def _build_html_roundup(self, items: List[dict], feeds: List[RSSfeed]): + def _build_html_roundup_grouped(self, categories: List[Dict], feeds: List[RSSfeed]): """ - Construit le HTML et retourne (html, feature_image_url_ou_None). - - YouTube: iframe + miniature cliquable (fallback email-safe) - - Images: collecte la première pour feature_image + Construit le HTML avec des catégories et sous-groupes thématiques. + Inclut un résumé et une table des matières en haut. + Retourne (html, feature_image_url_ou_None). """ parts: List[str] = [] - #parts.append(f"

Les news du {self._fr_date_today()}

") first_image: Optional[str] = None + + # --- Build Summary Section --- + parts.append('

✨ En bref cette semaine

') + parts.append('
    ') + + for cat in categories: + cat_name = cat.get("name", "Actualités") + subgroups = cat.get("subgroups", []) + + # Get top subgroups with more than 1 item (by item count) for summary + multi_item_subgroups = [sg for sg in subgroups if len(sg.get("items", [])) > 1] + sorted_subgroups = sorted(multi_item_subgroups, key=lambda sg: len(sg.get("items", [])), reverse=True) + top_subgroups = sorted_subgroups[:5] # Max 5 highlights per category + + total_items = sum(len(sg.get("items", [])) for sg in subgroups) + if top_subgroups: + highlights = ", ".join(sg.get("title", "Divers") for sg in top_subgroups) + total_items = sum(len(sg.get("items", [])) for sg in subgroups) + parts.append(f'
  • {html.escape(cat_name)}: {html.escape(highlights)} ({total_items} articles)
  • ') + elif total_items > 0: + parts.append(f'
  • {html.escape(cat_name)}: {total_items} articles
  • ') + + parts.append('
') + parts.append('
') + + # --- Build Table of Contents --- + # parts.append('

📋 Sommaire

') + # parts.append('
    ') + + # for cat in categories: + # cat_name = cat.get("name", "Actualités") + # cat_anchor = self._make_anchor(cat_name) + # subgroups = cat.get("subgroups", []) + # total_items = sum(len(sg.get("items", [])) for sg in subgroups) + + # parts.append(f'
  • {html.escape(cat_name)} ({total_items} articles)') + + # if len(subgroups) > 1 or (len(subgroups) == 1 and len(subgroups[0].get("items", [])) > 1): + # parts.append('
      ') + # for sg in subgroups: + # sg_title = sg.get("title", "Divers") + # sg_anchor = self._make_anchor(f"{cat_name}-{sg_title}") + # item_count = len(sg.get("items", [])) + # parts.append(f'
    • {html.escape(sg_title)} ({item_count})
    • ') + # parts.append('
    ') + + # parts.append('
  • ') + + # parts.append('
') + # parts.append('
') + + # --- Build Content by Category --- + for cat in categories: + cat_name = cat.get("name", "Actualités") + cat_anchor = self._make_anchor(cat_name) + subgroups = cat.get("subgroups", []) + + if not subgroups: + continue + + # Category header with emoji + cat_emoji = { + "Actualités": "📰", + "Tests & Critiques": "⭐", + "Aperçus & Previews": "👁️", + "Vidéos": "🎬", + "Autres": "📁" + }.get(cat_name, "📌") + + parts.append(f'

{cat_emoji} {html.escape(cat_name)}

') + + for sg in subgroups: + sg_title = sg.get("title", "Divers") + sg_anchor = self._make_anchor(f"{cat_name}-{sg_title}") + items = sg.get("items", []) + + if not items: + continue + + # Sub-group header (only if more than 1 item in subgroup) + if len(items) > 1: + parts.append(f'

{html.escape(sg_title)}

') + + for post in items: + title = post.get("title", "") or "" + linkURL = post.get("link", "") or "" + parts.append(f'

{html.escape(title)}

') - for post in items: - title = post.get("title", "") or "" - linkURL = post.get("link", "") or "" - parts.append(f'

{html.escape(title)}

') + # --- YouTube embed / fallback + vid = post.get("yt_videoid") or extract_youtube_id(linkURL) + if vid: + watch_url = f"https://www.youtube.com/watch?v={vid}" - # --- YouTube embed / fallback - vid = post.get("yt_videoid") or extract_youtube_id(linkURL) - if vid: - watch_url = f"https://www.youtube.com/watch?v={vid}" + # Try provider HTML via oEmbed (as Ghost does) + embed_html = fetch_youtube_oembed_html(watch_url, timeout=10) + if embed_html: + parts.append(embed_html) + else: + # Fallback: leave the plain URL on its own line so Ghost may still auto-embed + parts.append(f'\n

{watch_url}

\n') - # Try provider HTML via oEmbed (as Ghost does) - embed_html = fetch_youtube_oembed_html(watch_url, timeout=10) - if embed_html: - parts.append(embed_html) - else: - # Fallback: leave the plain URL on its own line so Ghost may still auto-embed - parts.append(f'\n

{watch_url}

\n') + # Minimal fallback link (non-intrusive for email/web) + parts.append(f'

Voir sur YouTube

') + else: + # --- Texte + lien + ftext = "" + if "summary" in post and post["summary"]: + ftext = html.unescape(post["summary"]) + ftext = re.sub("<[^<]+?>", "", ftext) + ftext = re.sub(r"L'article .* est apparu en premier sur .*", "", ftext) + if ftext: + parts.append(f"

{html.escape(ftext)}

") + if linkURL: + esc = html.escape(linkURL) + parts.append(f'

{esc}

') - # Minimal fallback link (non-intrusive for email/web) - parts.append(f'

Voir sur YouTube

') - else: - # --- Texte + lien - ftext = "" - if "summary" in post and post["summary"]: - ftext = html.unescape(post["summary"]) - ftext = re.sub("<[^<]+?>", "", ftext) - ftext = re.sub(r"L’article .* est apparu en premier sur .*", "", ftext) - if ftext: - parts.append(f"

{html.escape(ftext)}

") - if linkURL: - esc = html.escape(linkURL) - parts.append(f'

{esc}

') - - # --- Images dans le contenu - for link in post.get("links", []) or []: - if link.get("type") in ("image/jpg", "image/jpeg", "image/png", "image/webp"): - imgUrl = link.get("href") - if imgUrl: - imgUrl = imgUrl.replace("/250x250/", "/990x320/") - if not first_image: - first_image = imgUrl - parts.append(f'
') + # --- Images: first try RSS metadata, then crawl the page + found_image = False + for link in post.get("links", []) or []: + if link.get("type") in ("image/jpg", "image/jpeg", "image/png", "image/webp"): + imgUrl = link.get("href") + if imgUrl: + imgUrl = imgUrl.replace("/250x250/", "/990x320/") + if not first_image: + first_image = imgUrl + parts.append(f'
') + found_image = True + + # If no image from RSS, try to extract from the article page + if not found_image and linkURL: + crawled_img = extract_image_from_url(linkURL, timeout=8) + if crawled_img: + if not first_image: + first_image = crawled_img + parts.append(f'
') + + parts.append('
') # --- Sources - parts.append("

Sources

") + parts.append("

📚 Sources

") for feed in feeds: esc = html.escape(feed.url) parts.append(f'

{esc}

') - parts.append('

Abonnez-vous pour recevoir chaque jour les news et soutenir mon travail.

') + parts.append('

Abonnez-vous pour recevoir chaque semaine les news et soutenir mon travail.

') return "\n".join(parts), first_image + @staticmethod + def _make_anchor(text: str) -> str: + """Convert text to a valid HTML anchor ID.""" + # Remove accents and special chars, lowercase, replace spaces with dashes + anchor = text.lower() + anchor = re.sub(r'[àáâãäå]', 'a', anchor) + anchor = re.sub(r'[èéêë]', 'e', anchor) + anchor = re.sub(r'[ìíîï]', 'i', anchor) + anchor = re.sub(r'[òóôõö]', 'o', anchor) + anchor = re.sub(r'[ùúûü]', 'u', anchor) + anchor = re.sub(r'[ýÿ]', 'y', anchor) + anchor = re.sub(r'[ç]', 'c', anchor) + anchor = re.sub(r'[^a-z0-9\s-]', '', anchor) + anchor = re.sub(r'\s+', '-', anchor.strip()) + return anchor or "section" + @staticmethod def _format_duration(seconds: float) -> str: seconds = int(seconds) @@ -314,20 +727,32 @@ class GhostTask: if seconds: parts.append(f"{seconds} seconds") return ", ".join(parts) if parts else "0 seconds" - async def run_daily_at_6_05(self): + async def run_weekly_on_saturday(self): + """Run every Saturday at 12:00 (noon).""" while True: now = dt.datetime.now() - next_run = (now + dt.timedelta(days=1)).replace(hour=6, minute=5, second=0, microsecond=0) + + # Calculate next Saturday at 12:00 + days_until_saturday = (5 - now.weekday()) % 7 # Saturday = 5 + if days_until_saturday == 0 and now.hour >= 12: + days_until_saturday = 7 # Already past Saturday 12:00, wait for next week + + next_run = (now + dt.timedelta(days=days_until_saturday)).replace( + hour=12, minute=0, second=0, microsecond=0 + ) + sleep_seconds = (next_run - now).total_seconds() while sleep_seconds > 0: - LOG.info("Waiting for %s for next scan", self._format_duration(sleep_seconds)) + LOG.info("Waiting for %s for next scan (Saturday noon)", self._format_duration(sleep_seconds)) await asyncio.sleep(min(sleep_seconds, 5 * 60)) now = dt.datetime.now() sleep_seconds = (next_run - now).total_seconds() - LOG.info("Going to run the daily task") - await self.daily_task() + + LOG.info("Going to run the weekly task") + await self.weekly_task() - async def daily_task(self): + async def weekly_task(self): + """Main weekly task: collect, filter, group, and publish.""" # Log newsletters (debug) try: nls = self.ghost.get_newsletters() @@ -335,8 +760,8 @@ class GhostTask: except Exception as e: LOG.warning("Unable to list newsletters: %s", e) - title_post = "Les news du " + self._fr_date_today() - LOG.info("Running daily task : %s", title_post) + title_post = "Les news de la semaine du " + self._fr_week_range() + LOG.info("Running weekly task : %s", title_post) # (Re)charge les feeds feeds_file = os.environ.get("FEEDS_FILE", "/data/feeds.txt") @@ -349,8 +774,10 @@ class GhostTask: feeds.append(RSSfeed(line, "youtube" in line.lower())) self.feeds = feeds - # Fenêtre: depuis hier 06:00 UTC - yesterday_6am_utc = dt.datetime.now(dt.timezone.utc).replace(hour=6, minute=0, second=0, microsecond=0) - dt.timedelta(days=1) + # Fenêtre: depuis 7 jours à 06:00 UTC + week_ago_6am_utc = dt.datetime.now(dt.timezone.utc).replace( + hour=6, minute=0, second=0, microsecond=0 + ) - dt.timedelta(days=7) all_news_posts: List[dict] = [] for feed in self.feeds: @@ -360,14 +787,14 @@ class GhostTask: continue fp = feedparser.parse(content) - # Sélection des items récents + # Sélection des items de la semaine new_entries = [] for e in fp.entries: dte = self._entry_datetime(e) - if dte and dte > yesterday_6am_utc: + if dte and dte > week_ago_6am_utc: new_entries.append(e) - # Filtrage ad-hoc + # Basic URL-based filtering (keep existing logic) filtered = [] for e in new_entries: linkURL = e.get("link", "") or "" @@ -383,16 +810,55 @@ class GhostTask: all_news_posts.extend(filtered) if not all_news_posts: - LOG.warning("Aucun item récupéré (flux down ?). On n'envoie pas aujourd'hui.") + LOG.warning("Aucun item récupéré (flux down ?). On n'envoie pas cette semaine.") return - random.shuffle(all_news_posts) - roundup_html, feature_image = self._build_html_roundup(all_news_posts, self.feeds) + LOG.info("Collected %d items from feeds", len(all_news_posts)) + + # Use Mistral AI for filtering and grouping if available + if self.mistral: + LOG.info("Using Mistral AI to filter non-news content...") + filtered_posts = self.mistral.filter_news_items(all_news_posts, dry_run=self.dry_run) + LOG.info("After filtering: %d items (removed %d)", + len(filtered_posts), len(all_news_posts) - len(filtered_posts)) + + if filtered_posts: + LOG.info("Using Mistral AI to group items by category...") + categories = self.mistral.group_similar_items(filtered_posts) + total_cats = len(categories) + total_subgroups = sum(len(cat.get("subgroups", [])) for cat in categories) + LOG.info("Created %d categories with %d sub-groups", total_cats, total_subgroups) + else: + categories = [] + else: + LOG.warning("No Mistral API key configured, skipping AI filtering/grouping") + # Fallback: single category with all items + categories = [{ + "name": "Actualités de la semaine", + "subgroups": [{"title": "Toutes les news", "items": all_news_posts}] + }] + + if not categories or all( + len(sg.get("items", [])) == 0 + for cat in categories + for sg in cat.get("subgroups", []) + ): + LOG.warning("No news items after filtering. Skipping this week.") + return + + roundup_html, feature_image = self._build_html_roundup_grouped(categories, self.feeds) # 1) Create draft (with feature image if any) created = self.ghost.create_post_html(title_post, roundup_html, status="draft", feature_image=feature_image) + LOG.info("Created draft post: %s (id: %s)", created.get("title"), created.get("id")) - # 2) Publish + send email + # 2) Publish + send email (unless dry-run mode) + if self.dry_run: + LOG.info("DRY-RUN MODE: Post created as draft but NOT published. URL: %s", + created.get("url", "N/A")) + LOG.info("DRY-RUN MODE: Review the draft in Ghost admin, then publish manually if satisfied.") + return + published = self.ghost.publish_post( post_id=created["id"], updated_at=created["updated_at"], @@ -408,6 +874,8 @@ async def main(): parser = argparse.ArgumentParser() parser.add_argument("--runonce", action="store_true", help="Run now and exit (no scheduler)") + parser.add_argument("--dry-run", action="store_true", dest="dry_run", + help="Run immediately, create draft but do NOT publish (for testing)") args = parser.parse_args() # Feeds init (list may be reloaded inside task) @@ -424,26 +892,37 @@ async def main(): admin_url = os.environ["GHOST_ADMIN_URL"] # e.g. https://ghostadmin.zep.best/ghost/api/admin/ admin_key = os.environ["GHOST_ADMIN_KEY"] # integration_id:secret_hex + mistral_api_key = os.environ.get("MISTRAL_API_KEY") # Optional: for AI filtering/grouping + + if not mistral_api_key: + LOG.warning("MISTRAL_API_KEY not set. AI filtering and grouping will be disabled.") task = GhostTask( feeds=feeds, admin_url=admin_url, admin_key=admin_key, + mistral_api_key=mistral_api_key, newsletter_slug=os.environ.get("GHOST_NEWSLETTER_SLUG"), email_segment=os.environ.get("GHOST_EMAIL_SEGMENT"), + dry_run=args.dry_run, ) - LOG.info("Starting bot") + LOG.info("Starting bot (weekly mode%s)", " - DRY RUN" if args.dry_run else "") if args.runonce: - await task.daily_task() + await task.weekly_task() return - # Démarrage: publier l'édition du jour si elle n'existe pas encore - await task.maybe_run_today() + if args.dry_run: + LOG.info("DRY-RUN: Running weekly task immediately (will create draft only)") + await task.weekly_task() + return - # Planification quotidienne à 06:05 Europe/Brussels (via heure locale du conteneur) - await task.run_daily_at_6_05() + # Démarrage: publier l'édition de la semaine si elle n'existe pas encore + await task.maybe_run_this_week() + + # Planification hebdomadaire le samedi à 12:00 Europe/Brussels + await task.run_weekly_on_saturday() if __name__ == "__main__": asyncio.run(main())