Les news du {self._fr_date_today()}
") first_image: Optional[str] = None + + # --- Build Summary Section --- + parts.append('✨ En bref cette semaine
') + parts.append('- ')
+
+ for cat in categories:
+ cat_name = cat.get("name", "Actualités")
+ subgroups = cat.get("subgroups", [])
+
+ # Get top subgroups with more than 1 item (by item count) for summary
+ multi_item_subgroups = [sg for sg in subgroups if len(sg.get("items", [])) > 1]
+ sorted_subgroups = sorted(multi_item_subgroups, key=lambda sg: len(sg.get("items", [])), reverse=True)
+ top_subgroups = sorted_subgroups[:5] # Max 5 highlights per category
+
+ total_items = sum(len(sg.get("items", [])) for sg in subgroups)
+ if top_subgroups:
+ highlights = ", ".join(sg.get("title", "Divers") for sg in top_subgroups)
+ total_items = sum(len(sg.get("items", [])) for sg in subgroups)
+ parts.append(f'
- {html.escape(cat_name)}: {html.escape(highlights)} ({total_items} articles) ') + elif total_items > 0: + parts.append(f'
- {html.escape(cat_name)}: {total_items} articles ') + + parts.append('
') + + # --- Build Table of Contents --- + # parts.append('
📋 Sommaire
') + # parts.append('- ')
+
+ # for cat in categories:
+ # cat_name = cat.get("name", "Actualités")
+ # cat_anchor = self._make_anchor(cat_name)
+ # subgroups = cat.get("subgroups", [])
+ # total_items = sum(len(sg.get("items", [])) for sg in subgroups)
+
+ # parts.append(f'
- {html.escape(cat_name)} ({total_items} articles)')
+
+ # if len(subgroups) > 1 or (len(subgroups) == 1 and len(subgroups[0].get("items", [])) > 1):
+ # parts.append('
- ')
+ # for sg in subgroups:
+ # sg_title = sg.get("title", "Divers")
+ # sg_anchor = self._make_anchor(f"{cat_name}-{sg_title}")
+ # item_count = len(sg.get("items", []))
+ # parts.append(f'
- {html.escape(sg_title)} ({item_count}) ') + # parts.append('
')
+
+ # parts.append('
') + + # --- Build Content by Category --- + for cat in categories: + cat_name = cat.get("name", "Actualités") + cat_anchor = self._make_anchor(cat_name) + subgroups = cat.get("subgroups", []) + + if not subgroups: + continue + + # Category header with emoji + cat_emoji = { + "Actualités": "📰", + "Tests & Critiques": "⭐", + "Aperçus & Previews": "👁️", + "Vidéos": "🎬", + "Autres": "📁" + }.get(cat_name, "📌") + + parts.append(f'
{cat_emoji} {html.escape(cat_name)}
') + + for sg in subgroups: + sg_title = sg.get("title", "Divers") + sg_anchor = self._make_anchor(f"{cat_name}-{sg_title}") + items = sg.get("items", []) + + if not items: + continue + + # Sub-group header (only if more than 1 item in subgroup) + if len(items) > 1: + parts.append(f'{html.escape(sg_title)}
') + + for post in items: + title = post.get("title", "") or "" + linkURL = post.get("link", "") or "" + parts.append(f'{html.escape(title)}
') - for post in items: - title = post.get("title", "") or "" - linkURL = post.get("link", "") or "" - parts.append(f'{html.escape(title)}
') + # --- YouTube embed / fallback + vid = post.get("yt_videoid") or extract_youtube_id(linkURL) + if vid: + watch_url = f"https://www.youtube.com/watch?v={vid}" - # --- YouTube embed / fallback - vid = post.get("yt_videoid") or extract_youtube_id(linkURL) - if vid: - watch_url = f"https://www.youtube.com/watch?v={vid}" + # Try provider HTML via oEmbed (as Ghost does) + embed_html = fetch_youtube_oembed_html(watch_url, timeout=10) + if embed_html: + parts.append(embed_html) + else: + # Fallback: leave the plain URL on its own line so Ghost may still auto-embed + parts.append(f'\n{watch_url}
\n') - # Try provider HTML via oEmbed (as Ghost does) - embed_html = fetch_youtube_oembed_html(watch_url, timeout=10) - if embed_html: - parts.append(embed_html) - else: - # Fallback: leave the plain URL on its own line so Ghost may still auto-embed - parts.append(f'\n{watch_url}
\n') + # Minimal fallback link (non-intrusive for email/web) + parts.append(f'') + else: + # --- Texte + lien + ftext = "" + if "summary" in post and post["summary"]: + ftext = html.unescape(post["summary"]) + ftext = re.sub("<[^<]+?>", "", ftext) + ftext = re.sub(r"L'article .* est apparu en premier sur .*", "", ftext) + if ftext: + parts.append(f"{html.escape(ftext)}
") + if linkURL: + esc = html.escape(linkURL) + parts.append(f'') - # Minimal fallback link (non-intrusive for email/web) - parts.append(f'') - else: - # --- Texte + lien - ftext = "" - if "summary" in post and post["summary"]: - ftext = html.unescape(post["summary"]) - ftext = re.sub("<[^<]+?>", "", ftext) - ftext = re.sub(r"L’article .* est apparu en premier sur .*", "", ftext) - if ftext: - parts.append(f"{html.escape(ftext)}
") - if linkURL: - esc = html.escape(linkURL) - parts.append(f'') - - # --- Images dans le contenu - for link in post.get("links", []) or []: - if link.get("type") in ("image/jpg", "image/jpeg", "image/png", "image/webp"): - imgUrl = link.get("href") - if imgUrl: - imgUrl = imgUrl.replace("/250x250/", "/990x320/") - if not first_image: - first_image = imgUrl - parts.append(f'') # --- Sources - parts.append("
Sources
") + parts.append("📚 Sources
") for feed in feeds: esc = html.escape(feed.url) parts.append(f'') - parts.append('Abonnez-vous pour recevoir chaque jour les news et soutenir mon travail.
') + parts.append('Abonnez-vous pour recevoir chaque semaine les news et soutenir mon travail.
') return "\n".join(parts), first_image + @staticmethod + def _make_anchor(text: str) -> str: + """Convert text to a valid HTML anchor ID.""" + # Remove accents and special chars, lowercase, replace spaces with dashes + anchor = text.lower() + anchor = re.sub(r'[àáâãäå]', 'a', anchor) + anchor = re.sub(r'[èéêë]', 'e', anchor) + anchor = re.sub(r'[ìíîï]', 'i', anchor) + anchor = re.sub(r'[òóôõö]', 'o', anchor) + anchor = re.sub(r'[ùúûü]', 'u', anchor) + anchor = re.sub(r'[ýÿ]', 'y', anchor) + anchor = re.sub(r'[ç]', 'c', anchor) + anchor = re.sub(r'[^a-z0-9\s-]', '', anchor) + anchor = re.sub(r'\s+', '-', anchor.strip()) + return anchor or "section" + @staticmethod def _format_duration(seconds: float) -> str: seconds = int(seconds) @@ -314,20 +727,32 @@ class GhostTask: if seconds: parts.append(f"{seconds} seconds") return ", ".join(parts) if parts else "0 seconds" - async def run_daily_at_6_05(self): + async def run_weekly_on_saturday(self): + """Run every Saturday at 12:00 (noon).""" while True: now = dt.datetime.now() - next_run = (now + dt.timedelta(days=1)).replace(hour=6, minute=5, second=0, microsecond=0) + + # Calculate next Saturday at 12:00 + days_until_saturday = (5 - now.weekday()) % 7 # Saturday = 5 + if days_until_saturday == 0 and now.hour >= 12: + days_until_saturday = 7 # Already past Saturday 12:00, wait for next week + + next_run = (now + dt.timedelta(days=days_until_saturday)).replace( + hour=12, minute=0, second=0, microsecond=0 + ) + sleep_seconds = (next_run - now).total_seconds() while sleep_seconds > 0: - LOG.info("Waiting for %s for next scan", self._format_duration(sleep_seconds)) + LOG.info("Waiting for %s for next scan (Saturday noon)", self._format_duration(sleep_seconds)) await asyncio.sleep(min(sleep_seconds, 5 * 60)) now = dt.datetime.now() sleep_seconds = (next_run - now).total_seconds() - LOG.info("Going to run the daily task") - await self.daily_task() + + LOG.info("Going to run the weekly task") + await self.weekly_task() - async def daily_task(self): + async def weekly_task(self): + """Main weekly task: collect, filter, group, and publish.""" # Log newsletters (debug) try: nls = self.ghost.get_newsletters() @@ -335,8 +760,8 @@ class GhostTask: except Exception as e: LOG.warning("Unable to list newsletters: %s", e) - title_post = "Les news du " + self._fr_date_today() - LOG.info("Running daily task : %s", title_post) + title_post = "Les news de la semaine du " + self._fr_week_range() + LOG.info("Running weekly task : %s", title_post) # (Re)charge les feeds feeds_file = os.environ.get("FEEDS_FILE", "/data/feeds.txt") @@ -349,8 +774,10 @@ class GhostTask: feeds.append(RSSfeed(line, "youtube" in line.lower())) self.feeds = feeds - # Fenêtre: depuis hier 06:00 UTC - yesterday_6am_utc = dt.datetime.now(dt.timezone.utc).replace(hour=6, minute=0, second=0, microsecond=0) - dt.timedelta(days=1) + # Fenêtre: depuis 7 jours à 06:00 UTC + week_ago_6am_utc = dt.datetime.now(dt.timezone.utc).replace( + hour=6, minute=0, second=0, microsecond=0 + ) - dt.timedelta(days=7) all_news_posts: List[dict] = [] for feed in self.feeds: @@ -360,14 +787,14 @@ class GhostTask: continue fp = feedparser.parse(content) - # Sélection des items récents + # Sélection des items de la semaine new_entries = [] for e in fp.entries: dte = self._entry_datetime(e) - if dte and dte > yesterday_6am_utc: + if dte and dte > week_ago_6am_utc: new_entries.append(e) - # Filtrage ad-hoc + # Basic URL-based filtering (keep existing logic) filtered = [] for e in new_entries: linkURL = e.get("link", "") or "" @@ -383,16 +810,55 @@ class GhostTask: all_news_posts.extend(filtered) if not all_news_posts: - LOG.warning("Aucun item récupéré (flux down ?). On n'envoie pas aujourd'hui.") + LOG.warning("Aucun item récupéré (flux down ?). On n'envoie pas cette semaine.") return - random.shuffle(all_news_posts) - roundup_html, feature_image = self._build_html_roundup(all_news_posts, self.feeds) + LOG.info("Collected %d items from feeds", len(all_news_posts)) + + # Use Mistral AI for filtering and grouping if available + if self.mistral: + LOG.info("Using Mistral AI to filter non-news content...") + filtered_posts = self.mistral.filter_news_items(all_news_posts, dry_run=self.dry_run) + LOG.info("After filtering: %d items (removed %d)", + len(filtered_posts), len(all_news_posts) - len(filtered_posts)) + + if filtered_posts: + LOG.info("Using Mistral AI to group items by category...") + categories = self.mistral.group_similar_items(filtered_posts) + total_cats = len(categories) + total_subgroups = sum(len(cat.get("subgroups", [])) for cat in categories) + LOG.info("Created %d categories with %d sub-groups", total_cats, total_subgroups) + else: + categories = [] + else: + LOG.warning("No Mistral API key configured, skipping AI filtering/grouping") + # Fallback: single category with all items + categories = [{ + "name": "Actualités de la semaine", + "subgroups": [{"title": "Toutes les news", "items": all_news_posts}] + }] + + if not categories or all( + len(sg.get("items", [])) == 0 + for cat in categories + for sg in cat.get("subgroups", []) + ): + LOG.warning("No news items after filtering. Skipping this week.") + return + + roundup_html, feature_image = self._build_html_roundup_grouped(categories, self.feeds) # 1) Create draft (with feature image if any) created = self.ghost.create_post_html(title_post, roundup_html, status="draft", feature_image=feature_image) + LOG.info("Created draft post: %s (id: %s)", created.get("title"), created.get("id")) - # 2) Publish + send email + # 2) Publish + send email (unless dry-run mode) + if self.dry_run: + LOG.info("DRY-RUN MODE: Post created as draft but NOT published. URL: %s", + created.get("url", "N/A")) + LOG.info("DRY-RUN MODE: Review the draft in Ghost admin, then publish manually if satisfied.") + return + published = self.ghost.publish_post( post_id=created["id"], updated_at=created["updated_at"], @@ -408,6 +874,8 @@ async def main(): parser = argparse.ArgumentParser() parser.add_argument("--runonce", action="store_true", help="Run now and exit (no scheduler)") + parser.add_argument("--dry-run", action="store_true", dest="dry_run", + help="Run immediately, create draft but do NOT publish (for testing)") args = parser.parse_args() # Feeds init (list may be reloaded inside task) @@ -424,26 +892,37 @@ async def main(): admin_url = os.environ["GHOST_ADMIN_URL"] # e.g. https://ghostadmin.zep.best/ghost/api/admin/ admin_key = os.environ["GHOST_ADMIN_KEY"] # integration_id:secret_hex + mistral_api_key = os.environ.get("MISTRAL_API_KEY") # Optional: for AI filtering/grouping + + if not mistral_api_key: + LOG.warning("MISTRAL_API_KEY not set. AI filtering and grouping will be disabled.") task = GhostTask( feeds=feeds, admin_url=admin_url, admin_key=admin_key, + mistral_api_key=mistral_api_key, newsletter_slug=os.environ.get("GHOST_NEWSLETTER_SLUG"), email_segment=os.environ.get("GHOST_EMAIL_SEGMENT"), + dry_run=args.dry_run, ) - LOG.info("Starting bot") + LOG.info("Starting bot (weekly mode%s)", " - DRY RUN" if args.dry_run else "") if args.runonce: - await task.daily_task() + await task.weekly_task() return - # Démarrage: publier l'édition du jour si elle n'existe pas encore - await task.maybe_run_today() + if args.dry_run: + LOG.info("DRY-RUN: Running weekly task immediately (will create draft only)") + await task.weekly_task() + return - # Planification quotidienne à 06:05 Europe/Brussels (via heure locale du conteneur) - await task.run_daily_at_6_05() + # Démarrage: publier l'édition de la semaine si elle n'existe pas encore + await task.maybe_run_this_week() + + # Planification hebdomadaire le samedi à 12:00 Europe/Brussels + await task.run_weekly_on_saturday() if __name__ == "__main__": asyncio.run(main())