import asyncio import argparse import requests import feedparser import io import html import datetime import logging import os import re from logging.handlers import RotatingFileHandler import random import pyvirtualdisplay from substack import Api from substack.post import Post LOG = logging.getLogger('bot') LOG_PATTERN = logging.Formatter('%(asctime)s:%(levelname)s: [%(filename)s] %(message)s') def setuplogger(): conf_filename = None steam_handler = logging.StreamHandler() steam_handler.setFormatter(LOG_PATTERN) steam_handler.setLevel(logging.DEBUG) def setup_logger(logger_name, file_name=None, add_steam=False): file_name = file_name or logger_name log_filename = f"{file_name}.log" logger = logging.getLogger(logger_name) logger.setLevel(logging.DEBUG) file_handler = RotatingFileHandler(log_filename, "a", 1000000, 1) file_handler.setFormatter(LOG_PATTERN) logger.addHandler(file_handler) if add_steam: logger.addHandler(steam_handler) setup_logger("bot", conf_filename, True) class RSSfeed(): def __init__(self, url, yt=False): self.url = url self.youtube = yt class SubStackTask: def __init__(self, login, password, cookies_path, account, feeds): self.api = Api( email=login, password=password, cookies_path=cookies_path, publication_url=account, ) self.user_id = self.api.get_user_id() self.feeds = feeds for feed in self.feeds: LOG.info("Adding feed " + feed.url) def format_duration(self, seconds): days, seconds = divmod(seconds, 86400) hours, seconds = divmod(seconds, 3600) minutes, seconds = divmod(seconds, 60) days = round(days) hours = round(hours) minutes = round(minutes) seconds = round(seconds) parts = [] if days > 0: parts.append(f"{days} days") if hours > 0: parts.append(f"{hours} hours") if minutes > 0: parts.append(f"{minutes} minutes") if seconds > 0: parts.append(f"{seconds} seconds") return ', '.join(parts) if parts else '0 seconds' def get_fr_date(self): # Mapping of English month names to French months_en_to_fr = { 'January': 'Janvier', 'February': 'Février', 'March': 'Mars', 'April': 'Avril', 'May': 'Mai', 'June': 'Juin', 'July': 'Juillet', 'August': 'Août', 'September': 'Septembre', 'October': 'Octobre', 'November': 'Novembre', 'December': 'Décembre' } today = datetime.datetime.now() formatted_date = today.strftime("%d %B %Y") # Replace the English month with the French month for en, fr in months_en_to_fr.items(): formatted_date = formatted_date.replace(en, fr) return formatted_date async def run_daily_at_6_am(self): while True: now = datetime.datetime.now() # Calculate the time until 6 AM next day next_run = (now + datetime.timedelta(days=1)).replace(hour=6, minute=5, second=0, microsecond=0) sleep_seconds = (next_run - now).total_seconds() while sleep_seconds > 0: # Check if the remaining time is a multiple of 3600 seconds formatted_duration = self.format_duration(sleep_seconds) LOG.info(f"Waiting for {formatted_duration} for next scan") # Wait for some time before checking again await asyncio.sleep(min(sleep_seconds, 5 * 60)) # Recalculate the remaining sleep time now = datetime.datetime.now() sleep_seconds = (next_run - now).total_seconds() LOG.info("Going to run the daily task") # Run the daily task await self.daily_task() async def daily_task(self): title_post = "Les news du " + self.get_fr_date() LOG.info("Running daily task : " + str(title_post)) ff = r'/data/feeds.txt' if os.path.isfile(ff) is False: ff = r'x:\substack\feeds.txt' self.feeds = [] with open(ff) as file: lines = [line.rstrip() for line in file] for line in lines: youtube = "youtube" in line self.feeds.append(RSSfeed(line, youtube)) sub_stack_post = Post( title=title_post, subtitle="", user_id=self.user_id ) midnight_today = datetime.datetime.now(datetime.timezone.utc).replace(hour=0, minute=0, second=0, microsecond=0) yesterday_6am = datetime.datetime.now(datetime.timezone.utc).replace(hour=6, minute=0, second=0, microsecond=0) - datetime.timedelta(days=1) formatted_date = midnight_today.strftime('%a, %d %b %Y %H:%M:%S %z') all_news_posts = [] for feed in self.feeds: LOG.info("Scanning feed " + feed.url) html_text = requests.get(feed.url).text newsFeed = feedparser.parse(html_text) if feed.youtube is True: new_posts = [entry for entry in newsFeed.entries if datetime.datetime.fromisoformat(entry.published) > yesterday_6am] else: new_posts = [entry for entry in newsFeed.entries if datetime.datetime.strptime(entry.published.replace('GMT', '+0000'), '%a, %d %b %Y %H:%M:%S %z') > yesterday_6am] all_news_posts.extend(new_posts) random.shuffle(all_news_posts) for post in all_news_posts: linkURL = post["link"] title = post["title"] ftext = "" LOG.info("Posting " + str(title)) if "summary" in post: ftext = html.unescape(post["summary"]) # Using regular expressions to remove HTML tags ftext = re.sub('<[^<]+?>', '', ftext) pattern = r"L’article .* est apparu en premier sur .*" ftext = re.sub(pattern, '', ftext) if "yt_videoid" in post: sub_stack_post.add({"type":"heading", "level":3, "content": title}) videoId = post["yt_videoid"] sub_stack_post.add({"type":"youtube2", "src": videoId }) sub_stack_post.add({'type': 'paragraph', 'content': [ {'content': linkURL, 'marks': [{'type': "link", 'href': linkURL}]}]}) else: if ftext != "": sub_stack_post.add({"type":"heading", "level":3, "content": title}) sub_stack_post.add({"type":"paragraph", "content": ftext }) sub_stack_post.add({'type': 'paragraph', 'content': [ {'content': linkURL, 'marks': [{'type': "link", 'href': linkURL}]}]}) if "links" in post: for link in post["links"]: if link["type"] == "image/jpg": imgUrl = link["href"] sub_stack_post.add({'type': 'captionedImage', 'src': imgUrl}) sub_stack_post.add({"type":"horizontal_rule"}) sub_stack_post.add({"type":"heading", "level":3, "content": "Sources"}) for feed in self.feeds: sub_stack_post.add({'type': 'paragraph', 'content': [ {'content': feed.url, 'marks': [{'type': "link", 'href': feed.url}]}]}) sub_stack_post.add({"type":"subscribeWidget", "message":"Abonnez-vous gratuitement pour recevoir chaque jour les news dans votre e-mail et soutenir mon travail."}) draft = self.api.post_draft(sub_stack_post.get_draft()) self.api.prepublish_draft(draft.get("id")) self.api.publish_draft(draft.get("id")) async def main(login, password, account): setuplogger() if os.path.exists("last_scan_date.txt"): with open("last_scan_date.txt", "r") as f: last_post_date = datetime.datetime.strptime(f.read().strip(), '%a, %d %b %Y %H:%M:%S %z') else: last_post_date = datetime.datetime.min.replace(tzinfo=datetime.timezone.utc) feeds = [] ff = r'/data/feeds.txt' if os.path.isfile(ff) is False: ff = r'feeds.txt' cookies_path = r'/data/cookies.json' if os.path.isfile(cookies_path) is False: cookies_path = r'cookies.json' with open(ff) as file: lines = [line.rstrip() for line in file] for line in lines: youtube = "youtube" in line feeds.append(RSSfeed(line, youtube)) task = SubStackTask(login, password, cookies_path, account, feeds) LOG.info("Starting bot") await task.run_daily_at_6_am() #await task.daily_task() if __name__ == "__main__": asyncio.run(main("gael.honorez@gmail.com", "f3PaTGedjFc2gkr1ypi5", "https://aggregateurjvfr.substack.com"))