Substack_JV/Post_RSS_on_SubStack.py

import asyncio
import argparse
import requests
import feedparser
import io
import html
import datetime
import logging
import os
import re
from logging.handlers import RotatingFileHandler
import random

import pyvirtualdisplay


from substack import Api
from substack.post import Post

LOG = logging.getLogger('bot')
LOG_PATTERN = logging.Formatter('%(asctime)s:%(levelname)s: [%(filename)s] %(message)s')

def setuplogger():

    conf_filename = None

    steam_handler = logging.StreamHandler()
    steam_handler.setFormatter(LOG_PATTERN)
    steam_handler.setLevel(logging.DEBUG)

    def setup_logger(logger_name, file_name=None, add_steam=False):
        file_name = file_name or logger_name
        log_filename = f"{file_name}.log"

        logger = logging.getLogger(logger_name)
        logger.setLevel(logging.DEBUG)
        file_handler = RotatingFileHandler(log_filename, "a", 1000000, 1)
        file_handler.setFormatter(LOG_PATTERN)
        logger.addHandler(file_handler)
        if add_steam:
            logger.addHandler(steam_handler)

    setup_logger("bot", conf_filename, True)

class RSSfeed():
    def __init__(self, url, yt=False):
        self.url = url
        self.youtube = yt

class SubStackTask:
    def __init__(self, login, password, cookies_path, account, feeds):
        self.api = Api(
            email=login,
            password=password,
            cookies_path=cookies_path,
            publication_url=account,
        )

        self.user_id = self.api.get_user_id()
        self.feeds = feeds
        for feed in self.feeds:
            LOG.info("Adding feed " + feed.url)


    def format_duration(self, seconds):
        days, seconds = divmod(seconds, 86400)
        hours, seconds = divmod(seconds, 3600)
        minutes, seconds = divmod(seconds, 60)

        days = round(days)
        hours = round(hours)
        minutes = round(minutes)
        seconds = round(seconds)

        parts = []
        if days > 0:
            parts.append(f"{days} days")
        if hours > 0:
            parts.append(f"{hours} hours")
        if minutes > 0:
            parts.append(f"{minutes} minutes")
        if seconds > 0:
            parts.append(f"{seconds} seconds")

        return ', '.join(parts) if parts else '0 seconds'

    def get_fr_date(self):
        # Mapping of English month names to French
        months_en_to_fr = {
            'January': 'Janvier', 'February': 'Février', 'March': 'Mars',
            'April': 'Avril', 'May': 'Mai', 'June': 'Juin',
            'July': 'Juillet', 'August': 'Août', 'September': 'Septembre',
            'October': 'Octobre', 'November': 'Novembre', 'December': 'Décembre'
        }
        today = datetime.datetime.now()
        formatted_date = today.strftime("%d %B %Y")
            # Replace the English month with the French month
        for en, fr in months_en_to_fr.items():
            formatted_date = formatted_date.replace(en, fr)
        return formatted_date

    async def run_daily_at_6_am(self):
        while True:
            now = datetime.datetime.now()
            # Calculate the time until 6 AM next day
            next_run = (now + datetime.timedelta(days=1)).replace(hour=6, minute=5, second=0, microsecond=0)
            sleep_seconds = (next_run - now).total_seconds()

            while sleep_seconds > 0:
                # Check if the remaining time is a multiple of 3600 seconds
                formatted_duration = self.format_duration(sleep_seconds)
                LOG.info(f"Waiting for {formatted_duration} for next scan")

                # Wait for some time before checking again
                await asyncio.sleep(min(sleep_seconds, 5 * 60))

                # Recalculate the remaining sleep time
                now = datetime.datetime.now()
                sleep_seconds = (next_run - now).total_seconds()

            LOG.info("Going to run the daily task")
            # Run the daily task
            await self.daily_task()


    async def daily_task(self):

        title_post = "Les news du " + self.get_fr_date()

        LOG.info("Running daily task : " + str(title_post))

        ff = r'/data/feeds.txt'
        if os.path.isfile(ff) is False:
            ff = r'x:\substack\feeds.txt'

        self.feeds = []
        with open(ff) as file:
            lines = [line.rstrip() for line in file]

        for line in lines:
            youtube = "youtube" in line
            self.feeds.append(RSSfeed(line, youtube))

        sub_stack_post = Post(
            title=title_post,
            subtitle="",
            user_id=self.user_id
        )

        midnight_today = datetime.datetime.now(datetime.timezone.utc).replace(hour=0, minute=0, second=0, microsecond=0)
        yesterday_6am = datetime.datetime.now(datetime.timezone.utc).replace(hour=6, minute=0, second=0, microsecond=0) - datetime.timedelta(days=1)

        formatted_date = midnight_today.strftime('%a, %d %b %Y %H:%M:%S %z')

        all_news_posts = []

        for feed in self.feeds:
            LOG.info("Scanning feed " + feed.url)
            html_text = requests.get(feed.url).text
            newsFeed = feedparser.parse(html_text)


            if feed.youtube is True:
                new_posts = [entry for entry in newsFeed.entries if datetime.datetime.fromisoformat(entry.published) > yesterday_6am]
            else:
                new_posts = [entry for entry in newsFeed.entries if datetime.datetime.strptime(entry.published.replace('GMT', '+0000'), '%a, %d %b %Y %H:%M:%S %z') > yesterday_6am]

            all_news_posts.extend(new_posts)


        random.shuffle(all_news_posts)


        for post in all_news_posts:
            linkURL = post["link"]
            title = post["title"]
            ftext = ""

            LOG.info("Posting  " + str(title))

            if "summary" in post:
                ftext = html.unescape(post["summary"])
                # Using regular expressions to remove HTML tags
                ftext = re.sub('<[^<]+?>', '', ftext)
                pattern = r"L’article .* est apparu en premier sur .*"
                ftext = re.sub(pattern, '', ftext)

            if "yt_videoid" in post:
                sub_stack_post.add({"type":"heading", "level":3, "content": title})
                videoId = post["yt_videoid"]
                sub_stack_post.add({"type":"youtube2", "src": videoId })
                sub_stack_post.add({'type': 'paragraph', 'content': [
                        {'content': linkURL, 'marks': [{'type': "link", 'href': linkURL}]}]})
            else:


                if ftext != "":
                    sub_stack_post.add({"type":"heading", "level":3, "content": title})
                    sub_stack_post.add({"type":"paragraph", "content": ftext })
                    sub_stack_post.add({'type': 'paragraph', 'content': [
                        {'content': linkURL, 'marks': [{'type': "link", 'href': linkURL}]}]})

                    if "links" in post:
                        for link in post["links"]:

                            if link["type"] == "image/jpg":
                                imgUrl = link["href"]
                                sub_stack_post.add({'type': 'captionedImage', 'src': imgUrl})


            sub_stack_post.add({"type":"horizontal_rule"})


        sub_stack_post.add({"type":"heading", "level":3, "content": "Sources"})
        for feed in self.feeds:
            sub_stack_post.add({'type': 'paragraph', 'content': [
                        {'content': feed.url, 'marks': [{'type': "link", 'href': feed.url}]}]})


        sub_stack_post.add({"type":"subscribeWidget", "message":"Abonnez-vous gratuitement pour recevoir chaque jour les news dans votre e-mail et soutenir mon travail."})

        draft = self.api.post_draft(sub_stack_post.get_draft())
        self.api.prepublish_draft(draft.get("id"))
        self.api.publish_draft(draft.get("id"))

async def main(login, password, account):

    setuplogger()

    if os.path.exists("last_scan_date.txt"):
        with open("last_scan_date.txt", "r") as f:
            last_post_date = datetime.datetime.strptime(f.read().strip(), '%a, %d %b %Y %H:%M:%S %z')
    else:
        last_post_date = datetime.datetime.min.replace(tzinfo=datetime.timezone.utc)

    feeds = []

    ff = r'/data/feeds.txt'
    if os.path.isfile(ff) is False:
        ff = r'feeds.txt'

    cookies_path = r'/data/cookies.json'
    if os.path.isfile(cookies_path) is False:
        cookies_path = r'cookies.json'

    with open(ff) as file:
        lines = [line.rstrip() for line in file]

    for line in lines:
        youtube = "youtube" in line
        feeds.append(RSSfeed(line, youtube))

    task = SubStackTask(login, password, cookies_path, account, feeds)

    LOG.info("Starting bot")
    await task.run_daily_at_6_am()
    #await task.daily_task()


if __name__ == "__main__":
    asyncio.run(main("gael.honorez@gmail.com", "f3PaTGedjFc2gkr1ypi5", "https://aggregateurjvfr.substack.com"))