feedgenerator

A simple tool to create various feeds
git clone https://git.ortlepp.eu/feedgenerator.git/
Log | Files | Refs | README | LICENSE

commit 07f68b1eb79e4b82e9dee5258b40b75bf97936d5
parent 54286df7eba527d3c225a567f58b96b0d591e18d
Author: Thorsten Ortlepp <post@ortlepp.eu>
Date:   Tue, 22 Feb 2022 21:44:09 +0100

Implemented new feed 'Deutschlandfunk'

Diffstat:
M.gitignore | 2++
MREADME | 2++
M__main__.py | 10+++++++---
Mcommon.py | 18++++++++++++++++++
Adeutschlandfunk.py | 152+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
5 files changed, 181 insertions(+), 3 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -1,2 +1,4 @@ __pycache__ antenne_muenster.xml +deutschlandfunk.sqlite +deutschlandfunk.xml diff --git a/README b/README @@ -7,6 +7,8 @@ A simple tool to create various feeds. Feeds feedgenerator can create ------------------------------ - Antenne Münster: Create a feed from the news website of a local radio station +- Deutschlandfunk: Create a feed from the news website of the German nationwide + public broadcasting service Requirements diff --git a/__main__.py b/__main__.py @@ -1,12 +1,16 @@ import sys from antenne_muenster import AntenneMuensterFeed - -OUTPUT_DIR = "." +from deutschlandfunk import DeutschlandfunkFeed +from common import Config returncodes = [] +config = Config() am_feed = AntenneMuensterFeed() -returncodes.append(am_feed.create_feed(OUTPUT_DIR + "/antenne_muenster.xml", 20)) +returncodes.append(am_feed.create_feed(config.get_workdir() + "/antenne_muenster.xml", 20)) + +dlf_feed = DeutschlandfunkFeed() +returncodes.append(dlf_feed.create_feed(config.get_workdir() + "/deutschlandfunk.xml", 25)) if False in returncodes: sys.exit(1) diff --git a/common.py b/common.py @@ -2,6 +2,7 @@ import hashlib import html +import pathlib try: import zoneinfo @@ -9,6 +10,23 @@ except ImportError: from backports import zoneinfo +class Config: + + __WORK_DIR = "" + + def __init__(self): + self.__WORK_DIR = str(pathlib.Path(__file__).parent.resolve()) + if self.__WORK_DIR.endswith(".pyz"): + self.__WORK_DIR = str(pathlib.Path(__file__).parent.resolve().parent) + + def get_workdir(self): + return self.__WORK_DIR + + def get_database(self): + return self.__WORK_DIR + "/feedgenerator.sqlite" + + + class FeedItem: def __init__(self, title, date, author, content, url): diff --git a/deutschlandfunk.py b/deutschlandfunk.py @@ -0,0 +1,152 @@ +# -*- coding: utf-8 -*- + +import requests +import sqlite3 +import datetime +import re +from bs4 import BeautifulSoup +from common import AtomFeed, FeedItem, Config + + +class DeutschlandfunkFeed: + + FEED_NAME = "Deutschlandfunk Nachrichten" + FEED_AUTHOR = "Deutschlandfunk" + FEED_BASEURL = "https://www.deutschlandfunk.de/" + FEED_ICON = "https://www.deutschlandfunk.de/static/img/deutschlandfunk/icons/favicon.ico" + FEED_LOGO = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/d0/Deutschlandfunk_Logo_2017.svg/500px-Deutschlandfunk_Logo_2017.svg.png" + FEED_URL = "https://www.deutschlandfunk.de/nachrichten-100.rss" + + SQLITE_SQL_CREATE = "CREATE TABLE IF NOT EXISTS deutschlandfunk (key TEXT, title TEXT, content TEXT, link TEXT, created INTEGER, export INTEGER)" + SQLITE_SQL_CHECK = "SELECT COUNT(*) FROM deutschlandfunk WHERE key = ?" + SQLITE_SQL_INSERT = "INSERT INTO deutschlandfunk (key, title, content, link, created, export) VALUES (?, ?, ?, ?, ?, ?)" + SQLITE_SQL_CLEAN = "DELETE FROM deutschlandfunk WHERE created < ?" + SQLITE_SQL_GET = "SELECT title, content, link, created FROM deutschlandfunk WHERE export = 1 ORDER BY created DESC" + + UNWANTED_TOPICS = ["Wetter", "Fußball-Bundesliga"] + + + def __read_article_content(self, link): + try: + request = requests.get(link) + + if request.status_code == 200: + html = BeautifulSoup(request.text, "html.parser") + article = html.select_one(".b-article").children + content = "" + topic = "" + + for element in article: + if element.name == "header": + content += "<p><strong>" + element.findNext("p").text + "</strong></p>" + topic = element.findNext("h2").findNext("span").text + if element.name == "div": + for subelement in element.findNext("section"): + if subelement.name == "div": + text = str(subelement) + text = re.sub("<div class=\".*?\">", "", text) + text = text.replace("</div>", "") + if not text.startswith("Diese Nachricht wurde am"): + content += "<p>" + text + "</p>" + + return [topic, content] + + else: + return ["Error", "Reading article failed, HTTP status code was " + str(request.status_code)] + except: + return ["Error", "Reading article failed"] + + + def __read_feed(self): + now = int(str(datetime.datetime.now().timestamp()).split('.')[0]) + config = Config() + connection = sqlite3.connect(config.get_database()) + + try: + cursor = connection.cursor() + cursor.execute(self.SQLITE_SQL_CREATE) + connection.commit() + + # Delete db content older than 5 days + threshold = now - 432000 + cursor.execute(self.SQLITE_SQL_CLEAN, (threshold,)) + connection.commit() + + request = requests.get(self.FEED_URL) + + if request.status_code == 200: + feed = BeautifulSoup(request.text, "html.parser") + articles = feed.findAll('item') + + for article in articles: + title = article.title.text + key = title.strip().lower() + link = self.FEED_BASEURL + article.guid.text + ".html" + + count = 0 + cursor.execute(self.SQLITE_SQL_CHECK, (key,)) + rows = cursor.fetchall() + + for row in rows: + count = row[0] + + if count == 0: + content = self.__read_article_content(link) + export = 1 + + if len(content[0]) > 0: + title = content[0] + ": " + title + + if content[0] in self.UNWANTED_TOPICS: + export = 0 + + cursor.execute(self.SQLITE_SQL_INSERT, (key, title, content[1], link, now, export)) + connection.commit() + + else: + title = "Reading feed failed" + key = title.strip().lower() + content = "HTTP status code was " + str(request.status_code) + cursor.execute(self.SQLITE_SQL_INSERT, (key, title, content, self.FEED_BASEURL, now)) + connection.commit() + + except: + title = "Reading feed failed" + key = title.strip().lower() + content = "Error while fetching the feed" + cursor.execute(self.SQLITE_SQL_INSERT, (key, title, content, self.FEED_BASEURL, now)) + connection.commit() + + finally: + connection.commit() + connection.close() + + + def create_feed(self, feedfile, maxitems): + self.__read_feed() + + feed = AtomFeed(self.FEED_NAME, self.FEED_AUTHOR, self.FEED_BASEURL, datetime.datetime.now(), self.FEED_ICON, self.FEED_LOGO) + config = Config() + connection = sqlite3.connect(config.get_database()) + + try: + cursor = connection.cursor() + cursor.execute(self.SQLITE_SQL_GET) + rows = cursor.fetchall() + added = 0 + + for row in rows: + feed.add_item(FeedItem(row[0], datetime.datetime.fromtimestamp(int(row[3])), self.FEED_AUTHOR, row[1], row[2])) + added = added + 1 + if added >= maxitems: + break + + except: + error_title = "Feed creation failed" + error_content = "<p>Error while creating the feed</p>" + feed.add_item(FeedItem(error_title, datetime.datetime.now(), self.FEED_AUTHOR, error_content, self.WEBSITE_URL)) + finally: + connection.commit() + connection.close() + + return feed.write_feed(feedfile)