feedgenerator

A simple tool to create various feeds
git clone https://git.ortlepp.eu/feedgenerator.git/
Log | Files | Refs | README | LICENSE

commit 0b18e3ad0c394d1e36595da35c8d7d01df39dc71
parent 5b42190ccea1ad8f7ab3f78561ca9a48ca19156e
Author: Thorsten Ortlepp <post@ortlepp.eu>
Date:   Wed, 23 Mar 2022 23:11:33 +0100

Implemented new feed 'Heise Online'

Diffstat:
M__main__.py | 4++++
Aheise.py | 164+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 168 insertions(+), 0 deletions(-)

diff --git a/__main__.py b/__main__.py @@ -1,6 +1,7 @@ import sys from antenne_muenster import AntenneMuensterFeed from deutschlandfunk import DeutschlandfunkFeed +from heise import HeiseFeed from common import Config returncodes = [] @@ -12,5 +13,8 @@ returncodes.append(am_feed.create_feed(config.get_workdir() + "/antenne_muenster dlf_feed = DeutschlandfunkFeed() returncodes.append(dlf_feed.create_feed(config.get_workdir() + "/deutschlandfunk.xml", 25)) +heise_feed = HeiseFeed() +returncodes.append(heise_feed.create_feed(config.get_workdir() + "/heise.xml", 20)) + if False in returncodes: sys.exit(1) diff --git a/heise.py b/heise.py @@ -0,0 +1,164 @@ +# -*- coding: utf-8 -*- + +import requests +import sqlite3 +import datetime +import feedparser +from bs4 import BeautifulSoup +from common import AtomFeed, FeedItem, Config + +class HeiseFeed: + + FEED_NAME = "Heise Online" + FEED_AUTHOR = "Heise Online" + FEED_BASEURL = "https://www.heise.de/" + FEED_ICON = "https://www.heise.de/icons/ho/favicon/favicon-16x16.png" + FEED_LOGO = "https://upload.wikimedia.org/wikipedia/commons/thumb/0/05/Heise_online.svg/320px-Heise_online.svg.png" + + FEED_URL_NETZPOLITIK = "https://www.heise.de/rss/heise-Rubrik-Netzpolitik.rdf" + FEED_URL_IT = "https://www.heise.de/rss/heise-Rubrik-IT.rdf" + + SQLITE_SQL_CREATE = "CREATE TABLE IF NOT EXISTS heise (key TEXT, title TEXT, content TEXT, link TEXT, created INTEGER, export INTEGER)" + SQLITE_SQL_CHECK = "SELECT COUNT(*) FROM heise WHERE key = ? or link = ?" + SQLITE_SQL_INSERT = "INSERT INTO heise (key, title, content, link, created, export) VALUES (?, ?, ?, ?, ?, ?)" + SQLITE_SQL_CLEAN = "DELETE FROM heise WHERE created < ?" + SQLITE_SQL_GET = "SELECT title, content, link, created FROM heise WHERE export = 1 ORDER BY created DESC" + + IGNORE_TITLE = ["heise-Angebot", "Die Produktwerker", "Anzeige:", "c't uplink"] + IGNORE_URL = ["www.techstage.de"] + + + def __is_on_ignore_list(self, string, list): + if len(string) == 0: + return False + for item in list: + if item in string: + return True + return False + + + def __read_article_content(self, link): + desired_elements = ["h1", "h2", "h3", "h4", "h5", "h6", "p", "ul", "a-code", "pre"] + + try: + request = requests.get(link + "?seite=all") + + if request.status_code == 200: + html = BeautifulSoup(request.text, "html.parser") + content = "" + + try: + intro = html.find("p", {"class", "a-article-header__lead"}).text.strip() + content = content + "<p><strong>" + intro + "</strong></p>" + except: + pass + + article = html.find("div", {"class", "article-content"}) + + for element in article: + if element.name in desired_elements: + content = content + str(element) + + return content + + else: + return "Reading article failed, HTTP status code was " + str(request.status_code) + except: + return "Reading article failed" + + + def __read_feed(self, link): + now = int(str(datetime.datetime.now().timestamp()).split('.')[0]) + config = Config() + connection = sqlite3.connect(config.get_database()) + + try: + cursor = connection.cursor() + cursor.execute(self.SQLITE_SQL_CREATE) + connection.commit() + + # Delete db content older than 5 days + threshold = now - 432000 + cursor.execute(self.SQLITE_SQL_CLEAN, (threshold,)) + connection.commit() + + request = requests.get(link) + + if request.status_code == 200: + feed = feedparser.parse(request.text) + + for entry in feed.entries: + title = entry.title + key = title.strip().lower() + link = entry.link + index = entry.link.rindex("?") + if index > 0: + link = link[0:index] + + if self.__is_on_ignore_list(title, self.IGNORE_TITLE): + continue + if self.__is_on_ignore_list(link, self.IGNORE_URL): + continue + + count = 0 + cursor.execute(self.SQLITE_SQL_CHECK, (key, link)) + rows = cursor.fetchall() + + for row in rows: + count = row[0] + + if count == 0: + content = self.__read_article_content(link) + export = 1 + + cursor.execute(self.SQLITE_SQL_INSERT, (key, title, content, link, now, export)) + connection.commit() + + else: + title = "Reading feed failed" + key = title.strip().lower() + content = "HTTP status code was " + str(request.status_code) + cursor.execute(self.SQLITE_SQL_INSERT, (key, title, content, self.FEED_BASEURL, now)) + connection.commit() + + except: + title = "Reading feed failed" + key = title.strip().lower() + content = "Error while fetching the feed" + cursor.execute(self.SQLITE_SQL_INSERT, (key, title, content, self.FEED_BASEURL, now)) + connection.commit() + + finally: + connection.commit() + connection.close() + + + def create_feed(self, feedfile, maxitems): + self.__read_feed(self.FEED_URL_NETZPOLITIK) + self.__read_feed(self.FEED_URL_IT) + + feed = AtomFeed(self.FEED_NAME, self.FEED_AUTHOR, self.FEED_BASEURL, datetime.datetime.now(), self.FEED_ICON, self.FEED_LOGO) + config = Config() + connection = sqlite3.connect(config.get_database()) + + try: + cursor = connection.cursor() + cursor.execute(self.SQLITE_SQL_GET) + rows = cursor.fetchall() + added = 0 + + for row in rows: + feed.add_item(FeedItem(row[0], datetime.datetime.fromtimestamp(int(row[3])), self.FEED_AUTHOR, row[1], row[2])) + added = added + 1 + if added >= maxitems: + break + + except: + error_title = "Feed creation failed" + error_content = "<p>Error while creating the feed</p>" + feed.add_item(FeedItem(error_title, datetime.datetime.now(), self.FEED_AUTHOR, error_content, self.WEBSITE_URL)) + finally: + connection.commit() + connection.close() + + return feed.write_feed(feedfile)