commit 0b18e3ad0c394d1e36595da35c8d7d01df39dc71
parent 5b42190ccea1ad8f7ab3f78561ca9a48ca19156e
Author: Thorsten Ortlepp <post@ortlepp.eu>
Date: Wed, 23 Mar 2022 23:11:33 +0100
Implemented new feed 'Heise Online'
Diffstat:
M | __main__.py | | | 4 | ++++ |
A | heise.py | | | 164 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
2 files changed, 168 insertions(+), 0 deletions(-)
diff --git a/__main__.py b/__main__.py
@@ -1,6 +1,7 @@
import sys
from antenne_muenster import AntenneMuensterFeed
from deutschlandfunk import DeutschlandfunkFeed
+from heise import HeiseFeed
from common import Config
returncodes = []
@@ -12,5 +13,8 @@ returncodes.append(am_feed.create_feed(config.get_workdir() + "/antenne_muenster
dlf_feed = DeutschlandfunkFeed()
returncodes.append(dlf_feed.create_feed(config.get_workdir() + "/deutschlandfunk.xml", 25))
+heise_feed = HeiseFeed()
+returncodes.append(heise_feed.create_feed(config.get_workdir() + "/heise.xml", 20))
+
if False in returncodes:
sys.exit(1)
diff --git a/heise.py b/heise.py
@@ -0,0 +1,164 @@
+# -*- coding: utf-8 -*-
+
+import requests
+import sqlite3
+import datetime
+import feedparser
+from bs4 import BeautifulSoup
+from common import AtomFeed, FeedItem, Config
+
+class HeiseFeed:
+
+ FEED_NAME = "Heise Online"
+ FEED_AUTHOR = "Heise Online"
+ FEED_BASEURL = "https://www.heise.de/"
+ FEED_ICON = "https://www.heise.de/icons/ho/favicon/favicon-16x16.png"
+ FEED_LOGO = "https://upload.wikimedia.org/wikipedia/commons/thumb/0/05/Heise_online.svg/320px-Heise_online.svg.png"
+
+ FEED_URL_NETZPOLITIK = "https://www.heise.de/rss/heise-Rubrik-Netzpolitik.rdf"
+ FEED_URL_IT = "https://www.heise.de/rss/heise-Rubrik-IT.rdf"
+
+ SQLITE_SQL_CREATE = "CREATE TABLE IF NOT EXISTS heise (key TEXT, title TEXT, content TEXT, link TEXT, created INTEGER, export INTEGER)"
+ SQLITE_SQL_CHECK = "SELECT COUNT(*) FROM heise WHERE key = ? or link = ?"
+ SQLITE_SQL_INSERT = "INSERT INTO heise (key, title, content, link, created, export) VALUES (?, ?, ?, ?, ?, ?)"
+ SQLITE_SQL_CLEAN = "DELETE FROM heise WHERE created < ?"
+ SQLITE_SQL_GET = "SELECT title, content, link, created FROM heise WHERE export = 1 ORDER BY created DESC"
+
+ IGNORE_TITLE = ["heise-Angebot", "Die Produktwerker", "Anzeige:", "c't uplink"]
+ IGNORE_URL = ["www.techstage.de"]
+
+
+ def __is_on_ignore_list(self, string, list):
+ if len(string) == 0:
+ return False
+ for item in list:
+ if item in string:
+ return True
+ return False
+
+
+ def __read_article_content(self, link):
+ desired_elements = ["h1", "h2", "h3", "h4", "h5", "h6", "p", "ul", "a-code", "pre"]
+
+ try:
+ request = requests.get(link + "?seite=all")
+
+ if request.status_code == 200:
+ html = BeautifulSoup(request.text, "html.parser")
+ content = ""
+
+ try:
+ intro = html.find("p", {"class", "a-article-header__lead"}).text.strip()
+ content = content + "<p><strong>" + intro + "</strong></p>"
+ except:
+ pass
+
+ article = html.find("div", {"class", "article-content"})
+
+ for element in article:
+ if element.name in desired_elements:
+ content = content + str(element)
+
+ return content
+
+ else:
+ return "Reading article failed, HTTP status code was " + str(request.status_code)
+ except:
+ return "Reading article failed"
+
+
+ def __read_feed(self, link):
+ now = int(str(datetime.datetime.now().timestamp()).split('.')[0])
+ config = Config()
+ connection = sqlite3.connect(config.get_database())
+
+ try:
+ cursor = connection.cursor()
+ cursor.execute(self.SQLITE_SQL_CREATE)
+ connection.commit()
+
+ # Delete db content older than 5 days
+ threshold = now - 432000
+ cursor.execute(self.SQLITE_SQL_CLEAN, (threshold,))
+ connection.commit()
+
+ request = requests.get(link)
+
+ if request.status_code == 200:
+ feed = feedparser.parse(request.text)
+
+ for entry in feed.entries:
+ title = entry.title
+ key = title.strip().lower()
+ link = entry.link
+ index = entry.link.rindex("?")
+ if index > 0:
+ link = link[0:index]
+
+ if self.__is_on_ignore_list(title, self.IGNORE_TITLE):
+ continue
+ if self.__is_on_ignore_list(link, self.IGNORE_URL):
+ continue
+
+ count = 0
+ cursor.execute(self.SQLITE_SQL_CHECK, (key, link))
+ rows = cursor.fetchall()
+
+ for row in rows:
+ count = row[0]
+
+ if count == 0:
+ content = self.__read_article_content(link)
+ export = 1
+
+ cursor.execute(self.SQLITE_SQL_INSERT, (key, title, content, link, now, export))
+ connection.commit()
+
+ else:
+ title = "Reading feed failed"
+ key = title.strip().lower()
+ content = "HTTP status code was " + str(request.status_code)
+ cursor.execute(self.SQLITE_SQL_INSERT, (key, title, content, self.FEED_BASEURL, now))
+ connection.commit()
+
+ except:
+ title = "Reading feed failed"
+ key = title.strip().lower()
+ content = "Error while fetching the feed"
+ cursor.execute(self.SQLITE_SQL_INSERT, (key, title, content, self.FEED_BASEURL, now))
+ connection.commit()
+
+ finally:
+ connection.commit()
+ connection.close()
+
+
+ def create_feed(self, feedfile, maxitems):
+ self.__read_feed(self.FEED_URL_NETZPOLITIK)
+ self.__read_feed(self.FEED_URL_IT)
+
+ feed = AtomFeed(self.FEED_NAME, self.FEED_AUTHOR, self.FEED_BASEURL, datetime.datetime.now(), self.FEED_ICON, self.FEED_LOGO)
+ config = Config()
+ connection = sqlite3.connect(config.get_database())
+
+ try:
+ cursor = connection.cursor()
+ cursor.execute(self.SQLITE_SQL_GET)
+ rows = cursor.fetchall()
+ added = 0
+
+ for row in rows:
+ feed.add_item(FeedItem(row[0], datetime.datetime.fromtimestamp(int(row[3])), self.FEED_AUTHOR, row[1], row[2]))
+ added = added + 1
+ if added >= maxitems:
+ break
+
+ except:
+ error_title = "Feed creation failed"
+ error_content = "<p>Error while creating the feed</p>"
+ feed.add_item(FeedItem(error_title, datetime.datetime.now(), self.FEED_AUTHOR, error_content, self.WEBSITE_URL))
+ finally:
+ connection.commit()
+ connection.close()
+
+ return feed.write_feed(feedfile)