feedgenerator

A simple tool to create various feeds
git clone https://git.ortlepp.eu/feedgenerator.git/
Log | Files | Refs | README | LICENSE

deutschlandfunk.py (5989B)


      1 # -*- coding: utf-8 -*-
      2 
      3 import requests
      4 import sqlite3
      5 import datetime
      6 import re
      7 import feedparser
      8 from bs4 import BeautifulSoup
      9 from common import AtomFeed, FeedItem, Config
     10 
     11 
     12 class DeutschlandfunkFeed:
     13 
     14     FEED_NAME = "Deutschlandfunk Nachrichten"
     15     FEED_AUTHOR = "Deutschlandfunk"
     16     FEED_BASEURL = "https://www.deutschlandfunk.de/"
     17     FEED_ICON = "https://www.deutschlandfunk.de/static/img/deutschlandfunk/icons/favicon.ico"
     18     FEED_LOGO = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/d0/Deutschlandfunk_Logo_2017.svg/500px-Deutschlandfunk_Logo_2017.svg.png"
     19     FEED_URL = "https://www.deutschlandfunk.de/nachrichten-100.rss"
     20 
     21     SQLITE_SQL_CREATE = "CREATE TABLE IF NOT EXISTS deutschlandfunk (key TEXT, title TEXT, content TEXT, link TEXT, created INTEGER, export INTEGER)"
     22     SQLITE_SQL_CHECK = "SELECT COUNT(*) FROM deutschlandfunk WHERE key = ?"
     23     SQLITE_SQL_INSERT = "INSERT INTO deutschlandfunk (key, title, content, link, created, export) VALUES (?, ?, ?, ?, ?, ?)"
     24     SQLITE_SQL_CLEAN = "DELETE FROM deutschlandfunk WHERE created < ?"
     25     SQLITE_SQL_GET = "SELECT title, content, link, created FROM deutschlandfunk WHERE export = 1 ORDER BY created DESC"
     26 
     27     UNWANTED_TOPICS = ["Wetter", "Fußball-Bundesliga", "Fußball", "Fußball Europa League", "DFB-Pokal", "Lottozahlen", "Lotto", "Bundesliga", "American Football", "Formel 1", "Champions League", "Tennis"]
     28 
     29 
     30     def __read_article_content(self, link):
     31         try:
     32             request = requests.get(link)
     33 
     34             if request.status_code == 200:
     35                 html = BeautifulSoup(request.text, "html.parser")
     36                 article = html.select_one(".b-article").children
     37                 content = ""
     38                 topic = ""
     39 
     40                 for element in article:
     41                     if element.name == "header":
     42                         content += "<p><strong>" + element.findNext("p").text + "</strong></p>"
     43                         topic = element.findNext("h2").findNext("span").text
     44                     if element.name == "div":
     45                         for subelement in element.findNext("section"):
     46                             if subelement.name == "div":
     47                                 text = str(subelement)
     48                                 text = re.sub("<div class=\".*?\">", "", text)
     49                                 text = text.replace("</div>", "")
     50                                 if not text.startswith("Diese Nachricht wurde am"):
     51                                     content += "<p>" + text + "</p>"
     52 
     53                 return [topic, content]
     54 
     55             else:
     56                 return ["Error", "Reading article failed, HTTP status code was " + str(request.status_code)]
     57         except:
     58             return ["Error", "Reading article failed"]
     59 
     60 
     61     def __read_feed(self):
     62         now = int(str(datetime.datetime.now().timestamp()).split('.')[0])
     63         config = Config()
     64         connection = sqlite3.connect(config.get_database())
     65 
     66         try:
     67             cursor = connection.cursor()
     68             cursor.execute(self.SQLITE_SQL_CREATE)
     69             connection.commit()
     70 
     71             # Delete db content older than 5 days
     72             threshold = now - 432000
     73             cursor.execute(self.SQLITE_SQL_CLEAN, (threshold,))
     74             connection.commit()
     75 
     76             request = requests.get(self.FEED_URL)
     77 
     78             if request.status_code == 200:
     79                 feed = feedparser.parse(request.text)
     80 
     81                 for entry in feed.entries:
     82                     title = entry.title
     83                     key = title.strip().lower()
     84                     link = entry.link
     85 
     86                     count = 0
     87                     cursor.execute(self.SQLITE_SQL_CHECK, (key,))
     88                     rows = cursor.fetchall()
     89 
     90                     for row in rows:
     91                         count = row[0]
     92 
     93                     if count == 0:
     94                         content = self.__read_article_content(link)
     95                         export = 1
     96 
     97                         if content[0] in self.UNWANTED_TOPICS:
     98                             export = 0
     99 
    100                         cursor.execute(self.SQLITE_SQL_INSERT, (key, title, content[1], link, now, export))
    101                         connection.commit()
    102 
    103             else:
    104                 title = "Reading feed failed"
    105                 key = title.strip().lower()
    106                 content = "HTTP status code was " + str(request.status_code)
    107                 cursor.execute(self.SQLITE_SQL_INSERT, (key, title, content, self.FEED_BASEURL, now))
    108                 connection.commit()
    109 
    110         except:
    111             title = "Reading feed failed"
    112             key = title.strip().lower()
    113             content = "Error while fetching the feed"
    114             cursor.execute(self.SQLITE_SQL_INSERT, (key, title, content, self.FEED_BASEURL, now))
    115             connection.commit()
    116 
    117         finally:
    118             connection.commit()
    119             connection.close()
    120 
    121 
    122     def create_feed(self, feedfile, maxitems):
    123         self.__read_feed()
    124 
    125         feed = AtomFeed(self.FEED_NAME, self.FEED_AUTHOR, self.FEED_BASEURL, datetime.datetime.now(), self.FEED_ICON, self.FEED_LOGO)
    126         config = Config()
    127         connection = sqlite3.connect(config.get_database())
    128 
    129         try:
    130             cursor = connection.cursor()
    131             cursor.execute(self.SQLITE_SQL_GET)
    132             rows = cursor.fetchall()
    133             added = 0
    134 
    135             for row in rows:
    136                 feed.add_item(FeedItem(row[0], datetime.datetime.fromtimestamp(int(row[3])), self.FEED_AUTHOR, row[1], row[2]))
    137                 added = added + 1
    138                 if added >= maxitems:
    139                     break
    140 
    141         except:
    142             error_title = "Feed creation failed"
    143             error_content = "<p>Error while creating the feed</p>"
    144             feed.add_item(FeedItem(error_title, datetime.datetime.now(), self.FEED_AUTHOR, error_content, self.WEBSITE_URL))
    145         finally:
    146             connection.commit()
    147             connection.close()
    148 
    149         return feed.write_feed(feedfile)