frogfeed.py 6.39 KB
Newer Older
surreal's avatar
surreal committed
1
import datetime as dt
surreal's avatar
surreal committed
2
import re
surreal's avatar
surreal committed
3
import sys
4
import time
surreal's avatar
surreal committed
5 6 7
from collections import namedtuple

import bs4
surreal's avatar
surreal committed
8
import click
surreal's avatar
surreal committed
9 10 11 12 13 14 15 16 17 18 19 20 21 22
import feedparser
import requests


BOT_NAME = 'Feedbot'


URL_FEED = 'https://www.reddit.com/r/COMPLETEANARCHY/new.rss'
URL_ARCHIVE = 'https://raddle.me/f/COMPLETEANARCHY'
URL_LOGIN = 'https://raddle.me/login'
URL_LOGIN_CHECK = 'https://raddle.me/login_check'
URL_FORUM_SUBMIT = 'https://raddle.me/submit/COMPLETEANARCHY'
URL_SUBMIT = 'https://raddle.me/submit'

surreal's avatar
surreal committed
23 24
RedditPost = namedtuple('Post', 'author link title url date')
RaddlePost = namedtuple('Post', 'author link title')
surreal's avatar
surreal committed
25 26 27 28 29 30


def parse_reddit_feed(source):
    feed = feedparser.parse(source)
    for entry in feed['entries']:
        author = entry['authors'][0]['name'].split('/')[-1]
31 32 33 34

        content = entry['content'][0]['value']
        link = re.search(r'<span><a href="(https://.*)">\[link\]', content).group(1)

surreal's avatar
surreal committed
35
        title = entry['title']
36

surreal's avatar
surreal committed
37 38
        url = entry['link']

39
        date = entry['updated']
surreal's avatar
surreal committed
40 41 42 43

        yield RedditPost(author, link, title, url, date)


surreal's avatar
surreal committed
44 45 46 47 48 49 50 51 52 53 54 55 56
AUTHOR_SELECTOR = 'p > a'
INFO_SELECTOR = 'h1 > a'

def parse_raddle_bot_posts(source):
    soup = get_soup(source)

    for article in soup('article'):
        author = article.select_one(AUTHOR_SELECTOR).text
        if not author == BOT_NAME:
            continue
        info_tag = article.select_one(INFO_SELECTOR)
        link = info_tag['href']
        title = info_tag.text
surreal's avatar
surreal committed
57 58

        yield RaddlePost(author, link, title)
surreal's avatar
surreal committed
59 60 61 62 63 64 65


def get_soup(source):
    return bs4.BeautifulSoup(source, 'html.parser')


def filter_new_posts(archive, feed):
66
    archive = list(archive)
surreal's avatar
surreal committed
67
    for post in feed:
surreal's avatar
surreal committed
68 69
        if not is_new_post(post):
            continue
surreal's avatar
surreal committed
70
        if (BOT_NAME, post.link, post.title) in archive:
71 72
            print("Skipping:", post)
            print('-'*50)
surreal's avatar
surreal committed
73 74 75 76
            continue
        yield post


surreal's avatar
surreal committed
77 78 79 80 81 82 83 84 85
def is_new_post(post):
    today = dt.date.today()
    post_date = dt.datetime.fromisoformat(post.date)

    return (today.year == post_date.year
            and today.month == post_date.month
            and today.day == post_date.day)


surreal's avatar
surreal committed
86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
class Submitter:
    """Subclass this to provide network I/O."""
    def __init__(self, username, password, forum):
        self.username = username
        self.password = password
        self.forum = forum
        self._forum_id = None

    def __call__(self, posts):
        print("Logging in..")
        self.login()
        print("Submitting..")
        self.submit(posts)

    def login(self):
        raise NotImplementedError

    @property
    def csrf_token(self):
        return get_csrf_token(self.source)

    @property
    def login_data(self):
        return {'_csrf_token': self.csrf_token,
                '_username': self.username,
                '_password': self.password }

    def make_submission_data(self, post):
        return {
            'submission[url]': post.link,
            'submission[title]': post.title,
117
            'submission[body]': f'author: {post.author}\n\nurl: {post.url}',
surreal's avatar
surreal committed
118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
            'submission[forum]': self.forum_id,
            'submission[submit]': None,
            'submission[email]': None,
            'submission[_token]': self.csrf_token
            }

    @property
    def forum_id(self):
        if self._forum_id is None:
            self._forum_id = parse_forum_id(self.forum, self.source)
        return self._forum_id


def get_csrf_token(source):
    return re.search(r'.*_token]?" value="(.*)"', source.decode()).group(1)


def parse_forum_id(forum, source):
    soup = get_soup(source)

    forum_tag_list = soup.select_one('label + select')
    for forum_tag in forum_tag_list('option'):
        if forum_tag.text.lower() == forum.lower():
            return forum_tag['value']


class HttpSubmitter(Submitter):
surreal's avatar
surreal committed
145
    """Add http I/O on top of Submitter."""
146 147
    def __init__(self, *args, dry_run=False, **kwargs):
        self.dry_run = dry_run
surreal's avatar
surreal committed
148 149 150 151 152 153 154 155 156 157 158 159
        self.cookies = []
        super().__init__(*args, **kwargs)

    def login(self):
        self.get(URL_LOGIN)
        self.post(URL_LOGIN_CHECK, self.login_data)

    def submit(self, posts):
        for post in posts:
            self.get(URL_FORUM_SUBMIT)
            data = self.make_submission_data(post)
            print('Posting:', post)
160 161 162
            print('-'*50)
            if self.dry_run:
                continue
surreal's avatar
surreal committed
163 164 165 166 167
            self.post(URL_SUBMIT, data)

    def get(self, url):
        response = requests.get(url, cookies=self.cookies, allow_redirects=False)
        self.source = response.content
surreal's avatar
surreal committed
168
        if response.cookies:
surreal's avatar
surreal committed
169 170 171 172 173 174 175 176
            self.cookies = response.cookies

    def post(self, url, data):
        response = requests.post(url,
                                 data,
                                 cookies=self.cookies,
                                 allow_redirects=False)

surreal's avatar
surreal committed
177
        assert response.status_code == 302
surreal's avatar
surreal committed
178
        self.source = response.content
surreal's avatar
surreal committed
179
        if response.cookies:
surreal's avatar
surreal committed
180 181 182
            self.cookies = response.cookies


surreal's avatar
surreal committed
183
@click.command()
184 185 186 187 188 189 190 191 192 193
@click.option('--name', default=None,
              help='The raddle bot account username.')
@click.option('--password', default=None,
              help='The raddle bot account password.')
@click.option('--dry-run', default=False, is_flag=True,
              help='Don not actually post anything, useful for debugging.')
@click.option('--last', default=10, help='Post only the last n posts.')
@click.option('--interval', default=3600, type=click.INT,
              help='Interval at which the bot will look for new posts.')
def main(name, password, dry_run, last, interval):
surreal's avatar
surreal committed
194 195 196 197
    if name is None or password is None:
        print("Need to provide username and password.")
        sys.exit(1)

surreal's avatar
surreal committed
198

199 200 201 202
    submitter = HttpSubmitter(name, password, 'completeanarchy', dry_run=dry_run)


    while True:
203
        posts = get_new_posts()[-last:]
204 205 206 207 208 209 210 211
        submitter(posts)
        time.sleep(interval)

def get_new_posts():
    print('Fetching feed', end='')
    feed_response = requests.get(URL_FEED)
    while not feed_response.status_code == 200:
        print('.', end='')
surreal's avatar
surreal committed
212 213
        time.sleep(.5)
        feed_response = requests.get(URL_FEED)
214
    print()
surreal's avatar
surreal committed
215 216

    new_posts = parse_reddit_feed(feed_response.content)
217 218
    new_posts = list(reversed(list(new_posts)))[-15:]

surreal's avatar
surreal committed
219 220 221 222 223 224 225 226
    assert new_posts

    print('Fetching archive..')
    archive_response = requests.get(URL_ARCHIVE)
    archive_posts = parse_raddle_bot_posts(archive_response.content)

    posts = filter_new_posts(archive_posts, new_posts)

227
    return list(posts)
surreal's avatar
surreal committed
228 229 230

if __name__ == "__main__":
    main()