Commit edb6db74 authored by Jed Simson's avatar Jed Simson

Initial Commit

parents
*.py[co]
*.DS_*
# Packages
*.egg
*.egg-info
dist
build
eggs
parts
bin
var
sdist
develop-eggs
.installed.cfg
# Installer logs
pip-log.txt
# Unit test / coverage reports
.coverage
.tox
#Translations
*.mo
#Mr Developer
venv
*.pyc
build
from flask import Flask
from routes import main
def create_app():
app = Flask(__name__)
app.register_blueprint(main)
return app
if __name__ == "__main__":
app = create_app()
app.run(host='0.0.0.0', port=5050, debug=True)
click==6.6
decorator==4.0.10
Flask==0.11.1
itsdangerous==0.24
Jinja2==2.8
MarkupSafe==0.23
praw==3.5.0
pytz==2013b0
requests==2.10.0
six==1.10.0
timestring==1.6.2
update-checker==0.11
Werkzeug==0.11.10
from flask import Blueprint, request, jsonify, render_template
from slider import Slider
import time
main = Blueprint('main', __name__)
slider = Slider()
@main.route('/')
def index():
return jsonify({'name:': 'Reddit Slider',
'description': 'A simple program to allow greater filtering options for Reddit submissions.',
'version': 1.0})
@main.route('/search/')
def search():
return render_template('search.html')
@main.route('/search/<subreddit>/')
def subreddit(subreddit):
dimension = request.args.get('range', None)
delta = request.args.get('delta', None)
sort = request.args.get('sort', 'hot')
limit = int(request.args.get('limit', 100))
t0 = time.time()
try:
submissions = slider.get_subreddit_posts(subreddit, dimension, delta, sort, limit)
except ValueError as e:
t1 = time.time()
t = round(t1 - t0, 4)
return jsonify({'error': str(e), 'time': t}), 500
t1 = time.time()
t = round(t1 - t0, 4)
return jsonify({'time': t,
'sort': sort,
'range': dimension,
'delta': delta,
'submissions': submissions,
'count': len(submissions)})
from praw import Reddit
from praw.errors import HTTPException
import time
class Search:
def __init__(self):
self.version = 0.1
self.sorts = ['hot', 'new', 'controversial', 'top']
# For conversion between broken reddit timestamps and unix timestamps
self.REDDIT_TIMESTAMP_OFFSET = 28800
# Minimum number of seconds to sleep during errors
self.BACKOFF_START = 4
# When making timestamp:X..Y queries, reddit misses submissions
# inside X..Y range, but they can be found inside Y..Z range
# It is not clear what is the value of Z should be, but it seems
# like the difference is usually about ~1 hour or less
# To be sure, let's set the workaround offset to 2 hours
self.OUT_OF_ORDER_OFFSET = 7200
def generate(self, subreddit, low_timestamp, high_timestamp,
limit=100, sort='hot'):
if not sort in self.sorts:
raise ValueError('Invalid sort parameter `{}`... Please use one of {}'\
.format(sort, self.sorts))
if limit > 1000:
# If we're retrieving more than 1000 posts, it is likely things
# will get pretty slow as the rate limiting catches up with us.
# TODO: Inform the user and add option to override.
return self._generate(subreddit, low_timestamp, high_timestamp, limit=limit, sort=sort)
#raise ValueError('Limit is too high ({} > 1000)... query would take too long.\n' \
# 'There is an alternative query API that can be used through the ' \
# '/search/slow/ route.'
# .format(limit))
# We assume that `low `and `high` timestamps are given
if low_timestamp is None or high_timestamp is None:
raise ValueError('Timestamp must be given...')
# Take into account the broken reddit/unix timestamps
low_timestamp += self.REDDIT_TIMESTAMP_OFFSET
high_timestamp += self.REDDIT_TIMESTAMP_OFFSET
original_lowest_timestamp = low_timestamp
original_highest_timestamp = high_timestamp
# Take into account the fact that reddit can miss submissions in the
# timestamp range by ~1 hour.
low_timestamp -= self.OUT_OF_ORDER_OFFSET
high_timestamp += self.OUT_OF_ORDER_OFFSET
# Create a reddit session to use
r = Reddit('Reddit Slider by /u/oracular_demon - v{}'.format(self.version))
backoff = self.BACKOFF_START
try:
t1 = low_timestamp
t2 = high_timestamp
query = 'timestamp:{}..{}'.format(t1, t2)
results = list(r.search(query,
subreddit=subreddit,
limit=limit,
syntax='cloudsearch',
sort=sort))
except HTTPException as exc:
time.sleep(backoff)
backoff *= 2
pass
search_results = [s for s in results
if original_lowest_timestamp <= s.created and
s.created <= original_highest_timestamp]
# Give results as a generator
return (submission for submission in search_results)
def _generate(self, subreddit, low_timestamp, high_timestamp,
limit=100, sort='hot'):
if not sort in self.sorts:
raise ValueError('Invalid sort parameter `{}`... Please use one of {}'\
.format(sort, self.sorts))
# We assume that `low `and `high` timestamps are given
if low_timestamp is None or high_timestamp is None:
raise ValueError('Timestamp must be given...')
# Take into account the broken reddit/unix timestamps
low_timestamp += self.REDDIT_TIMESTAMP_OFFSET
high_timestamp += self.REDDIT_TIMESTAMP_OFFSET
original_lowest_timestamp = low_timestamp
original_highest_timestamp = high_timestamp
# Take into account the fact that reddit can miss submissions in the
# timestamp range by ~1 hour.
low_timestamp -= self.OUT_OF_ORDER_OFFSET
high_timestamp += self.OUT_OF_ORDER_OFFSET
# Create a reddit session to use
r = Reddit('Reddit Slider by /u/oracular_demon - v{}'.format(self.version))
window_size = 60 * 60
search_limit = 100
min_search_results_in_window = 50
window_adjustment_ratio = 1.25
backoff = self.BACKOFF_START
processed_submissions = 0
prev_win_increased = False
prev_win_decreased = False
about_to_hit_limit = False
while high_timestamp >= low_timestamp:
try:
t1 = max(high_timestamp - window_size, low_timestamp)
t2 = high_timestamp
query = 'timestamp:{}..{}'.format(t1, t2)
results = list(r.search(query,
subreddit=subreddit,
limit=search_limit,
syntax='cloudsearch',
sort=sort))
except HTTPException as exc:
time.sleep(backoff)
backoff *= 2
continue
if len(results) >= search_limit:
# Decrease the window size as we've got to many results in our window
power = 2 if prev_win_decreased else 1
window_size = int(window_size / window_adjustment_ratio**power)
prev_win_decreased = True
# Since it is possible that there are more submissions
# in the current window, we have to re-do the request
# with reduced window (i.e. don't yield results yet)
continue
else:
prev_win_decreased = False
results = [s for s in results
if original_lowest_timestamp <= s.created and
s.created <= original_highest_timestamp]
# If we're going to go over the user-specified limit, only
# generate submissions until the limit is hit (i.e. not the
# entire search results - just the difference)
if (processed_submissions + len(results)):
# Number of submissions we can generate before hitting
# the limit
diff = limit - processed_submissions
# Set a flag
about_to_hit_limit = True
else:
diff = 0
for i, submission in enumerate(results):
# If we reach the user specified limit, don't generate
# any more results
if about_to_hit_limit and i == diff:
return
else:
yield submission
processed_submissions += len(results)
high_timestamp -= (window_size + 1)
if len(results) < min_search_results_in_window:
power = 2 if prev_win_increased else 1
window_size = int(window_size * window_adjustment_ratio**power)
prev_win_increased = True
else:
prev_win_increased = False
import praw
import time
import os
from datetime import datetime
import timestring
from search import Search
search = Search()
class Slider:
def __init__(self):
self.today = timestring.Range('today', tz='UTC')
self.today_beginning = int(self.today.start.to_unixtime())
self.today_end = int(self.today.end.to_unixtime())
self.sorts = ['hot', 'new', 'controversial', 'top',
'top_from_week', None]
def get_all_posts(self, filter):
pass
def get_subreddit_posts(self, name, dimension, delta, sort, limit):
subreddit = name
# Can't have a dimension and a delta
# i.e. 'tuesday to wednesday 30 minutes ago' doesn't make sense
if dimension and delta:
raise ValueError('Can\'t filter by range `and` delta... Please use only one filter.')
# Don't allow for arbitrary sort parameters as there is only a few methods available
#if not sort in self.sorts:
# raise ValueError('Invalid sort parameter `{}`... Please use one of {}'.format(sort, self.sorts))
try:
if dimension:
dimension = timestring.Range(dimension, tz='UTC')
start, end = int(dimension.start.to_unixtime()), int(dimension.end.to_unixtime())
elif delta:
now = timestring.Date('now', tz='UTC')
dimension = timestring.Range(now + delta, end=now)
start, end = int(dimension.start.to_unixtime()), int(dimension.end.to_unixtime())
else:
# If neither a dimension or a delta is provided, use the default range
# i.e. start of the current day -> end of the current day.
start, end = self.todays_dimensions
except timestring.TimestringInvalid as e:
raise ValueError('Invalid range given...')
print('Fetching {} `{}` submissions from /r/{} between {} and {}...'.format(limit, sort, subreddit, start, end))
generator = search.generate(subreddit,
start,
end,
sort=sort,
limit=limit)
submissions = [{'info': str(s), 'timestamp': int(s.created_utc),
'href': s.permalink,
'date': timestring.Date(int(s.created_utc), tz='UTC')\
.format('%Y/%m/%d %H:%M:%S (%Z)')}
for s in generator]
return submissions
@property
def todays_dimensions(self):
return self.today_beginning, self.today_end
This diff is collapsed.
This diff is collapsed.
<!DOCTYPE HTML>
<head>
<script
src="https://code.jquery.com/jquery-3.0.0.min.js"
integrity="sha256-JmvOoLtYsmqlsWxa7mDSLMwa6dZ9rrIdtrrVYRnDRH0="
crossorigin="anonymous"></script>
<link rel="stylesheet" href="{{ url_for('static', filename='bootstrap.min.css') }}">
</head>
<body>
<div class="container">
<div class="content">
<br>
<h1>Reddit Slider</h1>
<hr>
<p>Please enter your query using the options below...</p>
<div id="error" class="alert alert-danger" style="display:none;"></div>
<form class="form-inline">
<fieldset class="form-group">
<label for="subreddit"><strong>Subreddit</strong></label>
<div class="input-group">
<span class="input-group-addon" id="basic-addon1">/r/</span>
<input type="text" id="subreddit" class="form-control" placeholder="programming">
</div>
</fieldset>
<fieldset class="form-group">
<label for="range"><strong>Range</strong></label>
<input type="text" id="range" class="form-control" placeholder="Today">
</fieldset>
<fieldset class="form-group">
<label for="sort"><strong>Sort</strong></label>
<select class="form-control" id="sort">
<option value="hot" selected>Hot</option>
<option value="top">Top</option>
<option value="top">New</option>
<option value="top">Controversial</option>
</select>
</fieldset>
<fieldset class="form-group">
<label for="limit"><strong>Limit</strong></label>
<input type="number" class="form-control" id="limit" placeholder="100">
</fieldset>
<button id="search-btn" class="btn btn-primary-outline" type="submit">Search</button>
</form>
<br>
<div id="info" class="alert alert-info" style="display:none;"></div>
<hr>
<div class="list-group" id="results">
<div id="loading" style="display:none;">Loading...</div>
</div>
</div>
</div>
</body>
<script type="text/javascript">
$('#search-btn').click(function(event) {
event.preventDefault();
var subreddit = $('#subreddit').val() || 'programming';
var range = $('#range').val() || 'today';
var sort = $('#sort').val() || 'hot';
var limit = $('#limit').val() || '100';
var range = range.split(' ').join('+');
if (subreddit == null) {
$('#error').text('Please provide a subreddit');
$('#error').show();
return;
}
var url = '/search/' + subreddit + '/?range=' + range + '&sort=' + sort + '&limit=' + limit;
$('#loading').show();
$('#info').hide();
$('.list-group-item').remove();
$.ajax({
type: 'GET',
url: url,
dataType: 'json',
contentType: 'application/json',
success: function(data) {
$('#loading').hide();
var count = data['count'];
var time = data['time'];
$('#info').text(count + ' results fetched in ' + time + ' seconds...')
$('#info').show();
var submissions = data['submissions'];
for (i=0; i<submissions.length; i++) {
var submission = submissions[i];
$('#results').append('<li class="list-group-item"><a href="' + submission['href'] + '">' + submission['info'] + '</a></li>');
}
$('html,body').animate({scrollTop: $('#info').offset().top}, 'slow');
},
error: function(data) {
var error = data.responseJSON['error'];
$('#loading').hide();
$('#results').append('<p class="result">No results...</p>');
$('#error').text(error);
$('#error').show();
}
});
});
</script>
</html>
import praw
from praw.helpers import submissions_between
from search import Search
import time
LOW = 1466424000
HIGH = 1466510400
def time_praw():
r = praw.Reddit('/u/oracular_demon tests')
submissions = []
t0 = time.time()
for submission in submissions_between(r, 'askreddit', verbosity=0, \
lowest_timestamp=LOW, highest_timestamp=HIGH):
submissions.append(submission)
t1 = time.time()
print('\n{} submissions found between {} and {}.'.format(len(submissions), LOW, HIGH))
print('Time elapsed: {} seconds'.format(t1 - t0))
return submissions
def time_search():
s = Search()
submissions = []
t0 = time.time()
for submission in s.generate('askreddit', LOW, HIGH, sort='new', limit=500):
submissions.append(submission)
t1 = time.time()
print('\n{} submissions found between {} and {}.'.format(len(submissions), LOW, HIGH))
print('Time elapsed: {} seconds'.format(t1 - t0))
return submissions
if __name__ == '__main__':
print('Testing submissions between using PRAW API Helper...')
submissions1 = time_praw()
print('\nTesting submissions between using my search API...')
submissions2 = time_search()
length = min(len(submissions1), len(submissions2))
# Only go as far as the smallest result
for i in range(length):
submission1 = submissions1[i]
submission2 = submissions2[i]
print('{}.\t {} == {} => {}'.format(i, submission1.id, submission2.id, submission1.id==submission2.id))
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment