Commit e20c522e authored by Alberto Cammozzo's avatar Alberto Cammozzo

filename graphing

parent 7f065890
#!/usr/local/bin/python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2020 Alberto Cammozzo
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import os
import sys
import getopt
import codecs
import bson
import datetime
import validators
from pymongo import MongoClient
from argparse import ArgumentParser
import numpy as np
from numpy import fft
import pandas as pd
import matplotlib.pyplot as plt
def buildDict(i,newsp,time):
# i= {newspaper: {date: articles}}
if newsp in i.keys():
if time not in i[newsp].keys():
i[newsp][time]=1
else:
i[newsp][time]+=1
else:
i[newsp]={}
i[newsp][time]=1
def fourierExtrapolation(x, n_predict):
# https://gist.github.com/tartakynov/83f3cd8f44208a1856ce
n = x.size
n_harm = 10 # number of harmonics in model
t = np.arange(0, n)
try:
p = np.polyfit(t, x, 1) # find linear trend in x
except Exception as e:
print (f"np.polyfit error: {e} with ({t} {x} 1)")
return(True,[])
x_notrend = x - p[0] * t # detrended x
x_freqdom = fft.fft(x_notrend) # detrended x in frequency domain
f = fft.fftfreq(n) # frequencies
indexes = range(n)
# sort indexes by frequency, lower -> higher
indexes = list(range(n))
t = np.arange(0, n + n_predict)
restored_sig = np.zeros(t.size)
for i in indexes[:1 + n_harm * 2]:
ampli = np.absolute(x_freqdom[i]) / n # amplitude
phase = np.angle(x_freqdom[i]) # phase
restored_sig += ampli * np.cos(2 * np.pi * f[i] * t + phase)
return (False,restored_sig + p[0] * t)
def plotall(x,days,minart,t,filedir):
# various plots with alerts
import matplotlib.dates as mdates
import statsmodels.api as sm
year = mdates.YearLocator(month=1)
month = mdates.MonthLocator(interval=1)
year_format = mdates.DateFormatter('%Y')
month_format = mdates.DateFormatter('%m')
# working vectors setup, labels
(dday,dtime)=str(min(x.index)).split(' ')
label=t+" since "+dday
# fig setup
fig,ax = plt.subplots(1,1)
ax.xaxis.grid(True, which = 'minor')
ax.xaxis.set_minor_locator(month)
ax.xaxis.set_major_locator(year)
ax.xaxis.set_major_formatter(year_format)
ax.xaxis.set_minor_formatter(month_format)
plt.figure(figsize=(20.5, 10.5))
plt.ylabel("number of daily articles")
plt.ylim(bottom=0)
plt.ylim(top=x.max())
plt.title(label)
#colors
cwe='#ffbf00' # weekend
cwd='#bfff00' # weekday
fft='#ff00bf' #fft extrapolation
alr='#ff0040' # alert
avr='#99ccff' # rolling average
zer='#ff4000' # zero articles
std='#bdccdb' # standard dev
# plot workday/weekend series
wd=x.where(x.index.dayofweek <5) #weekdays
we=x.where(x.index.dayofweek >4) #weekends
wd.plot(label="workday article", marker='.', color=cwd, alpha=0.6, linestyle='None', subplots=True)
we.plot(label="weekend article", marker='.', color=cwe, alpha=0.6, linestyle='None', subplots=True)
# fft extrapolation
n_predict = 0 # prediction days
(error,extrapolation) = fourierExtrapolation(x.fillna(0).values, n_predict)
if error:
print(f"skipping {t} because fourierExtrapolation returned error on {x.fillna(0)}")
else:
xe=pd.Series(extrapolation[0:len(x.index)],index=x.index)
xe.plot(label = 'fft extrapolation', linestyle="-", linewidth=3, color=fft, alpha=0.6, subplots=True)
# average
# 0 filled series: nan or 0 articles
m=x.fillna(0).rolling(days).mean()
labelavg=f"{str(days)} days rolling average "
m.plot(label=labelavg, color=avr, linewidth=3, alpha=0.3, subplots=True)
# std sigmas plot
for sigma in [1,2,3,4]:
s=x.fillna(0).rolling(days).std()
path_deviation = sigma * s
under_line = (m -(s *sigma))
over_line = (m +(s *sigma))
labelsig=f"±{str(sigma)}σ of rolling average"
plt.fill_between(path_deviation.index, under_line, over_line, label=labelsig, color=std, alpha=0.08)
# alerts
# a=x.where(((x < (m - s)) | (x > (m + s)) ) , np.nan)
a=x.where(((x < (m - (s *sigma)))) , np.nan)
a.plot(label="<"+labelsig ,marker='X', color=alr, alpha=0.7, linestyle='None', subplots=True)
# zero or low collection
z=pd.DataFrame()
z=x.where((x<minart) & (x.index.dayofweek < 5), np.nan )
z.plot(label="<"+str(minart)+" articles in workdays" ,marker='|', color=zer, alpha=0.7, linestyle='None', subplots=True)
# plot figure & save
plt.figlegend(loc="upper right")
fig= plt.gcf()
filename=f"timeseries_{t}_since_{dday}.svg"
path=filedir+"/"+filename
if os.path.exists(path):
os.remove(path)
fig.savefig(path)
plt.close('all')
return(filename)
#######
# MAIN
#######
# parse commandline
parser = ArgumentParser()
parser.add_argument("dbname", type=str,help='database name')
parser.add_argument("-s", "--server", dest="server", help="mongodb server address:port")
parser.add_argument("-f", "--fromyear", dest="fromyear", help="start from year")
parser.add_argument("-t", "--toyear", dest="toyear", help="start from year")
parser.add_argument("-D", "--outdir", dest="outdir", help="output directory for graph files")
parser.add_argument("-v", "--verbose", action="store_true", help="be chatty")
(args) = parser.parse_args()
# parameters
filedir="out/"+args.dbname
rolling=90 # rolling average
minart=10 # threshold of minumum of articles for plotting
numyears= [2,1] # plot last n years
datelimit=datetime.datetime(2007,1,1) # needed to overcome double index error (bug) in resampling
# connect to DB
dbname=args.dbname
if args.server:
server=args.server
else:
server="127.0.0.1:20017"
# build datetime query
q={}
if args.fromyear:
q={'datesPublished': {'$gte': datetime.datetime(int(args.fromyear),1,1) }}
if args.toyear:
q={'datesPublished': {'$lt': datetime.datetime(int(args.toyear),12,31)}}
if args.toyear and args.fromyear:
q={'$and' :[
{'datesPublished': {'$gte': datetime.datetime(int(args.fromyear),1,1) }} ,
{'datesPublished': {'$lt' : datetime.datetime(int(args.toyear),12,31) }}
]
}
# connect to DB
client = MongoClient(server)
db = client[dbname]
now = datetime.datetime.now()
articleDB = db["article"]
# collect articles
try:
all = articleDB.find(q,no_cursor_timeout=False)
except Exception as e:
print(f"error connecting to db {server}/{dbname}: {e}")
exit()
data={}
count=0
for item in all:
if 'sources' in item.keys() and 'datesPublished' in item.keys():
# source processing
try:
(proto,newsp,feed)=item['sources'][0].split('|',2)
feed.replace(':','|').replace(' ','_')
except Exception as e:
continue
# datesPublished procssing
ts=[]
for d in item['datesPublished']:
if isinstance(d, datetime.date):
if ((d < now) and (d > datelimit)):
ts.append(d)
# exclude nearDuplicates
if "redisIndexed" in item.keys():
if "dontIndex" in item.keys():
indexed=False
else:
indexed=True
else:
indexed="NA"
# output
if len(ts) > 0:
ts.sort()
first=ts[0]
last=ts[-1]
span=(last-first).days
src=item['sources'][0]
buildDict(data,newsp,first.date())
count+=1
if args.verbose: print(f"#got {count} items for {args.dbname} and query {q}")
# convert dict in a dataframe
# missing data as NaN parsing errors as NaT
ddf=pd.DataFrame(data)
ddf.index = pd.to_datetime(ddf.index,yearfirst=True,errors="coerce")
ddf.index.name="date"
ddf.sort_index(inplace=True)
# plotting, buold HTML index page
index="<html> <head><title>TIPS Project - quality assessment</title></head>\n"
index+="<style>.tab {border-collapse:collapse;}\n​ </style>\n"
index+=f"<h1>{args.dbname} articles</h1>\n"
index+=f"<p>from {ddf.index[0].date()} to {ddf.index[-1].date()}</p>\n"
index+=f"<table cellspacing=\"0\" cellpadding=\"0\" border=\"0\" style=\"width: 100%\"><tr>"
index+=f"<th></th>"
# build graphs in table
for i in numyears:
index+=f"<th></th>"
index+="\n"
for t in ddf.keys():
# upper bound
index+=f"<tr><td>{t}</td>"
lastd=ddf[t].last_valid_index()
if (lastd> now):
lastd=pd.to_datetime(now)
#lower bound
firstd=ddf[t].first_valid_index()
if (firstd < datetime.datetime(2007, 1, 1)):
firstd=pd.to_datetime(datetime.datetime(2007,1,1))
if args.verbose: print(f"selecting {t} [{firstd} {lastd}]")
try:
s=pd.Series(ddf[t],index=pd.date_range(start=firstd,end=lastd, freq='D' ))
except Exception as e:
index+="<td>N/A</td>"
if args.verbose: print(f"skipping {t}: {e}")
continue
firstd=s.first_valid_index()
lastd=s.last_valid_index()
if args.verbose: print(f"adjusting {t} [{firstd} {lastd}]")
try:
series=pd.Series(s,index=pd.date_range(start=firstd,end=lastd, freq='D' ))
except Exception as e:
index+="<td>N/A</td>"
if args.verbose: print(f"skipping {t}: {e}")
continue
filename=plotall(series,rolling,minart,t,filedir)
index+=f"<td>"
index+=f"<a href=\"{filename}\"><img alt=\"{t} [{firstd} {lastd}]\" style=\"width:100%;\" src=\"{filename}\"></a>"
index+=f"</td>"
for years in numyears:
# upper bound
if ((lastd.date() - firstd.date()).days < (365*years)):
index+="<td>N/A</td>"
if args.verbose: print(f"skipping {t}: no data")
else:
firstd=lastd-datetime.timedelta(days=years*365)
try:
if args.verbose: print(f"selecting {t} [{firstd} {lastd}]")
except Exception as e:
index+="<td>N/A</td>"
if args.verbose: print(f"skipping {t}: {e}")
continue
s=pd.Series(ddf[t],index=pd.date_range(start=firstd,end=lastd, freq='D' ))
firstd=s.first_valid_index()
lastd=s.last_valid_index()
try:
series=pd.Series(s,index=pd.date_range(start=firstd,end=lastd, freq='D' ))
except Exception as e:
index+="<td>N/A</td>"
if args.verbose: print(f"skipping {t}: {e}")
continue
if args.verbose: print(f"adjusting {t} [{firstd} {lastd}]")
filename=plotall(series,rolling,minart,t,filedir)
index+=f"<td>"
index+=f"<a href=\"{filename}\"><img alt=\"{t} [{firstd} {lastd}]\" style=\"width:100%;\" src=\"{filename}\"></a>"
index+=f"</td>"
index+="</tr>\n"
# build aggregate tables by year, month, day of week
byyear=ddf.groupby(ddf.index.year).sum()
bymonth=ddf.groupby(ddf.index.month).sum()
bydow=ddf.groupby(lambda x: x.dayofweek, axis=0).sum()
index+="</table>\n</HTML>\n"
index+="<p><h2>Aggregate tables and plots</h2></p><table>\n"
index+="<th></th><th></th>\n"
tables=[byyear,bymonth,bydow]
n=["year","month","day of week"]
leg=[]
for i in range(len(tables)):
index+=f"<tr><td><h2>By {n[i]}</h2>"
index+=tables[i].astype(int).to_html()
index+="</td><td>"
fig = plt.gcf()
plt.figure(figsize=(20.5, 10.5))
plot=tables[i].plot.bar()
fig = plot.get_figure()
filename=filedir+"/"+str(n[i])+".png"
fig.savefig(filename)
index+=f"<a href=\"{n[i]}.png\"><img alt=\"{n[i]}\" style=\"width:100%;\" src=\"{n[i]}.png\"></a>"
index+="<td></tr>"
#Write down index
index_name="out/"+args.dbname+"/index.html"
if os.path.exists(index_name):
os.remove(index_name)
f=open(index_name,"w+")
f.write(index)
f.close()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment