Commit 3cff359f authored by maxigas's avatar maxigas

Get talks from the CCC website via iCalendar and parse them into a big CSV:...

Get talks from the CCC website via iCalendar and parse them into a big CSV: Works but needs some tuning
parents
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
#!/usr/bin/python3
from sh import torify, mv
first = 1984
serial = 31
urlhead = "https://events.ccc.de/congress/"
urltail = "/Fahrplan/"
names = ["schedule.ics", "schedule.en.ics", "fahrplan.ics"]
for y in range(first + serial, first + 1, -1):
year = str(y)
win = False
for name in names:
url = urlhead + year + urltail + name
try:
# print("GET", url)
torify("wget", "-O", year + name, url)
# print(" SUCCESS:", url)
win = True
except:
# print(" FAIL:", url)
pass
if win:
print("*", year, "OK")
else:
print("-", year, "MISSING")
#!/usr/bin/python3
import csv, os, codecs
import vobject, chardet
# OUTPUT we want:
# congress, title, abstract, link, tags
# SOURCE terminology:
# VID, SUMMARY, DESCRIPTION, URL, n/a
fieldnames = ["congress", "title", "abstract", "link", "tags"]
# How to reac a vobject?
# vobject.readComponents(icalstream).next().vevent.dtstart.value
# First year:
y = 1984
# Last serial number:
s = 33
congresses = { i:str(x)+"C3" for i,x in zip(range(y,y+s), range(1, s+1)) }
icsfilenames = [ f for f in os.listdir(".") if ".ics" in f ]
o = "cccongresstalks.csv"
def slurp(filename):
with open(filename, 'rb') as f:
raw = f.read()
encoding = chardet.detect(raw)['encoding']
print(encoding)
with codecs.open(filename, 'r', encoding=encoding) as f:
return f.readlines()
def vobjectstreams(inputfile):
"""Accepts a list of lines, yields a list of vobjects streams"""
vobjectstreams = []
buffer = []
inside = False
# for l in inputfile:
# print(l)
for l in inputfile:
if "BEGIN:VEVENT" in l:
inside = True
buffer.append(l)
elif "END:VEVENT" in l:
inside = False
buffer.append(l)
vobjectstream = ""
for x in buffer:
vobjectstream = vobjectstream + x
vobjectstreams.append(vobjectstream)
buffer = []
elif inside:
buffer.append(l)
return vobjectstreams
def fixlines(badlines):
previous = ""
goodlines = []
for line in badlines:
if "ATTENDEE" in line:
pass
elif (line.split(":")[0].isupper() or line.split(";")[0].isupper()) and (":" in line or ";" in line):
goodlines.append(line)
else:
goodlines[-1] = goodlines[-1].strip() + line
return goodlines
def vobjects(vobjectstreams):
"""Accepts a list of vobject streams, yields a list of vobjects"""
vs = []
for v in vobjectstreams:
try:
vs.append(list(vobject.readComponents(v))[0])
except:
print("ERROR PARSING vobject:")
print(v)
return vs
# return [ list(vobject.readComponents(v))[0] for v in vobjectstreams ]
# Only for testing, delete later
def dtstartvalues(vobjects):
return [ v.dtstart.value for v in vobjects ]
def talks(vobjects):
vs = vobjects
year = int(vs[0].url.value.split("/")[4])
congress = congresses[year]
titles = [ v.summary.value for v in vs ]
abstracts = [ v.description.value if hasattr(v, 'description') else "N/F" for v in vs ]
links = [ v.url.value for v in vs ]
talks = []
for t, a, l in zip(titles, abstracts, links):
talks.append({"congress":congress,"title":t,"abstract":a,"link":l,"tags":""})
return talks
def _writer(csvfile, fieldnames):
writer = csv.DictWriter(csvfile,
fieldnames=fieldnames,
delimiter='|',
quoting=csv.QUOTE_ALL)
return writer
def save(filename, fieldnames, rows):
with open(filename, 'a') as csvfile:
writer = _writer(csvfile, fieldnames)
for row in rows:
writer.writerow(row)
def new(filename, fieldnames):
with open(filename, 'w') as csvfile:
writer = _writer(csvfile, fieldnames)
writer.writeheader()
# CALLS
new(o, fieldnames)
for i in icsfilenames:
print(i)
save(o, fieldnames, talks(vobjects(vobjectstreams(fixlines(slurp(i))))))
# print(congresses)
# OUTPUT we want:
# congress, title, abstract, link, tags
# SOURCE terminology:
# VID, SUMMARY, DESCRIPTION, URL, n/a
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
* 2002
* 2003
* 2004
* 2005
* 2006
* 2007
* 2008
* 2009
* 2010
* 2011
* 2012
* 2013
* 2014
* 2015
* 2016 ???
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment