Commit 6758b7cd authored by jnanar's avatar jnanar

Fix: score was used as an input for determination. New notebook to analyze the diaries without it.

parent 2f6ceea2
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -93,7 +93,7 @@ class LinuxFR:
"""
# We append the new lines to CSV
fieldnames = ['title', 'author', 'url', 'score', 'content', 'quality_content', 'count', 'datetime',
'author_url', 'author_previous_scores', 'birthday']
'author_url', 'author_previous_scores', 'birthday', 'n_comments', 'comments_scores']
with open(self.filename, 'a') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter='£', quotechar='µ')
writer.writeheader()
......@@ -169,7 +169,6 @@ def extract_user_data(base_url='', url=''):
content_scores = soup.find_all("figure", class_="score")
for entry in content_scores:
entry = entry.contents
score = float(entry[0])
list_scores.append(score)
......@@ -177,7 +176,8 @@ def extract_user_data(base_url='', url=''):
return list_scores, user_birthday
def extract_data(soup=None, div_class=None, figure_class=None, meta_content=None, datetime=None):
def extract_data(soup=None, div_class=None, figure_class=None, meta_content=None, datetime=None,
comments=False):
"""
Extract data from the diary page
:param datetime:
......@@ -194,6 +194,7 @@ def extract_data(soup=None, div_class=None, figure_class=None, meta_content=None
if figure_class:
content = soup.find("figure", class_=figure_class)
if meta_content:
# Find name: meta_content = "author"
content = soup.find("meta", {"name": meta_content})['content']
a = 0
if datetime:
......@@ -214,6 +215,26 @@ def extract_data(soup=None, div_class=None, figure_class=None, meta_content=None
return datetime
content_string = str(content)
if comments:
n_comments = 0
# itemprops = soup.find_all("meta", {"itemprop": 'interactionCount'})
# for item in itemprops:
# if 'UserComments:' in item.attrs['content']:
# n_comments = item.attrs['content']
# n_comments = n_comments.replace('UserComments:', '')
# n_comments = float(n_comments)
comments= soup.find_all("span", class_="score")
scores = []
for comment in comments:
score = comment.contents
score = float(score[0])
scores.append(score)
n_comments = len(scores)
return n_comments, scores
try:
content_string = html2text.html2text(content_string)
except TypeError:
......@@ -234,8 +255,9 @@ def collect_diaries(base_url='https://linuxfr.org', diaries_urls=[], known_urls=
diaries = []
for url in diaries_urls:
i = 1
if url not in known_urls:
logger.info("Extract data from {}".format(url))
logger.info("{} Extract data from {}".format(i,url))
path = base_url + url
bs = BeautifulSoup(get_soup(path), "lxml")
content_string = extract_data(soup=bs, div_class='content entry-content')
......@@ -248,11 +270,13 @@ def collect_diaries(base_url='https://linuxfr.org', diaries_urls=[], known_urls=
datetime = extract_data(soup=bs, datetime=True)
author_url = url.split("/journaux/")[0]
author_previous_scores, birthday = extract_user_data(base_url=base_url, url=author_url)
n_comments, comments_scores= extract_data(soup=bs, comments=True)
diary_dict = {'url': url, 'content': content_string, 'score': score,
'title': title, 'author': author, 'quality_content': '', 'count': len(content_string),
'datetime': datetime, 'author_url': base_url+author_url, 'author_previous_scores': author_previous_scores,
"birthday":birthday}
"birthday":birthday, 'n_comments': n_comments, 'comments_scores': comments_scores}
diaries.append(diary_dict)
i += 1
return diaries
......@@ -260,7 +284,7 @@ def launcher():
base_url = 'https://linuxfr.org'
diaries_urls = []
lf = LinuxFR(base_url)
lf.set_filename('out_of_sample_complete2.csv')
lf.set_filename('linuxfr_total.csv')
parse_without_content = True
lf.open_csv()
known_urls = lf.get_urls()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment