Commit e8a6f6d6 authored by jnanar's avatar jnanar

Better notebook and associated scripts

parent 51d4c575
......@@ -60,10 +60,14 @@ class LinuxfrPredictor:
:param feed: the feed from feedparser
:return: a dictionary containing the information
"""
boundaries = [-50, 0, 50]
scores = {
'https://linuxfr.org/users/lawless/journaux/l-equipe-ubuntu-desktop-aimerait-avoir-vos-commentaires': 14,
'https://linuxfr.org/users/rydroid/journaux/sortie-de-replicant-6-0': 15,
'https://linuxfr.org/users/eaufroide/journaux/retour-d-experience-yunohost': 22,
'https://linuxfr.org/users/patrick32/journaux/rem-on-saura-peut-etre-faire-le-cafe-et-pas-vous-ficher-dehors':-7,
'https://linuxfr.org/users/lawless/journaux/l-equipe-ubuntu-desktop-aimerait-avoir-vos-commentaires': 18,
'https://linuxfr.org/users/rydroid/journaux/sortie-de-replicant-6-0': 21,
'https://linuxfr.org/users/eaufroide/journaux/retour-d-experience-yunohost': 23,
'https://linuxfr.org/users/sebastien_p/journaux/nouvelles-distributions-a-venir-sous-windows-10': 31,
'https://linuxfr.org/users/faya/journaux/2-bookmarks-securite-windows': 30,
'https://linuxfr.org/users/fantome_asthmatique/journaux/tres_hs-moyen-de-gamme-sur-lemonde-fr-nan-mais-je-reve': 4,
......@@ -86,7 +90,18 @@ class LinuxfrPredictor:
param_feed['score'] = scores[feed['link']]
except KeyError:
param_feed['score'] = 0
param_feed['quality_content'] = True
if param_feed['score'] < min(boundaries):
param_feed['quality_content'] = 'Magnificient Troll'
if param_feed['score'] > max(boundaries):
param_feed['quality_content'] = 'Quality Troll'
if 0 < param_feed['score'] < max(boundaries):
param_feed['quality_content'] = 'Average Troll'
if 0 > param_feed['score'] > min(boundaries):
param_feed['quality_content'] = 'Great Troll'
return param_feed
def open_csv(self):
......
This diff is collapsed.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# # @author: Arnaud Joset
#
# This file is part of scikit-learn_for_linuxfr.
#
# scikit-learn_for_linuxfr is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# scikit-learn_for_linuxfr is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with scikit-learn_for_linuxfr. If not, see <http://www.gnu.org/licenses/>.
#
from bs4 import BeautifulSoup
import urllib3
import logging
logging.basicConfig()
logger = logging.getLogger("linuxfr_parser")
logger.setLevel("DEBUG")
class LinuxFr:
def __init__(self):
bs = None
def get_content(self):
a = 0
def get_score(self):
a = 0
def get_author(self):
a = 0
def get_title(self):
try:
title = self.bs.title.name
except AttributeError:
return None
return title
def get_soup(path):
ca_certs = "/etc/ssl/certs/ca-certificates.crt" # Or wherever it lives.
http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',
ca_certs=ca_certs, )
req = http.request('GET', path)
req_html = req.data
req.close()
return req_html
def parse_journals_list(base_url , count):
diaries_urls = []
path = base_url + '/journaux?page=' + str(count)
diaries_html = get_soup(path)
soup = BeautifulSoup(diaries_html, "lxml")
for link in soup.find_all('a'):
if '/users/' in link.attrs['href'] and 'Lire la suite' in link.contents:
try:
if link.attrs['href'] not in diaries_urls:
diaries_urls.append(link.attrs['href'])
except TypeError:
pass
return diaries_urls
def collect_diaries(base_url='https://linuxfr.org' , diaries_urls=''):
lf_instances = []
for url in diaries_urls:
path = base_url + url
lf = LinuxFr()
lf.bs = BeautifulSoup(get_soup(path), "lxml")
content = lf.get_content()
author = lf.get_content()
score = lf.get_score()
title = lf.get_title()
diary_dict = {'url': url, 'instance': lf, 'content':content,'score':score,'title':title }
lf_instances.append(diary_dict)
return lf_instances
def prepare_csv(lf_instance):
a = 0
def launcher():
base_url = 'https://linuxfr.org'
diaries_urls = []
for count in range(1, 2):
logger.info("Page : {}".format(count))
diaries_urls += parse_journals_list(base_url=base_url, count=count)
logger.info("{} diaries ".format(len(diaries_urls)))
lf_instances = collect_diaries(base_url=base_url, diaries_urls=diaries_urls)
prepare_csv(lf_instances)
if __name__ == "__main__":
launcher()
\ No newline at end of file
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# # @author: Arnaud Joset
#
# This file is part of scikit-learn_for_linuxfr.
#
# scikit-learn_for_linuxfr is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# scikit-learn_for_linuxfr is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with scikit-learn_for_linuxfr. If not, see <http://www.gnu.org/licenses/>.
#
# Thanks to Damien Accorsi for the nice presentation of Scrapy and the code:
# http://lebouquetin.github.io/scrapy-presentation-pyuggre-01-2015/#/section-1/page-1
# https://github.com/lebouquetin/lebouquetin.github.io/tree/master/scrapy-presentation-pyuggre-01-2015
import logging
import scrapy
logging.basicConfig()
logger = logging.getLogger("linuxfr_scraper")
logger.setLevel("DEBUG")
class LinuxfrJournalSpider(scrapy.Spider):
name = "quotes"
def start_requests(self):
urls = [
'https://linuxfr.org/journaux?page=1',
'https://linuxfr.org/journaux?page=2',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
for journal in response.css("div.figures"):
'author': quote.css('small.author::text').extract_first(),
'tags': quote.css('div.tags a.tag::text').extract(),
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment