linuxfr_scraper.py 1.68 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
#!/usr/bin/env python3
#  -*- coding: utf-8 -*-
#  # @author: Arnaud Joset
#
#  This file is part of scikit-learn_for_linuxfr.
#
# scikit-learn_for_linuxfr is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# scikit-learn_for_linuxfr is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with scikit-learn_for_linuxfr.  If not, see <http://www.gnu.org/licenses/>.
#
# Thanks to Damien Accorsi for the nice presentation of Scrapy and the code:
# http://lebouquetin.github.io/scrapy-presentation-pyuggre-01-2015/#/section-1/page-1
# https://github.com/lebouquetin/lebouquetin.github.io/tree/master/scrapy-presentation-pyuggre-01-2015



import logging
import scrapy

logging.basicConfig()
logger = logging.getLogger("linuxfr_scraper")
logger.setLevel("DEBUG")

class LinuxfrJournalSpider(scrapy.Spider):
    name = "quotes"

    def start_requests(self):
        urls = [
            'https://linuxfr.org/journaux?page=1',
            'https://linuxfr.org/journaux?page=2',
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        for journal in response.css("div.figures"):
            'author': quote.css('small.author::text').extract_first(),
            'tags': quote.css('div.tags a.tag::text').extract(),