Commit d711d166 authored by Lionel Maes's avatar Lionel Maes
Browse files

add some post treatment tools for extractors

parent ce3666d0
......@@ -5,6 +5,6 @@ An instance of coaldigger is running online at since January 2014.
The two processes are:
*, which scans one or several news indexes (ex: google news) looking for article links, then looks for patterns in each of those articles to extract specific content (ex: title, header, body, images), then save the content in the database and download the images.
*, which apply filters on the downloaded content and images to deduce more data from it (e.g. average colours from images - which is the only filter currently running)
*, which apply filters on the downloaded content and images to deduce more data from it (e.g. average colours from images - the only filter currently running)
![Grabber sequence diagram](sequence.svg)
......@@ -7,7 +7,8 @@ import datetime
import locale
from cssselect import HTMLTranslator
import cssselect
from lxml.etree import fromstring
#from lxml.etree import fromstring
from lxml import etree
import re
from logger import logger
from urlparse import urlparse
......@@ -35,6 +36,7 @@ class CoalExtractor(object):
for result in src.xpath(expression):
try:"data extracted before post-treatment: %s" % etree.tostring(result))
res = eval(self.treatment).strip()
if(res != ''):
......@@ -55,6 +57,19 @@ class CoalExtractor(object):
def rel2absURL(self, value, rootURL):
return rootURL + value.lstrip("./")
def contentWithoutElements(self, value, pattern):
for selector in cssselect.parse(pattern):
expression = HTMLTranslator().selector_to_xpath(selector, translate_pseudo_elements=True)'trying to launch this function %s', selector)
for elem in value.xpath(expression):
return value.text_content()
def lastFromSrcSet(self, value):
items = value.split(',');
return items[len(items)-1]
def removeQueryString(self, value):
o = urlparse(value)
return o.scheme + "://" + o.netloc + o.path
......@@ -62,9 +77,11 @@ class CoalExtractor(object):
def regex(self, value, regex):
p = re.compile(regex.decode('utf-8'))
results = p.findall(value)
return results[0]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment