Commit f3285bc3 authored by Elger Jonker's avatar Elger Jonker

Allow xls/xlsx/odf files to be imported via the internet.


Former-commit-id: 65bbc939
parent d84523b9
......@@ -18,9 +18,11 @@ from leaflet.admin import LeafletGeoAdminMixin
import failmap.scanners.scanner.http as scanner_http
from failmap import types
from failmap.app.models import Job
from failmap.celery import PRIO_HIGH
from failmap.celery import PRIO_HIGH, app
from failmap.map.report import OrganizationRating, UrlRating
from failmap.organizations.models import Coordinate, Organization, OrganizationType, Promise, Url
from failmap.organizations.models import (Coordinate, Dataset, Organization, OrganizationType,
Promise, Url)
from failmap.organizations.sources.excel import import_datasets
from failmap.scanners.admin import UrlIp
from failmap.scanners.models import Endpoint, EndpointGenericScan, TlsQualysScan, UrlGenericScan
from failmap.scanners.scanner import dns, dnssec, onboard, plain_http, security_headers, tls_qualys
......@@ -630,3 +632,35 @@ class PromiseAdmin(ImportExportModelAdmin, admin.ModelAdmin):
'description': PROMISE_DESCRIPTION,
}),
)
@admin.register(Dataset)
# todo: how to show a form / allowing uploads?
class DatasetAdmin(ImportExportModelAdmin, admin.ModelAdmin):
list_display = ('source', 'is_imported', 'imported_on')
search_fields = ('source', )
list_filter = ('is_imported', 'imported_on')
fields = ('source', 'is_imported', 'imported_on')
actions = []
# todo: perhaps a type should be added, and that defines what importer is used here...
# Then we also need the options to be expanded with options from the database.
def import_(self, request, queryset):
for dataset in queryset:
options = {'url': [dataset.source]}
# ok, it's not smart to say something is imported before it has been verified to be imported.
(import_datasets.si(**options)
| dataset_import_finished.si(dataset)).apply_async()
self.message_user(request, "Import started, will run in parallel.")
import_.short_description = "+ Import"
actions.append('import_')
@app.task(queue='storage')
def dataset_import_finished(dataset):
dataset.is_imported = True
dataset.imported_on = datetime.now(pytz.utc)
dataset.save()
......@@ -2,13 +2,14 @@ import logging
from django.core.management.base import BaseCommand
from failmap.organizations.sources import dutch_government
from failmap.organizations.sources import dutch_government, excel
log = logging.getLogger(__package__)
importers = {
'dutch_government': dutch_government,
'excel': excel
}
......@@ -19,6 +20,7 @@ class Command(BaseCommand):
def add_arguments(self, parser):
parser.add_argument('importer', nargs=1, help='The importer you want to use.', choices=importers)
parser.add_argument('url', nargs='*', help='URL of a file to download (fox excel).', default='')
super().add_arguments(parser)
def handle(self, *args, **options):
......@@ -29,7 +31,7 @@ class Command(BaseCommand):
return
importer_module = importers[options['importer'][0]]
importer_module.import_datasets()
importer_module.import_datasets(**options)
except KeyboardInterrupt:
log.info("Received keyboard interrupt. Stopped.")
# Generated by Django 2.1.3 on 2018-12-11 14:35
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('organizations', '0042_auto_20181210_1318'),
]
operations = [
migrations.CreateModel(
name='Dataset',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('source', models.URLField()),
('is_imported', models.BooleanField(default=False)),
('imported_on', models.DateTimeField(blank=True, null=True)),
],
),
]
......@@ -449,3 +449,13 @@ class Promise(models.Model):
class Meta:
verbose_name = _('promise')
verbose_name_plural = _('promises')
class Dataset(models.Model):
"""
Allows you to define URL datasources to download and import into the system. This acts as a memory of what you
have imported. You can even re-import the things listed here. It will use the generic/excel importer.
"""
source = models.URLField()
is_imported = models.BooleanField(default=False,)
imported_on = models.DateTimeField(blank=True, null=True)
......@@ -42,12 +42,12 @@ def get_data(dataset, download_function):
if is_cached(filename):
log.debug('Getting cached file for: %s' % dataset['url'])
return read_data(filename)
return filename
download_function(dataset['url'], filename_to_save=filename)
# simply reads and returns the raw data
return read_data(filename)
return filename
def generic_dataset_import(datasets, parser_function, download_function):
......@@ -56,7 +56,7 @@ def generic_dataset_import(datasets, parser_function, download_function):
for index, dataset in enumerate(datasets):
log.info('Importing dataset (%s/%s): %s' % (index+1, len(datasets), dataset))
data = get_data(dataset, download_function)
data = get_data(dataset=dataset, download_function=download_function)
# the parser has to do whatever it takes to parse the data: unzip, read arbitrary nonsense structures and so on
organizations = parser_function(dataset, data)
......@@ -83,13 +83,15 @@ def is_cached(filename):
def url_to_filename(url: str):
# keep the extension as some importers do magic with that
m = hashlib.md5()
m.update(("%s" % url).encode('utf-8'))
# make sure the directory for processing files exists
makedirs(DOWNLOAD_DIRECTORY, exist_ok=True)
return DOWNLOAD_DIRECTORY + m.hexdigest()
return DOWNLOAD_DIRECTORY + m.hexdigest() + '.' + max(url.split('.'))
def check_environment():
......
"""
Importer for Dutch governmental organizations, using open data.
Example:
failmap import_organizations dutch_government
Warning: this is XML, set aside your intuition about programming.
"""
......@@ -10,7 +13,7 @@ from os import rename
import requests
from failmap.organizations.sources import generic_dataset_import, print_progress_bar
from failmap.organizations.sources import generic_dataset_import, print_progress_bar, read_data
log = logging.getLogger(__package__)
......@@ -50,7 +53,8 @@ namespaces = {
}
def parse_data(dataset, data):
def parse_data(dataset, filename):
data = read_data(filename)
# this is some kind of XML format. for which an XSD is available.
# for each document another namespace version is available, which makes it harder.
# how can we identify the correct namespace for p correctly automatically?
......@@ -146,5 +150,5 @@ def download(url, filename_to_save):
return filename_to_save
def import_datasets():
def import_datasets(**options):
generic_dataset_import(datasets=datasets, parser_function=parse_data, download_function=download)
"""
Example:
failmap import_organizations excel https://example.com/example.xlsx
Mind the format! See parse_data
"""
import logging
from os import rename
import iso3166
import pyexcel as p
import requests
from failmap.celery import app
from failmap.organizations.sources import generic_dataset_import, print_progress_bar
log = logging.getLogger(__package__)
# todo: these datasets have to come from a table in the admin. That will give a nice UI.
datasets = []
def parse_data(dataset, filename):
"""
The Excel file should contain one tab. The tab contains the following columns:
Mandatory:
Name: organization name, using the abbreviation in parenthesis: Awesome Company (AC)
Address: organization address
Countrycode: organization country, two letter ISO country code.
Layer: layer, for example: government, municipality, finance, etc. Will be auto-created.
Websites (csv): websites (comma separated list of urls)
Optional:
Hint: other positional data (used as geocoding hint)
Lat: optional: latitude, float formatted as "1.0022" (more precision is better)
Lng: optional: long, float formatted as "1.234" (more precision is better)
:param dataset:
:param filename:
:return:
"""
# spreadsheet is the best / easiest.
# csv, ods, xls, xlsx and xlsm files
found_organizations = []
log.debug('Loading excel data from %s' % filename)
sheet = p.get_sheet(file_name=filename, name_columns_by_row=0)
records = sheet.to_records()
for record in records:
validate_record(record)
found_organizations.append(
{
'name': record['Name'],
'address': record['Address'],
'geocoding_hint': record.get('Hint', ''),
'websites': record['Websites (csv)'],
'country': record['Countrycode'],
'layer': record['Layer'],
'lat': record.get('Lat', ''),
'lng': record.get('Lng', ''),
'dataset': dataset
}
)
p.free_resources()
# debug_organizations(found_organizations)
return found_organizations
def validate_record(record):
if not record.get('Name', ''):
ValueError('Missing "Name" column or column was empty.')
if not record.get('Address', ''):
ValueError('Missing "Address" column or column was empty.')
if not record.get('Websites (csv)', ''):
ValueError('Missing "Websites (csv)" column or column was empty.')
if not record.get('Countrycode', ''):
ValueError('Missing "Countrycode" column or column was empty.')
if not record.get('Layer', ''):
ValueError('Missing "Layer" column or column was empty.')
if record['Countrycode'] not in iso3166.countries_by_alpha2:
raise ValueError('Countrycode is not a valid 3166 country code.')
def download(url, filename_to_save):
# post / get / credentials / protocol, whatever...
response = requests.get(url, stream=True, timeout=(10, 10))
response.raise_for_status()
with open(filename_to_save, 'wb') as f:
filename = f.name
i = 0
for chunk in response.iter_content(chunk_size=1024):
i += 1
print_progress_bar(1, 100, ' download')
if chunk: # filter out keep-alive new chunks
f.write(chunk)
rename(filename, filename_to_save)
return filename_to_save
@app.task(queue='storage')
def import_datasets(**options):
if not options['url']:
raise ValueError('Please supply an URL for a dataset to download.')
datasets = [
{'url': options['url'][0],
'description': 'Randomly uploaded file.'},
]
generic_dataset_import(datasets=datasets, parser_function=parse_data, download_function=download)
# https://stackoverflow.com/questions/115983/how-can-i-add-an-empty-directory-to-a-git-repository#932982
# Ignore everything in this directory
*
# Except this file
!.gitignore
\ No newline at end of file
......@@ -930,6 +930,7 @@ JET_SIDE_MENU_ITEMS = [ # A list of application or custom item dicts
{'name': 'promise'},
{'name': 'coordinate'},
{'name': 'organizationtype'},
{'name': 'dataset'},
], 'permissions': ['admin']},
# todo: sort scan moment to show latest first.
......
......@@ -137,3 +137,9 @@ markdown # explanations for urls and organizations from the admin
# geocoding organizations from python
googlemaps
# allowing generic import of these types of files:
django-excel
pyexcel-xls
pyexcel-xlsx
pyexcel-ods
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment