Commit f3285bc3 authored by Elger Jonker's avatar Elger Jonker

Allow xls/xlsx/odf files to be imported via the internet.


Former-commit-id: 65bbc939
parent d84523b9
...@@ -18,9 +18,11 @@ from leaflet.admin import LeafletGeoAdminMixin ...@@ -18,9 +18,11 @@ from leaflet.admin import LeafletGeoAdminMixin
import failmap.scanners.scanner.http as scanner_http import failmap.scanners.scanner.http as scanner_http
from failmap import types from failmap import types
from failmap.app.models import Job from failmap.app.models import Job
from failmap.celery import PRIO_HIGH from failmap.celery import PRIO_HIGH, app
from failmap.map.report import OrganizationRating, UrlRating from failmap.map.report import OrganizationRating, UrlRating
from failmap.organizations.models import Coordinate, Organization, OrganizationType, Promise, Url from failmap.organizations.models import (Coordinate, Dataset, Organization, OrganizationType,
Promise, Url)
from failmap.organizations.sources.excel import import_datasets
from failmap.scanners.admin import UrlIp from failmap.scanners.admin import UrlIp
from failmap.scanners.models import Endpoint, EndpointGenericScan, TlsQualysScan, UrlGenericScan from failmap.scanners.models import Endpoint, EndpointGenericScan, TlsQualysScan, UrlGenericScan
from failmap.scanners.scanner import dns, dnssec, onboard, plain_http, security_headers, tls_qualys from failmap.scanners.scanner import dns, dnssec, onboard, plain_http, security_headers, tls_qualys
...@@ -630,3 +632,35 @@ class PromiseAdmin(ImportExportModelAdmin, admin.ModelAdmin): ...@@ -630,3 +632,35 @@ class PromiseAdmin(ImportExportModelAdmin, admin.ModelAdmin):
'description': PROMISE_DESCRIPTION, 'description': PROMISE_DESCRIPTION,
}), }),
) )
@admin.register(Dataset)
# todo: how to show a form / allowing uploads?
class DatasetAdmin(ImportExportModelAdmin, admin.ModelAdmin):
list_display = ('source', 'is_imported', 'imported_on')
search_fields = ('source', )
list_filter = ('is_imported', 'imported_on')
fields = ('source', 'is_imported', 'imported_on')
actions = []
# todo: perhaps a type should be added, and that defines what importer is used here...
# Then we also need the options to be expanded with options from the database.
def import_(self, request, queryset):
for dataset in queryset:
options = {'url': [dataset.source]}
# ok, it's not smart to say something is imported before it has been verified to be imported.
(import_datasets.si(**options)
| dataset_import_finished.si(dataset)).apply_async()
self.message_user(request, "Import started, will run in parallel.")
import_.short_description = "+ Import"
actions.append('import_')
@app.task(queue='storage')
def dataset_import_finished(dataset):
dataset.is_imported = True
dataset.imported_on = datetime.now(pytz.utc)
dataset.save()
...@@ -2,13 +2,14 @@ import logging ...@@ -2,13 +2,14 @@ import logging
from django.core.management.base import BaseCommand from django.core.management.base import BaseCommand
from failmap.organizations.sources import dutch_government from failmap.organizations.sources import dutch_government, excel
log = logging.getLogger(__package__) log = logging.getLogger(__package__)
importers = { importers = {
'dutch_government': dutch_government, 'dutch_government': dutch_government,
'excel': excel
} }
...@@ -19,6 +20,7 @@ class Command(BaseCommand): ...@@ -19,6 +20,7 @@ class Command(BaseCommand):
def add_arguments(self, parser): def add_arguments(self, parser):
parser.add_argument('importer', nargs=1, help='The importer you want to use.', choices=importers) parser.add_argument('importer', nargs=1, help='The importer you want to use.', choices=importers)
parser.add_argument('url', nargs='*', help='URL of a file to download (fox excel).', default='')
super().add_arguments(parser) super().add_arguments(parser)
def handle(self, *args, **options): def handle(self, *args, **options):
...@@ -29,7 +31,7 @@ class Command(BaseCommand): ...@@ -29,7 +31,7 @@ class Command(BaseCommand):
return return
importer_module = importers[options['importer'][0]] importer_module = importers[options['importer'][0]]
importer_module.import_datasets() importer_module.import_datasets(**options)
except KeyboardInterrupt: except KeyboardInterrupt:
log.info("Received keyboard interrupt. Stopped.") log.info("Received keyboard interrupt. Stopped.")
# Generated by Django 2.1.3 on 2018-12-11 14:35
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('organizations', '0042_auto_20181210_1318'),
]
operations = [
migrations.CreateModel(
name='Dataset',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('source', models.URLField()),
('is_imported', models.BooleanField(default=False)),
('imported_on', models.DateTimeField(blank=True, null=True)),
],
),
]
...@@ -449,3 +449,13 @@ class Promise(models.Model): ...@@ -449,3 +449,13 @@ class Promise(models.Model):
class Meta: class Meta:
verbose_name = _('promise') verbose_name = _('promise')
verbose_name_plural = _('promises') verbose_name_plural = _('promises')
class Dataset(models.Model):
"""
Allows you to define URL datasources to download and import into the system. This acts as a memory of what you
have imported. You can even re-import the things listed here. It will use the generic/excel importer.
"""
source = models.URLField()
is_imported = models.BooleanField(default=False,)
imported_on = models.DateTimeField(blank=True, null=True)
...@@ -42,12 +42,12 @@ def get_data(dataset, download_function): ...@@ -42,12 +42,12 @@ def get_data(dataset, download_function):
if is_cached(filename): if is_cached(filename):
log.debug('Getting cached file for: %s' % dataset['url']) log.debug('Getting cached file for: %s' % dataset['url'])
return read_data(filename) return filename
download_function(dataset['url'], filename_to_save=filename) download_function(dataset['url'], filename_to_save=filename)
# simply reads and returns the raw data # simply reads and returns the raw data
return read_data(filename) return filename
def generic_dataset_import(datasets, parser_function, download_function): def generic_dataset_import(datasets, parser_function, download_function):
...@@ -56,7 +56,7 @@ def generic_dataset_import(datasets, parser_function, download_function): ...@@ -56,7 +56,7 @@ def generic_dataset_import(datasets, parser_function, download_function):
for index, dataset in enumerate(datasets): for index, dataset in enumerate(datasets):
log.info('Importing dataset (%s/%s): %s' % (index+1, len(datasets), dataset)) log.info('Importing dataset (%s/%s): %s' % (index+1, len(datasets), dataset))
data = get_data(dataset, download_function) data = get_data(dataset=dataset, download_function=download_function)
# the parser has to do whatever it takes to parse the data: unzip, read arbitrary nonsense structures and so on # the parser has to do whatever it takes to parse the data: unzip, read arbitrary nonsense structures and so on
organizations = parser_function(dataset, data) organizations = parser_function(dataset, data)
...@@ -83,13 +83,15 @@ def is_cached(filename): ...@@ -83,13 +83,15 @@ def is_cached(filename):
def url_to_filename(url: str): def url_to_filename(url: str):
# keep the extension as some importers do magic with that
m = hashlib.md5() m = hashlib.md5()
m.update(("%s" % url).encode('utf-8')) m.update(("%s" % url).encode('utf-8'))
# make sure the directory for processing files exists # make sure the directory for processing files exists
makedirs(DOWNLOAD_DIRECTORY, exist_ok=True) makedirs(DOWNLOAD_DIRECTORY, exist_ok=True)
return DOWNLOAD_DIRECTORY + m.hexdigest() return DOWNLOAD_DIRECTORY + m.hexdigest() + '.' + max(url.split('.'))
def check_environment(): def check_environment():
......
""" """
Importer for Dutch governmental organizations, using open data. Importer for Dutch governmental organizations, using open data.
Example:
failmap import_organizations dutch_government
Warning: this is XML, set aside your intuition about programming. Warning: this is XML, set aside your intuition about programming.
""" """
...@@ -10,7 +13,7 @@ from os import rename ...@@ -10,7 +13,7 @@ from os import rename
import requests import requests
from failmap.organizations.sources import generic_dataset_import, print_progress_bar from failmap.organizations.sources import generic_dataset_import, print_progress_bar, read_data
log = logging.getLogger(__package__) log = logging.getLogger(__package__)
...@@ -50,7 +53,8 @@ namespaces = { ...@@ -50,7 +53,8 @@ namespaces = {
} }
def parse_data(dataset, data): def parse_data(dataset, filename):
data = read_data(filename)
# this is some kind of XML format. for which an XSD is available. # this is some kind of XML format. for which an XSD is available.
# for each document another namespace version is available, which makes it harder. # for each document another namespace version is available, which makes it harder.
# how can we identify the correct namespace for p correctly automatically? # how can we identify the correct namespace for p correctly automatically?
...@@ -146,5 +150,5 @@ def download(url, filename_to_save): ...@@ -146,5 +150,5 @@ def download(url, filename_to_save):
return filename_to_save return filename_to_save
def import_datasets(): def import_datasets(**options):
generic_dataset_import(datasets=datasets, parser_function=parse_data, download_function=download) generic_dataset_import(datasets=datasets, parser_function=parse_data, download_function=download)
"""
Example:
failmap import_organizations excel https://example.com/example.xlsx
Mind the format! See parse_data
"""
import logging
from os import rename
import iso3166
import pyexcel as p
import requests
from failmap.celery import app
from failmap.organizations.sources import generic_dataset_import, print_progress_bar
log = logging.getLogger(__package__)
# todo: these datasets have to come from a table in the admin. That will give a nice UI.
datasets = []
def parse_data(dataset, filename):
"""
The Excel file should contain one tab. The tab contains the following columns:
Mandatory:
Name: organization name, using the abbreviation in parenthesis: Awesome Company (AC)
Address: organization address
Countrycode: organization country, two letter ISO country code.
Layer: layer, for example: government, municipality, finance, etc. Will be auto-created.
Websites (csv): websites (comma separated list of urls)
Optional:
Hint: other positional data (used as geocoding hint)
Lat: optional: latitude, float formatted as "1.0022" (more precision is better)
Lng: optional: long, float formatted as "1.234" (more precision is better)
:param dataset:
:param filename:
:return:
"""
# spreadsheet is the best / easiest.
# csv, ods, xls, xlsx and xlsm files
found_organizations = []
log.debug('Loading excel data from %s' % filename)
sheet = p.get_sheet(file_name=filename, name_columns_by_row=0)
records = sheet.to_records()
for record in records:
validate_record(record)
found_organizations.append(
{
'name': record['Name'],
'address': record['Address'],
'geocoding_hint': record.get('Hint', ''),
'websites': record['Websites (csv)'],
'country': record['Countrycode'],
'layer': record['Layer'],
'lat': record.get('Lat', ''),
'lng': record.get('Lng', ''),
'dataset': dataset
}
)
p.free_resources()
# debug_organizations(found_organizations)
return found_organizations
def validate_record(record):
if not record.get('Name', ''):
ValueError('Missing "Name" column or column was empty.')
if not record.get('Address', ''):
ValueError('Missing "Address" column or column was empty.')
if not record.get('Websites (csv)', ''):
ValueError('Missing "Websites (csv)" column or column was empty.')
if not record.get('Countrycode', ''):
ValueError('Missing "Countrycode" column or column was empty.')
if not record.get('Layer', ''):
ValueError('Missing "Layer" column or column was empty.')
if record['Countrycode'] not in iso3166.countries_by_alpha2:
raise ValueError('Countrycode is not a valid 3166 country code.')
def download(url, filename_to_save):
# post / get / credentials / protocol, whatever...
response = requests.get(url, stream=True, timeout=(10, 10))
response.raise_for_status()
with open(filename_to_save, 'wb') as f:
filename = f.name
i = 0
for chunk in response.iter_content(chunk_size=1024):
i += 1
print_progress_bar(1, 100, ' download')
if chunk: # filter out keep-alive new chunks
f.write(chunk)
rename(filename, filename_to_save)
return filename_to_save
@app.task(queue='storage')
def import_datasets(**options):
if not options['url']:
raise ValueError('Please supply an URL for a dataset to download.')
datasets = [
{'url': options['url'][0],
'description': 'Randomly uploaded file.'},
]
generic_dataset_import(datasets=datasets, parser_function=parse_data, download_function=download)
# https://stackoverflow.com/questions/115983/how-can-i-add-an-empty-directory-to-a-git-repository#932982
# Ignore everything in this directory
*
# Except this file
!.gitignore
\ No newline at end of file
...@@ -930,6 +930,7 @@ JET_SIDE_MENU_ITEMS = [ # A list of application or custom item dicts ...@@ -930,6 +930,7 @@ JET_SIDE_MENU_ITEMS = [ # A list of application or custom item dicts
{'name': 'promise'}, {'name': 'promise'},
{'name': 'coordinate'}, {'name': 'coordinate'},
{'name': 'organizationtype'}, {'name': 'organizationtype'},
{'name': 'dataset'},
], 'permissions': ['admin']}, ], 'permissions': ['admin']},
# todo: sort scan moment to show latest first. # todo: sort scan moment to show latest first.
......
...@@ -136,4 +136,10 @@ spectra # fancy colors for teams ...@@ -136,4 +136,10 @@ spectra # fancy colors for teams
markdown # explanations for urls and organizations from the admin markdown # explanations for urls and organizations from the admin
# geocoding organizations from python # geocoding organizations from python
googlemaps googlemaps
\ No newline at end of file
# allowing generic import of these types of files:
django-excel
pyexcel-xls
pyexcel-xlsx
pyexcel-ods
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment