Commit a3b5bd01 authored by Elger Jonker's avatar Elger Jonker

Moving dataset imports to admin, supporting types and kwargs, less code more flex


Former-commit-id: c9f70045
parent e1255a85
import logging
from datetime import datetime
from json import loads
import nested_admin
import pytz
......@@ -20,9 +21,9 @@ from failmap import types
from failmap.app.models import Job
from failmap.celery import PRIO_HIGH, app
from failmap.map.report import OrganizationRating, UrlRating
from failmap.organizations.datasources import dutch_government, excel
from failmap.organizations.models import (Coordinate, Dataset, Organization, OrganizationType,
Promise, Url)
from failmap.organizations.sources.excel import import_datasets
from failmap.scanners.admin import UrlIp
from failmap.scanners.models import Endpoint, EndpointGenericScan, TlsQualysScan, UrlGenericScan
from failmap.scanners.scanner import dns, dnssec, onboard, plain_http, security_headers, tls_qualys
......@@ -634,13 +635,27 @@ class PromiseAdmin(ImportExportModelAdmin, admin.ModelAdmin):
)
class DatasetForm(forms.ModelForm):
def clean_kwargs(self):
value = self.cleaned_data['kwargs']
try:
loads(value)
except ValueError as exc:
raise forms.ValidationError(
_('Unable to parse JSON: %s') % exc,
)
return value
@admin.register(Dataset)
# todo: how to show a form / allowing uploads?
class DatasetAdmin(ImportExportModelAdmin, admin.ModelAdmin):
list_display = ('source', 'is_imported', 'imported_on')
list_display = ('source', 'type', 'is_imported', 'imported_on')
search_fields = ('source', )
list_filter = ('is_imported', 'imported_on')
fields = ('source', 'is_imported', 'imported_on')
fields = ('source', 'type', 'kwargs', 'is_imported', 'imported_on')
actions = []
......@@ -649,15 +664,30 @@ class DatasetAdmin(ImportExportModelAdmin, admin.ModelAdmin):
def import_(self, request, queryset):
for dataset in queryset:
options = {'url': [dataset.source]}
kwargs = {'url': dataset.source}
extra_kwargs = loads(dataset.kwargs)
kwargs = {**kwargs, **extra_kwargs}
# ok, it's not smart to say something is imported before it has been verified to be imported.
importers = {
'excel': excel,
'dutch_government': dutch_government,
'': excel
}
(import_datasets.si(**options)
(importers[dataset.type].import_datasets.si(**kwargs)
| dataset_import_finished.si(dataset)).apply_async()
self.message_user(request, "Import started, will run in parallel.")
import_.short_description = "+ Import"
actions.append('import_')
form = DatasetForm
save_as = True
save_on_top = True
preserve_filters = True
@app.task(queue='storage')
def dataset_import_finished(dataset):
......
......@@ -5,54 +5,19 @@ Example:
failmap import_organizations dutch_government
Warning: this is XML, set aside your intuition about programming.
https://almanak-redactie.overheid.nl/archive/
"""
import logging
import xml.etree.ElementTree as ET
from os import rename
import requests
from failmap.organizations.sources import generic_dataset_import, print_progress_bar, read_data
from failmap.celery import app
from failmap.organizations.datasources import (download_http_get_no_credentials,
generic_dataset_import, read_data)
log = logging.getLogger(__package__)
LAYER = 'government'
COUNTRY = 'NL'
# https://almanak-redactie.overheid.nl/archive/
# the xml plural / single are to help parsing, they don't need to be in your specification.
datasets = [
{'url': 'https://almanak-redactie.overheid.nl/archive/exportOO_ministeries.xml',
'description': 'Dutch ministries', 'layer': LAYER, 'country': COUNTRY,
'xml_plural': 'organisaties', 'xml_single': 'organisatie'},
{'url': 'https://almanak-redactie.overheid.nl/archive/exportOO_gemeenschappelijke_regelingen.xml',
'description': 'Gemeenschappelijke Regelingen', 'layer': LAYER, 'country': COUNTRY,
'xml_plural': 'gemeenschappelijkeRegelingen', 'xml_single': 'gemeenschappelijkeRegeling'},
{'url': 'https://almanak-redactie.overheid.nl/archive/exportOO_organisaties.xml',
'description': 'Organisaties', 'layer': LAYER, 'country': COUNTRY,
'xml_plural': 'organisaties', 'xml_single': 'organisatie'},
{'url': 'https://almanak-redactie.overheid.nl/archive/exportOO_rechterlijke_macht.xml',
'description': 'Rechterlijke macht', 'layer': LAYER, 'country': COUNTRY,
'xml_plural': 'organisaties', 'xml_single': 'organisatie'},
{'url': 'https://almanak-redactie.overheid.nl/archive/exportOO_waterschappen.xml',
'description': 'Waterschappen', 'layer': LAYER, 'country': COUNTRY,
'xml_plural': 'organisaties', 'xml_single': 'organisatie'},
{'url': 'https://almanak-redactie.overheid.nl/archive/exportOO_zelfstandige_bestuursorganen.xml',
'description': 'Zelfstandige bestuursorganen', 'layer': LAYER, 'country': COUNTRY,
'xml_plural': 'zelfstandigeBestuursorganen', 'xml_single': 'zelfstandigBestuursorgaan'},
]
namespaces = {
'p': 'https://almanak.overheid.nl/static/schema/oo/export/2.4.3',
}
def parse_data(dataset, filename):
data = read_data(filename)
......@@ -68,7 +33,7 @@ def parse_data(dataset, filename):
# of course this doesn't work out the box, so how do we autoregister a namespace?
ET.register_namespace('p', ns)
# so just fake / overwrite the namespaces variable
namespaces['p'] = ns
namespaces = {'p': ns}
organizations = root.find('p:%s' % dataset['xml_plural'], namespaces)
......@@ -77,7 +42,7 @@ def parse_data(dataset, filename):
for organization in organizations.iterfind('p:%s' % dataset['xml_single'], namespaces):
name = emulate_get(organization, 'p:naam', namespaces)
if not name:
# gemeenschappelijke regelingen...
# gemeenschappelijke regelingen has a title, not a name.
name = emulate_get(organization, 'p:titel', namespaces)
abbreviation = emulate_get(organization, 'p:afkorting', namespaces)
......@@ -128,28 +93,8 @@ def emulate_get(xml, element, namespaces):
return ""
def download(url, filename_to_save):
# https://stackoverflow.com/questions/3173320/text-progress-bar-in-the-console
# post / get / credentials / protocol, whatever...
response = requests.get(url, stream=True, timeout=(1200, 1200))
response.raise_for_status()
with open(filename_to_save, 'wb') as f:
filename = f.name
i = 0
for chunk in response.iter_content(chunk_size=1024):
i += 1
print_progress_bar(1, 100, ' download')
if chunk: # filter out keep-alive new chunks
f.write(chunk)
# save as cachable resource
# this of course doesn't work if you call it a few times while a download is running, but well, good enough
rename(filename, filename_to_save)
return filename_to_save
def import_datasets(**options):
generic_dataset_import(datasets=datasets, parser_function=parse_data, download_function=download)
@app.task(queue='storage')
def import_datasets(**dataset):
generic_dataset_import(dataset=dataset,
parser_function=parse_data,
download_function=download_http_get_no_credentials)
......@@ -6,18 +6,16 @@ Mind the format! See parse_data
"""
import logging
from os import rename
import iso3166
import pyexcel as p
import requests
from failmap.celery import app
from failmap.organizations.sources import generic_dataset_import, print_progress_bar
from failmap.organizations.datasources import (download_http_get_no_credentials,
generic_dataset_import)
log = logging.getLogger(__package__)
# todo: these datasets have to come from a table in the admin. That will give a nice UI.
datasets = []
......@@ -54,12 +52,16 @@ def parse_data(dataset, filename):
validate_record(record)
sites = record['Websites (csv)'].strip().split(',')
sites = [x.strip() for x in sites]
# todo: column numbers might still be easier for people that enter data?
found_organizations.append(
{
'name': record['Name'],
'address': record['Address'],
'geocoding_hint': record.get('Hint', ''),
'websites': record['Websites (csv)'],
'websites': sites,
'country': record['Countrycode'],
'layer': record['Layer'],
'lat': record.get('Lat', ''),
......@@ -96,33 +98,8 @@ def validate_record(record):
raise ValueError('Countrycode is not a valid 3166 country code.')
def download(url, filename_to_save):
# post / get / credentials / protocol, whatever...
response = requests.get(url, stream=True, timeout=(10, 10))
response.raise_for_status()
with open(filename_to_save, 'wb') as f:
filename = f.name
i = 0
for chunk in response.iter_content(chunk_size=1024):
i += 1
print_progress_bar(1, 100, ' download')
if chunk: # filter out keep-alive new chunks
f.write(chunk)
rename(filename, filename_to_save)
return filename_to_save
@app.task(queue='storage')
def import_datasets(**options):
if not options['url']:
raise ValueError('Please supply an URL for a dataset to download.')
datasets = [
{'url': options['url'][0],
'description': 'Randomly uploaded file.'},
]
generic_dataset_import(datasets=datasets, parser_function=parse_data, download_function=download)
def import_datasets(**dataset):
generic_dataset_import(dataset=dataset,
parser_function=parse_data,
download_function=download_http_get_no_credentials)
import logging
from django.core.management.base import BaseCommand
from failmap.organizations.sources import dutch_government, excel
log = logging.getLogger(__package__)
importers = {
# failmap import_organizations dutch_government
'dutch_government': dutch_government,
# failmap import_organizations excel https://example.com/example.xlsx
'excel': excel
}
class Command(BaseCommand):
"""
Specify an importer and you'll be getting all organizations you'll ever dream of
"""
def add_arguments(self, parser):
parser.add_argument('importer', nargs=1, help='The importer you want to use.', choices=importers)
parser.add_argument('url', nargs='*', help='URL of a file to download (fox excel).', default='')
super().add_arguments(parser)
def handle(self, *args, **options):
try:
if options['importer'][0] not in importers:
print("Importer does not exist. Please specify a valid importer from this list: %s " % importers.keys())
return
importer_module = importers[options['importer'][0]]
importer_module.import_datasets(**options)
except KeyboardInterrupt:
log.info("Received keyboard interrupt. Stopped.")
......@@ -3,16 +3,10 @@ import logging
from django.core.management.base import BaseCommand
from failmap.organizations.models import Url
from failmap.organizations.sources import dutch_government
log = logging.getLogger(__package__)
importers = {
'dutch_government': dutch_government,
}
class Command(BaseCommand):
"""
Specify an importer and you'll be getting all organizations you'll ever dream of
......
......@@ -4,16 +4,10 @@ from django.core.management.base import BaseCommand
from django.db.models import Count
from failmap.organizations.models import Url
from failmap.organizations.sources import dutch_government
log = logging.getLogger(__package__)
importers = {
'dutch_government': dutch_government,
}
class Command(BaseCommand):
"""
Specify an importer and you'll be getting all organizations you'll ever dream of
......
# Generated by Django 2.1.3 on 2018-12-13 08:05
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('organizations', '0043_dataset'),
]
operations = [
migrations.AddField(
model_name='dataset',
name='kwargs',
field=models.CharField(
blank=True, help_text='A dictionary with extra options for the parser to handle the dataset. This is different per parser.This field is highly coupled with the code of the parser.', max_length=5000, null=True),
),
migrations.AddField(
model_name='dataset',
name='type',
field=models.CharField(
blank=True, help_text='To determine what importer is needed: xls, xlsx, json, dutch_government.', max_length=255, null=True),
),
]
# Generated by Django 2.1.3 on 2018-12-13 08:08
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('organizations', '0044_auto_20181213_0805'),
]
operations = [
migrations.AlterField(
model_name='dataset',
name='kwargs',
field=models.TextField(
blank=True, help_text='A dictionary with extra options for the parser to handle the dataset. This is different per parser.This field is highly coupled with the code of the parser.', max_length=5000, null=True),
),
]
# Generated by Django 2.1.3 on 2018-12-13 08:54
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('organizations', '0045_auto_20181213_0808'),
]
operations = [
migrations.AlterField(
model_name='dataset',
name='kwargs',
field=models.TextField(
blank=True, default='{}', help_text='A JSON / dictionary with extra options for the parser to handle the dataset. This is different per parser. This field is highly coupled with the code of the parser.', max_length=5000, null=True),
),
]
......@@ -459,3 +459,17 @@ class Dataset(models.Model):
source = models.URLField()
is_imported = models.BooleanField(default=False,)
imported_on = models.DateTimeField(blank=True, null=True)
type = models.CharField(
max_length=255,
blank=True,
null=True,
help_text="To determine what importer is needed: xls, xlsx, json, dutch_government."
)
kwargs = models.TextField(
max_length=5000,
blank=True,
null=True,
help_text="A JSON / dictionary with extra options for the parser to handle the dataset. "
"This is different per parser. This field is highly coupled with the code of the parser.",
default='{}'
)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment