Verified Commit c9f70045 authored by Elger Jonker's avatar Elger Jonker

Moving dataset imports to admin, supporting types and kwargs, less code more flex

parent 9eb79940
import logging
from datetime import datetime
from json import loads
import nested_admin
import pytz
......@@ -20,9 +21,9 @@ from failmap import types
from failmap.app.models import Job
from failmap.celery import PRIO_HIGH, app
from failmap.map.report import OrganizationRating, UrlRating
from failmap.organizations.datasources import dutch_government, excel
from failmap.organizations.models import (Coordinate, Dataset, Organization, OrganizationType,
Promise, Url)
from failmap.organizations.sources.excel import import_datasets
from failmap.scanners.admin import UrlIp
from failmap.scanners.models import Endpoint, EndpointGenericScan, TlsQualysScan, UrlGenericScan
from failmap.scanners.scanner import dns, dnssec, onboard, plain_http, security_headers, tls_qualys
......@@ -634,13 +635,27 @@ class PromiseAdmin(ImportExportModelAdmin, admin.ModelAdmin):
)
class DatasetForm(forms.ModelForm):
def clean_kwargs(self):
value = self.cleaned_data['kwargs']
try:
loads(value)
except ValueError as exc:
raise forms.ValidationError(
_('Unable to parse JSON: %s') % exc,
)
return value
@admin.register(Dataset)
# todo: how to show a form / allowing uploads?
class DatasetAdmin(ImportExportModelAdmin, admin.ModelAdmin):
list_display = ('source', 'is_imported', 'imported_on')
list_display = ('source', 'type', 'is_imported', 'imported_on')
search_fields = ('source', )
list_filter = ('is_imported', 'imported_on')
fields = ('source', 'is_imported', 'imported_on')
fields = ('source', 'type', 'kwargs', 'is_imported', 'imported_on')
actions = []
......@@ -649,15 +664,30 @@ class DatasetAdmin(ImportExportModelAdmin, admin.ModelAdmin):
def import_(self, request, queryset):
for dataset in queryset:
options = {'url': [dataset.source]}
kwargs = {'url': dataset.source}
extra_kwargs = loads(dataset.kwargs)
kwargs = {**kwargs, **extra_kwargs}
# ok, it's not smart to say something is imported before it has been verified to be imported.
importers = {
'excel': excel,
'dutch_government': dutch_government,
'': excel
}
(import_datasets.si(**options)
(importers[dataset.type].import_datasets.si(**kwargs)
| dataset_import_finished.si(dataset)).apply_async()
self.message_user(request, "Import started, will run in parallel.")
import_.short_description = "+ Import"
actions.append('import_')
form = DatasetForm
save_as = True
save_on_top = True
preserve_filters = True
@app.task(queue='storage')
def dataset_import_finished(dataset):
......
......@@ -5,54 +5,19 @@ Example:
failmap import_organizations dutch_government
Warning: this is XML, set aside your intuition about programming.
https://almanak-redactie.overheid.nl/archive/
"""
import logging
import xml.etree.ElementTree as ET
from os import rename
import requests
from failmap.organizations.sources import generic_dataset_import, print_progress_bar, read_data
from failmap.celery import app
from failmap.organizations.datasources import (download_http_get_no_credentials,
generic_dataset_import, read_data)
log = logging.getLogger(__package__)
LAYER = 'government'
COUNTRY = 'NL'
# https://almanak-redactie.overheid.nl/archive/
# the xml plural / single are to help parsing, they don't need to be in your specification.
datasets = [
{'url': 'https://almanak-redactie.overheid.nl/archive/exportOO_ministeries.xml',
'description': 'Dutch ministries', 'layer': LAYER, 'country': COUNTRY,
'xml_plural': 'organisaties', 'xml_single': 'organisatie'},
{'url': 'https://almanak-redactie.overheid.nl/archive/exportOO_gemeenschappelijke_regelingen.xml',
'description': 'Gemeenschappelijke Regelingen', 'layer': LAYER, 'country': COUNTRY,
'xml_plural': 'gemeenschappelijkeRegelingen', 'xml_single': 'gemeenschappelijkeRegeling'},
{'url': 'https://almanak-redactie.overheid.nl/archive/exportOO_organisaties.xml',
'description': 'Organisaties', 'layer': LAYER, 'country': COUNTRY,
'xml_plural': 'organisaties', 'xml_single': 'organisatie'},
{'url': 'https://almanak-redactie.overheid.nl/archive/exportOO_rechterlijke_macht.xml',
'description': 'Rechterlijke macht', 'layer': LAYER, 'country': COUNTRY,
'xml_plural': 'organisaties', 'xml_single': 'organisatie'},
{'url': 'https://almanak-redactie.overheid.nl/archive/exportOO_waterschappen.xml',
'description': 'Waterschappen', 'layer': LAYER, 'country': COUNTRY,
'xml_plural': 'organisaties', 'xml_single': 'organisatie'},
{'url': 'https://almanak-redactie.overheid.nl/archive/exportOO_zelfstandige_bestuursorganen.xml',
'description': 'Zelfstandige bestuursorganen', 'layer': LAYER, 'country': COUNTRY,
'xml_plural': 'zelfstandigeBestuursorganen', 'xml_single': 'zelfstandigBestuursorgaan'},
]
namespaces = {
'p': 'https://almanak.overheid.nl/static/schema/oo/export/2.4.3',
}
def parse_data(dataset, filename):
data = read_data(filename)
......@@ -68,7 +33,7 @@ def parse_data(dataset, filename):
# of course this doesn't work out the box, so how do we autoregister a namespace?
ET.register_namespace('p', ns)
# so just fake / overwrite the namespaces variable
namespaces['p'] = ns
namespaces = {'p': ns}
organizations = root.find('p:%s' % dataset['xml_plural'], namespaces)
......@@ -77,7 +42,7 @@ def parse_data(dataset, filename):
for organization in organizations.iterfind('p:%s' % dataset['xml_single'], namespaces):
name = emulate_get(organization, 'p:naam', namespaces)
if not name:
# gemeenschappelijke regelingen...
# gemeenschappelijke regelingen has a title, not a name.
name = emulate_get(organization, 'p:titel', namespaces)
abbreviation = emulate_get(organization, 'p:afkorting', namespaces)
......@@ -128,28 +93,8 @@ def emulate_get(xml, element, namespaces):
return ""
def download(url, filename_to_save):
# https://stackoverflow.com/questions/3173320/text-progress-bar-in-the-console
# post / get / credentials / protocol, whatever...
response = requests.get(url, stream=True, timeout=(1200, 1200))
response.raise_for_status()
with open(filename_to_save, 'wb') as f:
filename = f.name
i = 0
for chunk in response.iter_content(chunk_size=1024):
i += 1
print_progress_bar(1, 100, ' download')
if chunk: # filter out keep-alive new chunks
f.write(chunk)
# save as cachable resource
# this of course doesn't work if you call it a few times while a download is running, but well, good enough
rename(filename, filename_to_save)
return filename_to_save
def import_datasets(**options):
generic_dataset_import(datasets=datasets, parser_function=parse_data, download_function=download)
@app.task(queue='storage')
def import_datasets(**dataset):
generic_dataset_import(dataset=dataset,
parser_function=parse_data,
download_function=download_http_get_no_credentials)
......@@ -6,18 +6,16 @@ Mind the format! See parse_data
"""
import logging
from os import rename
import iso3166
import pyexcel as p
import requests
from failmap.celery import app
from failmap.organizations.sources import generic_dataset_import, print_progress_bar
from failmap.organizations.datasources import (download_http_get_no_credentials,
generic_dataset_import)
log = logging.getLogger(__package__)
# todo: these datasets have to come from a table in the admin. That will give a nice UI.
datasets = []
......@@ -54,12 +52,16 @@ def parse_data(dataset, filename):
validate_record(record)
sites = record['Websites (csv)'].strip().split(',')
sites = [x.strip() for x in sites]
# todo: column numbers might still be easier for people that enter data?
found_organizations.append(
{
'name': record['Name'],
'address': record['Address'],
'geocoding_hint': record.get('Hint', ''),
'websites': record['Websites (csv)'],
'websites': sites,
'country': record['Countrycode'],
'layer': record['Layer'],
'lat': record.get('Lat', ''),
......@@ -96,33 +98,8 @@ def validate_record(record):
raise ValueError('Countrycode is not a valid 3166 country code.')
def download(url, filename_to_save):
# post / get / credentials / protocol, whatever...
response = requests.get(url, stream=True, timeout=(10, 10))
response.raise_for_status()
with open(filename_to_save, 'wb') as f:
filename = f.name
i = 0
for chunk in response.iter_content(chunk_size=1024):
i += 1
print_progress_bar(1, 100, ' download')
if chunk: # filter out keep-alive new chunks
f.write(chunk)
rename(filename, filename_to_save)
return filename_to_save
@app.task(queue='storage')
def import_datasets(**options):
if not options['url']:
raise ValueError('Please supply an URL for a dataset to download.')
datasets = [
{'url': options['url'][0],
'description': 'Randomly uploaded file.'},
]
generic_dataset_import(datasets=datasets, parser_function=parse_data, download_function=download)
def import_datasets(**dataset):
generic_dataset_import(dataset=dataset,
parser_function=parse_data,
download_function=download_http_get_no_credentials)
import logging
from django.core.management.base import BaseCommand
from failmap.organizations.sources import dutch_government, excel
log = logging.getLogger(__package__)
importers = {
# failmap import_organizations dutch_government
'dutch_government': dutch_government,
# failmap import_organizations excel https://example.com/example.xlsx
'excel': excel
}
class Command(BaseCommand):
"""
Specify an importer and you'll be getting all organizations you'll ever dream of
"""
def add_arguments(self, parser):
parser.add_argument('importer', nargs=1, help='The importer you want to use.', choices=importers)
parser.add_argument('url', nargs='*', help='URL of a file to download (fox excel).', default='')
super().add_arguments(parser)
def handle(self, *args, **options):
try:
if options['importer'][0] not in importers:
print("Importer does not exist. Please specify a valid importer from this list: %s " % importers.keys())
return
importer_module = importers[options['importer'][0]]
importer_module.import_datasets(**options)
except KeyboardInterrupt:
log.info("Received keyboard interrupt. Stopped.")
......@@ -3,16 +3,10 @@ import logging
from django.core.management.base import BaseCommand
from failmap.organizations.models import Url
from failmap.organizations.sources import dutch_government
log = logging.getLogger(__package__)
importers = {
'dutch_government': dutch_government,
}
class Command(BaseCommand):
"""
Specify an importer and you'll be getting all organizations you'll ever dream of
......
......@@ -4,16 +4,10 @@ from django.core.management.base import BaseCommand
from django.db.models import Count
from failmap.organizations.models import Url
from failmap.organizations.sources import dutch_government
log = logging.getLogger(__package__)
importers = {
'dutch_government': dutch_government,
}
class Command(BaseCommand):
"""
Specify an importer and you'll be getting all organizations you'll ever dream of
......
# Generated by Django 2.1.3 on 2018-12-13 08:05
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('organizations', '0043_dataset'),
]
operations = [
migrations.AddField(
model_name='dataset',
name='kwargs',
field=models.CharField(
blank=True, help_text='A dictionary with extra options for the parser to handle the dataset. This is different per parser.This field is highly coupled with the code of the parser.', max_length=5000, null=True),
),
migrations.AddField(
model_name='dataset',
name='type',
field=models.CharField(
blank=True, help_text='To determine what importer is needed: xls, xlsx, json, dutch_government.', max_length=255, null=True),
),
]
# Generated by Django 2.1.3 on 2018-12-13 08:08
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('organizations', '0044_auto_20181213_0805'),
]
operations = [
migrations.AlterField(
model_name='dataset',
name='kwargs',
field=models.TextField(
blank=True, help_text='A dictionary with extra options for the parser to handle the dataset. This is different per parser.This field is highly coupled with the code of the parser.', max_length=5000, null=True),
),
]
# Generated by Django 2.1.3 on 2018-12-13 08:54
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('organizations', '0045_auto_20181213_0808'),
]
operations = [
migrations.AlterField(
model_name='dataset',
name='kwargs',
field=models.TextField(
blank=True, default='{}', help_text='A JSON / dictionary with extra options for the parser to handle the dataset. This is different per parser. This field is highly coupled with the code of the parser.', max_length=5000, null=True),
),
]
......@@ -459,3 +459,17 @@ class Dataset(models.Model):
source = models.URLField()
is_imported = models.BooleanField(default=False,)
imported_on = models.DateTimeField(blank=True, null=True)
type = models.CharField(
max_length=255,
blank=True,
null=True,
help_text="To determine what importer is needed: xls, xlsx, json, dutch_government."
)
kwargs = models.TextField(
max_length=5000,
blank=True,
null=True,
help_text="A JSON / dictionary with extra options for the parser to handle the dataset. "
"This is different per parser. This field is highly coupled with the code of the parser.",
default='{}'
)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment