Commit d6a4dd5f authored by Elger Jonker's avatar Elger Jonker

[wip] import datasets


Former-commit-id: c4e65f0e
parent f07258c2
......@@ -17,6 +17,7 @@ const failmap = {
// options.icon is undefined; can't access its "createIcon" property
// happens. The options/icon is eaten after the first expansionmarkerClusterGroup or something like that.
// icon is then null. And even if we don't make a marker, the same issue happens.
// is the problem in the retract function not storing anything correctly?
{
// zoomToBoundsOnClick: false,
// spiderfyOnMaxZoom: false,
......
......@@ -574,7 +574,7 @@ class CoordinateAdmin(LeafletGeoAdminMixin, ImportExportModelAdmin):
"If you want to move the coordinate, preferably do so by creating a new one and setting the"
" current one as dead (+date etc). Then the map will show coordinates over time, which is "
"pretty neat.",
'fields': ('organization', 'geojsontype', 'area', 'edit_area')
'fields': ('organization', 'geojsontype', 'area', 'edit_area', 'creation_metadata')
}),
('Life cycle', {
......
import logging
from django.core.management.base import BaseCommand
from failmap.organizations.sources import dutch_government
log = logging.getLogger(__package__)
importers = {
'dutch_government': dutch_government,
}
class Command(BaseCommand):
"""
Specify an importer and you'll be getting all organizations you'll ever dream of
"""
def add_arguments(self, parser):
parser.add_argument('importer', nargs=1, help='The importer you want to use.', choices=importers)
super().add_arguments(parser)
def handle(self, *args, **options):
try:
if options['importer'][0] not in importers:
print("Importer does not exist. Please specify a valid importer from this list: %s " % importers.keys())
return
importer_module = importers[options['importer'][0]]
importer_module.import_datasets()
except KeyboardInterrupt:
log.info("Received keyboard interrupt. Stopped.")
import logging
from django.core.management.base import BaseCommand
from failmap.organizations.models import Url
from failmap.organizations.sources import dutch_government
log = logging.getLogger(__package__)
importers = {
'dutch_government': dutch_government,
}
class Command(BaseCommand):
"""
Specify an importer and you'll be getting all organizations you'll ever dream of
"""
def handle(self, *args, **options):
# make sure there are creation dates etc...
organizations = Url.objects.all().filter(is_dead=True, is_dead_since__isnull=True)
for organization in organizations:
if organization.created_on:
organization.is_dead_since = organization.created_on
organization.save()
else:
organization.is_dead_since = organization.onboarded_on
organization.save()
# missing creation date
organizations = Url.objects.all().filter(created_on__isnull=True)
for organization in organizations:
organization.created_on = organization.onboarded_on
organization.save()
import logging
from django.core.management.base import BaseCommand
from django.db.models import Count
from failmap.organizations.models import Url
from failmap.organizations.sources import dutch_government
log = logging.getLogger(__package__)
importers = {
'dutch_government': dutch_government,
}
class Command(BaseCommand):
"""
Specify an importer and you'll be getting all organizations you'll ever dream of
"""
def handle(self, *args, **options):
double_urls = Url.objects.values('url').annotate(num_urls=Count('url')).order_by().filter(num_urls__gt=1)
log.debug("%s duplicate urls: %s" % (len(double_urls), double_urls))
# keep the oldest:
for double_url in double_urls:
log.debug("%s: %s" % (double_url['url'], double_url['num_urls']))
# - = descending, so, not using - is ascending, which is from small to large. Which is what we need.
all_doubles = list(Url.objects.all().filter(url=double_url['url']).order_by('created_on'))
Url.objects.all().filter(url=double_url['url']).exclude(pk=all_doubles[0].pk).delete()
# Generated by Django 2.1.3 on 2018-12-10 13:18
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('organizations', '0041_auto_20181207_1951'),
]
operations = [
migrations.AlterField(
model_name='organization',
name='internal_notes',
field=models.TextField(blank=True, help_text="These notes can contain information on WHY this organization was added. Can be handy if it's not straightforward. This helps with answering questions why the organization was added lateron. These notes will not be published, but are also not secret.", max_length=2500, null=True),
),
]
......@@ -43,7 +43,7 @@ class Organization(models.Model):
name = models.CharField(max_length=250, db_index=True)
internal_notes = models.TextField(
max_length=500,
max_length=2500,
help_text="These notes can contain information on WHY this organization was added. Can be handy if it's not "
"straightforward. This helps with answering questions why the organization was added lateron. "
"These notes will not be published, but are also not secret.",
......
This diff is collapsed.
"""
Importer for Dutch governmental organizations, using open data.
Warning: this is XML, set aside your intuition about programming.
"""
import logging
import xml.etree.ElementTree as ET
from os import rename
import requests
from failmap.organizations.sources import generic_dataset_import, print_progress_bar
log = logging.getLogger(__package__)
LAYER = 'government'
COUNTRY = 'NL'
# https://almanak-redactie.overheid.nl/archive/
# the xml plural / single are to help parsing, they don't need to be in your specification.
datasets = [
{'url': 'https://almanak-redactie.overheid.nl/archive/exportOO_gemeenschappelijke_regelingen.xml',
'description': 'Gemeenschappelijke Regelingen', 'layer': LAYER, 'country': COUNTRY,
'xml_plural': 'gemeenschappelijkeRegelingen', 'xml_single': 'gemeenschappelijkeRegeling'},
{'url': 'https://almanak-redactie.overheid.nl/archive/exportOO_organisaties.xml',
'description': 'Organisaties', 'layer': LAYER, 'country': COUNTRY,
'xml_plural': 'organisaties', 'xml_single': 'organisatie'},
{'url': 'https://almanak-redactie.overheid.nl/archive/exportOO_rechterlijke_macht.xml',
'description': 'Rechterlijke macht', 'layer': LAYER, 'country': COUNTRY,
'xml_plural': 'organisaties', 'xml_single': 'organisatie'},
{'url': 'https://almanak-redactie.overheid.nl/archive/exportOO_waterschappen.xml',
'description': 'Waterschappen', 'layer': LAYER, 'country': COUNTRY,
'xml_plural': 'organisaties', 'xml_single': 'organisatie'},
{'url': 'https://almanak-redactie.overheid.nl/archive/exportOO_zelfstandige_bestuursorganen.xml',
'description': 'Zelfstandige bestuursorganen', 'layer': LAYER, 'country': COUNTRY,
'xml_plural': 'zelfstandigeBestuursorganen', 'xml_single': 'zelfstandigBestuursorgaan'},
{'url': 'https://almanak-redactie.overheid.nl/archive/exportOO_ministeries.xml',
'description': 'Dutch ministries', 'layer': LAYER, 'country': COUNTRY,
'xml_plural': 'organisaties', 'xml_single': 'organisatie'},
]
namespaces = {
'p': 'https://almanak.overheid.nl/static/schema/oo/export/2.4.3',
}
def parse_data(dataset, data):
# this is some kind of XML format. for which an XSD is available.
# for each document another namespace version is available, which makes it harder.
# how can we identify the correct namespace for p correctly automatically?
found_organizations = []
root = ET.fromstring(data)
ns = root.attrib['{http://www.w3.org/2001/XMLSchema-instance}schemaLocation'].split(' ')[0]
log.debug('Using namespace: %s' % ns)
# of course this doesn't work out the box, so how do we autoregister a namespace?
ET.register_namespace('p', ns)
# so just fake / overwrite the namespaces variable
namespaces['p'] = ns
organizations = root.find('p:%s' % dataset['xml_plural'], namespaces)
# why can't i use a similar construct as get?
# i want: bla = et.find(x. alaternative if not found)
for organization in organizations.iterfind('p:%s' % dataset['xml_single'], namespaces):
name = emulate_get(organization, 'p:naam', namespaces)
if not name:
# gemeenschappelijke regelingen...
name = emulate_get(organization, 'p:titel', namespaces)
abbreviation = emulate_get(organization, 'p:afkorting', namespaces)
contact = organization.find('p:contact', namespaces)
bezoekAdres = contact.find('p:bezoekAdres', namespaces)
adres = bezoekAdres.find('p:adres', namespaces)
straat = emulate_get(adres, 'p:straat', namespaces)
huisnummer = emulate_get(adres, 'p:huisnummer', namespaces)
postcode = emulate_get(adres, 'p:postcode', namespaces)
plaats = emulate_get(adres, 'p:plaats', namespaces)
site = emulate_get(contact, 'p:internet', namespaces)
if not postcode and not plaats:
# try to find something by name... might not have an address...
geocoding_hint = "%s, Nederland" % name
else:
geocoding_hint = "Nederland"
found_organizations.append(
{
'name': "%s (%s)" % (name, abbreviation) if abbreviation else name,
'address': "%s %s, %s, %s" % (straat, huisnummer, postcode, plaats),
# make sure that the geocoder is looking at the Netherlands.
'geocoding_hint': geocoding_hint,
'websites': [site],
'country': dataset['country'],
'layer': dataset['layer'],
'lat': None,
'lng': None,
'dataset': dataset
}
)
# debug_organizations(found_organizations)
return found_organizations
def emulate_get(xml, element, namespaces):
# xml.find(element, namespaces) cannot be compared, it's always false.
# This thus doesn't work:
# return xml.find(element, namespaces).text if xml.find(element, namespaces) else ""
try:
return xml.find(element, namespaces).text
except AttributeError:
return ""
def download(url, filename_to_save):
# https://stackoverflow.com/questions/3173320/text-progress-bar-in-the-console
# post / get / credentials / protocol, whatever...
response = requests.get(url, stream=True, timeout=(1200, 1200))
response.raise_for_status()
with open(filename_to_save, 'wb') as f:
filename = f.name
i = 0
for chunk in response.iter_content(chunk_size=1024):
i += 1
print_progress_bar(1, 100, ' download')
if chunk: # filter out keep-alive new chunks
f.write(chunk)
# save as cachable resource
# this of course doesn't work if you call it a few times while a download is running, but well, good enough
rename(filename, filename_to_save)
return filename_to_save
def import_datasets():
generic_dataset_import(datasets=datasets, parser_function=parse_data, download_function=download)
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -446,6 +446,10 @@ TOOLS = {
'output_dir': OUTPUT_DIR + os.environ.get('OPENSTREETMAP_OUTPUT_DIR',
"scanners/resources/output/openstreetmap/"),
},
'organizations': {
'import_data_dir': OUTPUT_DIR + os.environ.get('ORGANIZATION_IMPORT_DATA_DIR',
"scanners/resources/data/organizations/"),
},
'sslscan': {
# this is beta functionality and not supported in production
# these are installed system wide and don't require a path (they might when development continues)
......
......@@ -134,3 +134,6 @@ tenacity
# game
spectra # fancy colors for teams
markdown # explanations for urls and organizations from the admin
# geocoding organizations from python
googlemaps
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment