Commit 21e3c221 authored by Mantas Zimnickas's avatar Mantas Zimnickas

Merge remote-tracking branch 'origin/master'

Merged with:

git reset --hard HEAD
git merge -Xours origin/master
parents 42eb6edd 10855cb2
Pipeline #138548823 failed with stage
in 2 minutes and 33 seconds
......@@ -48,10 +48,10 @@ Tokio duomenų šaltinio duomenų struktūros aprašas atrodo taip:
resources:
salys:
type: json
source: https://example.com/salys/
pull: https://example.com/salys/
miestai:
type: json
source: https://example.com/salys/{salis.kodas}/miestai/
pull: https://example.com/salys/{salis.kodas}/miestai/
.. code-block:: yaml
......
......@@ -73,7 +73,9 @@ lentele:
- `level` stulpelyje pateikiamas duomenų brandos lygis
- `source` stulpelyje vykdoma duomenų atranka ir transformacija
- `source` stulpelyje vykdoma duomenų atranka
- `prepare` stulpelyje vykdomas duomenų tikrinimas ir transformacija
- `ref` stulpelyje tvarkomi objektų identifikatoriai ir ryšiai tarp lentelių
......@@ -152,7 +154,7 @@ atrodytų mūsų inventorizacijos lentelė YAML formatu:
countries:
type: sql
title: Duomenų bazė
source: postgresql://user:[email protected]/dbname
pull: postgresql://user:[email protected]/dbname
.. code-block:: yaml
......
This diff is collapsed.
......@@ -301,7 +301,7 @@ id code country
resources:
sql:
type: sql
source: postgresql://[email protected]/dbname
pull: postgresql://[email protected]/dbname
.. code-block:: yaml
......@@ -405,7 +405,7 @@ Tarkime turime CSV failą, kuris pasiekiamas adresu
resources:
csv:
type: csv
source: https://example.com/
pull: https://example.com/
.. code-block:: yaml
......@@ -521,7 +521,7 @@ Tarkime turime JSON failą, kuris pasiekiamas adresu
resources:
json:
type: json
source: https://example.com/countries.json
pull: https://example.com/countries.json
.. code-block:: yaml
......@@ -615,7 +615,7 @@ Tarkime turime XML failą, kuris pasiekiamas adresu
resources:
xml:
type: xml
source: https://example.com/countries.xml
pull: https://example.com/countries.xml
.. code-block:: yaml
......@@ -710,7 +710,7 @@ id code country
resources:
xlsx:
type: xlsx
source: https://example.com/countries.xlsx
pull: https://example.com/countries.xlsx
.. code-block:: yaml
......
import csv
import pathlib
from ruamel.yaml import YAML
from spinta.utils.path import is_ignored
yaml = YAML(typ='safe')
DEFAULTS = {
'dataset': '',
'resource': '',
'base': '',
'model': '',
'property': '',
'source': '',
'prepare': '',
'type': '',
'ref': '',
'level': '',
'access': '',
'title': '',
'description': '',
}
def manifest_to_inventory(path: pathlib.Path):
ignore = [
'.travis.yml',
'/prefixes.yml',
'/schema/',
'/env/',
]
for file in path.glob('**/*.yml'):
if is_ignored(ignore, path, file):
continue
data = next(yaml.load_all(file.read_text()))
if data['type'] == 'dataset':
yield from dataset_to_inventory(data)
def dataset_to_inventory(data):
yield {
**DEFAULTS,
'dataset': data['name'],
}
for k, v in data.get('resources', {}).items():
yield from resource_to_inventory(k, v)
def resource_to_inventory(name, data):
yield {
**DEFAULTS,
'resource': name,
'type': data.get('type'),
}
for origin, objects in data.get('objects', {}).items():
for k, v in objects.items():
yield {**DEFAULTS}
yield from model_to_inventory(k, v)
def model_to_inventory(name, data):
props = data.get('properties', {})
if '_id' in props:
source = props['_id'].get('source', [])
if not isinstance(source, list):
source = [source]
ref = ', '.join(s.lower() for s in source)
else:
ref = ''
sources = data.get('source', [])
if not isinstance(sources, list):
sources = [sources]
for source in sources:
yield {
**DEFAULTS,
'model': name,
'source': source,
'ref': ref,
}
for k, v in props.items():
if k == '_id':
source = v.get('source', [])
if not isinstance(source, list):
source = [source]
for s in filter(None, source):
k = s.lower()
if k not in props:
yield from prop_to_inventory(k, {**v, 'type': 'integer', 'source': s})
else:
yield from prop_to_inventory(k, v)
def prop_to_inventory(name, data):
yield {
**DEFAULTS,
'property': name,
'type': data.get('type'),
'source': data.get('source'),
}
def writecsv(f, rows):
writer = csv.DictWriter(f, [
'dataset',
'resource',
'base',
'model',
'property',
'source',
'prepare',
'type',
'ref',
'level',
'access',
'title',
'description',
])
writer.writeheader()
for row in rows:
writer.writerow(row)
......@@ -91,6 +91,7 @@ def writecsv(f, models, dataset='', resource=''):
'model',
'property',
'source',
'prepare',
'type',
'ref',
'level',
......
from typing import Iterable, Dict
def inventory(rows: Iterable[Dict]):
cols = [
'dataset',
'resource',
'base',
'model',
'property',
'source',
'prepare',
'type',
'ref',
'level',
'access',
'title',
'description',
]
hpos = cols.index('property')
hsize = 1 # hierachical column size
bsize = 3 # border size
sizes = dict(
[(c, 1) for c in cols[:hpos]] +
[(c, len(c)) for c in cols[hpos:]]
)
rows = list(rows)
for row in rows:
for i, col in enumerate(cols):
val = str(row[col])
if i < hpos:
size = (hsize + bsize) * (hpos - i) + sizes['property']
if size < len(val):
sizes['property'] += len(val) - size
elif sizes[col] < len(val):
sizes[col] = len(val)
line = []
for col in cols:
size = sizes[col]
line.append(col[:size].ljust(size))
depth = 0
lines = [line]
for row in rows:
line = []
for i, col in enumerate(cols[:hpos + 1]):
val = row[col]
if val:
depth = i
break
else:
val = ''
if depth < hpos:
depth += 1
else:
depth = 2
line += [' ' * hsize] * depth
size = (hsize + bsize) * (hpos - depth) + sizes['property']
line += [val.ljust(size)]
for col in cols[hpos + 1:]:
val = str(row[col])
size = sizes[col]
line.append(val.ljust(size))
lines.append(line)
lines = [' | '.join(line) for line in lines]
indent = ' '
return '\n'.join([indent + l.rstrip() for l in lines]) + '\n' + indent
......@@ -11,22 +11,22 @@ resources:
'':
type: xlsx
source:
# 2013 m.
- "https://www.epolicija.lt/c/document_library/get_file?uuid=8c466021-bdac-4f0a-a437-8fe4bf7a5181&groupId=10156"
# 2014 m.
- "https://www.epolicija.lt/c/document_library/get_file?uuid=d894584b-40ec-4111-8ffa-d786d5abd3ac&groupId=10156"
# 2015 m.
- "https://www.epolicija.lt/c/document_library/get_file?uuid=796dc0e9-a304-404b-9720-6b53676bad15&groupId=10156"
# 2016 m.
- "https://www.epolicija.lt/c/document_library/get_file?uuid=a4ca8d1c-5f4f-4d0c-89b3-0c9baffac093&groupId=10156"
# 2017 m.
- "https://www.epolicija.lt/c/document_library/get_file?uuid=39221a0e-c0aa-406c-82df-0d87535c1b49&groupId=10156"
objects:
'':
policija/eismas/ivykis:
local: true
stars: 2
source:
# 2013 m.
- "https://www.epolicija.lt/c/document_library/get_file?uuid=8c466021-bdac-4f0a-a437-8fe4bf7a5181&groupId=10156"
# 2014 m.
- "https://www.epolicija.lt/c/document_library/get_file?uuid=d894584b-40ec-4111-8ffa-d786d5abd3ac&groupId=10156"
# 2015 m.
- "https://www.epolicija.lt/c/document_library/get_file?uuid=796dc0e9-a304-404b-9720-6b53676bad15&groupId=10156"
# 2016 m.
- "https://www.epolicija.lt/c/document_library/get_file?uuid=a4ca8d1c-5f4f-4d0c-89b3-0c9baffac093&groupId=10156"
# 2017 m.
- "https://www.epolicija.lt/c/document_library/get_file?uuid=39221a0e-c0aa-406c-82df-0d87535c1b49&groupId=10156"
properties:
savivaldybe:
type: string
......
#!/usr/bin/env python
"""
Generate inventory CSV file from manifest YAML files.
Usage:
$ env/bin/python scripts/manifest-to-csv path/to/manifest
"""
import pathlib
import sys
import click
from lodam.services.inventory import manifest_to_inventory, writecsv
@click.command()
@click.argument('manifest')
@click.option('-o', '--output', help="output file")
def main(manifest: str, output: str):
table = manifest_to_inventory(pathlib.Path(manifest))
if output:
with open(output, 'w') as f:
writecsv(f, table)
else:
writecsv(sys.stdout, table)
if __name__ == "__main__":
main()
from typing import Iterable, Dict
import io
import csv
import sqlalchemy as sa
from lodam.services import sqlschema
def pretty(rows: Iterable[Dict]):
cols = [
'dataset',
'resource',
'base',
'model',
'property',
'source',
'type',
'ref',
'level',
'access',
'title',
'description',
]
hpos = cols.index('property')
hsize = 1 # hierachical column size
bsize = 3 # border size
sizes = dict(
[(c, 1) for c in cols[:hpos]] +
[(c, len(c)) for c in cols[hpos:]]
)
rows = list(rows)
for row in rows:
for i, col in enumerate(cols):
val = str(row[col])
if i < hpos:
size = (hsize + bsize) * (hpos - i) + sizes['property']
if size < len(val):
sizes['property'] += len(val) - size
elif sizes[col] < len(val):
sizes[col] = len(val)
line = []
for col in cols:
size = sizes[col]
line.append(col[:size].ljust(size))
depth = 0
lines = [line]
for row in rows:
line = []
for i, col in enumerate(cols[:hpos + 1]):
val = row[col]
if val:
depth = i
break
else:
val = ''
if depth < hpos:
depth += 1
else:
depth = 2
line += [' ' * hsize] * depth
size = (hsize + bsize) * (hpos - depth) + sizes['property']
line += [val.ljust(size)]
for col in cols[hpos + 1:]:
val = str(row[col])
size = sizes[col]
line.append(val.ljust(size))
lines.append(line)
lines = [' | '.join(line) for line in lines]
indent = ' '
return '\n'.join([indent + l.rstrip() for l in lines]) + '\n' + indent
from lodam.testing import inventory
def inspect(sql, dataset='datasets/test/data', resource='sqlite'):
......@@ -97,14 +24,14 @@ def test_no_pk():
bar TEXT NOT NULL
);
''')
assert pretty(table) == '''\
d | r | b | m | property | source | type | ref | level | access | title | description
datasets/test/data | | | | | | |
| sqlite | | | | | | |
| | | | | | | | |
| | | baz | BAZ | | | | | |
| | | | foo | foo | integer | | 3 | | |
| | | | bar | bar | string | | 3 | | |
assert inventory(table) == '''\
d | r | b | m | property | source | prepare | type | ref | level | access | title | description
datasets/test/data | | | | | | | |
| sqlite | | | | | | | |
| | | | | | | | | |
| | | baz | BAZ | | | | | | |
| | | | foo | foo | | integer | | 3 | | |
| | | | bar | bar | | string | | 3 | | |
'''
......@@ -115,13 +42,13 @@ def test_pk():
PRIMARY KEY(ID)
);
''')
assert pretty(table) == '''\
d | r | b | m | property | source | type | ref | level | access | title | description
datasets/test/data | | | | | | |
| sqlite | | | | | | |
| | | | | | | | |
| | | baz | BAZ | | id | | | |
| | | | id | ID | integer | | 4 | | |
assert inventory(table) == '''\
d | r | b | m | property | source | prepare | type | ref | level | access | title | description
datasets/test/data | | | | | | | |
| sqlite | | | | | | | |
| | | | | | | | | |
| | | baz | BAZ | | | id | | | |
| | | | id | ID | | integer | | 4 | | |
'''
......@@ -133,14 +60,14 @@ def test_two_pkeys():
PRIMARY KEY(foo, bar)
);
''')
assert pretty(table) == '''\
d | r | b | m | property | source | type | ref | level | access | title | description
datasets/test/data | | | | | | |
| sqlite | | | | | | |
| | | | | | | | |
| | | baz | BAZ | | foo, bar | | | |
| | | | foo | foo | integer | | 4 | | |
| | | | bar | bar | integer | | 4 | | |
assert inventory(table) == '''\
d | r | b | m | property | source | prepare | type | ref | level | access | title | description
datasets/test/data | | | | | | | |
| sqlite | | | | | | | |
| | | | | | | | | |
| | | baz | BAZ | | | foo, bar | | | |
| | | | foo | foo | | integer | | 4 | | |
| | | | bar | bar | | integer | | 4 | | |
'''
......@@ -158,18 +85,18 @@ def test_fkeys():
FOREIGN KEY (COUNTRY_ID) REFERENCES COUNTRY (ID)
);
''')
assert pretty(table) == '''\
d | r | b | m | property | source | type | ref | level | access | title | description
datasets/test/data | | | | | | |
| sqlite | | | | | | |
| | | | | | | | |
| | | city | CITY | | id | | | |
| | | | id | ID | integer | | 4 | | |
| | | | country_id | COUNTRY_ID | ref | country[id] | 4 | | |
| | | | name | NAME | string | | 3 | | |
| | | | | | | | |
| | | country | COUNTRY | | id | | | |
| | | | id | ID | integer | | 4 | | |
| | | | area | AREA | integer | | 3 | | |
| | | | name | NAME | string | | 3 | | |
assert inventory(table) == '''\
d | r | b | m | property | source | prepare | type | ref | level | access | title | description
datasets/test/data | | | | | | | |
| sqlite | | | | | | | |
| | | | | | | | | |
| | | city | CITY | | | id | | | |
| | | | id | ID | | integer | | 4 | | |
| | | | country_id | COUNTRY_ID | | ref | country[id] | 4 | | |
| | | | name | NAME | | string | | 3 | | |
| | | | | | | | | |
| | | country | COUNTRY | | | id | | | |
| | | | id | ID | | integer | | 4 | | |
| | | | area | AREA | | integer | | 3 | | |
| | | | name | NAME | | string | | 3 | | |
'''
import pathlib
from spinta.testing.utils import create_manifest_files
from lodam.testing import inventory
from lodam.services.inventory import manifest_to_inventory
def test_pkey(tmpdir):
create_manifest_files(tmpdir, {
'datasets/rc/geo.yml': {
'type': 'dataset',
'name': 'rc/geo',
'resources': {
'sql': {
'type': 'sql',
'objects': {
'': {
'country': {
'source': 'COUNTRIES',
'properties': {
'_id': {
'type': 'pk',
'source': 'ID',
},
'name': {
'type': 'string',
'source': 'NAME',
}
}
}
}
}
}
}
}
})
table = manifest_to_inventory(pathlib.Path(tmpdir))
assert inventory(table) == '''\
d | r | b | m | property | source | prepare | type | ref | level | access | title | description
rc/geo | | | | | | | |
| sql | | | sql | | | | |
| | | | | | | | | |
| | | country | COUNTRIES | | | id | | | |
| | | | id | ID | | integer | | | | |
| | | | name | NAME | | string | | | | |
'''
def test_multiple_pkeys(tmpdir):
create_manifest_files(tmpdir, {
'datasets/rc/geo.yml': {
'type': 'dataset',
'name': 'rc/geo',
'resources': {
'sql': {
'type': 'sql',
'objects': {
'': {
'country': {
'source': 'COUNTRIES',
'properties': {
'_id': {
'type': 'pk',
'source': ['ID1', 'ID2'],
},
}
}