Commit 483d8a24 authored by Mantas Zimnickas's avatar Mantas Zimnickas

Add csv-to-manifest script

#836
parent b691f9c1
Pipeline #111601048 passed with stage
in 2 minutes and 24 seconds
import collections
import csv
import datetime
import json
import os
import pickle
import re
import pathlib
from ruamel.yaml import YAML
......@@ -30,6 +32,11 @@ def load_spinta_context():
return context
def get_csv_rows(path: pathlib.Path):
with path.open() as f:
yield from csv.reader(f)
def get_gsheet_rows(sheet_url, cache, creds_file):
spreadsheet_id, sheet_id = parse_sheet_url(sheet_url)
cache_file = f'gsheet-{spreadsheet_id}-{sheet_id}.json'
......@@ -49,34 +56,38 @@ def get_gsheet_rows(sheet_url, cache, creds_file):
with open(cache_file, 'w') as f:
json.dump(data, f, ensure_ascii=False)
return iter(data.get('values', []))
def update_manifest_files(context, rows):
rows = iter(data.get('values', []))
# Skip first header row, because column names are on the second row.
next(rows, None)
return rows
def update_manifest_files(context, rows):
# Header row
columns = next(rows, None)
columns = columns or [None]
if columns[0] == 'dataset':
schema = [
'dataset',
'resource',
'origin',
'model',
'property',
'type',
'ref',
'const',
'title',
'description',
'source.object',
'source.property',
]
else:
raise Exception(f"Unknown first column {columns[0]!r} in second row.")
schema = [
'dataset',
'resource',
'origin',
'model',
'property',
'type',
'ref',
'const',
'title',
'description',
'table',
'column',
]
unknown_columns = set(columns[:len(schema)]) - set(schema)
if unknown_columns:
unknown_columns = ', '.join(sorted(unknown_columns, key=columns.index))
raise Exception(f"Unknown columns: {unknown_columns}.")
manifest_dir = context.get('store').manifests['default'].path
......@@ -125,10 +136,10 @@ def update_manifest_files(context, rows):
else:
model = origin[row.model]
if row.source.object:
if row.table:
if 'source' not in model:
model['source'] = row.source.object
elif model['source'] != row.source.object:
model['source'] = row.table
elif model['source'] != row.table:
raise Exception(f"Row {i}, model {row.model} already has different source {model['source']!r} set.")
# Property
......@@ -147,11 +158,11 @@ def update_manifest_files(context, rows):
prop['title'] = row.title
if row.description:
prop['description'] = row.description
if row.source.property:
if ',' in row.source.property:
prop['source'] = [x.strip() for x in row.source.property.split(',')]
if row.column:
if ',' in row.column:
prop['source'] = [x.strip() for x in row.column.split(',')]
else:
prop['source'] = row.source.property
prop['source'] = row.column
else:
raise Exception(f"Row {i}, property {row.property} already defined for {row.model}.")
......
#!/usr/bin/env python
"""
Generate manifest YAML files from a CSV file.
CSV file should contain list of properties and some metadata.
Usage:
$ env/bin/python scripts/csv-to-manifest path/to.csv
"""
import pathlib
import click
from lodam.services.gsheets import get_csv_rows
from lodam.services.gsheets import load_spinta_context
from lodam.services.gsheets import update_manifest_files
@click.command()
@click.argument('csv')
def main(csv: str):
rows = get_csv_rows(pathlib.Path(csv))
context = load_spinta_context()
update_manifest_files(context, rows)
if __name__ == "__main__":
main()
import datetime
import pathlib
import textwrap
import re
from ruamel.yaml import YAML
from spinta.testing.context import create_test_context
from lodam.services.gsheets import update_manifest_files
from lodam.services.gsheets import get_csv_rows
yaml = YAML(typ='safe')
......@@ -30,8 +33,7 @@ def test_create_new_file(postgresql, config, tmpdir):
})
rows = _read_rows('''
Open Data Manifest | | | | | | | | | | VPT (new) | |
dataset | resource | origin | model | property | type | ref | const | title | description | object | property | comment
dataset | resource | origin | model | property | type | ref | const | title | description | table | column | comment
gov/vpt/new/ataskaitos/atn1 | | ATN1 | valstybe/pirkimas/ataskaita | etapas | string | | award | Etapas | Aprašymas | | |
gov/vpt/new/ataskaitos/atn1 | | ATN1 | valstybe/pirkimas/ataskaita | org | ref | org | | | | ATN1 | ORG |
''')
......@@ -113,8 +115,7 @@ def test_update_existing_file(postgresql, config, tmpdir):
})
rows = _read_rows('''
Open Data Manifest | | | | | | | | | | VPT | |
dataset | resource | origin | model | property | type | ref | const | title | description | object | property | comment
dataset | resource | origin | model | property | type | ref | const | title | description | table | column | comment
gov/vpt/ataskaitos | | ATN1 | valstybe/pirkimas | etapas | string | | award | Etapas | Aprašymas | | |
gov/vpt/ataskaitos | | ATN1 | valstybe/pirkimas | org | ref | org | | | | ATN1 | ORG |
gov/vpt/ataskaitos | | ATN1 | valstybe/pirkimas | title | string | | | Pavadinimas | | ATN1 | TITLE |
......@@ -166,3 +167,58 @@ def test_update_existing_file(postgresql, config, tmpdir):
},
},
}
def test_csv_file(postgresql, config, tmpdir):
tmpdir = pathlib.Path(tmpdir)
context = create_test_context(config)
context.load({
'manifests': {
'default': {
'backend': 'default',
'path': str(tmpdir),
},
},
})
(tmpdir / 'schema.csv').write_text(re.sub(r' *\| *', ',', textwrap.dedent('''\
dataset | resource | origin | model | property | type | ref | const | title | description | table | column | comment
gov/vpt/new/ataskaitos/atn1 | | ATN1 | valstybe/pirkimas/ataskaita | etapas | string | | award | Etapas | Aprašymas | | |
gov/vpt/new/ataskaitos/atn1 | | ATN1 | valstybe/pirkimas/ataskaita | org | ref | org | | | | ATN1 | ORG |
''')))
rows = get_csv_rows(tmpdir / 'schema.csv')
update_manifest_files(context, rows)
assert sorted([str(p.relative_to(tmpdir)) for p in tmpdir.glob('**/*.yml')]) == [
'datasets/gov/vpt/new/ataskaitos/atn1.yml',
]
assert yaml.load((tmpdir / 'datasets/gov/vpt/new/ataskaitos/atn1.yml').read_text()) == {
'type': 'dataset',
'name': 'gov/vpt/new/ataskaitos/atn1',
'resources': {
'': { # resource
'objects': {
'ATN1': { # origin
'valstybe/pirkimas/ataskaita': { # model
'source': 'ATN1',
'properties': {
'etapas': { # property
'type': 'string',
'title': 'Etapas',
'description': 'Aprašymas',
'const': 'award',
},
'org': { # property
'type': 'ref',
'object': 'org',
'source': 'ORG'
},
},
},
},
},
},
},
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment