Commit 2a3f4edf authored by Asitha Senanayake's avatar Asitha Senanayake
Browse files

feat: add helper functions to convert official AGS4 dictionaries in .json to .ags

parent 90d769aa
Loading
Loading
Loading
Loading

python_ags4/utils.py

0 → 100644
+282 −0
Original line number Diff line number Diff line
# Copyright (C) 2020-2025  Asitha Senanayake
#
# This file is part of python_ags4.
#
# python_ags4 is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.
#
# https://github.com/asitha-sena/python-ags4
# https://gitlab.com/ags-data-format-wg/ags-python-library

from pandas import DataFrame, concat, read_json

from .AGS4 import AGS4Error


def get_DICT_table_from_json_file(filepath):
    '''Convert AGS4 dictionary in .json format to DICT table in .ags format.

    The official standard dictionaries in .json format is available at
    https://gitlab.com/AGS-DFWG-Web/ASG4

    Parameters
    ----------
    filepath : str
        Path to JSON file

    Returns
    -------
    Pandas DataFrame with DICT table

    '''

    # Extract heading DICT_TYPE='HEADING' rows from JSON data
    heading_rows = read_json(filepath).rename(columns={'group': 'DICT_GRP',
                                                       'heading': 'DICT_HDNG',
                                                       'suggested_type': 'DICT_DTYP',
                                                       'description': 'DICT_DESC',
                                                       'suggested_unit': 'DICT_UNIT',
                                                       'example': 'DICT_EXMP'})\
                                      .pipe(lambda df: df.assign(HEADING='DATA',
                                                                 DICT_TYPE='HEADING',
                                                                 DICT_STAT=df.heading_status.map({'*': 'KEY',
                                                                                                  'R': 'REQUIRED',
                                                                                                  '*R': 'KEY+REQUIRED',
                                                                                                  'R*': 'KEY+REQUIRED',
                                                                                                  '': 'OTHER',
                                                                                                  'Deprecated': 'DEPRECATED'}),
                                                                 DICT_PGRP='',
                                                                 DICT_REM='',
                                                                 FILE_FSET='',
                                                                 in_group_order=df.in_group_order.astype('int'),
                                                                 group_order=df.group_order.astype('int')))

    # Extract heading DICT_TYPE='GROUP' rows from JSON data
    group_rows = heading_rows.groupby('DICT_GRP').first()\
                             .reset_index()\
                             .drop('DICT_DESC', axis=1)\
                             .rename(columns={'group_description': 'DICT_DESC'})\
                             .pipe(lambda df: df.assign(HEADING='DATA',
                                                        DICT_TYPE='GROUP',
                                                        DICT_HDNG='',
                                                        DICT_STAT='',
                                                        DICT_DTYP='',
                                                        DICT_UNIT='',
                                                        DICT_EXMP='',
                                                        DICT_PGRP=df['parent'],
                                                        FILE_FSET='',
                                                        in_group_order=0))

    # Create UNIT and TYPE rows
    unit_and_type_rows = DataFrame({'HEADING': ['UNIT', 'TYPE'],
                                    'DICT_TYPE': ['', 'PA'],
                                    'DICT_GRP': ['', 'X'],
                                    'DICT_HDNG': ['', 'X'],
                                    'DICT_STAT': ['', 'PA'],
                                    'DICT_DTYP': ['', 'PT'],
                                    'DICT_DESC': ['', 'X'],
                                    'DICT_UNIT': ['', 'PU'],
                                    'DICT_EXMP': ['', 'X'],
                                    'DICT_PGRP': ['', 'X'],
                                    'DICT_REM': ['', 'X'],
                                    'FILE_FSET': ['', 'X'],
                                    'group_order': [-1, 0]})

    # Combine all rows
    DICT = concat([unit_and_type_rows, heading_rows, group_rows])

    # Mark deprecated groups and headings
    mask = DICT.group_status.eq('Deprecated')
    DICT.loc[mask & DICT.DICT_TYPE.eq('GROUP'), 'DICT_STAT'] = 'DEPRECATED'
    DICT.loc[mask & DICT.DICT_TYPE.eq('GROUP'), 'DICT_REM'] = 'This group has been deprecated and will be removed in a future version.'
    DICT.loc[mask & DICT.DICT_TYPE.eq('HEADING'), 'DICT_REM'] = 'This heading is in a deprecated group and will be removed in a future version.'

    DICT.loc[DICT.heading_status.eq('DEPRECATED'), 'DICT_REM'] = 'This heading has been deprecated and will be removed in a future version.'

    # Sort rows and keep only relevant columns
    DICT = DICT.sort_values(by=['group_order', 'in_group_order'])\
               .loc[:, ['HEADING', 'DICT_TYPE', 'DICT_GRP', 'DICT_HDNG', 'DICT_STAT', 'DICT_DTYP', 'DICT_DESC', 'DICT_UNIT', 'DICT_EXMP',
                        'DICT_PGRP', 'DICT_REM', 'FILE_FSET']]\
               .reset_index(drop=True)\

    # Found some linebreak characters in the descriptions that are not needed in DICT_DESC
    # Replace them with spaces
    DICT.loc[:, 'DICT_DESC'] = DICT.DICT_DESC.str.replace(r'(\r\n)', '', regex=True)
    DICT.loc[:, 'DICT_DESC'] = DICT.DICT_DESC.str.replace(r'(\n)', '', regex=True)

    # Some DICT_DESC entries in the json file are enclosed by double quotes.
    # These need to be either single quotes or double double quotes according to
    # AGS4 Rule 5. The library correctly handles this when writing the DICT table
    # to a .ags file but it was decided to replace double quotes with single
    # quotes to be consistent with the existing built-in standard dictionary
    # files
    DICT.loc[:, 'DICT_DESC'] = DICT.DICT_DESC.str.replace(r'(")', '\'', regex=True)

    return DICT


def get_ABBR_table_from_json_file(filepath, filepath_ELRG, version='4.1'):
    '''Convert AGS4 abbreviations in .json format to ABBR table in .ags format.

    The official ABBR list in .json format is available at
    https://gitlab.com/AGS-DFWG-Web/ASG4

    Parameters
    ----------
    filepath : str
        Path to JSON file with AGS4 abbreviations list
    filepath_ELRG : str or None
        Path to JSON file with ELRG_CODE list
    version : {'4.0', '4.1'}
        Version of standard dictionary.

    Returns
    -------
    Pandas DataFrame with ABBR table

    '''

    # Check version provided by user
    if version not in ['4.0', '4.1']:
        raise AGS4Error("Invalid version number. Only '4.0' and '4.1' are valid entries.")

    # Extract heading "DATA" rows from JSON data with abbreviations list
    data_rows = read_json(filepath).rename(columns={'Group': 'ABBR_HDNG', 'Code': 'ABBR_CODE', 'Description': 'ABBR_DESC'})\
                                   .assign(HEADING='DATA', ABBR_LIST=version, ABBR_REM='', FILE_FSET='')\
                                   .query("Version.str.contains(@version) & Status.str.contains('Approved', case=False)")\
                                   .sort_values(by=['ABBR_HDNG', 'ABBR_CODE'])

    # Create UNIT and TYPE rows
    unit_and_type_rows = DataFrame({'HEADING': ['UNIT', 'TYPE'],
                                    'ABBR_HDNG': ['', 'X'],
                                    'ABBR_CODE': ['', 'X'],
                                    'ABBR_DESC': ['', 'X'],
                                    'ABBR_LIST': ['', 'X'],
                                    'ABBR_REM': ['', 'X'],
                                    'FILE_FSET': ['', 'X']})

    # Extract ELRG_CODE list
    if filepath_ELRG is not None:
        elrg_codes = read_json(filepath_ELRG).rename(columns={'code': 'ABBR_CODE', 'description': 'ABBR_DESC'})\
                                    .assign(HEADING='DATA', ABBR_HDNG='ELRG_CODE', ABBR_LIST=version, ABBR_REM='', FILE_FSET='')\
                                    .query("version.str.contains(@version) & status.str.contains('Approved', case=False)")\
                                    .sort_values(by=['ABBR_HDNG', 'ABBR_CODE'])

    else:
        elrg_codes = DataFrame()

    # Combine all rows
    ABBR = concat([unit_and_type_rows, data_rows, elrg_codes])

    # Sort columns and reset index
    ABBR = ABBR.loc[:, ['HEADING', 'ABBR_HDNG', 'ABBR_CODE', 'ABBR_DESC', 'ABBR_LIST', 'ABBR_REM', 'FILE_FSET']]\
               .reset_index(drop=True)

    return ABBR


def get_TYPE_table_from_json_file(filepath, version='4.1'):
    '''Convert AGS4 abbreviations in .json format to TYPE table in .ags format.

    The official TYPE list in .json format is available at
    https://gitlab.com/AGS-DFWG-Web/ASG4

    Parameters
    ----------
    filepath : str
        Path to JSON file
    version : {'4.0', '4.1'}
        Version of standard dictionary.

    Returns
    -------
    Pandas DataFrame with ABBR table

    '''

    # Check version provided by user
    if version not in ['4.0', '4.1']:
        raise AGS4Error("Invalid version number. Only '4.0' and '4.1' are valid entries.")

    # Extract heading "DATA" rows from JSON data
    data_rows = read_json(filepath).rename(columns={'Type': 'TYPE_TYPE', 'Desc': 'TYPE_DESC'})\
                                   .assign(HEADING='DATA', FILE_FSET='')\
                                   .query("Version.str.contains(@version)")\
                                   .sort_values(by=['TYPE_TYPE', 'TYPE_DESC'])

    # Create UNIT and TYPE rows
    unit_and_type_rows = DataFrame({'HEADING': ['UNIT', 'TYPE'],
                                    'TYPE_TYPE': ['', 'X'],
                                    'TYPE_DESC': ['', 'X'],
                                    'FILE_FSET': ['', 'X']})

    # Combine all rows
    TYPE = concat([unit_and_type_rows, data_rows])\

    # Sort columns and reset index
    TYPE = TYPE.loc[:, ['HEADING', 'TYPE_TYPE', 'TYPE_DESC', 'FILE_FSET']]\
               .reset_index(drop=True)\

    return TYPE


def get_UNIT_table_from_json_file(filepath, version='4.1'):
    '''Convert AGS4 abbreviations in .json format to UNIT table in .ags format.

    The official UNIT list in .json format is available at
    https://gitlab.com/AGS-DFWG-Web/ASG4

    Parameters
    ----------
    filepath : str
        Path to JSON file
    version : {'4.0', '4.1'}
        Version of standard dictionary.

    Returns
    -------
    Pandas DataFrame with ABBR table

    '''

    # Check version provided by user
    if version not in ['4.0', '4.1']:
        raise AGS4Error("Invalid version number. Only '4.0' and '4.1' are valid entries.")

    # Extract heading "DATA" rows from JSON data
    data_rows = read_json(filepath).rename(columns={'Unit': 'UNIT_UNIT', 'Description': 'UNIT_DESC'})\
                                   .assign(HEADING='DATA', UNIT_REM='', FILE_FSET='')\
                                   .query("Version.str.contains(@version) & Status.str.contains('Approved', case=False)")\
                                   .pipe(lambda df: df.drop_duplicates(subset='UNIT_UNIT', keep='first'))\
                                   .sort_values(by=['UNIT_UNIT', 'UNIT_DESC'], key=lambda x: x.str.lower())

    # The 'key' argument is used in sort_values() to apply the sorting after
    # changing all entries to lower case. This is necessary because
    # sort_values() is case sensitive by default, with uppercase letters having
    # higher precedence than lower case letters.

    # Create UNIT and TYPE rows
    unit_and_type_rows = DataFrame({'HEADING': ['UNIT', 'TYPE'],
                                    'UNIT_UNIT': ['', 'X'],
                                    'UNIT_DESC': ['', 'X'],
                                    'UNIT_REM': ['', 'X'],
                                    'FILE_FSET': ['', 'X']})

    # Combine all rows
    UNIT = concat([unit_and_type_rows, data_rows])\

    # Sort columns and reset index
    UNIT = UNIT.loc[:, ['HEADING', 'UNIT_UNIT', 'UNIT_DESC', 'UNIT_REM', 'FILE_FSET']]\
               .reset_index(drop=True)\

    return UNIT