Commit 1a735662 authored by mrpudn's avatar mrpudn 🇺🇦
Browse files

Initial commit

parent dd92eaa9
Pipeline #506417638 failed with stages
in 13 minutes and 20 seconds
*.jsonl filter=lfs diff=lfs merge=lfs -text
# python
__pycache__
*.pyc
# virtual environment
env
# jmnedict
data/jmnedict.xml
# junk
.DS_Store
workflow:
rules:
- if: $CI_COMMIT_TAG
when: never
- when: always
stages:
- static-analysis
- test
- update
- publish
.python:
image: "python:3.9"
before_script:
- python --version
- pip install --requirement requirements.txt
.python_dev:
extends: .python
before_script:
- !reference [.python, before_script]
- pip install --requirement requirements-dev.txt
flake8:
extends: .python_dev
stage: static-analysis
rules:
- if: '$CI_PIPELINE_SOURCE != "schedule"'
script:
- flake8 bin/*
test:
extends: .python_dev
stage: test
rules:
- if: '$CI_PIPELINE_SOURCE != "schedule"'
script:
- bin/test
.automatic_update:
rules:
- if: '$CI_PIPELINE_SOURCE == "schedule"'
- if: '$CI_PIPELINE_SOURCE == "manual"'
.update:
extends:
- .python
- .automatic_update
stage: update
artifacts:
paths:
- data/
expire_in: 3 days
download:
extends: .update
script:
- bin/download
convert:
extends: .update
needs:
- job: download
artifacts: true
script:
- bin/convert
.install_git:
before_script:
# install git
- apt-get install --yes git
# configure git username and email
- git config --global user.name "${GITLAB_USER_NAME}"
- git config --global user.email "${GITLAB_USER_EMAIL}"
# configure origin remote url
- git remote set-url origin git@${CI_SERVER_HOST}:${CI_PROJECT_PATH}
.install_ssh:
before_script:
# install openssh-client
- apt-get install --yes openssh-client
# set up ssh keys
# (public key should be added as a deploy key)
# (private key should be added as the `SSH_PRIVATE_KEY` CI variable)
- eval $(ssh-agent -s)
- echo "${SSH_PRIVATE_KEY}" | tr -d '\r' | ssh-add -
- mkdir -p ~/.ssh
- chmod 700 ~/.ssh
- ssh-keyscan "${CI_SERVER_HOST}" >> ~/.ssh/known_hosts
- chmod 644 ~/.ssh/known_hosts
.publish_git:
before_script:
- apt-get update --yes
- !reference [.install_git, before_script]
- !reference [.install_ssh, before_script]
script:
# make sure the commit ref is checked out (not detached HEAD)
- git checkout $CI_COMMIT_REF_NAME
# commit any updates
- |
if [[ -n "$(git status --porcelain)" ]]; then
git add --all
git commit --message "$MESSAGE"
git push --push-option ci.skip origin $CI_COMMIT_REF_NAME
else
echo "Nothing to publish!"
fi
# tag the current commit
- git tag $VERSION
- git push --push-option ci.skip origin $VERSION
.install_gitlfs:
before_script:
# install git-lfs
- apt-get install --yes git-lfs
- git lfs env
.publish_gitlfs:
extends: .publish_git
before_script:
- !reference [.publish_git, before_script]
- !reference [.install_gitlfs, before_script]
publish:
extends:
- .publish_gitlfs
- .automatic_update
stage: publish
script:
- VERSION=$(date --utc +%Y.%m.%d)
- MESSAGE="Update $VERSION"
- !reference [.publish_gitlfs, script]
# Contributing Instructions
Instructions for contributing to this project.
## Setup
Create and activate a virtual environment:
```sh
$ python -m venv env
$ source env/bin/activate
```
Install dependencies:
```sh
$ pip install -r requirements-dev.txt
```
## Development
Lint the source code with `flake8`:
```sh
$ flake8 bin/*
```
Run the unit tests:
```sh
$ bin/test
```
## Updating Data Files
Download:
```sh
$ bin/download
```
Convert:
```sh
$ bin/convert
```
The data files should now be updated.
This package redistributes the ENAMDICT/JMnedict dictionary file. This file is
the property of the Electronic Dictionary Research and Development Group
(EDRDG), and is used in conformance with the Group's license. See the links
below for more information.
http://www.edrdg.org/
http://nihongo.monash.edu/enamdict_doc.html
http://www.edrdg.org/edrdg/licence.html
The remainder of this project is licensed under the MIT license below:
Copyright (c) 2022 Zach McAuliffe
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
# jmnedict-jsonl
[JMnedict] redistributed in the [jsonlines] format.
[![pipeline status](https://gitlab.com/x4ku/jmnedict-jsonl/badges/main/pipeline.svg)](https://gitlab.com/x4ku/jmnedict-jsonl/-/commits/main)
## Overview
This project redistributes [JMnedict] in the [jsonlines] format.
You can download the latest files here:
| File | Description |
| -------------------- | ---------------------- |
| [`jmnedict.jsonl`] | JMnedict records |
| [`jmnedict.json`] | JMnedict record schema |
| [`jmnedict-dtd.xml`] | Additional information |
These files are updated automatically every month.
## Usage
You can read and work with the data files from this project as-is, line-by-line.
Each line is a JSON array containing a record's *values*. There is no need to
even load the entire file into memory if your use-case does not require it -
simply read and work with one line at a time.
By storing only values instead of highly redundant dictionaries (i.e. *keys* and
*values*), we are able to achieve a massive reduction in file size while still
retaining the ability to express the complex structure of these records in JSON.
If you would prefer to convert these records into dictionaries/objects, a
reference implementation is provided below.
### Converting Records to Dictionaries
The reference implementation below uses the [`jmnedict.json`] schema file to
convert records from the [`jmnedict.jsonl`] file into dictionaries/objects.
```py
import json
from pathlib import Path
def map_schema(value, schema):
if type(schema) is list:
return [map_schema(v, schema[0]) for v in value]
if type(schema) is dict:
kvs = zip(schema.keys(), value, schema.values())
return {k: map_schema(v, s) for k, v, s in kvs}
return value
with open(Path('schema') / 'jmnedict.json') as file:
schema = json.load(file)
with open(Path('data') / 'jmnedict.jsonl') as file:
records = [map_schema(json.loads(line), schema) for line in file]
```
The `records` variable should now contain the records as dictionaries.
## License
See the [`LICENSE`] file before using any of the files provided by this project
in your own work.
<!-- links -->
[`LICENSE`]: LICENSE
[`jmnedict.json`]: schema/jmnedict.json
[`jmnedict.jsonl`]: data/jmnedict.jsonl
[`jmnedict-dtd.xml`]: schema/jmnedict-dtd.xml
[JMnedict]: http://nihongo.monash.edu/enamdict_doc.html
[jsonlines]: https://jsonlines.org/
#!/usr/bin/env python3
import json
import logging
import os
from pathlib import Path
from bs4 import BeautifulSoup
logging.basicConfig(
level=os.getenv('LOG_LEVEL', 'INFO'),
format=(
'[%(levelname)s] %(asctime)s '
'(%(funcName)s:%(lineno)s) '
'%(message)s'
)
)
logger = logging.getLogger(__name__)
def main():
data_dir = Path('data')
input_path = data_dir / 'jmnedict.xml'
output_path = data_dir / 'jmnedict.jsonl'
convert_records(input_path, output_path)
def convert_records(input_path, output_path):
logger.info('Converting JMnedict ...')
stream = input_stream(input_path)
stream = record_adapter(stream)
stream = stream_writer(stream, output_path)
read_stream(stream)
logger.info('Converted JMnedict')
def input_stream(path):
with open(path) as file:
record = None
recording = False
for line in file:
if recording:
record.append(line)
if line.strip() == '</entry>':
recording = False
yield ''.join(record)
elif line.strip() == '<entry>':
recording = True
record = [line]
def record_adapter(stream):
for record in stream:
yield build_record(soup(record))
def build_record(el):
return [
int_one(el, 'ent_seq'),
el_all_map(el, 'k_ele', build_k_ele),
el_all_map(el, 'r_ele', build_r_ele),
el_all_map(el, 'trans', build_trans)
]
def build_k_ele(el):
return [
text_one(el, 'keb'),
text_all(el, 'ke_inf'),
text_all(el, 'ke_pri')
]
def build_r_ele(el):
return [
text_one(el, 'reb'),
text_all(el, 're_restr'),
text_all(el, 're_inf'),
text_all(el, 're_pri')
]
def build_trans(el):
return [
text_all(el, 'name_type'),
text_all(el, 'xref'),
el_all_map(el, 'trans_det', build_trans_det)
]
def build_trans_det(el):
return [
el_text(el),
el.get('xml:lang', 'eng')
]
def soup(markup):
return BeautifulSoup(markup, features='lxml')
def float_all(el, key):
return el_all_map(el, key, el_float)
def int_all(el, key):
return el_all_map(el, key, el_int)
def text_all(el, key):
return el_all_map(el, key, el_text)
def el_all_map(el, key, func):
return list(filter(None, map(func, el_all(el, key))))
def el_all(el, key):
return el.find_all(key) if el else []
def float_one(el, key):
return el_float(el_one(el, key))
def int_one(el, key):
return el_int(el_one(el, key))
def text_one(el, key):
return el_text(el_one(el, key))
def el_one(el, key):
return el.find(key) if el else None
def el_float(el):
try:
return float(el_text(el) or '')
except ValueError:
return None
def el_int(el):
try:
return int(el_text(el) or '')
except ValueError:
return None
def el_text(el):
return el.text.strip() if el and el.text else None
def stream_writer(stream, path, write=None):
write = write or write_json
with open(path, 'w') as file:
for record in stream:
write(record, file)
yield record
def write_json(obj, file):
file.write(f'{json.dumps(obj, ensure_ascii=False)}\n')
def read_stream(stream):
for record in stream:
pass
if __name__ == '__main__':
main()
#!/usr/bin/env python3
import gzip
import logging
import os
import requests
from pathlib import Path
logging.basicConfig(
level=os.getenv('LOG_LEVEL', 'INFO'),
format=(
'[%(levelname)s] %(asctime)s '
'(%(funcName)s:%(lineno)s) '
'%(message)s'
)
)
logger = logging.getLogger(__name__)
JMNEDICT_URL = 'http://ftp.edrdg.org/pub/Nihongo/JMnedict.xml.gz'
def main():
data_dir = Path('data')
jmnedict_path = data_dir / 'jmnedict.xml'
schema_dir = Path('schema')
dtd_path = schema_dir / 'jmnedict-dtd.xml'
download_jmnedict(jmnedict_path)
extract_jmnedict_dtd(jmnedict_path, dtd_path)
def download_jmnedict(path):
logger.info('Downloading JMnedict ...')
response = requests.get(JMNEDICT_URL, stream=True)
response.raise_for_status()
with gzip.GzipFile(fileobj=response.raw) as gzfile:
with open(path, 'wb') as file:
file.write(gzfile.read())
logger.info('Downloaded JMnedict')
def extract_jmnedict_dtd(jmnedict_path, dtd_path):
logger.info('Extracting JMnedict DTD ...')
with open(jmnedict_path) as jmnedict:
with open(dtd_path, 'w') as dtd:
for line in jmnedict:
dtd.write(line)
if line.lstrip().startswith(']>'):
break
logger.info('Extracted JMnedict DTD')
if __name__ == '__main__':
main()
#!/usr/bin/env python3
import importlib.machinery
import importlib.util
import unittest
from pathlib import Path
bin_dir = Path('bin')
# import bin/convert script as a module
module_path = str(bin_dir / 'convert')
loader = importlib.machinery.SourceFileLoader('convert', module_path)
spec = importlib.util.spec_from_loader('convert', loader)
convert = importlib.util.module_from_spec(spec)
loader.exec_module(convert)
class TestConvertUtilities(unittest.TestCase):
def test_text_all(self):
doc = convert.soup('''
<foo>one</foo>
<foo>two</foo>
<foo>three</foo>
''')
self.assertEqual(convert.text_all(doc, 'foo'), ['one', 'two', 'three'])
def test_text_all_none(self):
self.assertEqual(convert.text_all(None, 'foo'), [])
def test_text_all_empty(self):
doc = convert.soup('''
<foo>one</foo>
<foo></foo>
<foo>three</foo>
''')
self.assertEqual(convert.text_all(doc, 'foo'), ['one', 'three'])
def test_int_all(self):
doc = convert.soup('''
<foo>1</foo>
<foo>2</foo>
<foo>3</foo>
''')
self.assertEqual(convert.int_all(doc, 'foo'), [1, 2, 3])
def test_int_all_invalid(self):
doc = convert.soup('''
<foo>1</foo>
<foo>2.34</foo>
<foo>test</foo>
''')
self.assertEqual(convert.int_all(doc, 'foo'), [1])
def test_int_all_none(self):
self.assertEqual(convert.int_all(None, 'foo'), [])
def test_int_all_empty(self):
doc = convert.soup('''
<foo>1</foo>
<foo></foo>
<foo>3</foo>
''')
self.assertEqual(convert.int_all(doc, 'foo'), [1, 3])
def test_float_all(self):
doc = convert.soup('''
<foo>1.2</foo>
<foo>2.3</foo>
<foo>3.4</foo>
''')
self.assertEqual(convert.float_all(doc, 'foo'), [1.2, 2.3, 3.4])
def test_float_all_invalid(self):
doc = convert.soup('''
<foo>1.2</foo>
<foo>3</foo>
<foo>test</foo>
''')
self.assertEqual(convert.float_all(doc, 'foo'), [1.2, 3.0])
def test_float_all_none(self):
self.assertEqual(convert.float_all(None, 'foo'), [])
def test_float_all_empty(self):
doc = convert.soup('''
<foo>1.2</foo>
<foo></foo>
<foo>3.4</foo>
''')
self.assertEqual(convert.float_all(doc, 'foo'), [1.2, 3.4])
class TestConvertRecords(unittest.TestCase):
def test_build_record(self):
doc = convert.soup('''
<entry>
<ent_seq>12345</ent_seq>