Commit c952dd18 authored by Christoph Jansen's avatar Christoph Jansen

init

parents
/.idea
/MANIFEST
/dist
/venv
__pycache__
Copyright 2018 Christoph Jansen
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
include README.md LICENSE NOTICE requirements.txt
# Deep Teaching Commons
This Python module is part of the [deep.TEACHING](http://www.deep-teaching.org) project and provides common functionality across Jupyter notebooks and teaching material.
## Installation
Option 1: Install in user's home directory
```bash
pip3 install --user deep-teaching
```
Option 2: Install in a virtual environment
```bash
pip3 install --user virtualenv
# create virtual environment called venv
virtualenv venv
source venv/bin/activate
pip install deep-teaching
```
## License
[MIT](/LICENSE)
## Acknowledgements
The Deep Teaching Commons software is developed at HTW Berlin - University of Applied Sciences.
The work is supported by the German Ministry of Education and Research (BMBF)
import os
BASE_DATA_DIR = os.path.expanduser('~/deep.TEACHING/data')
import os
import shutil
import zipfile
import tempfile
from glob import glob
# third party
import requests
# internal
from deep_teaching_commons import config
class GermEval2014:
def __init__(self, base_data_dir=None, data_url=None, auto_download=True, verbose=True):
self.base_data_dir = base_data_dir
if self.base_data_dir is None:
self.base_data_dir = config.BASE_DATA_DIR
self.data_url = data_url
if self.data_url is None:
self.data_url = 'https://www.lt.informatik.tu-darmstadt.de/fileadmin/user_upload/Group_LangTech/data/' \
'GermEval2014_complete_data.zip'
self.verbose = verbose
self.data_dir = os.path.join(self.base_data_dir, 'GermEval2014')
self.train_file = os.path.join(self.data_dir, 'NER-de-train.tsv')
self.test_file = os.path.join(self.data_dir, 'NER-de-test.tsv')
self.val_file = os.path.join(self.data_dir, 'NER-de-dev.tsv')
if auto_download:
if self.verbose:
print('auto download is active, attempting download')
self.download()
def download(self):
# download corpus if directory does not yet exist
if os.path.exists(self.data_dir):
if self.verbose:
print('data directory already exists, no download required')
else:
if self.verbose:
print('data directory does not exist, starting download')
# create directories
os.makedirs(self.data_dir)
temp_dir = tempfile.mkdtemp()
try:
# stream download
r = requests.post(self.data_url, stream=True)
r.raise_for_status()
temp_path = os.path.join(temp_dir, 'corpus.zip')
with open(temp_path, 'wb') as f:
for chunk in r.iter_content(chunk_size=4096):
if chunk:
f.write(chunk)
r.raise_for_status()
# extract zip
with zipfile.ZipFile(temp_path, 'r') as z:
z.extractall(temp_dir)
# move *.tsv files
source_paths = glob(os.path.join(temp_dir, '*', '*.tsv'), recursive=True)
for source_path in source_paths:
file_name = os.path.split(source_path)[1]
destination_path = os.path.join(self.data_dir, file_name)
shutil.move(source_path, destination_path)
except:
shutil.rmtree(self.data_dir)
raise
finally:
shutil.rmtree(temp_dir)
if self.verbose:
print('data successfully downloaded')
@staticmethod
def sequences(data_file):
with open(data_file) as f:
sequence = []
for line in f:
line = line.strip()
if line.startswith('#'):
continue
if not line:
yield sequence
sequence = []
continue
_, token, iob_outer, iob_inner = line.split()
sequence.append((token, iob_outer, iob_inner))
if sequence:
yield sequence
def train_sequences(self):
return self.sequences(self.train_file)
def test_sequences(self):
return self.sequences(self.test_file)
def val_sequences(self):
return self.sequences(self.val_file)
certifi==2018.1.18
chardet==3.0.4
idna==2.6
requests==2.18.4
urllib3==1.22
#!/usr/bin/env python3
from distutils.core import setup
setup(
name='deep-teaching-commons',
version='0.1',
description='A Python module for common functionality across notebooks and teaching material.',
author='Christoph Jansen',
author_email='Christoph.Jansen@htw-berlin.de',
url='https://gitlab.com/deep.TEACHING/deep-teaching-commons',
packages=[
'deep_teaching_commons',
'deep_teaching_commons.data',
'deep_teaching_commons.data.text'
],
license='MIT',
platforms=['any'],
install_requires=[
'requests'
]
)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment