Commit ad49af70 authored by Biagio's avatar Biagio

refactor

parent 57dd9fd9
......@@ -2,77 +2,12 @@
## AKN4UN
### Installation
[Installation and Usage](/settings/README.md)
* requires python3.7+
* clone this repo
* cd to the repo
* create a virtual environment: `python3 -m venv venv`
* load the virtual environment: `source venv/bin/activate`
* install the dependencies: `pip install -r requirements.txt`
* install the spaCy model: `python -m spacy download en_core_web_md`
[Documents](/documents)
## Usage
[Ontology](/ontology)
* download all the documents: `python run.py --download`
* to parse one document: `python run.py --parse <filepath> [--outdir <output dir>]`
* to parse all the documents: `python run.py --parseall [--outdir <output dir>]`
* to use with a GUI: `python run.py --gui [--port: port_no]`
(it will return *.akn zip archives, also saved locally in
`keld/server/converted/`)
[Cource code](/development)
**All the converted files will be written in the directory `out` unless `--outdir` is specified**
## troubleshooting
If you are experiencing problems with import errors, export the PYTHONPATH
as follows:
```export PYTHONPATH:${PYTHONPATH}:<full/path/to/the/repo/>```
# How it works
The first step of the conversion consists in loading the provided word document
and converting it (or rather: its parts) into txt.
The second step is parsing the text top to bottom and using pattern matching to
identify structural elements such as the document title, number, the paragraphs,
sections, annexes and so on.
The pattern matching process uses [replus](https://pypi.org/project/replus/), which
provides a method to write modular, template-based extensible regular expressions.
Depending on the result of the pattern matching on the text, the text itself is
mapped as-is into Objects that work as a proxy for Akoma Ntoso xml generation.
Before the objects are appended, the text is qualified via a `paragraph_qualifier`
which has the job to determine if the text represents a preambular or an operational
element; then it is appended accordingly.
Once the objects are all appended, a downward recursive algorithm is used to
ensure that all the elements are placed accordingly to their hierarchical value
(eg. if a `section` and a `paragraph` happen to be siblings, the latter will be
set as a child of the first).
The next step is to correctly generate the eIds of the objects (which may have
prefixes depending on their parent(s)).
After that the structure is in place, it is possible to run pattern matching
and machine learning algorithms to identify all inline elements, such as dates,
references, roles, organizations and so on.
The inline pattern matching also uses [replus](https://pypi.org/project/replus/);
the `match objects` are passed
through a series of `resolvers` which will extract the metadata, build the
attributes and the corresponding Top-Level Concept to be added to the AKN
`references`.
Other than regexes, inline elements are recognized using spaCy with some
customized NER.
Once the structure and the inlines are done, another spaCy-powered custom
algorithm identifies SDG with their targets and respective indicators. The
results are mapped into AKN `keywords`, `references` and custom name-spaced
(akn4un) elements that will link the results to their corresponding elements.
The last step simply consists in writing the AKN to an xml file and validate it.
[Data](/data)
import os
TEST_DOCS_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), os.pardir, os.pardir, "test_docs"))
DATA_DIR = os.path.abspath(
os.path.join(
os.path.dirname(os.path.realpath(__file__)),
os.pardir,
os.pardir,
os.pardir,
"data",
)
)
INPUT_DIR = os.path.abspath(os.path.join(DATA_DIR, "input"))
OUTPUT_DIR = os.path.abspath(os.path.join(DATA_DIR, "output"))
......@@ -4,7 +4,8 @@ import os
def setup_logger(name, log_file_path, level=logging.DEBUG):
log_dir, _ = os.path.split(log_file_path)
os.makedirs(log_dir, exist_ok=True)
if log_dir != "":
os.makedirs(log_dir, exist_ok=True)
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
handler = logging.FileHandler(log_file_path)
......
......@@ -4,11 +4,11 @@ import re
import requests
from bs4 import BeautifulSoup
from keld.commons import TEST_DOCS_DIR
from keld.commons import INPUT_DIR
from tqdm import tqdm
DOC_HOMEPAGE = "https://www.un.org/en/sections/documents/general-assembly-resolutions/"
DOC_DIR = TEST_DOCS_DIR
DOC_DIR = INPUT_DIR
os.makedirs(DOC_DIR, exist_ok=True)
tqdm.monitor_interval = 0
......@@ -87,7 +87,7 @@ def _download_all_resolutions():
fname = re.findall("filename=(.+)", r.headers["Content-Disposition"])[0]
else:
fname = r.url.split("/")[-1]
with open(os.path.join(TEST_DOCS_DIR, fname), "wb") as f:
with open(os.path.join(INPUT_DIR, fname), "wb") as f:
f.write(r.content)
resolutions_to_download[k] = True
write_progress()
......
......@@ -6,19 +6,19 @@ import zipfile
from keld.aknschema import validate
from keld.body_parser import parse as parse_doc
from keld.commons import INPUT_DIR, OUTPUT_DIR
from keld.commons.logging import setup_logger
from keld.doc_scraper import download_everything
from tqdm import tqdm
from keld.commons import TEST_DOCS_DIR
from keld.commons.logging import setup_logger
here = os.path.abspath(os.path.dirname(os.path.abspath(__file__)))
tqdm.monitor_interval = 0
logger = setup_logger(f"keld", os.path.join(here, f"keld.log"))
logger = setup_logger(f"keld", "keld.log")
def parse(filepath, output_dir=None):
output_dir = output_dir or os.path.abspath(output_dir or "output")
output_dir = output_dir or OUTPUT_DIR
os.makedirs(output_dir, exist_ok=True)
doc = parse_doc(filepath)
doc.hierarchize()
......@@ -50,7 +50,7 @@ def parse(filepath, output_dir=None):
def parse_all(batch_no=None, output_dir=None):
docs_to_parse = os.listdir(TEST_DOCS_DIR)
docs_to_parse = os.listdir(INPUT_DIR)
batches = [
docs_to_parse[:150],
docs_to_parse[150:300],
......@@ -64,9 +64,9 @@ def parse_all(batch_no=None, output_dir=None):
pbar = tqdm(batch)
os.makedirs(output_dir or "out", exist_ok=True)
for i, docname in enumerate(pbar):
if not docname.endswith((".doc", ".DOC")):
if not docname.lower().endswith((".doc", ".docx")):
continue
docpath = os.path.join(TEST_DOCS_DIR, docname)
docpath = os.path.join(INPUT_DIR, docname)
try:
parse(docpath)
except KeyboardInterrupt:
......
from keld.inline_parser import parse_inlines
from keld.doc_handler.serializers import markup
def test_1():
for text in [
"this is a range of dates: from 10 october 1990 to 20 october 1991",
"this is a range of dates: from october 1990 to 20 october 1991",
"this is a range of dates: from october 1990 to october 1991",
"this is a range of dates: from 1990 to october 1991",
"this is a range of dates: from 10 to 20 october 1991",
"this is a range of dates: from 1990 to 20 october 1991",
"this is a range of dates: from 1990 to 1991"
]:
refs = parse_inlines(text)
print(markup(text, refs))
if __name__ == "__main__":
test_1()
This diff is collapsed.
{
"73rd - 2018": [
"http://www.un.org/en/ga/73/resolutions.shtml",
true
],
"72nd - 2017": [
"http://www.un.org/en/ga/72/resolutions.shtml",
true
],
"71st - 2016": [
"http://research.un.org/en/docs/ga/quick/regular/71",
true
],
"70th - 2015": [
"http://research.un.org/en/docs/ga/quick/regular/70",
true
],
"69th - 2014": [
"http://research.un.org/en/docs/ga/quick/regular/69",
true
],
"68th - 2013": [
"http://research.un.org/en/docs/ga/quick/regular/68",
true
],
"67th - 2012": [
"http://research.un.org/en/docs/ga/quick/regular/67",
true
],
"66th - 2011": [
"http://research.un.org/en/docs/ga/quick/regular/66",
true
],
"65th - 2010": [
"http://research.un.org/en/docs/ga/quick/regular/65",
true
],
"64th - 2009": [
"http://research.un.org/en/docs/ga/quick/regular/64",
true
],
"63rd - 2008": [
"http://research.un.org/en/docs/ga/quick/regular/63",
true
],
"62nd - 2007": [
"http://research.un.org/en/docs/ga/quick/regular/62",
true
],
"61st - 2006": [
"http://research.un.org/en/docs/ga/quick/regular/61",
true
],
"60th - 2005": [
"http://research.un.org/en/docs/ga/quick/regular/60",
true
],
"59th - 2004": [
"http://research.un.org/en/docs/ga/quick/regular/59",
true
],
"58th - 2003": [
"http://research.un.org/en/docs/ga/quick/regular/58",
true
],
"57th - 2002": [
"http://research.un.org/en/docs/ga/quick/regular/57",
true
],
"56th - 2001": [
"http://research.un.org/en/docs/ga/quick/regular/56",
true
],
"55th - 2000": [
"http://research.un.org/en/docs/ga/quick/regular/55",
true
],
"54th - 1999": [
"http://research.un.org/en/docs/ga/quick/regular/54",
true
],
"53rd - 1998": [
"http://research.un.org/en/docs/ga/quick/regular/53",
true
],
"52nd - 1997": [
"http://research.un.org/en/docs/ga/quick/regular/52",
true
],
"51st - 1996": [
"http://research.un.org/en/docs/ga/quick/regular/51",
true
],
"50th - 1995": [
"http://research.un.org/en/docs/ga/quick/regular/50",
true
],
"49th - 1994": [
"http://research.un.org/en/docs/ga/quick/regular/49",
true
],
"48th - 1993": [
"http://research.un.org/en/docs/ga/quick/regular/48",
true
],
"47th - 1992": [
"http://research.un.org/en/docs/ga/quick/regular/47",
true
],
"46th - 1991": [
"http://research.un.org/en/docs/ga/quick/regular/46",
true
],
"45th - 1990": [
"http://research.un.org/en/docs/ga/quick/regular/45",
true
],
"44th - 1989": [
"http://research.un.org/en/docs/ga/quick/regular/44",
true
],
"43rd - 1988": [
"http://research.un.org/en/docs/ga/quick/regular/43",
true
],
"42nd - 1987": [
"http://research.un.org/en/docs/ga/quick/regular/42",
true
],
"41st - 1986": [
"http://research.un.org/en/docs/ga/quick/regular/41",
true
],
"40th - 1985": [
"http://research.un.org/en/docs/ga/quick/regular/40",
true
],
"39th - 1984": [
"http://research.un.org/en/docs/ga/quick/regular/39",
true
],
"38th - 1983": [
"http://research.un.org/en/docs/ga/quick/regular/38",
true
],
"37th - 1982": [
"http://research.un.org/en/docs/ga/quick/regular/37",
true
],
"36th - 1981": [
"http://research.un.org/en/docs/ga/quick/regular/36",
true
],
"35th - 1980": [
"http://research.un.org/en/docs/ga/quick/regular/35",
true
],
"34th - 1979": [
"http://research.un.org/en/docs/ga/quick/regular/34",
true
],
"33rd - 1978": [
"http://research.un.org/en/docs/ga/quick/regular/33",
true
],
"32nd - 1977": [
"http://research.un.org/en/docs/ga/quick/regular/32",
true
],
"31st - 1976": [
"http://research.un.org/en/docs/ga/quick/regular/31",
true
],
"30th - 1975": [
"http://research.un.org/en/docs/ga/quick/regular/30",
true
],
"29th - 1974": [
"http://research.un.org/en/docs/ga/quick/regular/29",
true
],
"28th - 1973": [
"http://research.un.org/en/docs/ga/quick/regular/28",
true
],
"27th - 1972": [
"http://research.un.org/en/docs/ga/quick/regular/27",
true
],
"26th - 1971": [
"http://research.un.org/en/docs/ga/quick/regular/26",
true
],
"25th - 1970": [
"http://research.un.org/en/docs/ga/quick/regular/25",
true
],
"24th - 1969": [
"http://research.un.org/en/docs/ga/quick/regular/24",
true
],
"23rd - 1968": [
"http://research.un.org/en/docs/ga/quick/regular/23",
true
],
"22nd - 1967": [
"http://research.un.org/en/docs/ga/quick/regular/22",
true
],
"21st - 1966": [
"http://research.un.org/en/docs/ga/quick/regular/21",
true
],
"20th - 1965": [
"http://research.un.org/en/docs/ga/quick/regular/20",
true
],
"19th - 1964": [
"http://research.un.org/en/docs/ga/quick/regular/19",
true
],
"18th - 1963": [
"http://research.un.org/en/docs/ga/quick/regular/18",
true
],
"17th - 1962": [
"http://research.un.org/en/docs/ga/quick/regular/17",
true
],
"16th - 1961": [
"http://research.un.org/en/docs/ga/quick/regular/16",
true
],
"15th - 1960": [
"http://research.un.org/en/docs/ga/quick/regular/15",
true
],
"14th - 1959": [
"http://research.un.org/en/docs/ga/quick/regular/14",
true
],
"13th - 1958": [
"http://research.un.org/en/docs/ga/quick/regular/13",
true
],
"12th - 1957": [
"http://research.un.org/en/docs/ga/quick/regular/12",
true
],
"11th - 1956": [
"http://research.un.org/en/docs/ga/quick/regular/11",
true
],
"10th - 1955": [
"http://research.un.org/en/docs/ga/quick/regular/10",
true
],
"9th - 1954": [
"http://research.un.org/en/docs/ga/quick/regular/9",
true
],
"8th - 1953": [
"http://research.un.org/en/docs/ga/quick/regular/8",
true
],
"7th - 1952": [
"http://research.un.org/en/docs/ga/quick/regular/7",
true
],
"6th - 1951": [
"http://research.un.org/en/docs/ga/quick/regular/6",
true
],
"5th - 1950": [
"http://research.un.org/en/docs/ga/quick/regular/5",
true
],
"4th - 1949": [
"http://research.un.org/en/docs/ga/quick/regular/4",
true
],
"3rd - 1948": [
"http://research.un.org/en/docs/ga/quick/regular/3",
true
],
"2nd - 1947": [
"http://research.un.org/en/docs/ga/quick/regular/2",
true
],
"1st - 1946": [
"http://research.un.org/en/docs/ga/quick/regular/1",
true
],
"Special Sessions": [
"http://research.un.org/en/docs/ga/quick/special",
true
],
"Emergency Special Sessions": [
"http://research.un.org/en/docs/ga/quick/emergency",
true
]
}
\ No newline at end of file
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment