Commit 6c06d961 authored by Davide Liga's avatar Davide Liga

qualifier updated from master

parents c1cc6b04 19f2b03e
*.zip
*.xml
*.html
out/
tmp/
# Byte-compiled / optimized / DLL files
......
# UN Challange 2019
## AKN4UN
### Installation
* requires python3.7+
* clone this repo
* cd to the repo
* create a virtual environment: `python3 -m venv venv`
* load the virtual environment: `source venv/bin/activate`
* install the dependencies: `pip install -r requirements.txt`
* install the spaCy model: `python -m spacy download en_core_web_md`
## Usage
* download all the documents: `python run.py --download`
* to parse one document: `python run.py --parse <filepath>`
* to parse all the documents: `python run.py --parseall`
* to use with a GUI: `python run.py --gui [--port: port_no]`
(it will return *.akn zip archives, also saved locally in
`keld/server/converted/`)
**All the converted files will be written in the directory `out`**
## troubleshooting
If you are experiencing problems with import errors, export the PYHONPATH
as follows:
```export PYTHONPATH:${PYTHONPATH}:<full/path/to/the/repo/>```
# How it works
The first step of the conversion consists in loading the provided word document
and converting it (or rather: its parts) into txt.
The second step is parsing the text top to bottom and using pattern matching to
identify structural elements such as the document title, number, the paragraphs,
sections, annexes and so on.
The pattern matching process uses [replus](https://pypi.org/project/replus/), which
provides a method to write modular, template-based extensible regular expression.
Depending on the result of the pattern matching on the text, the text itself is
mapped as-is into Objects that work as a proxy for Akoma Ntoso xml generation.
Before the objects are appended, the text is qualified via a `paragraph_qualifier`
which has the job to determine if the text represents a preambular or an operational
element; then it is appended accordingly.
Once the objects are all appended, a downward recursive algorithm is used to
ensure that all the elements are placed accordingly to their hierarchical value
(eg. if a `section` and a `paragraph` happen to be siblings, the latter will be
set as a child of the first).
The next step is to correctly generate the eIds of the objects (which may have
prefixes depending on their parent(s)).
After that the structure is in place, it is possible to run pattern matching
and machine learning algorithms to identify all inline elements, such as dates,
references, roles, organizations and so on.
The inline pattern matching also uses [replus](https://pypi.org/project/replus/);
the `match objects` are passed
through a series of `resolvers` which will extract the metadata, build the
attributes and the corresponding Top-Level Concept to be added to the AKN
`references`.
Other than regexes, inline elements are recognized using spaCy with some
customized NER.
Once the structure and the inlines are done, another spaCy-powered custom
algorithm identifies SDG with their targets and respective indicators. The
results are mapped into AKN `keywords`, `references` and custom name-spaced
(akn4un) elements that will link the results to their corresponding elements.
The last step simply consists in writing the AKN to an xml file and validate it.
import os
from .cleaner import get_text
from .doc_handler.doc_reader import read_doc
TEST_DOCS_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), os.pardir, "test_docs"))
def load_document(docpath):
doc_parts = read_doc(docpath)
return get_text(doc_parts)
from .main import download_all, parse, parse_all
import requests
import json
import traceback
from lxml import etree
import os
VALIDATION_ENDPOINT = "http://sinatra.cirsfid.unibo.it/node/akn-validation/Validate/"
here = os.path.dirname(os.path.abspath(__file__))
SCHEMA_PATH = os.path.join(here, "akomantoso30.xml")
akn_schema= etree.parse(SCHEMA_PATH)
schema = etree.XMLSchema(akn_schema)
def remote_validate(akn_string):
r = requests.post(
......@@ -10,12 +18,29 @@ def remote_validate(akn_string):
json={"source": akn_string},
timeout=30
)
if r.status_code == 200:
validation = json.loads(r.text)
# validation_xml = badgerfish.etree(validation, root=etree.Element("validation"))
# is_valid = validation["success"]
# #alidation_xml = validation_xml
return validation["success"], validation
try:
if r.status_code == 200:
validation = json.loads(r.text)
# validation_xml = badgerfish.etree(validation, root=etree.Element("validation"))
# is_valid = validation["success"]
# #alidation_xml = validation_xml
return validation["success"], validation
else:
return False, dict(success="unkown", errors=[dict(message=f"Could not validate from remote ({r.status_code}): {r.text}")])
except:
return False, dict(success="unkown", errors=[dict(message=traceback.format_exc())])
def local_validate(akn):
valid = schema.validate(akn)
if not valid:
validation = dict(success=False, errors=[str(err) for err in schema.error_log])
return False, validation
return True, {"success": True}
def validate(akn_string, remote=False):
if remote:
return remote_validate(akn_string)
else:
print(r.text)
raise ConnectionError(f"Could not validate from remote: {r.status_code}")
return local_validate(etree.fromstring(akn_string))
This diff is collapsed.
from keld.doc_handler.doc_container import (
DocContainer, DocNumber, DocTitle, Session, AgendaItem, Formula, Paragraph, Point, Container
)
from keld.commons.helpers import normalize_name
TYPE_MAP = {
"doc_num": DocNumber,
......@@ -12,7 +13,7 @@ TYPE_MAP = {
}
def qualify(text):
def qualify_paragraph(text):
words = text.split()
qualification = None
start = end = 0
......@@ -28,4 +29,4 @@ def qualify(text):
end = start + len(w)
break
return qualification, (start, end)
return qualification, (start, end), normalize_name(text[start:end])
import os
import re
from keld import load_document
from keld.commons.helpers import to_camel_case
from keld.doc_handler.doc_container import (
DocContainer, DocNumber, DocTitle, Session, AgendaItem, Formula,
Paragraph, Point, Container, P, Section, Heading
Paragraph, Point, Container, P, Section, Heading, AuthorialNote
)
from keld.doc_handler.helpers import normalize_name, AUTHORITY_MAP
from keld.inline_parser import parse_inlines
from replus import Engine
from unidecode import unidecode
from .helpers import qualify
from keld.qualifier import qualifier
from keld.doc_loader import load_document
from .helpers import qualify_paragraph
here = os.path.dirname(os.path.abspath(__file__))
......@@ -29,12 +29,15 @@ def parse(docpath):
parse_coverpage(text, document)
if type_ == "main":
parse_main(text, document)
if type_ == "footnotes":
parse_footnotes(text, document)
return document
def parse_main(main_text, document):
previous_type = None
previous_value = None
attachments = document.attachments
target = document.cover_page
last_element = None
for _p in main_text.split("\n"):
......@@ -43,6 +46,7 @@ def parse_main(main_text, document):
continue
m = engine.search(p)
if m is not None:
qualification, (qstart, qend), qterm = qualify_paragraph(p[m.end:].lstrip()) # qualifier.qualify_paragraph(p)[0]
if m.type == "numbers":
first_group = m.groups()[0].key
if first_group == "doc_num":
......@@ -54,20 +58,56 @@ def parse_main(main_text, document):
target = document.main_body
num = m.value
e = Paragraph(p[m.end:].lstrip(), num=num, parent=document)
if qualification == "operational":
attributes = {
"refersTo": f"#{qterm}",
"$reference": {
"tag": "TLCTerm",
"eId": "operational",
"href": f"/akn/ontology/terms/un/{qterm}",
"showAs": "Operational"
}
}
e.inlines.append(("term", (qstart, qend), attributes))
e.attributes["refersTo"] = "operational"
elif qualification == "preambular":
show_as = p[qstart:qend]
attributes = {
"refersTo": f"#{qterm}",
"$reference": {
"tag": "TLCTerm",
"eId": qterm,
"href": f"/akn/ontology/terms/un/{qterm}",
"showAs": show_as
}
}
e.inlines.append(("term", (qstart, qend), attributes))
e.attributes["refersTo"] = "#preamble"
target.append(e)
last_element = e
elif first_group == "point_num":
num = m.value
if last_element is not None and last_element.tag == "paragraph":
if last_element is not None and last_element.tag in ["paragraph", "container"]:
e = Point(text=p[m.end:].lstrip(), num=num, parent=last_element)
last_element.append(e)
else:
target = document.main_body
e = Paragraph(text=p[m.end:].lstrip(), num=num, parent=document)
target.append(e)
elif first_group == "act_num":
pass # Todo
else:
raise ValueError(f"Unkown number: {m.value}")
elif m.type == "annex":
annex_doc = DocContainer(
doctype="annex",
authority=document.authority,
date=document.date,
)
attachments.append(annex_doc)
annex_doc.eid = f"annex_{len(attachments)}"
document = annex_doc
else:
if m.type != "roman":
if m.type == "session_number":
......@@ -137,25 +177,41 @@ def parse_main(main_text, document):
section = Section(num=previous_value, parent=document)
heading = Heading(p, parent=section)
section.append(heading)
target = document.main_body
target.append(section)
else:
e = Container(p, parent=document)
qualification, (start, end) = qualify(p)
if qualification is not None:
tag = "term"
qualification, (start, end), term = qualify_paragraph(p) # qualifier.qualify_paragraph(p)[0]
if qualification == "preambular":
show_as = p[start:end]
name = normalize_name(show_as)
attributes = {
"refersTo": f"#{name}",
"refersTo": f"#{term}",
"$reference": {
"tag": "TLCTerm",
"eId": name,
"href": f"/akn/ontology/terms/un/{name}",
"eId": term,
"href": f"/akn/ontology/terms/un/{term}",
"showAs": show_as
}
}
e.inlines.append((tag, (start, end), attributes))
e.attributes["name"] = name
e = Container(p, parent=document)
e.inlines.append(("term", (start, end), attributes))
e.attributes["name"] = term
elif qualification == "operational":
attributes = {
"refersTo": f"#{term}",
"$reference": {
"tag": "TLCTerm",
"eId": "operational",
"href": f"/akn/ontology/terms/un/{term}",
"showAs": "Operational"
}
}
target = document.main_body
e = Paragraph(p, parent=document, refersTo=qualification)
e.inlines.append(("term", (start, end), attributes))
else:
target = document.main_body
e = Paragraph(p, parent=document)
last_element = e
target.append(e)
previous_type = None
previous_value = p
......@@ -207,3 +263,8 @@ def parse_coverpage(text, document):
)
e.inlines.append((tag, offset, attributes))
target.append(e)
def parse_footnotes(footnotes, document):
for note_id, text in footnotes:
document.authorial_notes.append(AuthorialNote(note_id, text=text, parent=document, placement="bottom"))
{
"patterns": [
"^annex *$"
]
}
\ No newline at end of file
......@@ -3,6 +3,6 @@
"\\[.+\\]"
],
"patterns": [
"^{{subheading}}$"
"^{{subheading}} *$"
]
}
\ No newline at end of file
from .doc_to_text import get_text
import os
TEST_DOCS_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), os.pardir, os.pardir, "test_docs"))
import re
from lxml import etree
AUTHORITY_MAP = {
"generalAssembly": "unga",
"theGeneralAssembly": "unga",
"unitedNations": "un",
"theUnitedNations": "un"
}
def to_camel_case(text):
text = text.lower()
......@@ -10,3 +18,9 @@ def to_camel_case(text):
def etree_to_string(xml):
return etree.tostring(xml, encoding="utf-8", method="xml", pretty_print=True).decode("utf-8")
def normalize_name(text):
atcc = to_camel_case(text)
return AUTHORITY_MAP.get(atcc, atcc)
from .constants import Qualifications, Types
from .doc_container import DocContainer
from .doc_reader import read_doc
......@@ -6,7 +6,7 @@ from keld.ner_sdg import get_sdg
from .helpers import ner_entities
from .hierarchy import HIERARCHY, SIBLINGS
from .serializers import etree_element, markup
from .serializers import etree_element, markup, note_pattern
EID_MAP = {
"section": "sec",
......@@ -39,6 +39,12 @@ class _Base:
self.__children.remove(element)
except ValueError:
pass
if self.__class__.__name__ == "DocContainer":
for attr in self.components:
try:
self.components[attr].remove(element)
except ValueError:
pass
@property
def eid(self):
......@@ -46,6 +52,7 @@ class _Base:
parent_eid = self.parent.eid
if parent_eid is not None and self._eid is not None:
return f"{parent_eid}__{self._eid}"
# assert self._eid is not None, "here!"
return self._eid
@eid.setter
......@@ -92,7 +99,7 @@ class _Base:
def hierarchize(self):
if self.__class__.__name__ == "DocContainer":
for attr in ["cover_page", "preface", "preamble", "main_body", "conclusions"]:
for attr in ["cover_page", "preface", "preamble", "main_body", "conclusions", "attachments"]:
self._hierarchize(attr)
else:
self._hierarchize("children")
......@@ -101,15 +108,17 @@ class _Base:
eid_counter = Counter()
for element in getattr(self, attr):
if not hasattr(element, "num"):
element.build_eids()
continue
eid_counter[element.tag] += 1
eid_base = EID_MAP.get(element.tag, element.tag)
eid_num = element.num or eid_counter[element.tag]
element.eid = re.sub(r"\W$", "", f"{eid_base}_{eid_num}")
eid_num = element.num or f"{attr}_pg{eid_counter[element.tag]}"
element.eid = re.sub(r"\W", "", f"{eid_base}_{eid_num}")
element.build_eids()
def build_eids(self):
if self.__class__.__name__ == "DocContainer":
for attr in ["cover_page", "preface", "preamble", "main_body", "conclusions"]:
for attr in ["cover_page", "preface", "preamble", "main_body", "conclusions", "attachments"]:
self._build_eids(attr)
else:
self._build_eids("children")
......@@ -161,14 +170,14 @@ class _Base:
c.nlp()
def get_sdg(self, document):
for sdg, target_list in get_sdg(self.text):
sdg_id = sdg['index']
sdg_confidence = sdg['similarity']
for goal_list, target_list, indicator_list in get_sdg(self.text):
sdg_id = goal_list['id']
sdg_confidence = goal_list['similarity']
eid = f"keyword_{len(document.classification) + 1}"
reference_id = f"concept_sdg_{sdg_id}"
kw_attributes = dict(
eId=eid,
dictionary="SDGO",
dictionary="SDGIO",
value=f"goal_{sdg_id}",
href=f"/akn/ontology/concepts/un/sdg_{sdg_id}",
showAs=f"SDG {sdg_id}",
......@@ -183,32 +192,47 @@ class _Base:
document.references[reference_id] = Reference("TLCConcept", **ref_attributes)
sdg_source = Reference("source", namespace="akn4un", href=f"#{self.eid}")
classification = Reference(
"classification",
"sdgGoal",
namespace="akn4un",
value=kw_attributes["value"],
confidence=sdg_confidence,
name="SDGO"
name="SDGIO"
)
sdg_source.children.append(classification)
for tgt in target_list:
tgt_id = tgt['index']
tgt_id = tgt['id']
tgt_confidence = tgt['similarity']
tgt_kw_attributes = {k: f"{v}_{tgt_id}" for k, v in kw_attributes.items() if k != "dictionary"}
tgt_kw_attributes["dictionary"] = "SDGO"
tgt_kw_attributes["dictionary"] = "SDGIO"
tgt_ref_attributes = {k: f"{v}_{tgt_id}" for k, v in ref_attributes.items()}
document.classification[tgt_kw_attributes["eId"]] = Reference("keyword", **tgt_kw_attributes)
document.references[tgt_ref_attributes["eId"]] = Reference("TLCConcept", **tgt_ref_attributes)
sub_classification = Reference(
"subClassification",
tgt_classification = Reference(
"sdgTarget",
namespace="akn4un",
value=tgt_kw_attributes["value"],
confidence=tgt_confidence,
name="SDGO"
name="SDGIO"
)
sdg_source.children.append(tgt_classification)
for ind in indicator_list:
ind_id = ind['id']
ind_confidence = ind['similarity']
ind_kw_attributes = {k: f"{v}_{ind_id}" for k, v in kw_attributes.items() if k != "dictionary"}
ind_kw_attributes["dictionary"] = "SDGIO"
ind_ref_attributes = {k: f"{v}_{ind_id}" for k, v in ref_attributes.items()}
document.classification[ind_kw_attributes["eId"]] = Reference("keyword", **ind_kw_attributes)
document.references[ind_ref_attributes["eId"]] = Reference("TLCConcept", **ind_ref_attributes)
ind_classification = Reference(
"sdgIndicator",
namespace="akn4un",
value=ind_kw_attributes["value"],
confidence=ind_confidence,
name="SDGIO"
)
sdg_source.children.append(sub_classification)
break
sdg_source.children.append(ind_classification)
document.sdg_sources.append(sdg_source)
break
class Base(_Base):
......@@ -222,39 +246,44 @@ class Base(_Base):
def serialize(self, from_=None):
self.inlines.sort(key=lambda x: x[1][0])
if from_ == "mainBody" and self.tag == "container":
my_node = etree_element("crossHeading", refersTo="#preamble")
if self.tag == "point" and self.parent.tag == "container":
my_node = etree_element("item")
self.content = False
elif self.tag == "heading":
return markup(self.text, self.inlines, root="heading")
else:
my_node = etree_element(self.tag)
if self.eid:
my_node.set("eId", self.eid)
my_node.set("eId", f"{self.eid}")
if self.num:
num = etree_element("num")
num.text = re.sub(r"\W$", r"", self.num)
num.text = self.num
my_node.append(num)
if self.content:
if self.content and len(self.children) == 0:
content = etree_element("content")
my_node.append(content)
else:
content = my_node
if len(self.children) == 0:
if self.text != "":
p = markup(self)
p = markup(self.text, self.inlines)
content.append(p)
else:
if self.text != "":
if self.text.endswith(":"):
tag = "intro"
if self.text.strip().endswith(":"):
intro = etree_element("intro")
intro.append(markup(self.text, self.inlines))
else:
tag = "alinea"
p = markup(self, root=tag)
content.append(p)
alinea = etree_element("intro")
_content = etree_element("content")
_content.append(markup(self.text, self.inlines))
intro = alinea
_l = etree_element("list")
_l.append(intro)
content.append(_l)
else:
_l = content
content = _l
for c in self.children:
_l.append(c)
content.append(c.serialize(from_=self.tag))
for k, v in self.attributes.items():
if v is None:
......@@ -275,7 +304,7 @@ class RBase(_Base):
def serialize(self, from_=None):
self.inlines.sort(key=lambda x: x[1][0])
my_node = etree_element("p")
tag = markup(self, self.tag)
tag = markup(self.text, self.inlines, root=self.tag)
my_node.append(tag)
for k, v in self.attributes.items():
if v is None:
......@@ -297,6 +326,7 @@ class Reference:
for k, v in self.attributes.items():
if v is None:
v = ""
v = re.sub(note_pattern, "", f"{v}")
my_node.set(k, f"{v}")
for c in self.children:
my_node.append(c.serialize())
......
from collections import Counter
from lxml import etree
from .base import _Base, Base, RBase, Reference
from .constants import Types, DocTypes
from .serializers import create_meta
from .constants import DocTypes
from .serializers import create_meta, akn_string
from .serializers import etree_element, NS_MAP, markup
from collections import Counter
class DocContainer(_Base):
......@@ -21,6 +22,8 @@ class DocContainer(_Base):
self.preamble = []
self.main_body = []
self.conclusions = []
self.attachments = []
self.authorial_notes = []
self.ref_counter = Counter()
self.classification = {}
self.references = dict(
......@@ -51,32 +54,20 @@ class DocContainer(_Base):
"preface": self.preface,
"preamble": self.preamble,
"mainBody": self.main_body,
"conclusions": self.conclusions
"conclusions": self.conclusions,
"attachments": self.attachments
}
@property
def base_uri(self):
return f"/akn/un/statement/{self.doctype}/{self.authority}/{self.date}/{self.number.replace('/', '-')}"
def __str__(self):
my_str = "\n".join(str(c) for c in self.__children)
return my_str