Commit fa80fa41 authored by Biagio's avatar Biagio

minor hotfix + new patternsc

parent dbe23692
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -194,7 +194,8 @@ class _Base:
classification = Reference(
"sdgGoal",
namespace="akn4un",
value=kw_attributes["value"],
href=f"#{eid}",
# value=kw_attributes["value"],
confidence=sdg_confidence,
name="SDGIO"
)
......@@ -210,7 +211,8 @@ class _Base:
tgt_classification = Reference(
"sdgTarget",
namespace="akn4un",
value=tgt_kw_attributes["value"],
href=f"#{eid}",
# value=tgt_kw_attributes["value"],
confidence=tgt_confidence,
name="SDGIO"
)
......
......@@ -10,9 +10,10 @@ from .serializers import etree_element, NS_MAP, markup
class DocContainer(_Base):
def __init__(self, doctype=None, authority=None, number=None, date=None, publication_date=None):
def __init__(self, doctype=None, authority=None, number=None, date=None, docname=None, publication_date=None):
super().__init__(_type="document")
self.doctype = doctype or DocTypes.RESOLUTION
self.docname = docname or "deliberation"
self.authority = authority or "xxxx"
self.number = number or "xx-xxxx"
self.date = date or "xxxx-xx-xx"
......@@ -60,7 +61,7 @@ class DocContainer(_Base):
@property
def base_uri(self):
return f"/akn/un/statement/{self.doctype}/{self.authority}/{self.date}/{self.number.replace('/', '-')}"
return f"/akn/un/statement/{self.docname}/{self.authority}/{self.date}/{self.number.replace('/', '-')}"
def nlp(self):
for attr in ["children", "cover_page", "preface", "preamble", "main_body", "conclusions", "authorial_notes"]:
......
{
"authority":[
"General Assembly",
"United Nations"
"United Nations",
"(?<!united )states"
],
"patterns": [
"{{authority}}"
......
......@@ -2,7 +2,10 @@
"resolution": [
"A(/RES)?/\\d+(/\\d+)?(/Corr\\.? ?\\d+)?( \\w)?\\b",
"A/C\\.\\d+/\\d+/SR\\.\\d+",
"(par(\\.|agraph)( n(\\.|umber|um))? ?\\d+( of( the)?)? )?res(\\.|olution)( n(\\.|umber|um))? ?\\d+/\\d{4}"
"(par(\\.|agraph)( n(\\.|umber|um))? ?\\d+( of( the)?)? )?res(\\.|olution)( n(\\.|umber|um))? ?\\d+/\\d{4}",
"resolutions? \\d{4}/\\d+ of {{date}}",
"treaty series,? vol\\.? 610, No\\.? \\d+",
"article \\d+, par(\\.|agraph) \\d+"
],
"patterns": [
"{{resolution}}"
......
......@@ -18,7 +18,7 @@ logger = setup_logger(f"keld", os.path.join(here, f"keld.log"))
def parse(filepath, output_dir=None):
output_dir = os.path.abspath(output_dir or "output")
output_dir = output_dir or os.path.abspath(output_dir or "output")
os.makedirs(output_dir, exist_ok=True)
doc = parse_doc(filepath)
doc.hierarchize()
......@@ -49,7 +49,7 @@ def parse(filepath, output_dir=None):
return valid, akn_filename
def parse_all(batch_no=None):
def parse_all(batch_no=None, output_dir=None):
docs_to_parse = os.listdir(TEST_DOCS_DIR)
batches = [
docs_to_parse[:150],
......@@ -62,7 +62,7 @@ def parse_all(batch_no=None):
else:
batch = docs_to_parse
pbar = tqdm(batch)
os.makedirs("out", exist_ok=True)
os.makedirs(output_dir or "out", exist_ok=True)
for i, docname in enumerate(pbar):
if not docname.endswith((".doc", ".DOC")):
continue
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment