Commit 30bea27e authored by francesco.sovrano's avatar francesco.sovrano

Bug fix: SDG Classifier

Bug fix: SDG Classifier
parent cb69b474
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>un-challange-2019</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.python.pydev.PyDevBuilder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.python.pydev.pythonNature</nature>
</natures>
</projectDescription>
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?eclipse-pydev version="1.0"?><pydev_project>
<pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">Default</pydev_property>
<pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python interpreter</pydev_property>
</pydev_project>
import os
import sys
sys.path.append(os.path.dirname(os.path.realpath(__file__)) + "/lib")
import common as lib
import sdg_recognizer as sdg
import sdg_target_recognizer as tgt
sdg_model = sdg.SDGRecognizer(
with_semantic_shifting=True,
with_topic_scaling=True,
with_document_log_length_scaling=True,
with_stemmed_tfidf=True,
cache_queries=True,
log=False
)
tgt_model_list = [
tgt.SDGTargetRecognizer(
target_id=i,
with_semantic_shifting=True,
with_topic_scaling=True,
with_document_log_length_scaling=True,
with_stemmed_tfidf=True,
cache_queries=True,
log=False
)
for i in range(sdg_model.target_size)
]
# Build similarity queries. Our goal is to know how similar is the query to the corpus
queries = [
u"He is hungry and needs nutrition, they need to help him.",
u"the third and the second ship arriving from Moscow. That's for the SDG 4. He is hungry and needs nutrition, they need to help him.",
u"We are going to tackle the Sustainable Development Goal 7.",
u"We are going to tackle the seventh Sustainable Development Goal.",
u"there many different countries trying to work together on something",
u"sdg 4",
u"4",
u"hello everyone there something new to discuss, 4 new topics indeed",
u"There are 17 Sustainable Development Goals.",
u"resolution 70/1 of 25 September 2015, entitled “Transforming our world: the 2030 Agenda for Sustainable Development”, in which it adopted a comprehensive, far-reaching and people-centred set of universal and transformative 17 Sustainable Development Goals and targets, its commitment to working tirelessly for the full implementation of the Agenda by 2030, its recognition that eradicating poverty in all its forms and dimensions, including extreme poverty, is the greatest global challenge and an indispensable requirement for sustainable development, its commitment to achieving sustainable development in its three dimensions — economic, social and environmental — in a balanced and integrated manner, and to building upon the achievements of the Millennium Development Goals and seeking to address their unfinished business,",
u"The 17 Sustainable Development Goals and 169 targets which we are announcing today demonstrate the scale and ambition of this new universal Agenda. They seek to build on the Millennium Development Goals and complete what they did not achieve. They seek to realize the human rights of all and to achieve gender equality and the empowerment of all women and girls. They are integrated and indivisible and balance the three dimensions of sustainable development: the economic, social and environmental.",
u"*Notes with appreciation* that the preparatory process and the high-level segment of UNISPACE+50 resulted in documents aimed at articulating a comprehensive, inclusive and strategically oriented vision on strengthening international cooperation in the exploration and peaceful uses of outer space, in which space is seen as a major driver of and contributor to the achievement of the Sustainable Development Goals2 for the benefit of all countries;",
u"*Noting* that, while considerable progress has been made over the past decade across all areas of development, the pace of progress observed in recent years is insufficient and uneven to fully meet the Sustainable Development Goals and targets by 2030, especially in the area of rural poverty eradication, "
]
# Find queries similarity
for index,query in enumerate(queries):
print('----------------------------------------')
print('Query', index+1)
print('Content: "{}"\n'.format(query))
# Get the SDG
# Compute similarities
tfidf_similarity, docvec_similarity, weighted_similarity, topic_similarity = sdg_model.get_query_similarity(query)
#=======================================================================
# print('topic_similarity for every sub-corpus', topic_similarity)
# print('normalized combined similarities', weighted_similarity)
# # Compute similarity rankings
# tfidf_ranking = 1 + sdg_model.get_similarity_ranking(tfidf_similarity.flatten())
# docvec_ranking = 1 + sdg_model.get_similarity_ranking(docvec_similarity.flatten())
# final_ranking = 1 + sdg_model.get_similarity_ranking(weighted_similarity)
# print('tf-idf ranking', tfidf_ranking)
# print('docvec ranking', docvec_ranking)
# print('final ranking', final_ranking)
# print('')
#=======================================================================
index_list = sdg_model.get_index_of_most_similar_documents(weighted_similarity, threshold=0.75)
print('SDG classification', index_list)
# Get the target
for idx in index_list:
tgt_model = tgt_model_list[idx['index']-1]
# Compute similarities
tfidf_similarity, docvec_similarity, weighted_similarity, topic_similarity = tgt_model.get_query_similarity(query)
#=======================================================================
# print('topic_similarity for every sub-corpus', topic_similarity)
# print('normalized combined similarities', weighted_similarity)
# # Compute similarity rankings
# tfidf_ranking = 1 + tgt_model.get_similarity_ranking(tfidf_similarity.flatten())
# docvec_ranking = 1 + tgt_model.get_similarity_ranking(docvec_similarity.flatten())
# final_ranking = 1 + tgt_model.get_similarity_ranking(weighted_similarity)
# print('tf-idf ranking', tfidf_ranking)
# print('docvec ranking', docvec_ranking)
# print('final ranking', final_ranking)
# print('')
#=======================================================================
# Classify query
print('SDG {}, Target {}'.format(idx['index'], tgt_model.get_index_of_most_similar_documents(weighted_similarity, threshold=0.75)))
print('----------------------------------------')
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
This diff is collapsed.
{
"roles": [
{
"name": "unsecretarygeneral",
"description": "Secretary-General"
},
{
"name": "faoCommissary",
"description": "Commissary"
},
{
"name": "memberNation",
"description": "Member Nation"
},
{
"name": "faocommissary",
"description": "Staff Commissary"
},
{
"name": "executiveDirector",
"description": "Executive Director"
},
{
"name": "supremeCommander",
"description": "Supreme Commander"
},
{
"name": "supremeCommanderForTheAlliedPowers",
"description": "Supreme Commander for the Allied Powers"
},
{
"name": "theSpecialRepresentativeOfTheDirector-general",
"description": "Special Representative of the Director-General"
},
{
"name": "unhighcommissioner",
"description": "United Nations High Commissioner"
},
{
"name": "presidentOfTheGeneralAssembly",
"description": "President of the General Assembly"
},
{
"name": "chairman",
"description": "Chairman"
},
{
"name": "permanentRepresentative",
"description": "Permanent Representative"
},
{
"name": "primeMinister",
"description": "Prime Minister"
},
{
"name": "executiveBoard",
"description": "Executive Board"
},
{
"name": "undpadministrator",
"description": "Administrator"
},
{
"name": "headsOfStateOrGovernment",
"description": "Heads of State or Government"
},
{
"name": "director-general",
"description": "Director-General"
},
{
"name": "faoDeputyDirectorGeneral",
"description": "Deputy Director-General"
},
{
"name": "assistantdirectorgeneral",
"description": "Assistant Directors-General"
},
{
"name": "president",
"description": "President"
},
{
"name": "externalAuditor",
"description": "External Auditor"
},
{
"name": "rapporteur",
"description": "Rapporteur"
},
{
"name": "unundersecretarygeneral",
"description": "Undersecretary"
},
{
"name": "director",
"description": "Director"
},
{
"name": "independentchairperson",
"description": "Independent Chairman"
},
{
"name": "executiveSecretary",
"description": "Executive Secretary"
},
{
"name": "chairperson",
"description": "Chairperson"
},
{
"name": "viceChairperson",
"description": "Vice-Chairpersons"
},
{
"name": "vicechairperson",
"description": "Vice-Chairman"
},
{
"name": "faostaffpensioncommitteemember",
"description": "Member"
},
{
"name": "faococ-ieechairperson",
"description": "Chair"
},
{
"name": "faococ-ieevicechairperson",
"description": "Vice-Chairs"
},
{
"name": "comptroller",
"description": "Comptroller"
},
{
"name": "auditorGeneral",
"description": "Auditor General"
},
{
"name": "permanentSecretary",
"description": "Permanent Secretary"
},
{
"name": "actingDirector-general",
"description": "Acting Director-General"
},
{
"name": "faoDirecteurDeCabinet",
"description": "Chef de Cabinet"
},
{
"name": "faoOfficerInCharge",
"description": "Officer-in-Charge"
},
{
"name": "specialRepresentative",
"description": "Special Representative"
},
{
"name": "memberStates",
"description": "Member States"
},
{
"name": "headsOfStateAndGovernment",
"description": "Heads of State and Government"
},
{
"name": "presidentOfTheAssembly",
"description": "President of the Assembly"
},
{
"name": "residentCoordinator",
"description": "resident coordinator"
},
{
"name": "residentRepresentative",
"description": "resident representative"
},
{
"name": "membersOfTheTaskForce",
"description": "members of the Task Force"
},
{
"name": "co-chairpersons",
"description": "Co-Chairperson"
},
{
"name": "governmentLegalAdvisers",
"description": "Government legal adviser"
},
{
"name": "secretariatPursuant",
"description": "Secretariat pursuant"
},
{
"name": "advisoryBodyOfExperts",
"description": "Advisory Body of Experts"
},
{
"name": "secretariat",
"description": "Secretariat"
},
{
"name": "seniorRepresentativesOfInternationalOrganizations",
"description": "senior representatives"
}
]
}
\ No newline at end of file
#!/usr/bin/python
import re
import random
import sys
class SentenceGenerator():
def __init__(filename, markovLength=1):
# These mappings can get fairly large -- they're stored globally to
# save copying time.
# (tuple of words) -> {dict: word -> number of times the word appears following the tuple}
# Example entry:
# ('eyes', 'turned') => {'to': 2.0, 'from': 1.0}
# Used briefly while first constructing the normalized self.mapping
self.tempMapping = {}
# (tuple of words) -> {dict: word -> *normalized* number of times the word appears following the tuple}
# Example entry:
# ('eyes', 'turned') => {'to': 0.66666666, 'from': 0.33333333}
self.mapping = {}
# Contains the set of words that can start sentences
self.starts = []
self.markovLength = markovLength
self.buildMapping(self.wordlist(filename))
# We want to be able to compare words independent of their capitalization.
def fixCaps(self, word):
# Ex: "FOO" -> "foo"
if word.isupper() and word != "I":
word = word.lower()
# Ex: "LaTeX" => "Latex"
elif word [0].isupper():
word = word.lower().capitalize()
# Ex: "wOOt" -> "woot"
else:
word = word.lower()
return word
# Tuples can be hashed; lists can't. We need hashable values for dict keys.
# This looks like a hack (and it is, a little) but in practice it doesn't
# affect processing time too negatively.
def toHashKey(self, lst):
return tuple(lst)
# Returns the contents of the file, split into a list of words and
# (some) punctuation.
def wordlist(self, filename):
f = open(filename, 'r')
wordlist = [self.fixCaps(w) for w in re.findall(r"[\w']+|[.,!?;]", f.read())]
f.close()
return wordlist
# Self-explanatory -- adds "word" to the "self.tempMapping" dict under "history".
# self.tempMapping (and self.mapping) both match each word to a list of possible next
# words.
# Given history = ["the", "rain", "in"] and word = "Spain", we add "Spain" to
# the entries for ["the", "rain", "in"], ["rain", "in"], and ["in"].
def addItemToTempMapping(self, history, word):
while len(history) > 0:
first = self.toHashKey(history)
if first in self.tempMapping:
if word in self.tempMapping[first]:
self.tempMapping[first][word] += 1.0
else:
self.tempMapping[first][word] = 1.0
else:
self.tempMapping[first] = {}
self.tempMapping[first][word] = 1.0
history = history[1:]
# Building and normalizing the self.mapping.
def buildMapping(self, wordlist):
self.starts.append(wordlist [0])
for i in range(1, len(wordlist) - 1):
if i <= self.markovLength:
history = wordlist[: i + 1]
else:
history = wordlist[i - self.markovLength + 1 : i + 1]
follow = wordlist[i + 1]
# if the last elt was a period, add the next word to the start list
if history[-1] == "." and follow not in ".,!?;":
self.starts.append(follow)
self.addItemToTempMapping(history, follow)
# Normalize the values in self.tempMapping, put them into self.mapping
for first, followset in self.tempMapping.items():
total = sum(followset.values())
# Normalizing here:
self.mapping[first] = dict([(k, v / total) for k, v in followset.items()])
# Returns the next word in the sentence (chosen randomly),
# given the previous ones.
def next(self, prevList):
sum = 0.0
retval = ""
index = random.random()
# Shorten prevList until it's in self.mapping
while self.toHashKey(prevList) not in self.mapping:
prevList.pop(0)
# Get a random word from the self.mapping, given prevList
for k, v in self.mapping[self.toHashKey(prevList)].items():
sum += v
if sum >= index and retval == "":
retval = k
return retval
def genSentence(self):
# Start with a random "starting word"
curr = random.choice(self.starts)
sent = curr.capitalize()
prevList = [curr]
# Keep adding words until we hit a period
while (curr not in "."):
curr = self.next(prevList)
prevList.append(curr)
# if the prevList has gotten too long, trim it
if len(prevList) > self.markovLength:
prevList.pop(0)
if (curr not in ".,!?;"):
sent += " " # Add spaces between words (but not punctuation)
sent += curr
return sent
......@@ -86,7 +86,7 @@ def get_entities(text):
def get_sdg(text):
_, _, weighted_similarity, _ = sdg_model.get_query_similarity(text)
sdg_list = sdg_model.get_index_of_most_similar_documents(weighted_similarity, threshold=1.2)
sdg_list = sdg_model.get_index_of_most_similar_documents(weighted_similarity, threshold=0.85)
result = []
for sdg in sdg_list:
tgt_model = tgt_model_list[sdg['index'] - 1]
......
This diff is collapsed.
This diff is collapsed.
from keld import TEST_DOCS_DIR
from keld.commons import read_doc
import os
if __name__ == "__main__":
for docname in os.listdir(TEST_DOCS_DIR):
if not docname.lower().endswith(".doc"):
continue
# print("\n", docname, "\n")
docpath = os.path.join(TEST_DOCS_DIR, "N1846610.DOC")
doc_parts = read_doc(os.path.join(TEST_DOCS_DIR, docpath))
for part in doc_parts:
main = doc_parts["word/document.xml"]
os.makedirs("tmp", exist_ok=True)
root, tail = os.path.split(part)
os.makedirs(os.path.join("tmp", root), exist_ok=True)
with open(os.path.join("tmp", part), "w") as f:
f.write(str(doc_parts[part]))
# break
from keld.cleaner import clean_doc
from keld.commons import read_doc
from keld import TEST_DOCS_DIR
from keld.serializer import Serializer
import os
from lxml import etree
if __name__ == "__main__":
docpath = os.path.join(TEST_DOCS_DIR, "N1840398.DOC")
doc = read_doc(docpath)
doc = clean_doc(doc)
serializer = Serializer()
doc.hierarchize()
doc.parse_inlines()
akn = serializer.serialize(doc)
print(etree.tostring(akn, pretty_print=True).decode("utf-8"))
from keld.cleaner import clean_doc
from keld.commons import read_doc
from keld import TEST_DOCS_DIR
from keld.serializer import Serializer
import os
from lxml import etree
if __name__ == "__main__":
os.makedirs("out", exist_ok=True)
for docname in os.listdir(TEST_DOCS_DIR):
if not docname.lower().endswith(".doc"):
continue
print("###", docname)
docpath = os.path.join(TEST_DOCS_DIR, docname)
doc = read_doc(docpath)
doc = clean_doc(doc)
serializer = Serializer()
for p in doc:
for p0 in p:
if p0.type not in ["paragraph", "SingleTxt"]:
print(f"<{p0.type}>", p0.text)
continue
doc.hierarchize()
doc.parse_inlines()
akn = serializer.serialize(doc)
fname = docname.lower().replace(".doc", ".xml")
xml_string = etree.tostring(akn, pretty_print=True).decode("utf-8")
with open(os.path.join("out", fname), "w") as f:
f.write(xml_string)
from keld.commons import read_doc
from keld.cleaner import get_text
from keld import TEST_DOCS_DIR
import os
if __name__ == "__main__":
os.makedirs("tmp2", exist_ok=True)
for docname in os.listdir(TEST_DOCS_DIR):
if not docname.lower().endswith(".doc"):
continue
print(f"\n##### {docname} #####\n")
text = get_text(read_doc(os.path.join(TEST_DOCS_DIR, docname)))
all_text = []
for _, t in text:
print(t)
with open(os.path.join("tmp2", docname.replace(".DOC", ".txt")), "w") as f:
f.write("\n".join(all_text))
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
from keld import load_document
from keld.ner_sdg import get_sdg
from pprint import pprint
import os
from keld import TEST_DOCS_DIR
from keld.body_parser import parse
from keld.commons import setup_logger
from keld.serializer import Serializer
from tqdm import tqdm
here = os.path.abspath(os.path.dirname(os.path.abspath(__file__)))
logger = setup_logger("tmp", os.path.join(here, "tmp.log"))
tqdm.monitor_interval = 0
if __name__ == "__main__":
serializer = Serializer()
pbar = tqdm(os.listdir(TEST_DOCS_DIR))
ndoc = 5
os.makedirs("out", exist_ok=True)
for i, docname in enumerate(pbar):
if not docname.endswith((".doc", ".DOC")):
continue
pbar.set_description(docname)
docpath = os.path.join(TEST_DOCS_DIR, docname)
try:
doc = load_document(docpath)
doc = doc[1][1]
print('Doc:', doc)
print('')
pprint(get_sdg(doc))
except KeyboardInterrupt:
raise
except Exception as e:
raise
logger.exception(f"Fatal error for {docname}")
tqdm.write(f">>>> Fatal error for {docname}: {e}")
if i == ndoc:
break
pbar.close()
......@@ -14,8 +14,6 @@ cobble==0.1.3
cryptography==2.6.1
cymem==2.0.2
docutils==0.14
en-core-web-md==2.1.0
en-core-web-sm==2.1.0
gensim==3.7.2
idna==2.8