From 66213c856ce6409a29db2e44e3357b16e53dfa45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Anna=20Vernerov=C3=A1?= <1109323-Ansa211@users.noreply.gitlab.com> Date: Fri, 11 Sep 2020 12:41:01 +0000 Subject: [PATCH] Pos (attribute capturing the part of speech) --- CHANGELOG.rst | 4 +- doc/examples/compute-slovko.py | 6 +- .../scripts/dynamic_properties/misc_info.py | 63 ++++++++++++------- vallex/scripts/mapreduce.py | 4 +- vallex/scripts/mapreducers/slovko.py | 20 +++--- vallex/scripts/transforms/add_valdiff.py | 2 +- 6 files changed, 57 insertions(+), 42 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index df79f6f..f984a76 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -235,7 +235,7 @@ User Interface (ui) - Set the window icon / favicon. [Jonathan L. Verner] - Fix search not working on windows. [Jonathan L. Verner] - The query user enters is anded with a query constructed from the + The query user enters is added to a query constructed from the pathnames of the lexicons to search through. On windows, these pathnames typically contain '\' characters, which have special meaning in a regular expression. We must therefore escape them (and others). @@ -309,7 +309,7 @@ Library (lib) - Parse derivedN, Numbered attributes + attributes with refs. [Anša Vernerová] - Rename ParamFunctor to ValencySlot and Scope to FunctorCombination. [Anša Vernerová] -- Rename noun, verb, computed reflexive attrs to IsNoun, isVerb, isReflexverb. +- Rename noun, verb, computed reflexive attrs to isNoun, isVerb, isReflexverb. [Anša Vernerová] - Don't crash if frame has bad format. [Jonathan L. Verner] - When saving, do not put additional newlines after lexemes. [Jonathan diff --git a/doc/examples/compute-slovko.py b/doc/examples/compute-slovko.py index 30a9166..534a32b 100644 --- a/doc/examples/compute-slovko.py +++ b/doc/examples/compute-slovko.py @@ -62,7 +62,7 @@ noun_forms_typical = defaultdict(lambda: {'prod': [], 'noprod': []}) # type: ig noun_forms_special = defaultdict(lambda: {'prod': [], 'noprod': []}) # type: ignore for lu in lexicon_coll.lexical_units: - if not lu.dynamic_attrs['isNoun']._data or not 'valdiff' in lu.attribs: + if not (lu.dynamic_attrs['pos']._data in ('stem noun', 'root noun')) or not 'valdiff' in lu.attribs: continue derived = lu.attribs.get('derivedV', {})._data.get('ids', []) # type: ignore if not derived: @@ -76,8 +76,8 @@ for lu in lexicon_coll.lexical_units: spec = lu.attribs['valdiff'] - type_ = 'prod' if lu.dynamic_attrs['productive']._data else 'noprod' - class_ = lu.attribs['class']._data.strip().strip('?').split(' ')[0].split('/')[0].strip() if 'class' in lu.attribs else 'unspecified' + type_ = 'prod' if lu.dynamic_attrs['pos']._data == 'stem noun' else 'noprod' + class_ = lu.attribs['class']._data[0].strip().strip('?').split(' ')[0].split('/')[0].strip() if 'class' in lu.attribs else 'unspecified' verb_forms[class_][type_].extend(sum([verb.match_key_values(['frame', funct, 'forms']) for funct in ACTANT_FUNCTORS], [])) diff --git a/vallex/scripts/dynamic_properties/misc_info.py b/vallex/scripts/dynamic_properties/misc_info.py index 77fa50b..14df240 100644 --- a/vallex/scripts/dynamic_properties/misc_info.py +++ b/vallex/scripts/dynamic_properties/misc_info.py @@ -1,34 +1,49 @@ +import logging +from vallex.log import log +import sys +import os # for better debugging + import re from vallex import Attrib - -def compute_isNoun(lu): - attr = Attrib('isNoun', dynamic=True, help='Is it a noun (True/False)?', data='blu-n' in lu._id) - lu.dynamic_attrs[attr.name] = attr - - -def compute_isVerb(lu): - attr = Attrib('isVerb', dynamic=True, help='Is it a verb (True/False)?', data='blu-v' in lu._id) +PDTVALLEX_POS_MAPPING = { + 'V': 'verb', + 'N': 'stem noun', + 'Nx': 'root noun', + 'A': 'adjective', + 'D': 'adverb' +} + + +def compute_pos(lu): + attr = Attrib('pos', dynamic=True, help='Detailed part of speech (verb / stem noun / root noun)') + attr._data = 'unknown' + if 'blu-v' in lu._id: + attr._data = 'verb' + elif 'blu-n' in lu._id: + attr._data = 'stem noun' + for var in ['', '1', '2', '3', '4']: + if 'no-aspect'+var in lu.lemma._data.keys(): + attr._data = 'root noun' + break + elif 'v-w' in lu._id: + attr._data = PDTVALLEX_POS_MAPPING[lu.lemma._data.keys()[0]] lu.dynamic_attrs[attr.name] = attr -REFLVERB_RGX = re.compile(r'.*T[12]?\s+S[IE].*') -"""A regexp for recognizing reflexive verbs from the id of their parent lexeme.""" +REFLEXIVE_RGX = re.compile(r'.*\s+\bs[ei]\d?\b\s*$') +"""A regexp for recognizing reflexive lemmas.""" +OPT_REFLEXIVE_RGX = re.compile(r'.*\s+\(s[ei]\d?\)\s*$') +"""A regexp for recognizing optionally reflexive lemmas.""" -def compute_isReflexverb(lu): - attr = Attrib('isReflexverb', dynamic=True, help='Is it a reflexive verb (True/False)?') - attr._data = bool(lu._parent and REFLVERB_RGX.match(lu._parent._id)) +def compute_isReflexive(lu): + attr = Attrib('isReflexive', dynamic=True, help='Is the lemma reflexive (always/optionally/never)?') + if sum(1 for val in lu.lemma._data.values() if REFLEXIVE_RGX.match(val)) > 0: + attr._data = 'always' + elif sum(1 for val in lu.lemma._data.values() if OPT_REFLEXIVE_RGX.match(val)) > 0: + attr._data = 'optionally' + else: + attr._data = 'never' lu.dynamic_attrs[attr.name] = attr - - -def compute_productive(lu): - if 'blu-n' in lu._id: - attr = Attrib('productive', dynamic=True, help='Is it a productive noun (True/False)?') - attr._data = True - for var in ['', '1', '2', '3', '4']: - if 'no-aspect'+var in lu.lemma._data.keys(): - attr._data = False - break - lu.dynamic_attrs[attr.name] = attr diff --git a/vallex/scripts/mapreduce.py b/vallex/scripts/mapreduce.py index a7eec40..54c1d13 100644 --- a/vallex/scripts/mapreduce.py +++ b/vallex/scripts/mapreduce.py @@ -30,9 +30,9 @@ from vallex.scripts.mapreduce import emit def map_functor_count(lu): - if lu.isNoun == ['True']: + if lu.dynamic_attrs['pos']._data in ('stem noun','root noun'): emit(('noun',), len(lu.frame.functor)) - elif lu.isVerb == ['True']: + elif lu.dynamic_attrs['pos']._data == 'verb': emit(('verb',), len(lu.frame.functor)) def reduce_functor_count(key, resuts): diff --git a/vallex/scripts/mapreducers/slovko.py b/vallex/scripts/mapreducers/slovko.py index 4bc494e..a5f51be 100644 --- a/vallex/scripts/mapreducers/slovko.py +++ b/vallex/scripts/mapreducers/slovko.py @@ -20,11 +20,11 @@ from vallex.scripts.mapreduce import emit @requires('collection') def map_table1_noun_verb_forms_summary(lu, collection): - if lu.isNoun == ['False'] or not lu.valdiff: + if not (lu.dynamic_attrs['pos']._data in ('stem noun', 'root noun') and lu.valdiff): raise TestDoesNotApply noun = lu - type_ = 'prod' if noun.productive == ['True'] else 'noprod' + type_ = 'prod' if noun.pos == ['stem noun'] else 'noprod' class_ = noun.class_[0] if noun.class_ else 'unspecified' for verb in [collection.id2lu(id) for id in noun.derivedV.ids]: @@ -54,11 +54,11 @@ def map_table1_noun_verb_forms_summary(lu, collection): @requires('collection') def map_table1b_differing_actant_summary(lu, collection): - if lu.isNoun == ['False'] or not lu.valdiff: + if not (lu.dynamic_attrs['pos']._data in ('stem noun', 'root noun') and lu.valdiff): raise TestDoesNotApply noun = lu - type_ = 'prod' if noun.productive == ['True'] else 'noprod' + type_ = 'prod' if noun.pos == ['stem noun'] else 'noprod' class_ = noun.class_[0] if noun.class_ else 'unspecified' for verb in [collection.id2lu(id) for id in noun.derivedV.ids]: @@ -80,10 +80,10 @@ def map_table1b_differing_actant_summary(lu, collection): @requires('collection') def map_table2_spec_nom_forms(lu, collection): - if lu.isNoun == ['False'] or not lu.valdiff: + if not (lu.dynamic_attrs['pos']._data in ('stem noun', 'root noun') and lu.valdiff): raise TestDoesNotApply - type_ = 'prod' if lu.productive == ['True'] else 'noprod' + type_ = 'prod' if lu.pos == ['stem noun'] else 'noprod' class_ = lu.class_[0] if lu.class_ else 'unspecified' for funct in lu.valdiff.actant.eq: @@ -100,10 +100,10 @@ def reduce_table2_spec_nom_forms(key, results): @requires('collection') def map_table3a_actant_spec_forms(lu, collection): - if lu.isNoun == ['False'] or not lu.valdiff: + if not (lu.dynamic_attrs['pos']._data in ('stem noun', 'root noun') and lu.valdiff): raise TestDoesNotApply - type_ = 'prod' if lu.productive == ['True'] else 'noprod' + type_ = 'prod' if lu.pos == ['stem noun'] else 'noprod' class_ = lu.class_[0] if lu.class_ else 'unspecified' # Actant functors which have at least one specific form @@ -114,10 +114,10 @@ def map_table3a_actant_spec_forms(lu, collection): @requires('collection') def map_table3b_actant_spec_forms(lu, collection): - if lu.isNoun == ['False'] or not lu.valdiff: + if not (lu.dynamic_attrs['pos']._data in ('stem noun', 'root noun') and lu.valdiff): raise TestDoesNotApply - type_ = 'prod' if lu.productive == ['True'] else 'noprod' + type_ = 'prod' if lu.pos == ['stem noun'] else 'noprod' class_ = lu.class_[0] if lu.class_ else 'unspecified' # Actant functors which specific forms in parentheses (joined together by ',') diff --git a/vallex/scripts/transforms/add_valdiff.py b/vallex/scripts/transforms/add_valdiff.py index 66cc5cf..0eac258 100644 --- a/vallex/scripts/transforms/add_valdiff.py +++ b/vallex/scripts/transforms/add_valdiff.py @@ -20,7 +20,7 @@ from vallex.scripts import changes, requires, TestDoesNotApply, TestFailed @changes('valdiff') @requires('lumap') def transform_lu_add_valdiff(lu, lumap): - if 'isNoun' not in lu.dynamic_attrs or not lu.dynamic_attrs['isNoun']._data: + if 'pos' not in lu.dynamic_attrs or not lu.dynamic_attrs['pos']._data in ('stem noun', 'root noun'): raise TestDoesNotApply if 'derivedV' not in lu.attribs or lu.attribs['derivedV']._data['ids'] == []: raise TestDoesNotApply -- GitLab