Class: PROIEL::Converter::CoNLLU
- Inherits:
-
Object
- Object
- PROIEL::Converter::CoNLLU
- Defined in:
- lib/proiel/cli/converters/conll-u.rb,
lib/proiel/cli/converters/conll-u/syntax.rb,
lib/proiel/cli/converters/conll-u/morphology.rb
Overview
Converter that outputs CoNLL-U.
This converter relies on certain assumptions about correct linguistic annotation in order to produce a meaningful representation in CoNLL-U.
Defined Under Namespace
Constant Summary collapse
- OBLIQUENESS_HIERARCHY =
['nsubj', 'obj', 'iobj', 'obl', 'advmod', 'csubj', 'xcomp', 'ccomp', 'advcl']
- REL_TO_POS =
{ 'acl' => 'VERB', 'advcl' => 'VERB', 'advcl:cmp' => 'NOUN', 'advmod' => 'ADV', 'amod' => 'ADJ', 'appos' => 'NOUN', 'ccomp' => 'VERB', 'conj' => 'X', 'csubj' => 'VERB', 'csubj:pass' => 'NOUN', 'dep' => 'X', 'det' => 'DET', 'dislocated' => 'X', 'fixed' => 'X', 'flat:foreign' => 'X', 'flat:name' => 'PROPN', 'nmod' => 'NOUN', 'nsubj' => 'NOUN', 'nsubj:pass' => 'NOUN', 'nsubj:outer' => 'NOUN', 'nummod' => 'NUM', 'obj' => 'NOUN', 'obl' => 'NOUN', 'obl:agent' => 'NOUN', 'obl:arg' => 'NOUN', 'orphan' => 'NOUN', 'parataxis' => 'VERB', 'root' => 'VERB', 'vocative' => 'NOUN', 'xcomp' => 'VERB' }
- RELATION_MAPPING =
{ 'adnom' => 'dep', 'adv' => [['advcl', lambda(&:clausal?) ], ['advmod', lambda { |x| x.adverb? } ], ['advmod', lambda(&:adjectival?) ], # adjective for adverb ['obl', lambda { |x| x.nominal? or x.preposition? or x.has_preposition? } ], ['advcl', lambda(&:subjunction?) ], ['obl', lambda { |x| true } ], ], 'ag' => 'obl:agent', # add :agent' once defined 'apos' => [['flat:name', lambda { |x| x.proper_noun? and x.head and x.head.proper_noun? } ], ['acl', lambda { |x| x.clausal? and x.head and x.head.nominal? } ], # add :relcl ? ['appos', lambda { |x| (x.nominal? or x.adjectival?) and x.head and x.head.nominal? } ], ['parataxis', lambda { |x| x.clausal? and x.head and x.head.clausal? } ], # what to do about sentential appositions? attempt here to make them parataxis, but there are some legitimate nominal appos under root nominals, so overgenerates slightly ['advcl', lambda(&:clausal?) ], ['appos', lambda { |x| true } ], ], 'arg' => 'dep', 'atr' => [['nummod', lambda(&:cardinal?) ], ['det', lambda { |x| x.pronominal? and !x.clausal? and !(!x.genitive? and x.head and x.head.genitive?) } ], #TODO check ['acl', lambda { |x| x.clausal? } ], # add :relcl? ['nmod', lambda(&:nominal?) ], ['advmod', lambda { |x| x.head and !x.head.nominal? and x.head.clausal? } ], ['amod', lambda { |x| true } ], #default ], 'aux' => [['det', lambda(&:determiner?) ], ['fixed', lambda { |x| x.head and x.head.subjunction? } ], ['fixed', lambda { |x| x.head and x.head.conjunction? } ], ['fixed', lambda { |x| x.head and x.head.adverb? and x.relative? } ], ['fixed', lambda { |x| x.head and x.head.pronominal? and x.verb? } ], ['aux:pass', lambda { |x| x.clausal? and x.head.passive? } ], ['aux', lambda(&:clausal?) ], #v2 probably want the modal particle an to go here too in ['advmod', lambda(&:negation?) ], ['discourse', lambda { |x| x.particle? or x.interjection? } ], ['advmod', lambda { |x| x.adjectival? or x.adverb? } ], # make subjunctions in root sentences "mark" ['mark', lambda { |x| x.subjunction? } ], ['cc', lambda(&:conjunction?) ], ['flat:foreign', lambda(&:foreign?) ], # We need some more distinctions to get Gothic and Armenian. Introduce language in the treebank? (Read from xml) ['mark', lambda { |x| ['R-'].include? x.part_of_speech } ], #"R-" as infinitive marker in Gothic ['expl:pv', lambda { |x| ['Pk' ].include? x.part_of_speech } ], #reflexive as valency reducer ['amod', lambda { |x| x.preposition? } ], # Armenian DOM ['fixed', lambda { |x| ['Px', 'Pr'].include? x.part_of_speech } ], # NB there are a lot of bogus annotations with 'Px' # MISANNOTATION IF A NOUN or a 'Pi' or a 'Pp' or a 'Ps' ], 'comp' => [['csubj:pass', lambda { |x| x.head and x.head.passive? and !x.head.has_subject?} ], ['csubj', lambda { |x| x.head and x.head.has_copula? and !x.head.has_subject?} ], ['ccomp', lambda { |x| true } ], ], 'expl' => 'expl', 'narg' => [['acl', lambda(&:clausal?) ], ['nmod', lambda(&:nominal?) ], ['nmod', lambda(&:adjectival?) ], # nominaliezed in this function ['nmod', lambda { |x| true } ], ], 'nonsub' => 'dep', 'obj' => 'obj', 'obl' => [# normally a preposition will be subordinate to its noun, this captures adverbial use of prepositions ['advmod', lambda { |x| x.adverb? } ], ['obl', lambda { |x| x.has_preposition? or x.preposition? } ], ['obl', lambda { |x| x.head and x.head.adverb? } ], ['obl:arg', lambda { |x| (x.nominal? or x.adjectival?) and x.head and x.head.clausal? } ],# if nominal (NB check for presence of article!) TODO: should be 'obj' if the verb is monovalent (even by elision) #['obl:arg', lambda(&:adjectival?) ], # OBL adjectives are nominalized ['advcl', lambda(&:clausal?) ], # this seems to happen with ad libros legendos etc. but check closer! ['obl', lambda { |x| true } ], ], 'parpred' => 'parataxis', 'part' => 'nmod', 'per' => 'dep', 'pid' => ['ERROR', lambda { |x| raise 'Remaining pid edge!' } ], 'pred' => [['root', lambda(&:root?) ], ['ERROR', lambda { |x| raise "#{x.to_n} (head_id #{x.head_id}) is not a root!" }], ], 'rel' => 'acl', # add :relcl? 'sub' => [['nsubj:pass', lambda { |x| x.head and x.head.passive? } ], #['obl', lambda { |x| x.head and x.head.part_of_speech == 'Df' } ], ['nsubj', lambda { |x| true }], ], 'voc' => [['discourse', lambda { |x| x.part_of_speech == 'I-' } ], ['vocative', lambda { |x| true } ], ], 'xadv' => [['advcl', lambda(&:clausal?)], #add :contr ? ['xcomp', lambda { |x| x.nominal? or x.pronominal? or x.cardinal?} ], ['advcl', lambda(&:subjunction?)], ['advmod', lambda { |x| true } ], # add :contr ? ], 'xobj' => 'xcomp', # copula cases have already been taken care of 'xsub' => 'xsub', }
- DEPONENTS =
try to guess deponency based on the lemma
{ 'lat' => /r\Z/, 'grc' => /ομαι\Z/ }
- COPULAR_LEMMATA =
['sum,V-,lat', 'eo#2,V-,lat','εἰμί#1,V-,grc', 'быти,V-,orv','стати#2,V-,orv','бꙑти,V-,chu']
- AUXILIARIES =
COPULAR_LEMMATA + []
- DETERMINERS =
['S-', 'Pd', 'Px']
- NEGATION_LEMMATA =
['non,Df,lat', 'ne,Df,lat', 'μή,Df,grc', 'μήγε,Df,grc', 'μηδαμῶς,Df,grc', 'μηδέποτε,Df,grc', 'μηδέπω,Df,grc', 'μηκέτι,Df,grc', 'μήπω,Df,grc', 'μήτε,Df,grc', 'μήτι,Df,grc', 'μήτιγε,Df,grc', 'οὐ,Df,grc', 'οὐδαμῇ,Df,grc', 'οὐδαμῶς,Df,grc', 'οὐδέ,Df,grc', 'οὐδέποτε,Df,grc', 'οὐδέπω,Df,grc', 'οὐκέτι,Df,grc', 'οὐκοῦν,Df,grc', 'οὔπω,Df,grc', 'οὔτε,Df,grc', 'οὔτι,Df,grc', 'οὐχί,Df,grc', 'не,Df,chu', 'ни,Df,chu', 'нѣ,Df,chu', 'nei,Df,got', 'ni,Df,got', 'nibai#2,Df,got', 'nih,Df,got', 'не,Df,orv', 'ни,Df,orv', 'ниже,Df,orv', 'нѣ,Df,orv', ]
- TAM_PARTICLE_LEMMATA =
['ἄν,Df,grc', ]
- PARTICLE_LEMMATA =
[ 'at,Df,lat', 'atque,Df,lat', 'autem,Df,lat', 'certe,Df,lat', 'en,Df,lat', 'equidem,Df,lat', 'ergo,Df,lat', 'et,Df,lat', 'enim,Df,lat', 'etenim,Df,lat', 'etiam,Df,lat', 'igitur,Df,lat', 'immo,Df,lat', 'itaque,Df,lat', 'nam,Df,lat', 'namque,Df,lat', 'nonne,Df,lat', 'nonne,Du,lat', 'num,Df,lat', 'quidem,Df,lat', 'quoque,Df,lat', 'sic,Df,lat', 'siquidem,Df,lat', 'tamen,Df,lat', 'tum,Df,lat', 'tunc,Df,lat', 'vero,Df,lat', 'ἅμα,Df,grc', 'ἀνά,Df,grc', 'ἆρα,Df,grc', 'ἄραγε,Df,grc', 'ἀτάρ,Df,grc', 'ἅτε,Df,grc', 'αὗ,Df,grc', 'αὖθις,Df,grc', 'γάρ,Df,grc', 'γε,Df,grc', 'γοῦν,Df,grc', 'δέ,Df,grc', 'δή,Df,grc', 'δῆθεν,Df,grc', 'δηλαδή,Df,grc', 'δηλονότι,Df,grc', 'δῆτα,Df,grc', 'εἶτα,Df,grc', 'ἔτι,Df,grc', 'ἦ#2,Df,grc', 'ἤγουν,Df,grc', 'ἤδη,Df,grc', 'ἤτοι,Df,grc', 'καίτοι,Df,grc', 'καίτοιγε,Df,grc', 'μέν,Df,grc', 'μενοῦνγε,Df,grc', 'μέντοι,Df,grc', 'μήν,Df,grc', 'νά,Df,grc', 'νῦν#1,Df,grc', 'νυν#2,Df,grc', 'νυνί,Df,grc', 'οὖν,Df,grc', 'πέρ,Df,grc', 'πῃ,Df,grc', 'ποτε,Df,grc', 'πού,Df,grc', 'πω,Df,grc', 'πως,Df,grc', 'τάχα,Df,grc', 'τε,Df,grc', 'τοι,Df,grc', 'τοιγαροῦν,Df,grc', 'τοίνυν,Df,grc', 'бо,Df,chu', 'же,Df,chu', 'занѥ,Df,chu', 'ибо,Df,chu', 'иде,Df,chu', 'ижде,Df,chu', 'ли,Df,chu', 'обаче,Df,chu', 'оубо,Df,chu', 'ти,Df,chu', 'тѣ,Df,chu', 'ꙗко#2,Df,chu', 'an,Df,got', 'auk,Df,got', 'aufto,Df,got', 'nu,Df,got', 'ussindo,Df,got', 'waitei,Df,got', 'þan,Df,got', 'nuh,Df,got', 'nunu,Df,got', 'raihtis,Df,got', 'sunsaiw,Df,got', 'unte,Df,got', 'þande,Df,got', 'þannu,Df,got', 'þanuh,Df,got', 'þaruh,Df,got', 'али,Df,orv', 'аль,Df,orv', 'ано,Df,orv', 'атъ,Df,orv', 'ать,Df,orv', 'бо,Df,orv', 'вѣдь,Df,orv', 'да#2,Df,orv', 'еда,Df,orv', 'же,Df,orv', 'зане,Df,orv', 'занеже,Df,orv', 'ибо,Df,orv', 'ино,Df,orv', 'ли,Df,orv', 'ну,Df,orv', 'понеже,Df,orv', 'си,Df,orv', 'ти,Df,orv', 'убо,Df,orv', 'ужь,Df,orv', 'ци,Df,orv', 'яко,Df,orv', 'якоже,Df,orv', ]
- COMPARISON_LEMMATA =
['alja,Df,got', 'ar̄awel,Df,xcl', 'atque,Df,lat', 'baycʻ,Df,xcl', 'etʻe,Df,xcl', 'ibrew,Df,xcl', 'ibrew z-,Df,xcl', 'kʻan z,Df,xcl', 'licet,Df,lat', 'nibai,Df,got', 'nisi,Df,lat', 'orpēs,Df,xcl', 'praeterquam,Df,lat', 'quam,Df,lat', 'quasi,Df,lat', 'quemadmodum,Df,lat', 'si,Df,lat', 'sicut,Df,lat', 'swaswe,Df,got', 'swe,Df,got', 'tamquam,Df,lat', 'tʻe,Df,xcl', 'ut,Df,lat', 'velut,Df,lat', 'þau,Df,got', 'ἅτε,Df,grc', 'εἰ,Df,grc', 'ἤ,Df,grc', 'ἤπερ,Df,grc', 'καθάπερ,Df,grc', 'καθώς,Df,grc', 'οἷα,Df,grc', 'ὁμοίως,Df,grc', 'ὅτι,Df,grc', 'ὡς,Df,grc', 'ὡσεί,Df,grc', 'ὥσπερ,Df,grc', 'ако,Df,orv', 'акъже,Df,orv', 'акы,Df,orv', 'акꙑ,Df,chu', 'будьто,Df,orv', 'како,Df,orv', 'ли,Df,chu', 'неже,Df,chu', 'нежели,Df,chu', 'нежели,Df,orv', 'окꙑ,Df,chu', 'развѣ,Df,chu', 'тъкъмо,Df,chu', 'чьто,Df,orv', 'яко,Df,orv', 'якоже,Df,orv', 'ꙗко,Df,chu', 'ꙗкоже,Df,chu' ]
- POS_MAP =
{ 'A-' => [['ADJ', lambda { |x| true } ]], 'C-' => [['CCONJ', lambda { |x| true } ]], 'Df' => [['AUX', lambda(&:tam_particle?)], ['ADV', lambda(&:negation?), 'Polarity=Neg'], ['ADV', lambda { |x| true } ] ], 'Dq' => [['ADV', lambda { |x| true }, 'PronType=Rel']], 'Du' => [['ADV', lambda { |x| true }, 'PronType=Int']], 'F-' => [['X', lambda { |x| true } ]], 'G-' => [['SCONJ', lambda { |x| true } ]], 'I-' => [['INTJ', lambda { |x| true } ]], 'Ma' => [['NUM', lambda { |x| true } ]], 'Mo' => [['ADJ', lambda { |x| true } ]], 'N-' => [['SCONJ', lambda { |x| true } ]], #irrelevant for our purposes 'Nb' => [['NOUN', lambda { |x| true } ]], 'Ne' => [['PROPN', lambda { |x| true } ]], 'Pc' => [['PRON', lambda { |x| true }, 'PronType=Rcp']], 'Pd' => [['DET', lambda { |x| true } ]], 'Pi' => [['PRON', lambda { |x| true }, 'PronType=Int']], 'Pk' => [['AUX', lambda { |x| x.relation == 'aux' }], ['PRON', lambda { |x| true }, 'PronType=Prs|Reflex=Yes']], 'Pp' => [['PRON', lambda { |x| true }, 'PronType=Prs']], 'Pr' => [['PRON', lambda { |x| true }, 'PronType=Rel']], 'Ps' => [['DET', lambda { |x| true }, 'Poss=Yes']], ### NB no evidence for a pronominal/determiner-like nature here 'Pt' => [['DET', lambda { |x| true }, 'Poss=Yes|Reflex=Yes' ]], ### NB no evidence for a pronominal/determiner-like nature here 'Px' => [['DET', lambda { |x| true } ]], 'Py' => [['PRON', lambda { |x| true } ]], 'R-' => [['ADP', lambda { |x| true } ]], 'V-' => [['AUX', lambda(&:auxiliary?)], ['VERB', lambda { |x| true } ]], 'S-' => [['DET', lambda { |x| true }, 'Definite=Def|PronType=Dem']], # (we only have definite articles) 'X-' => [['X', lambda { |x| true } ]] }
- MORPHOLOGY_MAP =
{ :person => {'1' => 'Person=1', '2' => 'Person=2', '3' => 'Person=3' } , :number => {'s' => 'Number=Sing', 'd' => 'Number=Dual', 'p' => 'Number=Plur' } , :tense => {'p' => 'Tense=Pres', 'i' => 'Tense=Past|Aspect=Imp', 'r' => 'Tense=Past|Aspect=Perf', #'Tense=Perfect', 's' => 'VerbForm=PartRes|Tense=Past', # tags Perf is not universal 'a' => 'Tense=Past|Aspect=Perf', 'u' => 'Tense=Past', 'l' => 'Tense=Pqp', 'f' => 'Tense=Fut', # tag FutPerfect is not universal 't' => 'Tense=Fut|Aspect=Perf', #FutPerfect' }, :mood => {'i' => 'VerbForm=Fin|Mood=Ind', 's' => 'VerbForm=Fin|Mood=Sub', 'm' => 'VerbForm=Fin|Mood=Imp', 'o' => 'VerbForm=Fin|Mood=Opt', 'n' => 'VerbForm=Inf', 'p' => 'VerbForm=Part', 'd' => 'VerbForm=Ger', # Gdv (gerundive) is not universal 'g' => 'VerbForm=Gdv', 'u' => 'VerbForm=Sup', 'e'=> 'VerbForm=Fin|Mood=Ind,Sub', 'f'=> 'VerbForm=Fin|Mood=Imp,Ind', 'h'=> 'VerbForm=Fin|Mood=Imp,Sub', 't' => 'VerbForm=Fin' }, :voice => {'a' => 'Voice=Act', # Med is not universal 'm' => 'Voice=Mid', 'p' => 'Voice=Pass', 'e' => 'Voice=Mid,Pass' }, :gender => {'m' => 'Gender=Masc', 'f' => 'Gender=Fem', 'n' => 'Gender=Neut', 'p' => 'Gender=Fem,Masc', 'o' => 'Gender=Masc,Neut', 'r' => 'Gender=Fem,Neut' }, :case => {'n' => 'Case=Nom', 'a' => 'Case=Acc', # Obl(ique) is not universal 'o' => 'Case=Obl', 'g' => 'Case=Gen', 'c' => 'Case=Dat,Gen', 'e' => 'Case=Acc,Dat', 'd' => 'Case=Dat', 'b' => 'Case=Abl', 'i' => 'Case=Ins', 'l' => 'Case=Loc', 'v' => 'Case=Voc' }, :degree => {'p' => 'Degree=Pos', 'c' => 'Degree=Cmp', 's' => 'Degree=Sup' }, # The whole strength category is not universal :strength => {'s' => 'Strength=Strong', 'w' => 'Strength=Weak' }, :inflection => {}, }
Class Method Summary collapse
Class Method Details
.process(tb, options = []) ⇒ Object
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 11 def process(tb, = []) error_count = 0 sentence_count = 0 tb.sources.each do |source| source.divs.each do |div| div.sentences.each do |sentence| sentence_count += 1 n = Sentence.new sentence begin # Do the conversion first to avoid spurious headers if the conversion fails a = n.convert.to_conll puts "# source = #{source.title}, #{div.title}" # using printable_form would give us punctuation, which must then be added to the tree puts "# text = #{sentence.tokens.map(&:form).compact.join(' ')}" puts "# sent_id = #{sentence.id}" puts a puts rescue => e error_count += 1 STDERR.puts "Cannot convert #{sentence.id} (#{sentence.citation}): #{e}" STDERR.puts e.backtrace.join("\n") unless e.is_a? RuntimeError end end end end STDERR.puts "#{error_count} sentences out of #{sentence_count} could not be converted" end |