Class: PROIEL::Converter::CoNLLU

Inherits:
Object
  • Object
show all
Defined in:
lib/proiel/cli/converters/conll-u.rb,
lib/proiel/cli/converters/conll-u/syntax.rb,
lib/proiel/cli/converters/conll-u/morphology.rb

Defined Under Namespace

Classes: Sentence, Token

Constant Summary collapse

OBLIQUENESS_HIERARCHY =
["nsubj", "obj", "iobj", "obl", "advmod", "csubj", "xcomp", "ccomp", "advcl"]
RELATION_MAPPING =
{
  "adnom" => "dep",
  "adv" =>  [["advcl", lambda(&:clausal?) ],
             ["advmod", lambda { |x| x.adverb? or x.preposition? } ],
             ["advmod", lambda(&:adjectival?) ], # adjective for adverb
             ["obl", lambda(&:nominal?) ], 
             ["advmod", lambda { |x| true } ],
            ],
  "ag" => "obl:agent", # add :agent" once defined
  "apos" => [["flat:name", lambda { |x| x.proper_noun? and x.head and x.head.proper_noun? } ],
             ["appos", lambda { |x| (x.nominal? or x.adjectival?) and x.head and x.head.nominal? } ],
             ["acl", lambda { |x| x.clausal? and x.head and x.head.nominal? } ],  # add :relcl ?
             # what to do about sentential appositions?
             ["advcl", lambda(&:clausal?) ],
             ["appos", lambda { |x| true } ],
            ],
  "arg" => "dep",
  "atr" => [["nummod", lambda(&:cardinal?) ],
            ["det", lambda { |x| x.pronominal? and !(!x.genitive? and x.head and x.head.genitive?) } ], #TODO check
            ["nmod", lambda(&:nominal?) ], 
            ["acl", lambda { |x| x.clausal? } ],  # add :relcl?
            ["advmod", lambda { |x| x.head and x.head.clausal? } ],
            ["amod", lambda { |x| true } ], #default
           ],
  "aux" => [["det", lambda(&:determiner?) ],
            ["aux:pass", lambda { |x| x.clausal? and x.head.passive?  } ],
            ["aux", lambda(&:clausal?) ], #v2 probably want the modal particle an to go here too in 
            ["advmod", lambda(&:negation?) ],
            ["discourse", lambda { |x| x.particle? or x.interjection? } ],
            # include subjunctions that are aux here; (root sentences with subjunction)
            ["advmod", lambda { |x| x.adjectival? or x.adverb? or x.subjunction? } ],
            ["cc", lambda(&:conjunction?) ],
            ["flat:foreign", lambda(&:foreign?) ],
            # We need some more distinctions to get Gothic and Armenian. Introduce language in the treebank? (Read from xml)
            ["mark", lambda { |x| ['R-'].include? x.part_of_speech  } ], #'R-' as infinitive marker in Gothic
            ["aux", lambda { |x| ['Pk' ].include? x.part_of_speech  } ], #reflexive as valency reducer
            ['amod', lambda { |x| x.preposition? } ], # Armenian DOM
            ['fixed', lambda { |x| ['Px', 'Pr'].include? x.part_of_speech } ], # NB there are a lot of bogus annotations with 'Px'
            
            # MISANNOTATION  IF A NOUN or a 'Pi' or a 'Pp' or a 'Ps'
           ],
  "comp" => [['csubj:pass', lambda { |x| x.head and x.head.passive? } ],
             ['csubj', lambda { |x| x.head and x.head.copula? } ],
             ['ccomp', lambda { |x| true } ],
            ],
  "expl" => "expl",
  "narg" => [['acl', lambda(&:clausal?) ],
             ['nmod', lambda(&:nominal?) ], 
             ['nmod', lambda(&:adjectival?) ], # nominaliezed in this function
             ['nmod', lambda { |x| true } ],
            ],
  "nonsub" => "dep",
  "obj" => "obj:dir",
  "obl" => [# normally a preposition will be subordinate to its noun, this captures adverbial use of prepositions
            ["advmod", lambda { |x| x.adverb? or x.preposition? } ], 
            ["obl", lambda { |x| x.has_preposition? } ],
            ["iobj", lambda(&:nominal?) ],# if nominal (NB check for presence of article!) TODO: should be "obj" if the verb is monovalent (even by elision)
            ["iobj", lambda(&:adjectival?) ], # OBL adjectives are nominalized 
            ["advcl", lambda(&:clausal?) ], # this seems to happen with ad libros legendos etc. but check closer!
            ["iobj", lambda { |x| true } ], 
           ],
  "parpred" => "parataxis",
  "part" => "nmod",
  "per" => "dep",
  "pid" => ["ERROR", lambda { |x| raise "Remaining pid edge!" } ],
  "pred" => [["root", lambda(&:root?) ],
             ["ERROR", lambda { |x| raise "#{x.to_n} (head_id #{x.head_id}) is not a root!" }],
            ],
  "rel" => "acl", # add :relcl?
  "sub" => [["nsubj:pass", lambda { |x| x.head and x.head.passive? } ],
            ["nsubj", lambda { |x| true }],
           ],
  "voc" => "vocative",
  "xadv" => [["advcl", lambda(&:clausal?)], #add :contr ?
             ["advmod", lambda { |x| true } ], # add :contr ?
            ],
  "xobj" => "xcomp", # copula cases have already been taken care of
  "xsub" => "xsub",
}
DEPONENTS =

try to guess deponency based on the lemma

{ 'lat' => /r\Z/,
'grc' => /ομαι\Z/ }
COPULAR_LEMMATA =
['sum,V-,lat', 'εἰμί#1,V-,grc']
AUXILIARIES =
COPULAR_LEMMATA + []
DETERMINERS =
['S-', 'Pd', 'Px']
NEGATION_LEMMATA =
['non,Df,lat', 'ne,Df,lat', 
 'μή,Df,grc',
 'μήγε,Df,grc',
 'μηδαμῶς,Df,grc',
 'μηδέποτε,Df,grc',
 'μηδέπω,Df,grc',
 'μηκέτι,Df,grc',
 'μήπω,Df,grc',
 'μήτε,Df,grc',
 'μήτι,Df,grc',
 'μήτιγε,Df,grc',
 'οὐ,Df,grc',
 'οὐδαμῇ,Df,grc',
 'οὐδαμῶς,Df,grc',
 'οὐδέ,Df,grc',
 'οὐδέποτε,Df,grc',
 'οὐδέπω,Df,grc',
 'οὐκέτι,Df,grc',
 'οὐκοῦν,Df,grc',
 'οὔπω,Df,grc',
 'οὔτε,Df,grc',
 'οὔτι,Df,grc',
 'οὐχί,Df,grc',
 'не,Df,chu',
 'ни,Df,chu',
 'нѣ,Df,chu',
 'nei,Df,got',
 'ni,Df,got',
 'nibai#2,Df,got',
 'nih,Df,got',
]
TAM_PARTICLE_LEMMATA =
['ἄν,Df,grc',
]
PARTICLE_LEMMATA =
[ 'at,Df,lat',
  'atque,Df,lat',
  'autem,Df,lat',
  'certe,Df,lat',
  'ergo,Df,lat',
  'et,Df,lat',
  'enim,Df,lat',
  'etiam,Df,lat',
  'igitur,Df,lat',
  'immo,Df,lat',
  'itaque,Df,lat',
  'nam,Df,lat',
  'nonne,Df,lat',
  'nonne,Du,lat',
  'quidem,Df,lat',
  'quoque,Df,lat',
  'sic,Df,lat',
  'tamen,Df,lat',
  'tum,Df,lat',
  'tunc,Df,lat',
  'vero,Df,lat',
  'ἅμα,Df,grc',
  'ἀνά,Df,grc',
  'ἆρα,Df,grc',
  'ἄραγε,Df,grc',
  'ἀτάρ,Df,grc',
  'ἅτε,Df,grc',
  'αὗ,Df,grc',
  'αὖθις,Df,grc',
  'γάρ,Df,grc',
  'γε,Df,grc',
  'γοῦν,Df,grc',
  'δέ,Df,grc',
  'δή,Df,grc',
  'δῆθεν,Df,grc',
  'δηλαδή,Df,grc',
  'δηλονότι,Df,grc',
  'δῆτα,Df,grc',
  'εἶτα,Df,grc',
  'ἔτι,Df,grc',
  'ἦ#2,Df,grc',
  'ἤγουν,Df,grc',
  'ἤδη,Df,grc',
  'ἤτοι,Df,grc',
  'καίτοι,Df,grc',
  'καίτοιγε,Df,grc',
  'μέν,Df,grc',
  'μενοῦνγε,Df,grc',
  'μέντοι,Df,grc',
  'μήν,Df,grc',
  'νά,Df,grc',
  'νῦν#1,Df,grc',
  'νυν#2,Df,grc',
  'νυνί,Df,grc',
  'οὖν,Df,grc',
  'πέρ,Df,grc',
  'πῃ,Df,grc',
  'ποτε,Df,grc',
  'πού,Df,grc',
  'πω,Df,grc',
  'πως,Df,grc',
  'τάχα,Df,grc',
  'τε,Df,grc',
  'τοι,Df,grc',
  'τοιγαροῦν,Df,grc',
  'τοίνυν,Df,grc',
  'бо,Df,chu',
  'же,Df,chu',
  'занѥ,Df,chu',
  'ибо,Df,chu',
  'иде,Df,chu',
  'ижде,Df,chu',
  'ли,Df,chu',
  'обаче,Df,chu',
  'оубо,Df,chu',
  'ти,Df,chu',
  'тѣ,Df,chu',
  'ꙗко#2,Df,chu',
  'an,Df,got',
  'auk,Df,got',
  'aufto,Df,got',
  'nu,Df,got',
  'ussindo,Df,got',
  'waitei,Df,got',
  'þan,Df,got',
  'nuh,Df,got',
  'nunu,Df,got',
  'raihtis,Df,got',
  'sunsaiw,Df,got',
  'unte,Df,got',
  'þande,Df,got',
  'þannu,Df,got',
  'þanuh,Df,got',
  'þaruh,Df,got',
]
POS_MAP =
{ 
          'A-' => [['ADJ', lambda { |x| true } ]],
          'C-' => [['CCONJ', lambda { |x| true } ]],
          'Df' => [['AUX', lambda(&:TAM_particle?)],
 ['ADV', lambda(&:negation?), "Polarity=Neg"],
 ['ADV', lambda { |x| true } ]
],
          'Dq' => [['ADV', lambda { |x| true }, "PronType=Rel"]],
          'Du' => [['ADV', lambda { |x| true }, "PronType=Int"]],
          'F-' => [['X', lambda { |x| true } ]],
          'G-' => [['SCONJ', lambda { |x| true } ]],
          'I-' => [['INTJ', lambda { |x| true } ]],
          'Ma' => [['NUM', lambda { |x| true } ]], 
          'Mo' => [['ADJ', lambda { |x| true } ]], 
          'N-' => [['SCONJ', lambda { |x| true } ]], #irrelevant for our purposes
          'Nb' => [['NOUN', lambda { |x| true } ]],
          'Ne' => [['PROPN', lambda { |x| true } ]],
          'Pc' => [['PRON', lambda { |x| true }, "PronType=Rcp"]],
          'Pd' => [['DET', lambda { |x| true } ]], 
          'Pi' => [['PRON', lambda { |x| true }, "PronType=Int"]],
          'Pk' => [['AUX', lambda { |x| x.relation == 'aux' }],
 ['PRON', lambda { |x| true }, "PronType=Prs|Reflex=Yes"]],
          'Pp' => [['PRON', lambda { |x| true }, "PronType=Prs"]],
          'Pr' => [['PRON', lambda { |x| true }, "PronType=Rel"]],
          'Ps' => [['ADJ', lambda { |x| true }, "Poss=Yes"]],   ###  NB no evidence for a pronominal/determiner-like nature here
          'Pt' => [['ADJ', lambda { |x| true }, "Poss=Yes|Reflex=Yes" ]],   ###  NB no evidence for a pronominal/determiner-like nature here
          'Px' => [['DET', lambda { |x| true } ]], 
          'Py' => [['PRON', lambda { |x| true } ]], 
          'R-' => [['ADP', lambda { |x| true } ]],
          'V-' => [['AUX', lambda(&:auxiliary?)],
 ['VERB', lambda { |x| true } ]],
          'S-' => [['DET', lambda { |x| true }, "Definite=Def|PronType=Dem"]], # (we only have definite articles)
          'X-' => [['X', lambda { |x| true } ]]
}
MORPHOLOGY_MAP =
{
  :person => {'1' => 'Person=1', 
              '2' => 'Person=2', 
              '3' => 'Person=3'  } , 
  :number => {'s' => 'Number=Sing', 
              'd' => 'Number=Dual', 
              'p' => 'Number=Plur'  } ,
  :tense  => {'p' => 'Tense=Pres', 
              'i' => 'Tense=Past|Aspect=Imp', 
              'r' => 'Tense=Past|Aspect=Perf', #'Tense=Perfect', 
              's' => 'Aspect=Res',
              # tags Perf is not universal
              'a' => 'Tense=Past|Aspect=Perf', 
              'u' => 'Tense=Past', 
              'l' => 'Tense=Pqp', 
              'f' => 'Tense=Fut', 
              # tag FutPerfect is not universal
              't' => 'Tense=Fut|Aspect=Perf', #FutPerfect' 
                },
  :mood =>   {'i' => 'VerbForm=Fin|Mood=Ind', 
              's' => 'VerbForm=Fin|Mood=Sub', 
              'm' => 'VerbForm=Fin|Mood=Imp', 
              'o' => 'VerbForm=Fin|Mood=Opt', 
              'n' => 'VerbForm=Inf', 
              'p' => 'VerbForm=Part', 
              'd' => 'VerbForm=Ger', 
              # Gdv (gerundive) is not universal
              'g' => 'VerbForm=Gdv', 
              'u' => 'VerbForm=Sup', 
              'e'=> 'VerbForm=Fin|Mood=Ind,Sub', 
              'f'=> 'VerbForm=Fin|Mood=Imp,Ind', 
              'h'=> 'VerbForm=Fin|Mood=Imp,Sub', 
              't' => 'VerbForm=Fin' },
  :voice =>  {'a' => 'Voice=Act', 
              # Med is not universal
              'm' => 'Voice=Mid', 
              'p' => 'Voice=Pass', 
              'e' => 'Voice=Mid,Pass' },
  :gender => {'m' => 'Gender=Masc',
              'f' => 'Gender=Fem',
              'n' => 'Gender=Neut',
              'p' => 'Gender=Fem,Masc',
              'o' => 'Gender=Masc,Neut',
              'r' => 'Gender=Fem,Neut' },
  :case =>   {'n' => 'Case=Nom', 
              'a' => 'Case=Acc', 
              # Obl(ique) is not universal
              'o' => 'Case=Obl', 
              'g' => 'Case=Gen', 
              'c' => 'Case=Dat,Gen', 
              'e' => 'Case=Acc,Dat', 
              'd' => 'Case=Dat', 
              'b' => 'Case=Abl', 
              'i' => 'Case=Ins', 
              'l' => 'Case=Loc', 
              'v' => 'Case=Voc' },
  :degree => {'p' => 'Degree=Pos', 
              'c' => 'Degree=Cmp', 
              's' => 'Degree=Sup' },
  # The whole strength category is not universal
  :strength => {'w' => 'Strength=Weak',
                's' => 'Strength=Strong'},
  :inflection => {},
}

Class Method Summary collapse

Class Method Details

.process(tb, options = []) ⇒ Object



14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# File 'lib/proiel/cli/converters/conll-u.rb', line 14

def process(tb, options = [])
  error_count = 0 
  sentence_count = 0
  tb.sources.each do |source|
    source.divs.each do |div|
      div.sentences.each do |sentence|
        sentence_count += 1
        n = Sentence.new sentence
        begin
          # Do the conversion first to avoid spurious headers if the conversion fails
          a = n.convert.to_conll
          puts "# source = #{source.title}, #{div.title}"
          # using printable_form would give us punctuation, which must then be added to the tree
          puts "# text = #{sentence.tokens.map(&:form).compact.join(' ')}"
          puts "# sent_id = #{sentence.id}"
          puts a
          puts
        rescue => e
          error_count += 1
          STDERR.puts "Cannot convert #{sentence.id} (#{sentence.citation}): #{e}"
          STDERR.puts e.backtrace.join("\n") unless e.is_a? RuntimeError
        end
      end
    end
  end
  STDERR.puts "#{error_count} sentences out of #{sentence_count} could not be converted"
end