Class: PROIEL::Converter::CoNLLU

Inherits:

Object

Object
PROIEL::Converter::CoNLLU

show all

Defined in:: lib/proiel/cli/converters/conll-u.rb,
lib/proiel/cli/converters/conll-u/syntax.rb,
lib/proiel/cli/converters/conll-u/morphology.rb

Overview

Converter that outputs CoNLL-U.

This converter relies on certain assumptions about correct linguistic annotation in order to produce a meaningful representation in CoNLL-U.

Defined Under Namespace

Classes: Sentence, Token

Constant Summary collapse

OBLIQUENESS_HIERARCHY =

['nsubj', 'obj', 'iobj', 'obl', 'advmod', 'csubj', 'xcomp', 'ccomp', 'advcl']

REL_TO_POS =

{
 'acl' => 'VERB',
 'advcl' => 'VERB',
 'advcl:cmp' => 'NOUN',
 'advmod' => 'ADV',
 'amod' => 'ADJ',
 'appos' => 'NOUN',
 'ccomp' => 'VERB',
 'conj' => 'X',
 'csubj' => 'VERB',
 'csubj:pass' => 'NOUN',
 'dep' => 'X',
 'det' => 'DET',
 'dislocated' => 'X',
 'fixed' => 'X',
 'flat:foreign' => 'X',
 'flat:name' => 'PROPN',
 'nmod' => 'NOUN',
 'nsubj' => 'NOUN',
 'nsubj:pass' => 'NOUN',
 'nsubj:outer' => 'NOUN',
 'nummod' => 'NUM',
 'obj' => 'NOUN',
 'obl' => 'NOUN',
 'obl:agent' => 'NOUN',
 'obl:arg' => 'NOUN',
 'orphan' => 'NOUN',
 'parataxis' => 'VERB',
 'root' => 'VERB',
 'vocative' => 'NOUN',
 'xcomp' => 'VERB'
}

RELATION_MAPPING =

{
  'adnom' => 'dep',
  'adv' =>  [['advcl', lambda(&:clausal?) ],
             ['advmod', lambda { |x| x.adverb? } ],
             ['advmod', lambda(&:adjectival?) ], # adjective for adverb
             ['obl', lambda { |x| x.nominal? or x.preposition? or x.has_preposition? } ],
             ['advcl', lambda(&:subjunction?) ],
             ['obl', lambda { |x| true } ],
            ],
  'ag' => 'obl:agent', # add :agent' once defined
  'apos' => [['flat:name', lambda { |x| x.proper_noun? and x.head and x.head.proper_noun? } ],
             ['acl', lambda { |x| x.clausal? and x.head and x.head.nominal? } ],  # add :relcl ?

             ['appos', lambda { |x| (x.nominal? or x.adjectival?) and x.head and x.head.nominal? } ],
             ['parataxis', lambda { |x| x.clausal? and x.head and x.head.clausal? } ],
             # what to do about sentential appositions? attempt here to make them parataxis, but there are some legitimate nominal appos under root nominals, so overgenerates slightly
             ['advcl', lambda(&:clausal?) ],
             ['appos', lambda { |x| true } ],
            ],
  'arg' => 'dep',
  'atr' => [['nummod', lambda(&:cardinal?) ],
            ['det', lambda { |x| x.pronominal? and !x.clausal? and !(!x.genitive? and x.head and x.head.genitive?) } ], #TODO check
            ['acl', lambda { |x| x.clausal? } ],  # add :relcl?
            ['nmod', lambda(&:nominal?) ],
            ['advmod', lambda { |x| x.head and !x.head.nominal? and x.head.clausal? } ],
            ['amod', lambda { |x| true } ], #default
           ],
  'aux' => [['det', lambda(&:determiner?) ],
            ['fixed', lambda { |x| x.head and x.head.subjunction? } ],
            ['fixed', lambda { |x| x.head and x.head.conjunction? } ],
            ['fixed', lambda { |x| x.head and x.head.adverb? and x.relative? } ],
            ['fixed', lambda { |x| x.head and x.head.pronominal? and x.verb? } ],
            ['aux:pass', lambda { |x| x.clausal? and x.head.passive?  } ],
            ['aux', lambda(&:clausal?) ], #v2 probably want the modal particle an to go here too in
            ['advmod', lambda(&:negation?) ],
            ['discourse', lambda { |x| x.particle? or x.interjection? } ],
            ['advmod', lambda { |x| x.adjectival? or x.adverb? } ],
            # make subjunctions in root sentences "mark"
            ['mark', lambda { |x| x.subjunction? } ],
            ['cc', lambda(&:conjunction?) ],
            ['flat:foreign', lambda(&:foreign?) ],
            # We need some more distinctions to get Gothic and Armenian. Introduce language in the treebank? (Read from xml)
            ['mark', lambda { |x| ['R-'].include? x.part_of_speech  } ], #"R-" as infinitive marker in Gothic
            ['expl:pv', lambda { |x| ['Pk' ].include? x.part_of_speech  } ], #reflexive as valency reducer
            ['amod', lambda { |x| x.preposition? } ], # Armenian DOM
            ['fixed', lambda { |x| ['Px', 'Pr'].include? x.part_of_speech } ], # NB there are a lot of bogus annotations with 'Px'

            # MISANNOTATION  IF A NOUN or a 'Pi' or a 'Pp' or a 'Ps'
           ],
  'comp' => [['csubj:pass', lambda { |x| x.head and x.head.passive? and !x.head.has_subject?} ],
             ['csubj', lambda { |x| x.head and x.head.has_copula? and !x.head.has_subject?} ],
             ['ccomp', lambda { |x| true } ],
            ],
  'expl' => 'expl',
  'narg' => [['acl', lambda(&:clausal?) ],
             ['nmod', lambda(&:nominal?) ],
             ['nmod', lambda(&:adjectival?) ], # nominaliezed in this function
             ['nmod', lambda { |x| true } ],
            ],
  'nonsub' => 'dep',
  'obj' => 'obj',
  'obl' => [# normally a preposition will be subordinate to its noun, this captures adverbial use of prepositions
            ['advmod', lambda { |x| x.adverb? } ],
            ['obl', lambda { |x| x.has_preposition? or x.preposition? } ],
            ['obl', lambda { |x| x.head and x.head.adverb? } ],
            ['obl:arg', lambda { |x| (x.nominal? or x.adjectival?) and x.head and x.head.clausal? } ],# if nominal (NB check for presence of article!) TODO: should be 'obj' if the verb is monovalent (even by elision)
            #['obl:arg', lambda(&:adjectival?) ], # OBL adjectives are nominalized
            ['advcl', lambda(&:clausal?) ], # this seems to happen with ad libros legendos etc. but check closer!
            ['obl', lambda { |x| true } ],
           ],
  'parpred' => 'parataxis',
  'part' => 'nmod',
  'per' => 'dep',
  'pid' => ['ERROR', lambda { |x| raise 'Remaining pid edge!' } ],
  'pred' => [['root', lambda(&:root?) ],
             ['ERROR', lambda { |x| raise "#{x.to_n} (head_id #{x.head_id}) is not a root!" }],
            ],
  'rel' => 'acl', # add :relcl?
  'sub' => [['nsubj:pass', lambda { |x| x.head and x.head.passive? } ],
            #['obl', lambda { |x| x.head and x.head.part_of_speech == 'Df' } ],
            ['nsubj', lambda { |x| true }],
           ],
  'voc' => [['discourse', lambda { |x| x.part_of_speech == 'I-' } ],
            ['vocative', lambda { |x| true } ],
           ],
  'xadv' => [['advcl', lambda(&:clausal?)], #add :contr ?
             ['xcomp', lambda { |x| x.nominal? or x.pronominal? or x.cardinal?} ],
             ['advcl', lambda(&:subjunction?)],
             ['advmod', lambda { |x| true } ], # add :contr ?
            ],
  'xobj' => 'xcomp', # copula cases have already been taken care of
  'xsub' => 'xsub',
}

DEPONENTS = try to guess deponency based on the lemma

{ 'lat' => /r\Z/,
'grc' => /ομαι\Z/ }

COPULAR_LEMMATA =

['sum,V-,lat', 'eo#2,V-,lat','εἰμί#1,V-,grc', 'быти,V-,orv','стати#2,V-,orv','бꙑти,V-,chu']

AUXILIARIES =

COPULAR_LEMMATA + []

DETERMINERS =

['S-', 'Pd', 'Px']

NEGATION_LEMMATA =

['non,Df,lat', 'ne,Df,lat',
 'μή,Df,grc',
 'μήγε,Df,grc',
 'μηδαμῶς,Df,grc',
 'μηδέποτε,Df,grc',
 'μηδέπω,Df,grc',
 'μηκέτι,Df,grc',
 'μήπω,Df,grc',
 'μήτε,Df,grc',
 'μήτι,Df,grc',
 'μήτιγε,Df,grc',
 'οὐ,Df,grc',
 'οὐδαμῇ,Df,grc',
 'οὐδαμῶς,Df,grc',
 'οὐδέ,Df,grc',
 'οὐδέποτε,Df,grc',
 'οὐδέπω,Df,grc',
 'οὐκέτι,Df,grc',
 'οὐκοῦν,Df,grc',
 'οὔπω,Df,grc',
 'οὔτε,Df,grc',
 'οὔτι,Df,grc',
 'οὐχί,Df,grc',
 'не,Df,chu',
 'ни,Df,chu',
 'нѣ,Df,chu',
 'nei,Df,got',
 'ni,Df,got',
 'nibai#2,Df,got',
 'nih,Df,got',
 'не,Df,orv',
 'ни,Df,orv',
 'ниже,Df,orv',
 'нѣ,Df,orv',
]

TAM_PARTICLE_LEMMATA =

['ἄν,Df,grc',
]

PARTICLE_LEMMATA =

[ 'at,Df,lat',
  'atque,Df,lat',
  'autem,Df,lat',
  'certe,Df,lat',
  'en,Df,lat',
  'equidem,Df,lat',
  'ergo,Df,lat',
  'et,Df,lat',
  'enim,Df,lat',
  'etenim,Df,lat',
  'etiam,Df,lat',
  'igitur,Df,lat',
  'immo,Df,lat',
  'itaque,Df,lat',
  'nam,Df,lat',
  'namque,Df,lat',
  'nonne,Df,lat',
  'nonne,Du,lat',
  'num,Df,lat',
  'quidem,Df,lat',
  'quoque,Df,lat',
  'sic,Df,lat',
  'siquidem,Df,lat',
  'tamen,Df,lat',
  'tum,Df,lat',
  'tunc,Df,lat',
  'vero,Df,lat',
  'ἅμα,Df,grc',
  'ἀνά,Df,grc',
  'ἆρα,Df,grc',
  'ἄραγε,Df,grc',
  'ἀτάρ,Df,grc',
  'ἅτε,Df,grc',
  'αὗ,Df,grc',
  'αὖθις,Df,grc',
  'γάρ,Df,grc',
  'γε,Df,grc',
  'γοῦν,Df,grc',
  'δέ,Df,grc',
  'δή,Df,grc',
  'δῆθεν,Df,grc',
  'δηλαδή,Df,grc',
  'δηλονότι,Df,grc',
  'δῆτα,Df,grc',
  'εἶτα,Df,grc',
  'ἔτι,Df,grc',
  'ἦ#2,Df,grc',
  'ἤγουν,Df,grc',
  'ἤδη,Df,grc',
  'ἤτοι,Df,grc',
  'καίτοι,Df,grc',
  'καίτοιγε,Df,grc',
  'μέν,Df,grc',
  'μενοῦνγε,Df,grc',
  'μέντοι,Df,grc',
  'μήν,Df,grc',
  'νά,Df,grc',
  'νῦν#1,Df,grc',
  'νυν#2,Df,grc',
  'νυνί,Df,grc',
  'οὖν,Df,grc',
  'πέρ,Df,grc',
  'πῃ,Df,grc',
  'ποτε,Df,grc',
  'πού,Df,grc',
  'πω,Df,grc',
  'πως,Df,grc',
  'τάχα,Df,grc',
  'τε,Df,grc',
  'τοι,Df,grc',
  'τοιγαροῦν,Df,grc',
  'τοίνυν,Df,grc',
  'бо,Df,chu',
  'же,Df,chu',
  'занѥ,Df,chu',
  'ибо,Df,chu',
  'иде,Df,chu',
  'ижде,Df,chu',
  'ли,Df,chu',
  'обаче,Df,chu',
  'оубо,Df,chu',
  'ти,Df,chu',
  'тѣ,Df,chu',
  'ꙗко#2,Df,chu',
  'an,Df,got',
  'auk,Df,got',
  'aufto,Df,got',
  'nu,Df,got',
  'ussindo,Df,got',
  'waitei,Df,got',
  'þan,Df,got',
  'nuh,Df,got',
  'nunu,Df,got',
  'raihtis,Df,got',
  'sunsaiw,Df,got',
  'unte,Df,got',
  'þande,Df,got',
  'þannu,Df,got',
  'þanuh,Df,got',
  'þaruh,Df,got',
  'али,Df,orv',
  'аль,Df,orv',
  'ано,Df,orv',
  'атъ,Df,orv',
  'ать,Df,orv',
  'бо,Df,orv',
  'вѣдь,Df,orv',
  'да#2,Df,orv',
  'еда,Df,orv',
  'же,Df,orv',
  'зане,Df,orv',
  'занеже,Df,orv',
  'ибо,Df,orv',
  'ино,Df,orv',
  'ли,Df,orv',
  'ну,Df,orv',
  'понеже,Df,orv',
  'си,Df,orv',
  'ти,Df,orv',
  'убо,Df,orv',
  'ужь,Df,orv',
  'ци,Df,orv',
  'яко,Df,orv',
  'якоже,Df,orv',
]

COMPARISON_LEMMATA =

['alja,Df,got',
 'ar̄awel,Df,xcl',
 'atque,Df,lat',
 'baycʻ,Df,xcl',
 'etʻe,Df,xcl',
 'ibrew,Df,xcl',
 'ibrew z-,Df,xcl',
 'kʻan z,Df,xcl',
 'licet,Df,lat',
 'nibai,Df,got',
 'nisi,Df,lat',
 'orpēs,Df,xcl',
 'praeterquam,Df,lat',
 'quam,Df,lat',
 'quasi,Df,lat',
 'quemadmodum,Df,lat',
 'si,Df,lat',
 'sicut,Df,lat',
 'swaswe,Df,got',
 'swe,Df,got',
 'tamquam,Df,lat',
 'tʻe,Df,xcl',
 'ut,Df,lat',
 'velut,Df,lat',
 'þau,Df,got',
 'ἅτε,Df,grc',
 'εἰ,Df,grc',
 'ἤ,Df,grc',
 'ἤπερ,Df,grc',
 'καθάπερ,Df,grc',
 'καθώς,Df,grc',
 'οἷα,Df,grc',
 'ὁμοίως,Df,grc',
 'ὅτι,Df,grc',
 'ὡς,Df,grc',
 'ὡσεί,Df,grc',
 'ὥσπερ,Df,grc',
 'ако,Df,orv',
 'акъже,Df,orv',
 'акы,Df,orv',
 'акꙑ,Df,chu',
 'будьто,Df,orv',
 'како,Df,orv',
 'ли,Df,chu',
 'неже,Df,chu',
 'нежели,Df,chu',
 'нежели,Df,orv',
 'окꙑ,Df,chu',
 'развѣ,Df,chu',
 'тъкъмо,Df,chu',
 'чьто,Df,orv',
 'яко,Df,orv',
 'якоже,Df,orv',
 'ꙗко,Df,chu',
 'ꙗкоже,Df,chu'
]

POS_MAP =

{
    'A-' => [['ADJ', lambda { |x| true } ]],
    'C-' => [['CCONJ', lambda { |x| true } ]],
    'Df' => [['AUX', lambda(&:tam_particle?)],
             ['ADV', lambda(&:negation?), 'Polarity=Neg'],
             ['ADV', lambda { |x| true } ]
            ],
    'Dq' => [['ADV', lambda { |x| true }, 'PronType=Rel']],
    'Du' => [['ADV', lambda { |x| true }, 'PronType=Int']],
    'F-' => [['X', lambda { |x| true } ]],
    'G-' => [['SCONJ', lambda { |x| true } ]],
    'I-' => [['INTJ', lambda { |x| true } ]],
    'Ma' => [['NUM', lambda { |x| true } ]],
    'Mo' => [['ADJ', lambda { |x| true } ]],
    'N-' => [['SCONJ', lambda { |x| true } ]], #irrelevant for our purposes
    'Nb' => [['NOUN', lambda { |x| true } ]],
    'Ne' => [['PROPN', lambda { |x| true } ]],
    'Pc' => [['PRON', lambda { |x| true }, 'PronType=Rcp']],
    'Pd' => [['DET', lambda { |x| true } ]],
    'Pi' => [['PRON', lambda { |x| true }, 'PronType=Int']],
    'Pk' => [['AUX', lambda { |x| x.relation == 'aux' }],
             ['PRON', lambda { |x| true }, 'PronType=Prs|Reflex=Yes']],
    'Pp' => [['PRON', lambda { |x| true }, 'PronType=Prs']],
    'Pr' => [['PRON', lambda { |x| true }, 'PronType=Rel']],
    'Ps' => [['DET', lambda { |x| true }, 'Poss=Yes']],   ###  NB no evidence for a pronominal/determiner-like nature here
    'Pt' => [['DET', lambda { |x| true }, 'Poss=Yes|Reflex=Yes' ]],   ###  NB no evidence for a pronominal/determiner-like nature here
    'Px' => [['DET', lambda { |x| true } ]],
    'Py' => [['PRON', lambda { |x| true } ]],
    'R-' => [['ADP', lambda { |x| true } ]],
    'V-' => [['AUX', lambda(&:auxiliary?)],
             ['VERB', lambda { |x| true } ]],
    'S-' => [['DET', lambda { |x| true }, 'Definite=Def|PronType=Dem']], # (we only have definite articles)
    'X-' => [['X', lambda { |x| true } ]]
}

MORPHOLOGY_MAP =

{
  :person => {'1' => 'Person=1',
              '2' => 'Person=2',
              '3' => 'Person=3'  } ,
  :number => {'s' => 'Number=Sing',
              'd' => 'Number=Dual',
              'p' => 'Number=Plur'  } ,
  :tense  => {'p' => 'Tense=Pres',
              'i' => 'Tense=Past|Aspect=Imp',
              'r' => 'Tense=Past|Aspect=Perf', #'Tense=Perfect',
              's' => 'VerbForm=PartRes|Tense=Past',
              # tags Perf is not universal
              'a' => 'Tense=Past|Aspect=Perf',
              'u' => 'Tense=Past',
              'l' => 'Tense=Pqp',
              'f' => 'Tense=Fut',
              # tag FutPerfect is not universal
              't' => 'Tense=Fut|Aspect=Perf', #FutPerfect'
                },
  :mood =>   {'i' => 'VerbForm=Fin|Mood=Ind',
              's' => 'VerbForm=Fin|Mood=Sub',
              'm' => 'VerbForm=Fin|Mood=Imp',
              'o' => 'VerbForm=Fin|Mood=Opt',
              'n' => 'VerbForm=Inf',
              'p' => 'VerbForm=Part',
              'd' => 'VerbForm=Ger',
              # Gdv (gerundive) is not universal
              'g' => 'VerbForm=Gdv',
              'u' => 'VerbForm=Sup',
              'e'=> 'VerbForm=Fin|Mood=Ind,Sub',
              'f'=> 'VerbForm=Fin|Mood=Imp,Ind',
              'h'=> 'VerbForm=Fin|Mood=Imp,Sub',
              't' => 'VerbForm=Fin' },
  :voice =>  {'a' => 'Voice=Act',
              # Med is not universal
              'm' => 'Voice=Mid',
              'p' => 'Voice=Pass',
              'e' => 'Voice=Mid,Pass' },
  :gender => {'m' => 'Gender=Masc',
              'f' => 'Gender=Fem',
              'n' => 'Gender=Neut',
              'p' => 'Gender=Fem,Masc',
              'o' => 'Gender=Masc,Neut',
              'r' => 'Gender=Fem,Neut' },
  :case =>   {'n' => 'Case=Nom',
              'a' => 'Case=Acc',
              # Obl(ique) is not universal
              'o' => 'Case=Obl',
              'g' => 'Case=Gen',
              'c' => 'Case=Dat,Gen',
              'e' => 'Case=Acc,Dat',
              'd' => 'Case=Dat',
              'b' => 'Case=Abl',
              'i' => 'Case=Ins',
              'l' => 'Case=Loc',
              'v' => 'Case=Voc' },
  :degree => {'p' => 'Degree=Pos',
              'c' => 'Degree=Cmp',
              's' => 'Degree=Sup' },
  # The whole strength category is not universal
  :strength => {'s' => 'Strength=Strong',
                'w' => 'Strength=Weak' },

  :inflection => {},
}

Class Method Summary collapse

.process(tb, options = []) ⇒ Object

Class Method Details

.process(tb, options = []) ⇒ `Object`

# File 'lib/proiel/cli/converters/conll-u.rb', line 11

def process(tb, options = [])
  error_count = 0
  sentence_count = 0
  tb.sources.each do |source|
    source.divs.each do |div|
      div.sentences.each do |sentence|
        sentence_count += 1
        n = Sentence.new sentence
        begin
          # Do the conversion first to avoid spurious headers if the conversion fails
          a = n.convert.to_conll
          puts "# source = #{source.title}, #{div.title}"
          # using printable_form would give us punctuation, which must then be added to the tree
          puts "# text = #{sentence.tokens.map(&:form).compact.join(' ')}"
          puts "# sent_id = #{sentence.id}"
          puts a
          puts
        rescue => e
          error_count += 1
          STDERR.puts "Cannot convert #{sentence.id} (#{sentence.citation}): #{e}"
          STDERR.puts e.backtrace.join("\n") unless e.is_a? RuntimeError
        end
      end
    end
  end
  STDERR.puts "#{error_count} sentences out of #{sentence_count} could not be converted"
end