Class: PROIEL::Treebank

Inherits:
Object
  • Object
show all
Defined in:
lib/proiel/treebank.rb

Overview

A class representing a PROIEL treebank containing any number of sources. The sources must use the same annotation scheme.

Constant Summary collapse

METADATA_ELEMENTS =

Available metadata elements for sources.

%i(
  title
  alternative_title
  author
  citation_part
  principal
  funder
  distributor
  distributor_address
  date
  license
  license_url
  reference_system
  editor
  editorial_note
  annotator
  reviewer
  electronic_text_editor
  electronic_text_title
  electronic_text_version
  electronic_text_publisher
  electronic_text_place
  electronic_text_date
  electronic_text_original_url
  electronic_text_license
  electronic_text_license_url
  printed_text_editor
  printed_text_title
  printed_text_edition
  printed_text_publisher
  printed_text_place
  printed_text_date
  chronology_composition
  chronology_manuscript
).freeze

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeTreebank

Creates a new treebank object.



67
68
69
70
71
72
73
74
75
76
77
# File 'lib/proiel/treebank.rb', line 67

def initialize
  @annotation_schema = nil
  @schema_version = nil
  @sources = []
  @dictionaries = []

  @source_index = {}
  @div_index = {}
  @sentence_index = {}
  @token_index = {}
end

Instance Attribute Details

#annotation_schemaAnnotationSchema (readonly)

Returns annotation schema for the treebank.

Returns:



18
19
20
# File 'lib/proiel/treebank.rb', line 18

def annotation_schema
  @annotation_schema
end

#dictionariesArray<Dictionary> (readonly)

Returns dictionaries in the treebank.

Returns:

  • (Array<Dictionary>)

    dictionaries in the treebank



27
28
29
# File 'lib/proiel/treebank.rb', line 27

def dictionaries
  @dictionaries
end

#schema_versionString (readonly)

Returns PROIEL XML schema version for the treebank.

Returns:

  • (String)

    PROIEL XML schema version for the treebank



21
22
23
# File 'lib/proiel/treebank.rb', line 21

def schema_version
  @schema_version
end

#sourcesArray<Source> (readonly)

Returns sources in the treebank.

Returns:

  • (Array<Source>)

    sources in the treebank



24
25
26
# File 'lib/proiel/treebank.rb', line 24

def sources
  @sources
end

Instance Method Details

#find_div(id) ⇒ nil, Div

Finds the Div object corresponding to a div ID.

Parameters:

  • id (Integer)

Returns:

Raises:

  • (ArgumentError)


146
147
148
149
150
# File 'lib/proiel/treebank.rb', line 146

def find_div(id)
  raise ArgumentError, 'integer expected' unless id.is_a?(Integer)

  @div_index[id]
end

#find_sentence(id) ⇒ nil, Sentence

Finds the Sentence object corresponding to a sentence ID.

Parameters:

  • id (Integer)

Returns:

Raises:

  • (ArgumentError)


157
158
159
160
161
# File 'lib/proiel/treebank.rb', line 157

def find_sentence(id)
  raise ArgumentError, 'integer expected' unless id.is_a?(Integer)

  @sentence_index[id]
end

#find_source(id) ⇒ nil, Source

Finds the Source object corresponding to a source ID.

Parameters:

  • id (String)

Returns:

Raises:

  • (ArgumentError)


135
136
137
138
139
# File 'lib/proiel/treebank.rb', line 135

def find_source(id)
  raise ArgumentError, 'string expected' unless id.is_a?(String)

  @source_index[id]
end

#find_token(id) ⇒ nil, Token

Finds the PROIEL::Token object corresponding to a token ID.

Parameters:

  • id (Integer)

Returns:

Raises:

  • (ArgumentError)


168
169
170
171
172
# File 'lib/proiel/treebank.rb', line 168

def find_token(id)
  raise ArgumentError, 'integer expected' unless id.is_a?(Integer)

  @token_index[id]
end

#load_from_xml(f) ⇒ Treebank

Loads one or more PROIEL XML files.

Parameters:

  • f (String, IO, Array)

    PROIEL XML files to load

Returns:



85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# File 'lib/proiel/treebank.rb', line 85

def load_from_xml(f)
  case f
  when Array
    f.each { |filename| load_from_xml(filename) }
  when String
    load_from_xml(File.open(f))
  when IO
    tf = PROIELXML::Reader.parse_io(f)

    tf.proiel.sources.each do |s|
      @sources << Source.new(self, s.id, tf.proiel.export_time, s.language, s.dialect,
                             (s), s.alignment_id) do |source|
        build_divs(s, source)
      end

      index_source_objects!(@sources.last)
    end

    tf.proiel.dictionaries.each do |s|
      @dictionaries << Dictionary.new(self, tf.proiel.export_time, s.language, s.dialect, s)

      index_dictionary_objects!(@dictionaries.last)
    end

    annotation_schema = AnnotationSchema.new(tf.proiel.annotation)
    schema_version = tf.proiel.schema_version

    @annotation_schema ||= annotation_schema
    @schema_version ||= schema_version

    if @annotation_schema == annotation_schema and @schema_version == schema_version
      # FIXME: consolidate export times? This is a design flaw in PROIEL XML
      # 2.0: export time ought to be per source not per PROIEL XML file, so
      # not clear what to do here. Pass it down to the source object?
      # @export_time = tf.proiel.export_time
    else
      raise SchemaMismatch
    end
  else
    raise ArgumentError, 'expected filename, IO or array of these'
  end

  self
end