Class: PROIEL::Treebank

Inherits:
Object
  • Object
show all
Defined in:
lib/proiel/treebank.rb

Overview

A class representing a PROIEL treebank containing any number of sources. The sources must use the same annotation scheme.

Constant Summary collapse

METADATA_ELEMENTS =

Available metadata elements for sources.

%i(
  title
  author
  citation_part
  principal
  funder
  distributor
  distributor_address
  date
  license
  license_url
  reference_system
  editor
  editorial_note
  annotator
  reviewer
  electronic_text_editor
  electronic_text_title
  electronic_text_version
  electronic_text_publisher
  electronic_text_place
  electronic_text_date
  electronic_text_original_url
  electronic_text_license
  electronic_text_license_url
  printed_text_editor
  printed_text_title
  printed_text_edition
  printed_text_publisher
  printed_text_place
  printed_text_date
)

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeTreebank

Creates a new treebank object.



61
62
63
64
65
66
67
68
69
70
# File 'lib/proiel/treebank.rb', line 61

def initialize
  @annotation_schema = nil
  @schema_version = nil
  @sources = []

  @source_index = {}
  @div_index = {}
  @sentence_index = {}
  @token_index = {}
end

Instance Attribute Details

#annotation_schemaAnnotationSchema (readonly)

Returns annotation schema for the treebank.

Returns:



18
19
20
# File 'lib/proiel/treebank.rb', line 18

def annotation_schema
  @annotation_schema
end

#schema_versionString (readonly)

Returns PROIEL XML schema version for the treebank.

Returns:

  • (String)

    PROIEL XML schema version for the treebank



21
22
23
# File 'lib/proiel/treebank.rb', line 21

def schema_version
  @schema_version
end

#sourcesArray<Source> (readonly)

Returns sources in the treebank.

Returns:

  • (Array<Source>)

    sources in the treebank



24
25
26
# File 'lib/proiel/treebank.rb', line 24

def sources
  @sources
end

Instance Method Details

#find_div(id) ⇒ nil, Div

Finds the Div object corresponding to a div ID.

Parameters:

  • id (Integer)

Returns:

Raises:

  • (ArgumentError)


133
134
135
136
137
# File 'lib/proiel/treebank.rb', line 133

def find_div(id)
  raise ArgumentError, 'integer expected' unless id.is_a?(Integer)

  @div_index[id]
end

#find_sentence(id) ⇒ nil, Sentence

Finds the Sentence object corresponding to a sentence ID.

Parameters:

  • id (Integer)

Returns:

Raises:

  • (ArgumentError)


144
145
146
147
148
# File 'lib/proiel/treebank.rb', line 144

def find_sentence(id)
  raise ArgumentError, 'integer expected' unless id.is_a?(Integer)

  @sentence_index[id]
end

#find_source(id) ⇒ nil, Source

Finds the Source object corresponding to a source ID.

Parameters:

  • id (String)

Returns:

Raises:

  • (ArgumentError)


122
123
124
125
126
# File 'lib/proiel/treebank.rb', line 122

def find_source(id)
  raise ArgumentError, 'string expected' unless id.is_a?(String)

  @source_index[id]
end

#find_token(id) ⇒ nil, Token

Finds the PROIEL::Token object corresponding to a token ID.

Parameters:

  • id (Integer)

Returns:

Raises:

  • (ArgumentError)


155
156
157
158
159
# File 'lib/proiel/treebank.rb', line 155

def find_token(id)
  raise ArgumentError, 'integer expected' unless id.is_a?(Integer)

  @token_index[id]
end

#load_from_xml(f) ⇒ Treebank

Loads one or more PROIEL XML files.

Parameters:

  • f (String, IO, Array)

    PROIEL XML files to load

Returns:



78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# File 'lib/proiel/treebank.rb', line 78

def load_from_xml(f)
  case f
  when Array
    f.each { |filename| load_from_xml(filename) }
  when String
    load_from_xml(File.open(f))
  when IO
    tf = PROIELXML::Reader.parse_io(f)

    tf.proiel.sources.each do |s|
      @sources << Source.new(self, s.id, tf.proiel.export_time, s.language,
                             (s), s.alignment_id) do |source|
        build_divs(s, source)
      end

      index_objects!(@sources.last)
    end

    annotation_schema = AnnotationSchema.new(tf.proiel.annotation)
    schema_version = tf.proiel.schema_version

    @annotation_schema ||= annotation_schema
    @schema_version ||= schema_version

    if @annotation_schema == annotation_schema and @schema_version == schema_version
      # FIXME: consolidate export times? This is a design flaw in PROIEL XML
      # 2.0: export time ought to be per source not per PROIEL XML file, so
      # not clear what to do here. Pass it down to the source object?
      #@export_time = tf.proiel.export_time
    else
      raise SchemaMismatch
    end
  else
    raise ArgumentError, 'expected filename, IO or array of these'
  end

  self
end