Class: Wapiti::Dataset

Inherits:
Object
  • Object
show all
Extended by:
Forwardable
Includes:
Comparable, Enumerable
Defined in:
lib/wapiti/dataset.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(sequences = []) ⇒ Dataset

Returns a new instance of Dataset.



55
56
57
# File 'lib/wapiti/dataset.rb', line 55

def initialize(sequences = [])
  @sequences = sequences
end

Instance Attribute Details

#sequencesObject (readonly)

Returns the value of attribute sequences.



11
12
13
# File 'lib/wapiti/dataset.rb', line 11

def sequences
  @sequences
end

Class Method Details

.open(path, format: File.extname(path), **opts) ⇒ Object

Raises:

  • (ArgumentError)


41
42
43
44
45
46
47
48
49
50
51
52
# File 'lib/wapiti/dataset.rb', line 41

def open(path, format: File.extname(path), **opts)
  raise ArgumentError,
    "cannot open dataset from tainted path: '#{path}'" if path.tainted?

  input = File.read(path, encoding: 'utf-8')
  case format.downcase
  when '.xml', 'xml'
    parse(REXML::Document.new(input), **opts)
  else
    parse(input, **opts)
  end
end

.parse(dataset, separator: /(?:\r?\n){2,}/, **opts) ⇒ Object



15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# File 'lib/wapiti/dataset.rb', line 15

def parse(dataset, separator: /(?:\r?\n){2,}/, **opts)
  case dataset
  when Array
    new(dataset.map { |seq|
      Sequence.new(seq.map { |tk|
        value, *obs = tk[0].split(/\s+/)
        Token.new value, label: tk[1].to_s, observations: obs, score: tk[2]
      })
    })
  when String
    new(dataset.split(separator).map { |seq|
      Sequence.parse(seq, **opts)
    }.reject(&:empty?))
  when REXML::Document
    new(dataset.elements.to_a('dataset/sequence').map { |seq|
      Sequence.new(seq.elements.to_a.map { |sgm|
        sgm.text.strip.split(opts[:spacer] || /\s+/).map { |tk|
          Token.new tk, label: sgm.name
        }
      }.flatten)
    })
  else
    raise ArgumentError, "unknown input type: #{input.class}"
  end
end

Instance Method Details

#&(other) ⇒ Object



113
114
115
# File 'lib/wapiti/dataset.rb', line 113

def &(other)
  Dataset.new(sequences & other.sequences)
end

#+(other) ⇒ Object



101
102
103
# File 'lib/wapiti/dataset.rb', line 101

def +(other)
  Dataset.new(sequences + other.sequences)
end

#-(other) ⇒ Object



105
106
107
# File 'lib/wapiti/dataset.rb', line 105

def -(other)
  Dataset.new(sequences - other.sequences)
end

#<=>(other) ⇒ Object



80
81
82
# File 'lib/wapiti/dataset.rb', line 80

def <=>(other)
  Dataset === other ? sequences <=> other.sequences : nil
end

#concat(other) ⇒ Object



84
85
86
87
# File 'lib/wapiti/dataset.rb', line 84

def concat(other)
  sequences.concat other.sequences
  self
end

#eachObject



59
60
61
62
63
64
65
66
# File 'lib/wapiti/dataset.rb', line 59

def each
  if block_given?
    sequences.each(&Proc.new)
    self
  else
    to_enum
  end
end

#eql?(other) ⇒ Boolean

Returns:

  • (Boolean)


76
77
78
# File 'lib/wapiti/dataset.rb', line 76

def eql?(other)
  hash == other.hash
end

#hashObject



72
73
74
# File 'lib/wapiti/dataset.rb', line 72

def hash
  sequences.hash
end

#inspectObject



159
160
161
# File 'lib/wapiti/dataset.rb', line 159

def inspect
  "#<Wapiti::Dataset sequences={#{size}}>"
end

#labelsObject



68
69
70
# File 'lib/wapiti/dataset.rb', line 68

def labels
  map { |sq| sq.map(&:label).uniq }.flatten.uniq.sort
end

#sample(n = 1, **opts) ⇒ Object



89
90
91
# File 'lib/wapiti/dataset.rb', line 89

def sample(n = 1, **opts)
  Dataset.new sequences.sample(n, **opts)
end

#save(path, format: File.extname(path), **opts) ⇒ Object

Raises:

  • (ArgumentError)


143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# File 'lib/wapiti/dataset.rb', line 143

def save(path, format: File.extname(path), **opts)
  raise ArgumentError,
    "cannot write dataset to tainted path: '#{path}'" if path.tainted?

  output = case format.downcase
    when '.txt', 'txt'
      to_s(**opts)
    when '.xml', 'xml'
      to_xml(**opts)
    else
      raise ArgumentError, "unknown format: '#{format}'"
    end

  File.write(path, output, encoding: 'utf-8', mode: 'w')
end

#slice(start, length = 1) ⇒ Object



93
94
95
96
97
98
99
# File 'lib/wapiti/dataset.rb', line 93

def slice(start, length = 1)
  if Range === start
    Dataset.new sequences.slice(start)
  else
    Dataset.new sequences.slice(start, length)
  end
end

#to_a(**opts) ⇒ Object



125
126
127
# File 'lib/wapiti/dataset.rb', line 125

def to_a(**opts)
  map { |sq| sq.to_a(**opts) }
end

#to_s(separator: "\n\n", **opts) ⇒ Object



117
118
119
# File 'lib/wapiti/dataset.rb', line 117

def to_s(separator: "\n\n", **opts)
  map { |sq| sq.to_s(**opts) }.join(separator)
end

#to_txt(separator: "\n", **opts) ⇒ Object



121
122
123
# File 'lib/wapiti/dataset.rb', line 121

def to_txt(separator: "\n", **opts)
  map { |sq| sq.to_sentence(**opts) }.join(separator)
end

#to_xml(**opts) ⇒ Object



129
130
131
132
133
134
135
136
137
# File 'lib/wapiti/dataset.rb', line 129

def to_xml(**opts)
  xml = Builder::XmlMarkup.new(**opts)
  xml.instruct!
  xml.dataset do |ds|
    each do |seq|
      seq.to_xml ds
    end
  end
end

#to_yml(**opts) ⇒ Object



139
140
141
# File 'lib/wapiti/dataset.rb', line 139

def to_yml(**opts)
  map { |sq| sq.to_h(**opts) }
end

#|(other) ⇒ Object



109
110
111
# File 'lib/wapiti/dataset.rb', line 109

def |(other)
  Dataset.new(sequences | other.sequences)
end