Class: Wapiti::Dataset
- Inherits:
-
Object
- Object
- Wapiti::Dataset
- Extended by:
- Forwardable
- Includes:
- Comparable, Enumerable
- Defined in:
- lib/wapiti/dataset.rb
Instance Attribute Summary collapse
-
#sequences ⇒ Object
readonly
Returns the value of attribute sequences.
Class Method Summary collapse
- .open(path, format: File.extname(path), **opts) ⇒ Object
- .parse(dataset, separator: /(?:\r?\n){2,}/, **opts) ⇒ Object
Instance Method Summary collapse
- #&(other) ⇒ Object
- #+(other) ⇒ Object
- #-(other) ⇒ Object
- #<=>(other) ⇒ Object
- #concat(other) ⇒ Object
- #each ⇒ Object
- #eql?(other) ⇒ Boolean
- #hash ⇒ Object
-
#initialize(sequences = []) ⇒ Dataset
constructor
A new instance of Dataset.
- #inspect ⇒ Object
- #labels ⇒ Object
- #sample(n = 1, **opts) ⇒ Object
- #save(path, format: File.extname(path), **opts) ⇒ Object
- #slice(start, length = 1) ⇒ Object
- #to_a(**opts) ⇒ Object
- #to_s(separator: "\n\n", **opts) ⇒ Object
- #to_txt(separator: "\n", **opts) ⇒ Object
- #to_xml(**opts) ⇒ Object
- #to_yml(**opts) ⇒ Object
- #|(other) ⇒ Object
Constructor Details
#initialize(sequences = []) ⇒ Dataset
Returns a new instance of Dataset.
55 56 57 |
# File 'lib/wapiti/dataset.rb', line 55 def initialize(sequences = []) @sequences = sequences end |
Instance Attribute Details
#sequences ⇒ Object (readonly)
Returns the value of attribute sequences.
11 12 13 |
# File 'lib/wapiti/dataset.rb', line 11 def sequences @sequences end |
Class Method Details
.open(path, format: File.extname(path), **opts) ⇒ Object
41 42 43 44 45 46 47 48 49 50 51 52 |
# File 'lib/wapiti/dataset.rb', line 41 def open(path, format: File.extname(path), **opts) raise ArgumentError, "cannot open dataset from tainted path: '#{path}'" if path.tainted? input = File.read(path, encoding: 'utf-8') case format.downcase when '.xml', 'xml' parse(REXML::Document.new(input), **opts) else parse(input, **opts) end end |
.parse(dataset, separator: /(?:\r?\n){2,}/, **opts) ⇒ Object
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
# File 'lib/wapiti/dataset.rb', line 15 def parse(dataset, separator: /(?:\r?\n){2,}/, **opts) case dataset when Array new(dataset.map { |seq| Sequence.new(seq.map { |tk| value, *obs = tk[0].split(/\s+/) Token.new value, label: tk[1].to_s, observations: obs, score: tk[2] }) }) when String new(dataset.split(separator).map { |seq| Sequence.parse(seq, **opts) }.reject(&:empty?)) when REXML::Document new(dataset.elements.to_a('dataset/sequence').map { |seq| Sequence.new(seq.elements.to_a.map { |sgm| sgm.text.strip.split(opts[:spacer] || /\s+/).map { |tk| Token.new tk, label: sgm.name } }.flatten) }) else raise ArgumentError, "unknown input type: #{input.class}" end end |
Instance Method Details
#&(other) ⇒ Object
113 114 115 |
# File 'lib/wapiti/dataset.rb', line 113 def &(other) Dataset.new(sequences & other.sequences) end |
#+(other) ⇒ Object
101 102 103 |
# File 'lib/wapiti/dataset.rb', line 101 def +(other) Dataset.new(sequences + other.sequences) end |
#-(other) ⇒ Object
105 106 107 |
# File 'lib/wapiti/dataset.rb', line 105 def -(other) Dataset.new(sequences - other.sequences) end |
#<=>(other) ⇒ Object
80 81 82 |
# File 'lib/wapiti/dataset.rb', line 80 def <=>(other) Dataset === other ? sequences <=> other.sequences : nil end |
#concat(other) ⇒ Object
84 85 86 87 |
# File 'lib/wapiti/dataset.rb', line 84 def concat(other) sequences.concat other.sequences self end |
#each ⇒ Object
59 60 61 62 63 64 65 66 |
# File 'lib/wapiti/dataset.rb', line 59 def each if block_given? sequences.each(&Proc.new) self else to_enum end end |
#eql?(other) ⇒ Boolean
76 77 78 |
# File 'lib/wapiti/dataset.rb', line 76 def eql?(other) hash == other.hash end |
#hash ⇒ Object
72 73 74 |
# File 'lib/wapiti/dataset.rb', line 72 def hash sequences.hash end |
#inspect ⇒ Object
159 160 161 |
# File 'lib/wapiti/dataset.rb', line 159 def inspect "#<Wapiti::Dataset sequences={#{size}}>" end |
#labels ⇒ Object
68 69 70 |
# File 'lib/wapiti/dataset.rb', line 68 def labels map { |sq| sq.map(&:label).uniq }.flatten.uniq.sort end |
#sample(n = 1, **opts) ⇒ Object
89 90 91 |
# File 'lib/wapiti/dataset.rb', line 89 def sample(n = 1, **opts) Dataset.new sequences.sample(n, **opts) end |
#save(path, format: File.extname(path), **opts) ⇒ Object
143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
# File 'lib/wapiti/dataset.rb', line 143 def save(path, format: File.extname(path), **opts) raise ArgumentError, "cannot write dataset to tainted path: '#{path}'" if path.tainted? output = case format.downcase when '.txt', 'txt' to_s(**opts) when '.xml', 'xml' to_xml(**opts) else raise ArgumentError, "unknown format: '#{format}'" end File.write(path, output, encoding: 'utf-8', mode: 'w') end |
#slice(start, length = 1) ⇒ Object
93 94 95 96 97 98 99 |
# File 'lib/wapiti/dataset.rb', line 93 def slice(start, length = 1) if Range === start Dataset.new sequences.slice(start) else Dataset.new sequences.slice(start, length) end end |
#to_a(**opts) ⇒ Object
125 126 127 |
# File 'lib/wapiti/dataset.rb', line 125 def to_a(**opts) map { |sq| sq.to_a(**opts) } end |
#to_s(separator: "\n\n", **opts) ⇒ Object
117 118 119 |
# File 'lib/wapiti/dataset.rb', line 117 def to_s(separator: "\n\n", **opts) map { |sq| sq.to_s(**opts) }.join(separator) end |
#to_txt(separator: "\n", **opts) ⇒ Object
121 122 123 |
# File 'lib/wapiti/dataset.rb', line 121 def to_txt(separator: "\n", **opts) map { |sq| sq.to_sentence(**opts) }.join(separator) end |
#to_xml(**opts) ⇒ Object
129 130 131 132 133 134 135 136 137 |
# File 'lib/wapiti/dataset.rb', line 129 def to_xml(**opts) xml = Builder::XmlMarkup.new(**opts) xml.instruct! xml.dataset do |ds| each do |seq| seq.to_xml ds end end end |
#to_yml(**opts) ⇒ Object
139 140 141 |
# File 'lib/wapiti/dataset.rb', line 139 def to_yml(**opts) map { |sq| sq.to_h(**opts) } end |