Class: Datasets::PennTreebank

Inherits:
Dataset
  • Object
show all
Defined in:
lib/datasets/penn-treebank.rb

Defined Under Namespace

Classes: Record

Constant Summary collapse

DESCRIPTION =
<<~DESC
  `Penn Tree Bank <https://www.cis.upenn.edu/~treebank/>`_ is originally a
  corpus of English sentences with linguistic structure annotations. This
  function uses a variant distributed at
  `https://github.com/wojzaremba/lstm <https://github.com/wojzaremba/lstm>`_,
  which omits the annotation and splits the dataset into three parts:
  training, validation, and test.
DESC

Instance Attribute Summary

Attributes inherited from Dataset

#metadata

Instance Method Summary collapse

Methods inherited from Dataset

#clear_cache!, #to_table

Constructor Details

#initialize(type: :train) ⇒ PennTreebank

Returns a new instance of PennTreebank.


16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# File 'lib/datasets/penn-treebank.rb', line 16

def initialize(type: :train)
  valid_types = [:train, :test, :valid]
  unless valid_types.include?(type)
    valid_types_label = valid_types.collect(&:inspect).join(", ")
    message = "Type must be one of [#{valid_types_label}]: #{type.inspect}"
    raise ArgumentError, message
  end
  @type = type

  super()

  @metadata.id = "penn-treebank-#{@type}"
  @metadata.name = "Penn Treebank: #{@type}"
  @metadata.description = DESCRIPTION
  @metadata.url = "https://github.com/wojzaremba/lstm"
  @metadata.licenses = ["Apache-2.0"]
end

Instance Method Details

#each(&block) ⇒ Object


34
35
36
37
38
39
40
41
42
43
44
45
# File 'lib/datasets/penn-treebank.rb', line 34

def each(&block)
  return to_enum(__method__) unless block_given?

  base_name = "ptb.#{@type}.txt"
  data_path = cache_dir_path + base_name
  unless data_path.exist?
    base_url = "https://raw.githubusercontent.com/wojzaremba/lstm/master/data"
    download(data_path, "#{base_url}/#{base_name}")
  end

  parse_data(data_path, &block)
end