Class: Datasets::PMJTDatasetList

Inherits:
Dataset
  • Object
show all
Defined in:
lib/datasets/pmjt-dataset-list.rb

Defined Under Namespace

Classes: Record

Instance Attribute Summary

Attributes inherited from Dataset

#metadata

Instance Method Summary collapse

Methods inherited from Dataset

#clear_cache!, #to_table

Constructor Details

#initializePMJTDatasetList

Returns a new instance of PMJTDatasetList.



19
20
21
22
23
24
25
26
27
28
29
30
31
# File 'lib/datasets/pmjt-dataset-list.rb', line 19

def initialize
  super()
  @metadata.id = "pmjt-dataset-list"
  @metadata.name = "List of pre-modern Japanese text dataset"
  @metadata.url = "http://codh.rois.ac.jp/pmjt/"
  @metadata.licenses = ["CC-BY-SA-4.0"]
  @metadata.description = <<~DESCRIPTION
    Pre-Modern Japanese Text, owned by National Institute of Japanese Literature, is released image and text data as open data.
    In addition, some text has description, transcription, and tagging data.
  DESCRIPTION

  @data_path = cache_dir_path + (@metadata.id + ".csv")
end

Instance Method Details

#each(&block) ⇒ Object



33
34
35
36
37
38
39
40
41
42
43
44
45
# File 'lib/datasets/pmjt-dataset-list.rb', line 33

def each(&block)
  return to_enum(__method__) unless block_given?

  latest_version = "201901"
  url = "http://codh.rois.ac.jp/pmjt/list/pmjt-dataset-list-#{latest_version}.csv"
  download(@data_path, url)
  CSV.open(@data_path, headers: :first_row, encoding: "Windows-31J:UTF-8") do |csv|
    csv.each do |row|
      record = create_record(row)
      yield record
    end
  end
end