Class: Datasets::AozoraBunko

Inherits:
Dataset
  • Object
show all
Defined in:
lib/datasets/aozora-bunko.rb

Overview

Dataset for AozoraBunko

Defined Under Namespace

Classes: Book

Instance Attribute Summary

Attributes inherited from Dataset

#metadata

Instance Method Summary collapse

Methods inherited from Dataset

#clear_cache!, #to_table

Constructor Details

#initializeAozoraBunko

Returns a new instance of AozoraBunko.



149
150
151
152
153
154
155
156
157
158
159
160
161
# File 'lib/datasets/aozora-bunko.rb', line 149

def initialize
  super()

  @metadata.id = 'aozora-bunko'
  @metadata.name = 'Aozora Bunko'
  @metadata.url = 'https://www.aozora.gr.jp/'
  @metadata.licenses = 'CC-BY-2.1-JP'
  @metadata.description = <<~DESCRIPTION
    Aozora Bunko is an activity to collect free electronic books that anyone can access
    on the Internet like a library. The copyrighted works and the works that are said to be
    "free to read" are available after being digitized in text and XHTML (some HTML) formats.
  DESCRIPTION
end

Instance Method Details

#eachObject



163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
# File 'lib/datasets/aozora-bunko.rb', line 163

def each
  return to_enum(__method__) unless block_given?

  open_data do |csv_file_stream|
    text = csv_file_stream.read.force_encoding(Encoding::UTF_8) # file has Byte Order Mark

    CSV.parse(text, headers: true) do |row|
      %w[作品著作権フラグ 人物著作権フラグ].each do |boolean_column_name|
        row[boolean_column_name] = normalize_boolean(row[boolean_column_name])
      end
      book = Book.new(*row.fields)
      book.cache_path = cache_path

      yield(book)
    end
  end
end