Class: Datasets::Wikipedia

Inherits:
Dataset
  • Object
show all
Defined in:
lib/datasets/wikipedia.rb

Defined Under Namespace

Classes: ArticlesListener, Contributor, Page, Revision

Instance Attribute Summary

Attributes inherited from Dataset

#metadata

Instance Method Summary collapse

Methods inherited from Dataset

#clear_cache!, #to_table

Constructor Details

#initialize(language: :en, type: :articles) ⇒ Wikipedia

Returns a new instance of Wikipedia.


28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# File 'lib/datasets/wikipedia.rb', line 28

def initialize(language: :en,
               type: :articles)
  super()
  @language = language
  @type = type
  @metadata.id = "wikipedia-#{@language}-#{@type}"
  @metadata.name = "Wikipedia #{@type} (#{@language})"
  @metadata.url = "https://dumps.wikimedia.org/"
  @metadata.licenses = [
    "CC-BY-SA-3.0",
    "CC-BY-SA-4.0",
    "GFDL-1.3-or-later",
  ]
  @metadata.description = "Wikipedia #{@type} in #{@language}"
end

Instance Method Details

#each(&block) ⇒ Object


44
45
46
47
48
49
50
51
52
# File 'lib/datasets/wikipedia.rb', line 44

def each(&block)
  return to_enum(__method__) unless block_given?

  open_data do |input|
    listener = ArticlesListener.new(block)
    parser = REXML::Parsers::StreamParser.new(input, listener)
    parser.parse
  end
end