Class: Docs::Scraper

Inherits:
Doc
  • Object
show all
Includes:
Instrumentable
Defined in:
lib/docs/core/scraper.rb

Direct Known Subclasses

FileScraper, UrlScraper

Constant Summary

Constants inherited from Doc

Doc::DB_FILENAME, Doc::INDEX_FILENAME

Class Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from Instrumentable

extended, included

Methods inherited from Doc

as_json, db_path, index_path, #initialize, path, store_page, store_pages

Constructor Details

This class inherits a constructor from Docs::Doc

Class Attribute Details

.base_urlObject

Returns the value of attribute base_url


6
7
8
# File 'lib/docs/core/scraper.rb', line 6

def base_url
  @base_url
end

.html_filtersObject

Returns the value of attribute html_filters


6
7
8
# File 'lib/docs/core/scraper.rb', line 6

def html_filters
  @html_filters
end

.initial_pathsObject

Returns the value of attribute initial_paths


6
7
8
# File 'lib/docs/core/scraper.rb', line 6

def initial_paths
  @initial_paths
end

.optionsObject

Returns the value of attribute options


6
7
8
# File 'lib/docs/core/scraper.rb', line 6

def options
  @options
end

.root_pathObject

Returns the value of attribute root_path


6
7
8
# File 'lib/docs/core/scraper.rb', line 6

def root_path
  @root_path
end

.text_filtersObject

Returns the value of attribute text_filters


6
7
8
# File 'lib/docs/core/scraper.rb', line 6

def text_filters
  @text_filters
end

Class Method Details

.filtersObject


23
24
25
# File 'lib/docs/core/scraper.rb', line 23

def filters
  html_filters.to_a + text_filters.to_a
end

.inherited(subclass) ⇒ Object


8
9
10
11
12
13
14
15
16
17
18
19
20
21
# File 'lib/docs/core/scraper.rb', line 8

def inherited(subclass)
  super

  subclass.class_eval do
    extend AutoloadHelper
    autoload_all "docs/filters/#{to_s.demodulize.underscore}", 'filter'
  end

  subclass.root_path = root_path
  subclass.initial_paths = initial_paths.dup
  subclass.options = options.deep_dup
  subclass.html_filters = html_filters.inheritable_copy
  subclass.text_filters = text_filters.inheritable_copy
end

Instance Method Details

#base_urlObject


60
61
62
# File 'lib/docs/core/scraper.rb', line 60

def base_url
  @base_url ||= URL.parse self.class.base_url
end

#build_page(path) {|result| ... } ⇒ Object

Yields:

  • (result)

39
40
41
42
43
44
# File 'lib/docs/core/scraper.rb', line 39

def build_page(path)
  response = request_one url_for(path)
  result = handle_response(response)
  yield result if block_given?
  result
end

#build_pagesObject


46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/docs/core/scraper.rb', line 46

def build_pages
  history = Set.new initial_urls.map(&:downcase)
  instrument 'running.scraper', urls: initial_urls

  request_all initial_urls do |response|
    next unless data = handle_response(response)
    yield data
    next unless data[:internal_urls].present?
    next_urls = data[:internal_urls].select { |url| history.add?(url.downcase) }
    instrument 'queued.scraper', urls: next_urls
    next_urls
  end
end

#initial_pathsObject


76
77
78
# File 'lib/docs/core/scraper.rb', line 76

def initial_paths
  self.class.initial_paths
end

#initial_urlsObject


80
81
82
# File 'lib/docs/core/scraper.rb', line 80

def initial_urls
  @initial_urls ||= [root_url.to_s].concat(initial_paths.map(&method(:url_for))).freeze
end

#optionsObject


90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# File 'lib/docs/core/scraper.rb', line 90

def options
  @options ||= self.class.options.deep_dup.tap do |options|
    options.merge! base_url: base_url, root_url: root_url,
                   root_path: root_path, initial_paths: initial_paths

    if root_path?
      (options[:skip] ||= []).concat ['', '/']
    end

    if options[:only] || options[:only_patterns]
      (options[:only] ||= []).concat initial_paths + (root_path? ? [root_path] : ['', '/'])
    end

    options.freeze
  end
end

#pipelineObject


84
85
86
87
88
# File 'lib/docs/core/scraper.rb', line 84

def pipeline
  @pipeline ||= ::HTML::Pipeline.new(self.class.filters).tap do |pipeline|
    pipeline.instrumentation_service = Docs
  end
end

#root_pathObject


68
69
70
# File 'lib/docs/core/scraper.rb', line 68

def root_path
  self.class.root_path
end

#root_path?Boolean

Returns:

  • (Boolean)

72
73
74
# File 'lib/docs/core/scraper.rb', line 72

def root_path?
  root_path.present? && root_path != '/'
end

#root_urlObject


64
65
66
# File 'lib/docs/core/scraper.rb', line 64

def root_url
  @root_url ||= root_path? ? URL.parse(File.join(base_url.to_s, root_path)) : base_url.normalize
end