Class: DaimonSkycrawlers::Crawler::Base

Inherits:
Object
  • Object
show all
Includes:
DaimonSkycrawlers::ConfigMixin, LoggerMixin
Defined in:
lib/daimon_skycrawlers/crawler/base.rb

Overview

The base class of crawler

Direct Known Subclasses

Default

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from LoggerMixin

included

Constructor Details

#initialize(base_url = nil, faraday_options: {}, options: {}) ⇒ Base

Returns a new instance of Base.

Parameters:

  • Base (String)

    URL for crawler

  • options (Hash) (defaults to: {})

    for Faraday



34
35
36
37
38
39
40
41
42
# File 'lib/daimon_skycrawlers/crawler/base.rb', line 34

def initialize(base_url = nil, faraday_options: {}, options: {})
  super()
  @base_url = base_url
  @faraday_options = faraday_options
  @options = options
  @prepare = ->(connection) {}
  @skipped = false
  @n_processed_urls = 0
end

Instance Attribute Details

#n_processed_urlsObject (readonly)

Returns the value of attribute n_processed_urls.



28
29
30
# File 'lib/daimon_skycrawlers/crawler/base.rb', line 28

def n_processed_urls
  @n_processed_urls
end

#storageObject

Retrieve storage instance



70
71
72
# File 'lib/daimon_skycrawlers/crawler/base.rb', line 70

def storage
  @storage ||= Storage::RDB.new
end

Instance Method Details

#connectionObject



78
79
80
# File 'lib/daimon_skycrawlers/crawler/base.rb', line 78

def connection
  @connection ||= Faraday.new(@base_url, @faraday_options)
end

#fetch(path, message = {}) ⇒ Object

Raises:

  • (NotImplementedError)


98
99
100
# File 'lib/daimon_skycrawlers/crawler/base.rb', line 98

def fetch(path, message = {})
  raise NotImplementedError, "Must implement this method in subclass"
end

#get(path, params = {}) ⇒ Object



102
103
104
# File 'lib/daimon_skycrawlers/crawler/base.rb', line 102

def get(path, params = {})
  @connection.get(path, params)
end

#post(path, params = {}) ⇒ Object



106
107
108
# File 'lib/daimon_skycrawlers/crawler/base.rb', line 106

def post(path, params = {})
  @connection.post(path, params)
end

#prepare(&block) ⇒ Object

Call this method before DaimonSkycrawlers.register_crawler For example, you can login before fetch URL



63
64
65
# File 'lib/daimon_skycrawlers/crawler/base.rb', line 63

def prepare(&block)
  @prepare = block
end

#process(message, &block) ⇒ Object



82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# File 'lib/daimon_skycrawlers/crawler/base.rb', line 82

def process(message, &block)
  url = message.delete(:url)

  @skipped = false
  @n_processed_urls += 1
  # url can be a path
  url = connection.url_prefix + url

  apply_filters(url)

  unless skipped?
    @prepare.call(connection)
    fetch(url, message, &block)
  end
end

#setup_connection(options = {}) {|faraday| ... } ⇒ Object

Set up connection

Parameters:

  • options (Hash) (defaults to: {})

    for Faraday

Yields:

  • (faraday)

Yield Parameters:

  • faraday (Faraday)


51
52
53
54
55
56
57
# File 'lib/daimon_skycrawlers/crawler/base.rb', line 51

def setup_connection(options = {})
  merged_options = @faraday_options.merge(options)
  faraday_options = merged_options.empty? ? nil : merged_options
  @connection = Faraday.new(@base_url, faraday_options) do |faraday|
    yield faraday
  end
end

#skipped?Boolean

Returns:

  • (Boolean)


74
75
76
# File 'lib/daimon_skycrawlers/crawler/base.rb', line 74

def skipped?
  @skipped
end