Class: Kimurai::Base

Inherits:
Object
  • Object
show all
Includes:
BaseHelper
Defined in:
lib/kimurai/base.rb,
lib/kimurai/base/saver.rb,
lib/kimurai/base/storage.rb

Direct Known Subclasses

ApplicationSpider

Defined Under Namespace

Classes: Saver, Storage

Constant Summary collapse

DMERGE_EXCLUDE =

don’t deep merge config’s headers hash option

[:headers]
LoggerFormatter =
proc do |severity, datetime, progname, msg|
  current_thread_id = Thread.current.object_id
  thread_type = Thread.main == Thread.current ? "M" : "C"
  output = "%s, [%s#%d] [%s: %s] %5s -- %s: %s\n"
    .freeze % [severity[0..0], datetime, $$, thread_type, current_thread_id, severity, progname, msg]

  if Kimurai.configuration.colorize_logger != false && Kimurai.env == "development"
    Rbcat.colorize(output, predefined: [:jsonhash, :logger])
  else
    output
  end
end

Class Attribute Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(engine = self.class.engine, config: {}) ⇒ Base

Returns a new instance of Base.



173
174
175
176
177
178
179
180
181
182
183
184
185
# File 'lib/kimurai/base.rb', line 173

def initialize(engine = self.class.engine, config: {})
  @engine = engine
  @config = self.class.config.deep_merge_excl(config, DMERGE_EXCLUDE)
  @pipelines = self.class.pipelines.map do |pipeline_name|
    klass = Pipeline.descendants.find { |kl| kl.name == pipeline_name }
    instance = klass.new
    instance.spider = self
    [pipeline_name, instance]
  end.to_h

  @logger = self.class.logger
  @savers = {}
end

Class Attribute Details

.run_infoObject (readonly)

Returns the value of attribute run_info.



27
28
29
# File 'lib/kimurai/base.rb', line 27

def run_info
  @run_info
end

.saversObject (readonly)

Returns the value of attribute savers.



27
28
29
# File 'lib/kimurai/base.rb', line 27

def savers
  @savers
end

.storageObject (readonly)

Returns the value of attribute storage.



27
28
29
# File 'lib/kimurai/base.rb', line 27

def storage
  @storage
end

Instance Attribute Details

#loggerObject (readonly)

Returns the value of attribute logger.



170
171
172
# File 'lib/kimurai/base.rb', line 170

def logger
  @logger
end

#with_infoObject

Returns the value of attribute with_info.



171
172
173
# File 'lib/kimurai/base.rb', line 171

def with_info
  @with_info
end

Class Method Details

.add_event(scope, event) ⇒ Object



55
56
57
58
# File 'lib/kimurai/base.rb', line 55

def self.add_event(scope, event)
  return unless @run_info
  @update_mutex.synchronize { @run_info[:events][scope][event] += 1 }
end

.completed?Boolean

Returns:

  • (Boolean)


34
35
36
# File 'lib/kimurai/base.rb', line 34

def self.completed?
  @run_info && @run_info[:status] == :completed
end

.configObject



82
83
84
85
86
87
88
# File 'lib/kimurai/base.rb', line 82

def self.config
  if superclass.equal?(::Object)
    @config
  else
    superclass.config.deep_merge_excl(@config || {}, DMERGE_EXCLUDE)
  end
end

.crawl!(exception_on_fail: true) ⇒ Object



100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# File 'lib/kimurai/base.rb', line 100

def self.crawl!(exception_on_fail: true)
  logger.error "Spider: already running: #{name}" and return false if running?

  @storage = Storage.new
  @savers = {}
  @update_mutex = Mutex.new

  @run_info = {
    spider_name: name, status: :running, error: nil, environment: Kimurai.env,
    start_time: Time.new, stop_time: nil, running_time: nil,
    visits: { requests: 0, responses: 0 }, items: { sent: 0, processed: 0 },
    events: { requests_errors: Hash.new(0), drop_items_errors: Hash.new(0), custom: Hash.new(0) }
  }

  ###

  logger.info "Spider: started: #{name}"
  open_spider if self.respond_to? :open_spider

  spider = self.new
  spider.with_info = true
  if start_urls
    start_urls.each do |start_url|
      if start_url.class == Hash
        spider.request_to(:parse, start_url)
      else
        spider.request_to(:parse, url: start_url)
      end
    end
  else
    spider.parse
  end
rescue StandardError, SignalException, SystemExit => e
  @run_info.merge!(status: :failed, error: e.inspect)
  exception_on_fail ? raise(e) : [@run_info, e]
else
  @run_info.merge!(status: :completed)
ensure
  if spider
    spider.browser.destroy_driver! if spider.instance_variable_get("@browser")

    stop_time  = Time.now
    total_time = (stop_time - @run_info[:start_time]).round(3)
    @run_info.merge!(stop_time: stop_time, running_time: total_time)

    close_spider if self.respond_to? :close_spider

    message = "Spider: stopped: #{@run_info.merge(running_time: @run_info[:running_time]&.duration)}"
    failed? ? logger.fatal(message) : logger.info(message)

    @run_info, @storage, @savers, @update_mutex = nil
  end
end

.engineObject



70
71
72
# File 'lib/kimurai/base.rb', line 70

def self.engine
  @engine ||= superclass.engine
end

.failed?Boolean

Returns:

  • (Boolean)


38
39
40
# File 'lib/kimurai/base.rb', line 38

def self.failed?
  @run_info && @run_info[:status] == :failed
end

.itemsObject



46
47
48
# File 'lib/kimurai/base.rb', line 46

def self.items
  @run_info && @run_info[:items]
end

.loggerObject



92
93
94
95
96
97
98
# File 'lib/kimurai/base.rb', line 92

def self.logger
  @logger ||= Kimurai.configuration.logger || begin
    log_level = (ENV["LOG_LEVEL"] || Kimurai.configuration.log_level || "DEBUG").to_s.upcase
    log_level = "Logger::#{log_level}".constantize
    Logger.new(STDOUT, formatter: LoggerFormatter, level: log_level, progname: name)
  end
end

.nameObject



66
67
68
# File 'lib/kimurai/base.rb', line 66

def self.name
  @name
end

.parse!(handler, *args, **request) ⇒ Object



154
155
156
157
158
159
160
161
162
163
164
165
166
# File 'lib/kimurai/base.rb', line 154

def self.parse!(handler, *args, **request)
  spider = self.new

  if args.present?
    spider.public_send(handler, *args)
  elsif request.present?
    spider.request_to(handler, request)
  else
    spider.public_send(handler)
  end
ensure
  spider.browser.destroy_driver! if spider.instance_variable_get("@browser")
end

.pipelinesObject



74
75
76
# File 'lib/kimurai/base.rb', line 74

def self.pipelines
  @pipelines ||= superclass.pipelines
end

.running?Boolean

Returns:

  • (Boolean)


30
31
32
# File 'lib/kimurai/base.rb', line 30

def self.running?
  @run_info && @run_info[:status] == :running
end

.start_urlsObject



78
79
80
# File 'lib/kimurai/base.rb', line 78

def self.start_urls
  @start_urls
end

.update(type, subtype) ⇒ Object



50
51
52
53
# File 'lib/kimurai/base.rb', line 50

def self.update(type, subtype)
  return unless @run_info
  @update_mutex.synchronize { @run_info[type][subtype] += 1 }
end

.visitsObject



42
43
44
# File 'lib/kimurai/base.rb', line 42

def self.visits
  @run_info && @run_info[:visits]
end

Instance Method Details

#add_event(scope = :custom, event) ⇒ Object



234
235
236
237
238
239
240
241
# File 'lib/kimurai/base.rb', line 234

def add_event(scope = :custom, event)
  unless self.with_info
    raise "It's allowed to use `add_event` only while performing a full run (`.crawl!` method)"
  end

  self.class.add_event(scope, event)
  logger.info "Spider: new event (scope: #{scope}): #{event}" if scope == :custom
end

#browserObject



187
188
189
# File 'lib/kimurai/base.rb', line 187

def browser
  @browser ||= BrowserBuilder.build(@engine, @config, spider: self)
end

#console(response = nil, url: nil, data: {}) ⇒ Object



203
204
205
# File 'lib/kimurai/base.rb', line 203

def console(response = nil, url: nil, data: {})
  binding.pry
end

#request_to(handler, delay = nil, url:, data: {}, response_type: :html) ⇒ Object



191
192
193
194
195
196
197
198
199
200
201
# File 'lib/kimurai/base.rb', line 191

def request_to(handler, delay = nil, url:, data: {}, response_type: :html)
  if @config[:skip_duplicate_requests] && !unique_request?(url)
    add_event(:duplicate_requests) if self.with_info
    logger.warn "Spider: request_to: not unique url: #{url}, skipped" and return
  end

  visited = delay ? browser.visit(url, delay: delay) : browser.visit(url)
  return unless visited

  public_send(handler, browser.current_response(response_type), { url: url, data: data })
end

#save_to(path, item, format:, position: true, append: false) ⇒ Object



219
220
221
222
223
224
225
226
227
228
229
230
# File 'lib/kimurai/base.rb', line 219

def save_to(path, item, format:, position: true, append: false)
  @savers[path] ||= begin
    options = { format: format, position: position, append: append }
    if self.with_info
      self.class.savers[path] ||= Saver.new(path, options)
    else
      Saver.new(path, options)
    end
  end

  @savers[path].save(item)
end

#storageObject



209
210
211
212
213
# File 'lib/kimurai/base.rb', line 209

def storage
  # Note: for `.crawl!` uses shared thread safe Storage instance,
  # otherwise, each spider instance will have it's own Storage
  @storage ||= self.with_info ? self.class.storage : Storage.new
end

#unique?(scope, value) ⇒ Boolean

Returns:

  • (Boolean)


215
216
217
# File 'lib/kimurai/base.rb', line 215

def unique?(scope, value)
  storage.unique?(scope, value)
end