Class: Kimurai::Base

Inherits:
Object
  • Object
show all
Includes:
BaseHelper
Defined in:
lib/kimurai/base.rb,
lib/kimurai/base/saver.rb,
lib/kimurai/base/storage.rb

Direct Known Subclasses

ApplicationSpider

Defined Under Namespace

Classes: InvalidUrlError, Saver, Storage

Constant Summary collapse

DMERGE_EXCLUDE =

don’t deep merge config’s headers hash option

[:headers]
LoggerFormatter =
proc do |severity, datetime, progname, msg|
  current_thread_id = Thread.current.object_id
  thread_type = Thread.main == Thread.current ? "M" : "C"
  output = "%s, [%s#%d] [%s: %s] %5s -- %s: %s\n"
    .freeze % [severity[0..0], datetime, $$, thread_type, current_thread_id, severity, progname, msg]

  if Kimurai.configuration.colorize_logger != false && Kimurai.env == "development"
    Rbcat.colorize(output, predefined: [:jsonhash, :logger])
  else
    output
  end
end

Class Attribute Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(engine = self.class.engine, config: {}) ⇒ Base

Returns a new instance of Base.



175
176
177
178
179
180
181
182
183
184
185
186
187
# File 'lib/kimurai/base.rb', line 175

def initialize(engine = self.class.engine, config: {})
  @engine = engine || self.class.engine
  @config = self.class.config.deep_merge_excl(config, DMERGE_EXCLUDE)
  @pipelines = self.class.pipelines.map do |pipeline_name|
    klass = Pipeline.descendants.find { |kl| kl.name == pipeline_name }
    instance = klass.new
    instance.spider = self
    [pipeline_name, instance]
  end.to_h

  @logger = self.class.logger
  @savers = {}
end

Class Attribute Details

.run_infoObject (readonly)

Returns the value of attribute run_info.



29
30
31
# File 'lib/kimurai/base.rb', line 29

def run_info
  @run_info
end

.saversObject (readonly)

Returns the value of attribute savers.



29
30
31
# File 'lib/kimurai/base.rb', line 29

def savers
  @savers
end

.storageObject (readonly)

Returns the value of attribute storage.



29
30
31
# File 'lib/kimurai/base.rb', line 29

def storage
  @storage
end

Instance Attribute Details

#loggerObject (readonly)

Returns the value of attribute logger.



172
173
174
# File 'lib/kimurai/base.rb', line 172

def logger
  @logger
end

#with_infoObject

Returns the value of attribute with_info.



173
174
175
# File 'lib/kimurai/base.rb', line 173

def with_info
  @with_info
end

Class Method Details

.add_event(scope, event) ⇒ Object



57
58
59
60
# File 'lib/kimurai/base.rb', line 57

def self.add_event(scope, event)
  return unless @run_info
  @update_mutex.synchronize { @run_info[:events][scope][event] += 1 }
end

.completed?Boolean

Returns:

  • (Boolean)


36
37
38
# File 'lib/kimurai/base.rb', line 36

def self.completed?
  @run_info && @run_info[:status] == :completed
end

.configObject



84
85
86
87
88
89
90
# File 'lib/kimurai/base.rb', line 84

def self.config
  if superclass.equal?(::Object)
    @config
  else
    superclass.config.deep_merge_excl(@config || {}, DMERGE_EXCLUDE)
  end
end

.crawl!(exception_on_fail: true) ⇒ Object



102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# File 'lib/kimurai/base.rb', line 102

def self.crawl!(exception_on_fail: true)
  logger.error "Spider: already running: #{name}" and return false if running?

  @storage = Storage.new
  @savers = {}
  @update_mutex = Mutex.new

  @run_info = {
    spider_name: name, status: :running, error: nil, environment: Kimurai.env,
    start_time: Time.new, stop_time: nil, running_time: nil,
    visits: { requests: 0, responses: 0 }, items: { sent: 0, processed: 0 },
    events: { requests_errors: Hash.new(0), drop_items_errors: Hash.new(0), custom: Hash.new(0) }
  }

  ###

  logger.info "Spider: started: #{name}"
  open_spider if self.respond_to? :open_spider

  spider = self.new
  spider.with_info = true
  if start_urls
    start_urls.each do |start_url|
      if start_url.class == Hash
        spider.request_to(:parse, start_url)
      else
        spider.request_to(:parse, url: start_url)
      end
    end
  else
    spider.parse
  end
rescue StandardError, SignalException, SystemExit => e
  @run_info.merge!(status: :failed, error: e.inspect)
  exception_on_fail ? raise(e) : [@run_info, e]
else
  @run_info.merge!(status: :completed)
ensure
  if spider
    spider.browser.destroy_driver! if spider.instance_variable_get("@browser")

    stop_time  = Time.now
    total_time = (stop_time - @run_info[:start_time]).round(3)
    @run_info.merge!(stop_time: stop_time, running_time: total_time)

    close_spider if self.respond_to? :close_spider

    message = "Spider: stopped: #{@run_info.merge(running_time: @run_info[:running_time]&.duration)}"
    failed? ? logger.fatal(message) : logger.info(message)

    @run_info, @storage, @savers, @update_mutex = nil
  end
end

.engineObject



72
73
74
# File 'lib/kimurai/base.rb', line 72

def self.engine
  @engine ||= superclass.engine
end

.failed?Boolean

Returns:

  • (Boolean)


40
41
42
# File 'lib/kimurai/base.rb', line 40

def self.failed?
  @run_info && @run_info[:status] == :failed
end

.itemsObject



48
49
50
# File 'lib/kimurai/base.rb', line 48

def self.items
  @run_info && @run_info[:items]
end

.loggerObject



94
95
96
97
98
99
100
# File 'lib/kimurai/base.rb', line 94

def self.logger
  @logger ||= Kimurai.configuration.logger || begin
    log_level = (ENV["LOG_LEVEL"] || Kimurai.configuration.log_level || "DEBUG").to_s.upcase
    log_level = "Logger::#{log_level}".constantize
    Logger.new(STDOUT, formatter: LoggerFormatter, level: log_level, progname: name)
  end
end

.nameObject



68
69
70
# File 'lib/kimurai/base.rb', line 68

def self.name
  @name
end

.parse!(handler, *args, **request) ⇒ Object



156
157
158
159
160
161
162
163
164
165
166
167
168
# File 'lib/kimurai/base.rb', line 156

def self.parse!(handler, *args, **request)
  spider = self.new

  if args.present?
    spider.public_send(handler, *args)
  elsif request.present?
    spider.request_to(handler, request)
  else
    spider.public_send(handler)
  end
ensure
  spider.browser.destroy_driver! if spider.instance_variable_get("@browser")
end

.pipelinesObject



76
77
78
# File 'lib/kimurai/base.rb', line 76

def self.pipelines
  @pipelines ||= superclass.pipelines
end

.running?Boolean

Returns:

  • (Boolean)


32
33
34
# File 'lib/kimurai/base.rb', line 32

def self.running?
  @run_info && @run_info[:status] == :running
end

.start_urlsObject



80
81
82
# File 'lib/kimurai/base.rb', line 80

def self.start_urls
  @start_urls
end

.update(type, subtype) ⇒ Object



52
53
54
55
# File 'lib/kimurai/base.rb', line 52

def self.update(type, subtype)
  return unless @run_info
  @update_mutex.synchronize { @run_info[type][subtype] += 1 }
end

.visitsObject



44
45
46
# File 'lib/kimurai/base.rb', line 44

def self.visits
  @run_info && @run_info[:visits]
end

Instance Method Details

#add_event(scope = :custom, event) ⇒ Object



238
239
240
241
242
243
244
245
# File 'lib/kimurai/base.rb', line 238

def add_event(scope = :custom, event)
  unless self.with_info
    raise "It's allowed to use `add_event` only while performing a full run (`.crawl!` method)"
  end

  self.class.add_event(scope, event)
  logger.info "Spider: new event (scope: #{scope}): #{event}" if scope == :custom
end

#browserObject



189
190
191
# File 'lib/kimurai/base.rb', line 189

def browser
  @browser ||= BrowserBuilder.build(@engine, @config, spider: self)
end

#console(response = nil, url: nil, data: {}) ⇒ Object



207
208
209
# File 'lib/kimurai/base.rb', line 207

def console(response = nil, url: nil, data: {})
  binding.pry
end

#request_to(handler, delay = nil, url:, data: {}, response_type: :html) ⇒ Object

Raises:



193
194
195
196
197
198
199
200
201
202
203
204
205
# File 'lib/kimurai/base.rb', line 193

def request_to(handler, delay = nil, url:, data: {}, response_type: :html)
  raise InvalidUrlError, "Requested url is invalid: #{url}" unless URI.parse(url).kind_of?(URI::HTTP)

  if @config[:skip_duplicate_requests] && !unique_request?(url)
    add_event(:duplicate_requests) if self.with_info
    logger.warn "Spider: request_to: not unique url: #{url}, skipped" and return
  end

  visited = delay ? browser.visit(url, delay: delay) : browser.visit(url)
  return unless visited

  public_send(handler, browser.current_response(response_type), { url: url, data: data })
end

#save_to(path, item, format:, position: true, append: false) ⇒ Object



223
224
225
226
227
228
229
230
231
232
233
234
# File 'lib/kimurai/base.rb', line 223

def save_to(path, item, format:, position: true, append: false)
  @savers[path] ||= begin
    options = { format: format, position: position, append: append }
    if self.with_info
      self.class.savers[path] ||= Saver.new(path, options)
    else
      Saver.new(path, options)
    end
  end

  @savers[path].save(item)
end

#storageObject



213
214
215
216
217
# File 'lib/kimurai/base.rb', line 213

def storage
  # Note: for `.crawl!` uses shared thread safe Storage instance,
  # otherwise, each spider instance will have it's own Storage
  @storage ||= self.with_info ? self.class.storage : Storage.new
end

#unique?(scope, value) ⇒ Boolean

Returns:

  • (Boolean)


219
220
221
# File 'lib/kimurai/base.rb', line 219

def unique?(scope, value)
  storage.unique?(scope, value)
end