Class: Kimurai::Base

Inherits:
Object
  • Object
show all
Includes:
BaseHelper
Defined in:
lib/kimurai/base.rb,
lib/kimurai/base/saver.rb,
lib/kimurai/base/storage.rb

Direct Known Subclasses

ApplicationSpider

Defined Under Namespace

Classes: InvalidUrlError, Saver, Storage

Constant Summary collapse

DMERGE_EXCLUDE =

don’t deep merge config’s headers hash option

[:headers].freeze
LoggerFormatter =
proc do |severity, datetime, progname, msg|
  current_thread_id = Thread.current.object_id
  thread_type = Thread.main == Thread.current ? 'M' : 'C'
  output = format("%s, [%s#%d] [%s: %s] %5s -- %s: %s\n", severity[0..0], datetime, $PROCESS_ID, thread_type,
                  current_thread_id, severity, progname, msg)

  if Kimurai.configuration.colorize_logger != false && Kimurai.env == 'development'
    Rbcat.colorize(output, predefined: %i[jsonhash logger])
  else
    output
  end
end

Class Attribute Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(engine = self.class.engine, config: {}) ⇒ Base

Returns a new instance of Base.



178
179
180
181
182
183
184
185
186
187
188
189
190
# File 'lib/kimurai/base.rb', line 178

def initialize(engine = self.class.engine, config: {})
  @engine = engine || self.class.engine
  @config = self.class.config.deep_merge_excl(config, DMERGE_EXCLUDE)
  @pipelines = self.class.pipelines.map do |pipeline_name|
    klass = Pipeline.descendants.find { |kl| kl.name == pipeline_name }
    instance = klass.new
    instance.spider = self
    [pipeline_name, instance]
  end.to_h

  @logger = self.class.logger
  @savers = {}
end

Class Attribute Details

.run_infoObject (readonly)

Returns the value of attribute run_info.



30
31
32
# File 'lib/kimurai/base.rb', line 30

def run_info
  @run_info
end

.saversObject (readonly)

Returns the value of attribute savers.



30
31
32
# File 'lib/kimurai/base.rb', line 30

def savers
  @savers
end

.storageObject (readonly)

Returns the value of attribute storage.



30
31
32
# File 'lib/kimurai/base.rb', line 30

def storage
  @storage
end

Instance Attribute Details

#loggerObject (readonly)

Returns the value of attribute logger.



175
176
177
# File 'lib/kimurai/base.rb', line 175

def logger
  @logger
end

#with_infoObject

Returns the value of attribute with_info.



176
177
178
# File 'lib/kimurai/base.rb', line 176

def with_info
  @with_info
end

Class Method Details

.add_event(scope, event) ⇒ Object



59
60
61
62
63
# File 'lib/kimurai/base.rb', line 59

def self.add_event(scope, event)
  return unless @run_info

  @update_mutex.synchronize { @run_info[:events][scope][event] += 1 }
end

.completed?Boolean

Returns:

  • (Boolean)


37
38
39
# File 'lib/kimurai/base.rb', line 37

def self.completed?
  @run_info && @run_info[:status] == :completed
end

.configObject



87
88
89
90
91
92
93
# File 'lib/kimurai/base.rb', line 87

def self.config
  if superclass.equal?(::Object)
    @config
  else
    superclass.config.deep_merge_excl(@config || {}, DMERGE_EXCLUDE)
  end
end

.crawl!(exception_on_fail: true) ⇒ Object



105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# File 'lib/kimurai/base.rb', line 105

def self.crawl!(exception_on_fail: true)
  logger.error "Spider: already running: #{name}" and return false if running?

  @storage = Storage.new
  @savers = {}
  @update_mutex = Mutex.new

  @run_info = {
    spider_name: name, status: :running, error: nil, environment: Kimurai.env,
    start_time: Time.new, stop_time: nil, running_time: nil,
    visits: { requests: 0, responses: 0 }, items: { sent: 0, processed: 0 },
    events: { requests_errors: Hash.new(0), drop_items_errors: Hash.new(0), custom: Hash.new(0) }
  }

  ###

  logger.info "Spider: started: #{name}"
  open_spider if respond_to? :open_spider

  spider = new
  spider.with_info = true
  if start_urls
    start_urls.each do |start_url|
      if start_url.instance_of?(Hash)
        spider.request_to(:parse, start_url)
      else
        spider.request_to(:parse, url: start_url)
      end
    end
  else
    spider.parse
  end
rescue StandardError, SignalException, SystemExit => e
  @run_info.merge!(status: :failed, error: e.inspect)
  exception_on_fail ? raise(e) : [@run_info, e]
else
  @run_info.merge!(status: :completed)
ensure
  if spider
    spider.browser.destroy_driver! if spider.instance_variable_get('@browser')

    stop_time  = Time.now
    total_time = (stop_time - @run_info[:start_time]).round(3)
    @run_info.merge!(stop_time: stop_time, running_time: total_time)

    close_spider if respond_to? :close_spider

    message = "Spider: stopped: #{@run_info.merge(running_time: @run_info[:running_time]&.duration)}"
    failed? ? logger.fatal(message) : logger.info(message)

    @run_info, @storage, @savers, @update_mutex = nil
  end
end

.engineObject



75
76
77
# File 'lib/kimurai/base.rb', line 75

def self.engine
  @engine ||= superclass.engine
end

.failed?Boolean

Returns:

  • (Boolean)


41
42
43
# File 'lib/kimurai/base.rb', line 41

def self.failed?
  @run_info && @run_info[:status] == :failed
end

.itemsObject



49
50
51
# File 'lib/kimurai/base.rb', line 49

def self.items
  @run_info && @run_info[:items]
end

.loggerObject



97
98
99
100
101
102
103
# File 'lib/kimurai/base.rb', line 97

def self.logger
  @logger ||= Kimurai.configuration.logger || begin
    log_level = (ENV['LOG_LEVEL'] || Kimurai.configuration.log_level || 'DEBUG').to_s.upcase
    log_level = "Logger::#{log_level}".constantize
    Logger.new($stdout, formatter: LoggerFormatter, level: log_level, progname: name)
  end
end

.nameObject



71
72
73
# File 'lib/kimurai/base.rb', line 71

def self.name
  @name
end

.parse!(handler, *args, **request) ⇒ Object



159
160
161
162
163
164
165
166
167
168
169
170
171
# File 'lib/kimurai/base.rb', line 159

def self.parse!(handler, *args, **request)
  spider = new

  if args.present?
    spider.public_send(handler, *args)
  elsif request.present?
    spider.request_to(handler, request)
  else
    spider.public_send(handler)
  end
ensure
  spider.browser.destroy_driver! if spider.instance_variable_get('@browser')
end

.pipelinesObject



79
80
81
# File 'lib/kimurai/base.rb', line 79

def self.pipelines
  @pipelines ||= superclass.pipelines
end

.running?Boolean

Returns:

  • (Boolean)


33
34
35
# File 'lib/kimurai/base.rb', line 33

def self.running?
  @run_info && @run_info[:status] == :running
end

.start_urlsObject



83
84
85
# File 'lib/kimurai/base.rb', line 83

def self.start_urls
  @start_urls
end

.update(type, subtype) ⇒ Object



53
54
55
56
57
# File 'lib/kimurai/base.rb', line 53

def self.update(type, subtype)
  return unless @run_info

  @update_mutex.synchronize { @run_info[type][subtype] += 1 }
end

.visitsObject



45
46
47
# File 'lib/kimurai/base.rb', line 45

def self.visits
  @run_info && @run_info[:visits]
end

Instance Method Details

#add_event(scope = :custom, event) ⇒ Object



241
242
243
244
245
# File 'lib/kimurai/base.rb', line 241

def add_event(scope = :custom, event)
  self.class.add_event(scope, event) if with_info

  logger.info "Spider: new event (scope: #{scope}): #{event}" if scope == :custom
end

#browserObject



192
193
194
# File 'lib/kimurai/base.rb', line 192

def browser
  @browser ||= BrowserBuilder.build(@engine, @config, spider: self)
end

#console(response = nil, url: nil, data: {}) ⇒ Object



210
211
212
# File 'lib/kimurai/base.rb', line 210

def console(response = nil, url: nil, data: {})
  binding.pry
end

#request_to(handler, delay = nil, url:, data: {}, response_type: :html) ⇒ Object

Raises:



196
197
198
199
200
201
202
203
204
205
206
207
208
# File 'lib/kimurai/base.rb', line 196

def request_to(handler, delay = nil, url:, data: {}, response_type: :html)
  raise InvalidUrlError, "Requested url is invalid: #{url}" unless URI.parse(url).is_a?(URI::HTTP)

  if @config[:skip_duplicate_requests] && !unique_request?(url)
    add_event(:duplicate_requests) if with_info
    logger.warn "Spider: request_to: not unique url: #{url}, skipped" and return
  end

  visited = delay ? browser.visit(url, delay: delay) : browser.visit(url)
  return unless visited

  public_send(handler, browser.current_response(response_type), **{ url: url, data: data })
end

#save_to(path, item, format:, position: true, append: false) ⇒ Object



226
227
228
229
230
231
232
233
234
235
236
237
# File 'lib/kimurai/base.rb', line 226

def save_to(path, item, format:, position: true, append: false)
  @savers[path] ||= begin
    options = { format: format, position: position, append: append }
    if with_info
      self.class.savers[path] ||= Saver.new(path, **options)
    else
      Saver.new(path, **options)
    end
  end

  @savers[path].save(item)
end

#storageObject



216
217
218
219
220
# File 'lib/kimurai/base.rb', line 216

def storage
  # NOTE: for `.crawl!` uses shared thread safe Storage instance,
  # otherwise, each spider instance will have it's own Storage
  @storage ||= with_info ? self.class.storage : Storage.new
end

#unique?(scope, value) ⇒ Boolean

Returns:

  • (Boolean)


222
223
224
# File 'lib/kimurai/base.rb', line 222

def unique?(scope, value)
  storage.unique?(scope, value)
end