Class: Tanakai::Base

Inherits:
Object
  • Object
show all
Includes:
BaseHelper
Defined in:
lib/tanakai/base.rb,
lib/tanakai/base/saver.rb,
lib/tanakai/base/storage.rb

Direct Known Subclasses

ApplicationSpider

Defined Under Namespace

Classes: InvalidUrlError, Saver, Storage

Constant Summary collapse

DMERGE_EXCLUDE =

don’t deep merge config’s headers hash option

[:headers]
LoggerFormatter =
proc do |severity, datetime, progname, msg|
  current_thread_id = Thread.current.object_id
  thread_type = Thread.main == Thread.current ? "M" : "C"
  output = "%s, [%s#%d] [%s: %s] %5s -- %s: %s\n"
    .freeze % [severity[0..0], datetime, $$, thread_type, current_thread_id, severity, progname, msg]

  if Tanakai.configuration.colorize_logger != false && Tanakai.env == "development"
    Rbcat.colorize(output, predefined: [:jsonhash, :logger])
  else
    output
  end
end

Class Attribute Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(engine = self.class.engine, config: {}) ⇒ Base

Returns a new instance of Base.



176
177
178
179
180
181
182
183
184
185
186
187
188
# File 'lib/tanakai/base.rb', line 176

def initialize(engine = self.class.engine, config: {})
  @engine = engine || self.class.engine
  @config = self.class.config.deep_merge_excl(config, DMERGE_EXCLUDE)
  @pipelines = self.class.pipelines.map do |pipeline_name|
    klass = Pipeline.descendants.find { |kl| kl.name == pipeline_name }
    instance = klass.new
    instance.spider = self
    [pipeline_name, instance]
  end.to_h

  @logger = self.class.logger
  @savers = {}
end

Class Attribute Details

.run_infoObject (readonly)

Returns the value of attribute run_info.



30
31
32
# File 'lib/tanakai/base.rb', line 30

def run_info
  @run_info
end

.saversObject (readonly)

Returns the value of attribute savers.



30
31
32
# File 'lib/tanakai/base.rb', line 30

def savers
  @savers
end

.storageObject (readonly)

Returns the value of attribute storage.



30
31
32
# File 'lib/tanakai/base.rb', line 30

def storage
  @storage
end

Instance Attribute Details

#loggerObject (readonly)

Returns the value of attribute logger.



173
174
175
# File 'lib/tanakai/base.rb', line 173

def logger
  @logger
end

#with_infoObject

Returns the value of attribute with_info.



174
175
176
# File 'lib/tanakai/base.rb', line 174

def with_info
  @with_info
end

Class Method Details

.add_event(scope, event) ⇒ Object



58
59
60
61
# File 'lib/tanakai/base.rb', line 58

def self.add_event(scope, event)
  return unless @run_info
  @update_mutex.synchronize { @run_info[:events][scope][event] += 1 }
end

.completed?Boolean

Returns:

  • (Boolean)


37
38
39
# File 'lib/tanakai/base.rb', line 37

def self.completed?
  @run_info && @run_info[:status] == :completed
end

.configObject



85
86
87
88
89
90
91
# File 'lib/tanakai/base.rb', line 85

def self.config
  if superclass.equal?(::Object)
    @config
  else
    superclass.config.deep_merge_excl(@config || {}, DMERGE_EXCLUDE)
  end
end

.crawl!(exception_on_fail: true, data: {}) ⇒ Object



103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# File 'lib/tanakai/base.rb', line 103

def self.crawl!(exception_on_fail: true, data: {})
  logger.error "Spider: already running: #{name}" and return false if running?

  @storage = Storage.new
  @savers = {}
  @update_mutex = Mutex.new

  @run_info = {
    spider_name: name, status: :running, error: nil, environment: Tanakai.env,
    start_time: Time.new, stop_time: nil, running_time: nil,
    visits: { requests: 0, responses: 0 }, items: { sent: 0, processed: 0 },
    events: { requests_errors: Hash.new(0), drop_items_errors: Hash.new(0), custom: Hash.new(0) }
  }

  ###

  logger.info "Spider: started: #{name}"
  open_spider if self.respond_to? :open_spider

  spider = self.new
  spider.with_info = true
  if start_urls
    start_urls.each do |start_url|
      if start_url.class == Hash
        spider.request_to(:parse, url: start_url, data: data)
      else
        spider.request_to(:parse, url: start_url, data: data)
      end
    end
  else
    spider.parse(data: data)
  end
rescue StandardError, SignalException, SystemExit => e
  @run_info.merge!(status: :failed, error: e.inspect)
  exception_on_fail ? raise(e) : [@run_info, e]
else
  @run_info.merge!(status: :completed)
ensure
  if spider
    spider.browser.destroy_driver! if spider.instance_variable_get("@browser")

    stop_time  = Time.now
    total_time = (stop_time - @run_info[:start_time]).round(3)
    @run_info.merge!(stop_time: stop_time, running_time: total_time)

    close_spider if self.respond_to? :close_spider

    message = "Spider: stopped: #{@run_info.merge(running_time: @run_info[:running_time]&.duration)}"
    failed? ? logger.fatal(message) : logger.info(message)

    @run_info, @storage, @savers, @update_mutex = nil
  end
end

.engineObject



73
74
75
# File 'lib/tanakai/base.rb', line 73

def self.engine
  @engine ||= superclass.engine
end

.failed?Boolean

Returns:

  • (Boolean)


41
42
43
# File 'lib/tanakai/base.rb', line 41

def self.failed?
  @run_info && @run_info[:status] == :failed
end

.itemsObject



49
50
51
# File 'lib/tanakai/base.rb', line 49

def self.items
  @run_info && @run_info[:items]
end

.loggerObject



95
96
97
98
99
100
101
# File 'lib/tanakai/base.rb', line 95

def self.logger
  @logger ||= Tanakai.configuration.logger || begin
    log_level = (ENV["LOG_LEVEL"] || Tanakai.configuration.log_level || "DEBUG").to_s.upcase
    log_level = "Logger::#{log_level}".constantize
    Logger.new(STDOUT, formatter: LoggerFormatter, level: log_level, progname: name)
  end
end

.nameObject



69
70
71
# File 'lib/tanakai/base.rb', line 69

def self.name
  @name
end

.parse!(handler, *args, **request) ⇒ Object



157
158
159
160
161
162
163
164
165
166
167
168
169
# File 'lib/tanakai/base.rb', line 157

def self.parse!(handler, *args, **request)
  spider = self.new

  if args.present?
    spider.public_send(handler, *args)
  elsif request.present?
    spider.request_to(handler, **request)
  else
    spider.public_send(handler)
  end
ensure
  spider.browser.destroy_driver! if spider.instance_variable_get("@browser")
end

.pipelinesObject



77
78
79
# File 'lib/tanakai/base.rb', line 77

def self.pipelines
  @pipelines ||= superclass.pipelines
end

.running?Boolean

Returns:

  • (Boolean)


33
34
35
# File 'lib/tanakai/base.rb', line 33

def self.running?
  @run_info && @run_info[:status] == :running
end

.start_urlsObject



81
82
83
# File 'lib/tanakai/base.rb', line 81

def self.start_urls
  @start_urls
end

.update(type, subtype) ⇒ Object



53
54
55
56
# File 'lib/tanakai/base.rb', line 53

def self.update(type, subtype)
  return unless @run_info
  @update_mutex.synchronize { @run_info[type][subtype] += 1 }
end

.visitsObject



45
46
47
# File 'lib/tanakai/base.rb', line 45

def self.visits
  @run_info && @run_info[:visits]
end

Instance Method Details

#add_event(scope = :custom, event) ⇒ Object



241
242
243
244
245
246
247
# File 'lib/tanakai/base.rb', line 241

def add_event(scope = :custom, event)
  if self.with_info
    self.class.add_event(scope, event)
  end

  logger.info "Spider: new event (scope: #{scope}): #{event}" if scope == :custom
end

#browserObject



190
191
192
# File 'lib/tanakai/base.rb', line 190

def browser
  @browser ||= BrowserBuilder.build(@engine, @config, spider: self)
end

#console(response = nil, url: nil, data: {}) ⇒ Object



210
211
212
# File 'lib/tanakai/base.rb', line 210

def console(response = nil, url: nil, data: {})
  binding.pry
end

#request_to(handler, delay = nil, url:, data: {}, response_type: :html) ⇒ Object



194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
# File 'lib/tanakai/base.rb', line 194

def request_to(handler, delay = nil, url:, data: {}, response_type: :html)
  if %w[http https].exclude?(Addressable::URI.parse(url).scheme)
    raise InvalidUrlError, "Requested url scheme is invalid: #{url}"
  end

  if @config[:skip_duplicate_requests] && !unique_request?(url)
    add_event(:duplicate_requests) if self.with_info
    logger.warn "Spider: request_to: not unique url: #{url}, skipped" and return
  end

  visited = delay ? browser.visit(url, delay: delay) : browser.visit(url)
  return unless visited

  public_send(handler, browser.current_response(response_type), **{ url: url, data: data })
end

#save_to(path, item, format:, position: true, append: false) ⇒ Object



226
227
228
229
230
231
232
233
234
235
236
237
# File 'lib/tanakai/base.rb', line 226

def save_to(path, item, format:, position: true, append: false)
  @savers[path] ||= begin
    options = { format: format, position: position, append: append }
    if self.with_info
      self.class.savers[path] ||= Saver.new(path, **options)
    else
      Saver.new(path, **options)
    end
  end

  @savers[path].save(item)
end

#storageObject



216
217
218
219
220
# File 'lib/tanakai/base.rb', line 216

def storage
  # Note: for `.crawl!` uses shared thread safe Storage instance,
  # otherwise, each spider instance will have it's own Storage
  @storage ||= self.with_info ? self.class.storage : Storage.new
end

#unique?(scope, value) ⇒ Boolean

Returns:

  • (Boolean)


222
223
224
# File 'lib/tanakai/base.rb', line 222

def unique?(scope, value)
  storage.unique?(scope, value)
end