Class: CobwebModule::Crawl

Inherits:

Object

Object
CobwebModule::Crawl

show all

Defined in:: lib/crawl.rb

Instance Method Summary collapse

#already_crawled?(link = @options[:url]) ⇒ Boolean

Returns true if the url requested is already in the crawled queue.
#already_queued?(link) ⇒ Boolean
#content ⇒ Object
#crawled_base_url ⇒ Object
#debug_ap(value) ⇒ Object
#debug_puts(value) ⇒ Object
#finished ⇒ Object
#finished? ⇒ Boolean
#finished_processing ⇒ Object
#first_to_finish? ⇒ Boolean
#initialize(options = {}) ⇒ Crawl constructor

A new instance of Crawl.
#lock(key, &block) ⇒ Object
#process(&block) ⇒ Object
#process_links(&block) ⇒ Object
#redis ⇒ Object
#retrieve ⇒ Object
#set_first_to_finish ⇒ Object
#statistics ⇒ Object
#to_be_processed? ⇒ Boolean
#update_queues ⇒ Object
#within_crawl_limits? ⇒ Boolean

Returns true if the crawl count is within limits.
#within_process_limits? ⇒ Boolean

Returns true if the processed count is within limits.
#within_queue_limits? ⇒ Boolean

Returns true if the queue count is calculated to be still within limits when complete.

Constructor Details

#initialize(options = {}) ⇒ `Crawl`

Returns a new instance of Crawl.

# File 'lib/crawl.rb', line 4

def initialize(options={})
  @options = HashUtil.deep_symbolize_keys(options)

  setup_defaults

  @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@options[:crawl_id]}", :redis => Redis.new(@options[:redis_options]))
  @stats = Stats.new(@options)
  @debug = @options[:debug]
  @first_to_finish = false

end

Instance Method Details

#already_crawled?(link = @options[:url]) ⇒ `Boolean`

Returns true if the url requested is already in the crawled queue

Returns:

(Boolean)



17
18
19

# File 'lib/crawl.rb', line 17

def already_crawled?(link=@options[:url])
  @redis.sismember "crawled", link
end

#already_queued?(link) ⇒ `Boolean`

Returns:

(Boolean)



21
22
23

# File 'lib/crawl.rb', line 21

def already_queued?(link)
  @redis.sismember "queued", link
end

#content ⇒ `Object`

# File 'lib/crawl.rb', line 125

def content
  raise "Content is not available" if @content.nil?
  CobwebModule::CrawlObject.new(@content, @options)
end

#crawled_base_url ⇒ `Object`



213
214
215

# File 'lib/crawl.rb', line 213

def crawled_base_url
  @redis.get("crawled_base_url")
end

#debug_ap(value) ⇒ `Object`



246
247
248

# File 'lib/crawl.rb', line 246

def debug_ap(value)
  ap(value) if @options[:debug]
end

#debug_puts(value) ⇒ `Object`



250
251
252

# File 'lib/crawl.rb', line 250

def debug_puts(value)
  puts(value) if @options[:debug]
end

#finished ⇒ `Object`

# File 'lib/crawl.rb', line 189

def finished
  set_first_to_finish
  @stats.end_crawl(@options)
end

#finished? ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/crawl.rb', line 174

def finished?
  print_counters
  # if there's nothing left queued or the crawled limit has been reached and we're not still processing something
  if @options[:crawl_limit].nil? || @options[:crawl_limit] == 0
    if queue_counter == 0 && @redis.smembers("currently_running").empty?
      finished
      return true
    end
  elsif (queue_counter == 0 && @redis.smembers("currently_running").empty?) || process_counter >= @options[:crawl_limit].to_i
    finished
    return true
  end
  false
end

#finished_processing ⇒ `Object`



170
171
172

# File 'lib/crawl.rb', line 170

def finished_processing
  @redis.srem "currently_running", @options[:url]
end

#first_to_finish? ⇒ `Boolean`

Returns:

(Boolean)



209
210
211

# File 'lib/crawl.rb', line 209

def first_to_finish?
  @first_to_finish
end

#lock(key, &block) ⇒ `Object`

# File 'lib/crawl.rb', line 225

def lock(key, &block)
  debug_puts "REQUESTING LOCK [#{key}]"
  set_nx = @redis.setnx("#{key}_lock", "locked")
  debug_puts "LOCK:#{key}:#{set_nx}"
  while !set_nx
    debug_puts "===== WAITING FOR LOCK [#{key}] ====="
    sleep 0.01
    set_nx = @redis.setnx("#{key}_lock", "locked")
  end

  debug_puts "RECEIVED LOCK [#{key}]"
  @redis.expire("#{key}_lock", 10)
  begin
    result = yield
  ensure
    @redis.del("#{key}_lock")
    debug_puts "LOCK RELEASED [#{key}]"
  end
  result
end

#process(&block) ⇒ `Object`

# File 'lib/crawl.rb', line 156

def process(&block)
  if @options[:crawl_limit_by_page]
    if content.mime_type.match("text/html")
      increment_process_counter
    end
  else
    increment_process_counter
  end
  @redis.sadd "enqueued", @options[:url]

  yield if block_given?
  @redis.incr("crawl_job_enqueued_count")
end

#process_links(&block) ⇒ `Object`

# File 'lib/crawl.rb', line 84

def process_links &block

  # set the base url if this is the first page
  set_base_url @redis

  @cobweb_links = CobwebLinks.new(@options)
  if within_queue_limits?
    document_links = ContentLinkParser.new(@options[:url], content.body, @options).all_links(:valid_schemes => [:http, :https])
    #get rid of duplicate links in the same page.
    document_links.uniq!
    
    # select the link if its internal
    internal_links = document_links.select{ |link| @cobweb_links.internal?(link) }

    # reject the link if we've crawled it or queued it
    internal_links.reject! { |link| @redis.sismember("crawled", link) }
    internal_links.reject! { |link| @redis.sismember("queued", link) }

    internal_links.each do |link|
      if within_queue_limits? && !already_queued?(link) && !already_crawled?(link)
        if status != CobwebCrawlHelper::CANCELLED
          yield link if block_given?
          unless link.nil?
            @redis.sadd "queued", link
            increment_queue_counter
          end
        else
          debug_puts "Cannot enqueue new content as crawl has been cancelled."
        end
      end
    end

    if @options[:store_inbound_links]
      document_links.each do |link|
        uri = URI.parse(link)
        @redis.sadd("inbound_links_#{Digest::MD5.hexdigest(uri.to_s)}", url)
      end
    end
  end
end

#redis ⇒ `Object`



221
222
223

# File 'lib/crawl.rb', line 221

def redis
  @redis
end

#retrieve ⇒ `Object`

# File 'lib/crawl.rb', line 52

def retrieve
  unless @redis.sismember("currently_running", @options[:url])
    @redis.sadd("currently_running", @options[:url])
    unless already_crawled?
      if within_crawl_limits?
        @stats.update_status("Retrieving #{@options[:url]}...")
        @content = Cobweb.new(@options).get(@options[:url], @options)
        if @options[:url] == @redis.get("original_base_url")
          @redis.set("crawled_base_url", @content[:base_url])
        end
        update_queues

        if content.permitted_type?
          ## update statistics

          @stats.update_statistics(@content)
          return true
        end
      else
        decrement_queue_counter
      end
    else
      decrement_queue_counter
    end
  else
    debug_puts "\n\nDETECTED DUPLICATE JOB for #{@options[:url]}\n"
    debug_ap @redis.smembers("currently_running")
    decrement_queue_counter
  end
  false
end

#set_first_to_finish ⇒ `Object`

# File 'lib/crawl.rb', line 194

def set_first_to_finish
  @redis.watch("first_to_finish") do
    if !@redis.exists("first_to_finish")
      @redis.multi do
        debug_puts "set first to finish"
        @first_to_finish = true
        @redis.set("first_to_finish", 1)
      end
    else
      @redis.unwatch
    end
  end
end

#statistics ⇒ `Object`



217
218
219

# File 'lib/crawl.rb', line 217

def statistics
  @stats.get_statistics
end

#to_be_processed? ⇒ `Boolean`

Returns:

(Boolean)



152
153
154

# File 'lib/crawl.rb', line 152

def to_be_processed?
  (!finished? || within_process_limits?) && !@redis.sismember("enqueued", @options[:url])
end

#update_queues ⇒ `Object`

# File 'lib/crawl.rb', line 130

def update_queues
  lock("update_queues") do
    #@redis.incr "inprogress"
    # move the url from the queued list to the crawled list - for both the original url, and the content url (to handle redirects)
    @redis.srem "queued", @options[:url]
    @redis.sadd "crawled", @options[:url]
    if content.url != @options[:url]
      @redis.srem "queued", content.url
      @redis.sadd "crawled", content.url
    end
    # increment the counter if we are not limiting by page only || we are limiting count by page and it is a page
    if @options[:crawl_limit_by_page]
      if content.mime_type.match("text/html")
        increment_crawl_counter
      end
    else
      increment_crawl_counter
    end
    decrement_queue_counter
  end
end

#within_crawl_limits? ⇒ `Boolean`

Returns true if the crawl count is within limits

Returns:

(Boolean)



26
27
28

# File 'lib/crawl.rb', line 26

def within_crawl_limits?
  @options[:crawl_limit].nil? || crawl_counter < @options[:crawl_limit].to_i
end

#within_process_limits? ⇒ `Boolean`

Returns true if the processed count is within limits

Returns:

(Boolean)



31
32
33

# File 'lib/crawl.rb', line 31

def within_process_limits?
  @options[:crawl_limit].nil? || process_counter < @options[:crawl_limit].to_i
end

#within_queue_limits? ⇒ `Boolean`

Returns true if the queue count is calculated to be still within limits when complete

Returns:

(Boolean)

# File 'lib/crawl.rb', line 36

def within_queue_limits?

  # if we are limiting by page we can't limit the queue size as we don't know the mime type until retrieved
  if @options[:crawl_limit_by_page]
    return true

    # if a crawl limit is set, limit queue size to crawled + queue
  elsif @options[:crawl_limit].to_i > 0
    (queue_counter + crawl_counter) < @options[:crawl_limit].to_i

    # no crawl limit set so always within queue limit
  else
    true
  end
end

Class: CobwebModule::Crawl

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ Crawl

Instance Method Details

#already_crawled?(link = @options[:url]) ⇒ Boolean

#already_queued?(link) ⇒ Boolean

#content ⇒ Object

#crawled_base_url ⇒ Object

#debug_ap(value) ⇒ Object

#debug_puts(value) ⇒ Object

#finished ⇒ Object

#finished? ⇒ Boolean

#finished_processing ⇒ Object

#first_to_finish? ⇒ Boolean

#lock(key, &block) ⇒ Object

#process(&block) ⇒ Object

#process_links(&block) ⇒ Object

#redis ⇒ Object

#retrieve ⇒ Object

#set_first_to_finish ⇒ Object

#statistics ⇒ Object

#to_be_processed? ⇒ Boolean

#update_queues ⇒ Object

#within_crawl_limits? ⇒ Boolean

#within_process_limits? ⇒ Boolean

#within_queue_limits? ⇒ Boolean

#initialize(options = {}) ⇒ `Crawl`

#already_crawled?(link = @options[:url]) ⇒ `Boolean`

#already_queued?(link) ⇒ `Boolean`

#content ⇒ `Object`

#crawled_base_url ⇒ `Object`

#debug_ap(value) ⇒ `Object`

#debug_puts(value) ⇒ `Object`

#finished ⇒ `Object`

#finished? ⇒ `Boolean`

#finished_processing ⇒ `Object`

#first_to_finish? ⇒ `Boolean`

#lock(key, &block) ⇒ `Object`

#process(&block) ⇒ `Object`

#process_links(&block) ⇒ `Object`

#redis ⇒ `Object`

#retrieve ⇒ `Object`

#set_first_to_finish ⇒ `Object`

#statistics ⇒ `Object`

#to_be_processed? ⇒ `Boolean`

#update_queues ⇒ `Object`

#within_crawl_limits? ⇒ `Boolean`

#within_process_limits? ⇒ `Boolean`

#within_queue_limits? ⇒ `Boolean`