Class: CobwebModule::Crawl

Inherits:

Object

Object
CobwebModule::Crawl

show all

Defined in:: lib/crawl.rb

Instance Method Summary collapse

#already_crawled?(link = @options[:url]) ⇒ Boolean

Returns true if the url requested is already in the crawled queue.
#already_handled?(link) ⇒ Boolean
#already_queued?(link) ⇒ Boolean
#already_running?(link) ⇒ Boolean
#cancelled? ⇒ Boolean
#content ⇒ Object
#crawled_base_url ⇒ Object
#debug_ap(value) ⇒ Object
#debug_puts(value) ⇒ Object
#finish ⇒ Object
#finished? ⇒ Boolean
#finished_processing ⇒ Object
#first_to_finish? ⇒ Boolean
#initialize(options = {}) ⇒ Crawl constructor

A new instance of Crawl.
#lock(key, &block) ⇒ Object
#process(&block) ⇒ Object
#process_links(&block) ⇒ Object
#redis ⇒ Object
#retrieve ⇒ Object
#set_first_to_finish ⇒ Object
#statistics ⇒ Object
#to_be_processed? ⇒ Boolean
#update_counters ⇒ Object
#update_queues ⇒ Object
#within_crawl_limits? ⇒ Boolean

Returns true if the crawl count is within limits.
#within_process_limits? ⇒ Boolean

Returns true if the processed count is within limits.
#within_queue_limits? ⇒ Boolean

Returns true if the queue count is calculated to be still within limits when complete.

Constructor Details

#initialize(options = {}) ⇒ `Crawl`

# File 'lib/crawl.rb', line 4

def initialize(options={})
  @options = HashUtil.deep_symbolize_keys(options)

  setup_defaults

  @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@options[:crawl_id]}", :redis => RedisConnection.new(@options[:redis_options]))
  @stats = Stats.new(@options)
  @debug = @options[:debug]
  @first_to_finish = false

end

Instance Method Details

#already_crawled?(link = @options[:url]) ⇒ `Boolean`

Returns true if the url requested is already in the crawled queue



17
18
19

# File 'lib/crawl.rb', line 17

def already_crawled?(link=@options[:url])
  @redis.sismember "crawled", link
end

#already_handled?(link) ⇒ `Boolean`



29
30
31

# File 'lib/crawl.rb', line 29

def already_handled?(link)
  already_crawled?(link) || already_queued?(link) || already_running?(link)
end

#already_queued?(link) ⇒ `Boolean`



21
22
23

# File 'lib/crawl.rb', line 21

def already_queued?(link)
  @redis.sismember "queued", link
end

#already_running?(link) ⇒ `Boolean`



25
26
27

# File 'lib/crawl.rb', line 25

def already_running?(link)
  @redis.sismember "currently_running", link
end

#cancelled? ⇒ `Boolean`



33
34
35

# File 'lib/crawl.rb', line 33

def cancelled?
  @stats.get_statistics[:current_status] == "Cancelled"
end

#content ⇒ `Object`

# File 'lib/crawl.rb', line 157

def content
  raise "Content is not available" if @content.nil?
  CobwebModule::CrawlObject.new(@content, @options)
end

#crawled_base_url ⇒ `Object`



260
261
262

# File 'lib/crawl.rb', line 260

def crawled_base_url
  @redis.get("crawled_base_url")
end

#debug_ap(value) ⇒ `Object`



293
294
295

# File 'lib/crawl.rb', line 293

def debug_ap(value)
  ap(value) if @options[:debug]
end

#debug_puts(value) ⇒ `Object`



297
298
299

# File 'lib/crawl.rb', line 297

def debug_puts(value)
  puts(value) if @options[:debug]
end

#finish ⇒ `Object`

# File 'lib/crawl.rb', line 229

def finish
  debug_puts ""
  debug_puts "========================================================================"
  debug_puts "finished crawl on #{@options[:url]}"
  print_counters
  debug_puts "========================================================================"
  debug_puts ""

  set_first_to_finish
  @stats.end_crawl(@options)
end

#finished? ⇒ `Boolean`

# File 'lib/crawl.rb', line 208

def finished?
  print_counters
  debug_puts @stats.get_status
  if @stats.get_status == CobwebCrawlHelper::FINISHED
    debug_puts "Already Finished!"
  end  
  # if there's nothing left queued or the crawled limit has been reached and we're not still processing something
  if @options[:crawl_limit].nil? || @options[:crawl_limit] == 0
    if queue_counter == 0 && @redis.smembers("currently_running").empty?
      debug_puts "queue_counter is 0 and currently_running is empty so we're done"
      #finished
      return true
    end
  elsif (queue_counter == 0 || process_counter >= @options[:crawl_limit].to_i) && @redis.smembers("currently_running").empty?
    #finished
    debug_puts "queue_counter: #{queue_counter}, @redis.smembers(\"currently_running\").empty?: #{@redis.smembers("currently_running").empty?}, process_counter: #{process_counter}, @options[:crawl_limit].to_i: #{@options[:crawl_limit].to_i}"
    return true
  end
  false
end

#finished_processing ⇒ `Object`



204
205
206

# File 'lib/crawl.rb', line 204

def finished_processing
  @redis.srem "currently_running", @options[:url]
end

#first_to_finish? ⇒ `Boolean`



256
257
258

# File 'lib/crawl.rb', line 256

def first_to_finish?
  @first_to_finish
end

#lock(key, &block) ⇒ `Object`

# File 'lib/crawl.rb', line 272

def lock(key, &block)
  debug_puts "REQUESTING LOCK [#{key}]"
  set_nx = @redis.setnx("#{key}_lock", "locked")
  debug_puts "LOCK:#{key}:#{set_nx}"
  while !set_nx
    debug_puts "===== WAITING FOR LOCK [#{key}] ====="
    sleep 0.01
    set_nx = @redis.setnx("#{key}_lock", "locked")
  end

  debug_puts "RECEIVED LOCK [#{key}]"
  @redis.expire("#{key}_lock", 30)
  begin
    result = yield
  ensure
    @redis.del("#{key}_lock")
    #debug_puts "LOCK RELEASED [#{key}]"
  end
  result
end

#process(&block) ⇒ `Object`

# File 'lib/crawl.rb', line 188

def process(&block)
  lock("process-count") do
    if @options[:crawl_limit_by_page]
      if content.mime_type.match("text/html")
        increment_process_counter
      end
    else
      increment_process_counter
    end
    #@redis.sadd "queued", @options[:url]
  end

  yield if block_given?
  @redis.incr("crawl_job_enqueued_count")
end

#process_links(&block) ⇒ `Object`

# File 'lib/crawl.rb', line 110

def process_links &block

  # set the base url if this is the first page
  set_base_url @redis

  @cobweb_links = CobwebLinks.new(@options)
  if within_queue_limits?
    document_links = ContentLinkParser.new(@options[:url], content.body, @options).all_links(:valid_schemes => [:http, :https])
    #get rid of duplicate links in the same page.
    document_links.uniq!
    
    # select the link if its internal
    internal_links = document_links.select{ |link| @cobweb_links.internal?(link) }

    # if the site has the same content for http and https then normalize to http 
    if @options[:treat_https_as_http]
      internal_links.map!{|link| link.gsub(/^https/, "http")}
    end

    # reject the link if we've crawled it or queued it
    internal_links.reject! { |link| already_handled?(link)}

    lock("internal-links") do
      internal_links.each do |link|
        if within_queue_limits? && !already_handled?(link)
          if status != CobwebCrawlHelper::CANCELLED
            yield link if block_given?
            unless link.nil?
              @redis.sadd "queued", link
              increment_queue_counter
            end
          else
            debug_puts "Cannot enqueue new content as crawl has been cancelled."
          end
        end
      end
    end

    if @options[:store_inbound_links]
      document_links.each do |link|
        uri = URI.parse(link).normalize
        @redis.sadd("inbound_links_#{Digest::MD5.hexdigest(uri.to_s)}", url)
      end
    end
  end
end

#redis ⇒ `Object`



268
269
270

# File 'lib/crawl.rb', line 268

def redis
  @redis
end

#retrieve ⇒ `Object`

# File 'lib/crawl.rb', line 64

def retrieve
  unless cancelled?
    unless already_running? @options[:url]
      unless already_crawled? @options[:url]
        update_queues
        if within_crawl_limits?
          @redis.sadd("currently_running", @options[:url])
          @stats.update_status("Retrieving #{@options[:url]}...")
          @content = Cobweb.new(@options).get(@options[:url], @options)
          update_counters

          if @options[:url] == @redis.get("original_base_url")
            @redis.set("crawled_base_url", @content[:base_url])
          end

          if content.permitted_type?
            ## update statistics

            @stats.update_statistics(@content)
            return true
          end
        else
          puts "======================================="
          puts "OUTWITH CRAWL LIMITS"
          puts "======================================="
          decrement_queue_counter
        end
      else
        puts "======================================="
        puts "ALREADY CRAWLED"
        puts "======================================="
        decrement_queue_counter
      end
    else
      debug_puts "\n\nDETECTED DUPLICATE JOB for #{@options[:url]}\n"
      debug_ap @redis.smembers("currently_running")
      decrement_queue_counter
    end
  else
    puts "======================================="
    puts "CRAWL CANCELLED"
    puts "======================================="
  end
  false
end

#set_first_to_finish ⇒ `Object`

# File 'lib/crawl.rb', line 241

def set_first_to_finish
  @redis.watch("first_to_finish") do
    if !@redis.exists("first_to_finish")
      @redis.multi do
        debug_puts "set first to finish"
        @first_to_finish = true
        @redis.set("first_to_finish", 1)
      end
    else
      @redis.unwatch
    end
  end
end

#statistics ⇒ `Object`



264
265
266

# File 'lib/crawl.rb', line 264

def statistics
  @stats.get_statistics
end

#to_be_processed? ⇒ `Boolean`



184
185
186

# File 'lib/crawl.rb', line 184

def to_be_processed?
  !finished? && within_process_limits? && !@redis.sismember("queued", @options[:url])
end

#update_counters ⇒ `Object`

# File 'lib/crawl.rb', line 173

def update_counters
  if @options[:crawl_limit_by_page]
    if content.mime_type.match("text/html")
      increment_crawl_counter
    end
  else
    increment_crawl_counter
  end
  decrement_queue_counter
end

#update_queues ⇒ `Object`

# File 'lib/crawl.rb', line 162

def update_queues
  lock("update_queues") do
    #@redis.incr "inprogress"
    # move the url from the queued list to the crawled list - for both the original url, and the content url (to handle redirects)
    @redis.srem "queued", @options[:url]
    @redis.sadd "crawled", @options[:url]

    # increment the counter if we are not limiting by page only || we are limiting count by page and it is a page
  end
end

#within_crawl_limits? ⇒ `Boolean`

Returns true if the crawl count is within limits



38
39
40

# File 'lib/crawl.rb', line 38

def within_crawl_limits?
  @options[:crawl_limit].nil? || crawl_counter < @options[:crawl_limit].to_i
end

#within_process_limits? ⇒ `Boolean`

Returns true if the processed count is within limits



43
44
45

# File 'lib/crawl.rb', line 43

def within_process_limits?
  @options[:crawl_limit].nil? || process_counter < @options[:crawl_limit].to_i
end

#within_queue_limits? ⇒ `Boolean`

Returns true if the queue count is calculated to be still within limits when complete

# File 'lib/crawl.rb', line 48

def within_queue_limits?

  # if we are limiting by page we can't limit the queue size as we don't know the mime type until retrieved
  if @options[:crawl_limit_by_page]
    return true

    # if a crawl limit is set, limit queue size to crawled + queue
  elsif @options[:crawl_limit].to_i > 0
    (queue_counter + crawl_counter) < @options[:crawl_limit].to_i

    # no crawl limit set so always within queue limit
  else
    true
  end
end

Class: CobwebModule::Crawl

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ Crawl

Instance Method Details

#already_crawled?(link = @options[:url]) ⇒ Boolean

#already_handled?(link) ⇒ Boolean

#already_queued?(link) ⇒ Boolean

#already_running?(link) ⇒ Boolean

#cancelled? ⇒ Boolean

#content ⇒ Object

#crawled_base_url ⇒ Object

#debug_ap(value) ⇒ Object

#debug_puts(value) ⇒ Object

#finish ⇒ Object

#finished? ⇒ Boolean

#finished_processing ⇒ Object

#first_to_finish? ⇒ Boolean

#lock(key, &block) ⇒ Object

#process(&block) ⇒ Object

#process_links(&block) ⇒ Object

#redis ⇒ Object

#retrieve ⇒ Object

#set_first_to_finish ⇒ Object

#statistics ⇒ Object

#to_be_processed? ⇒ Boolean

#update_counters ⇒ Object

#update_queues ⇒ Object

#within_crawl_limits? ⇒ Boolean

#within_process_limits? ⇒ Boolean

#within_queue_limits? ⇒ Boolean