Class: CobwebModule::Crawl

Inherits:

Object

Object
CobwebModule::Crawl

show all

Defined in:: lib/crawl.rb

Instance Method Summary collapse

#already_crawled?(link = @options[:url]) ⇒ Boolean

Returns true if the url requested is already in the crawled queue.
#already_handled?(link) ⇒ Boolean
#already_queued?(link) ⇒ Boolean
#already_running?(link) ⇒ Boolean
#content ⇒ Object
#crawled_base_url ⇒ Object
#debug_ap(value) ⇒ Object
#debug_puts(value) ⇒ Object
#finish ⇒ Object
#finished? ⇒ Boolean
#finished_processing ⇒ Object
#first_to_finish? ⇒ Boolean
#initialize(options = {}) ⇒ Crawl constructor

A new instance of Crawl.
#lock(key, &block) ⇒ Object
#process(&block) ⇒ Object
#process_links(&block) ⇒ Object
#redis ⇒ Object
#retrieve ⇒ Object
#set_first_to_finish ⇒ Object
#statistics ⇒ Object
#to_be_processed? ⇒ Boolean
#update_counters ⇒ Object
#update_queues ⇒ Object
#within_crawl_limits? ⇒ Boolean

Returns true if the crawl count is within limits.
#within_process_limits? ⇒ Boolean

Returns true if the processed count is within limits.
#within_queue_limits? ⇒ Boolean

Returns true if the queue count is calculated to be still within limits when complete.

Constructor Details

#initialize(options = {}) ⇒ `Crawl`

# File 'lib/crawl.rb', line 4

def initialize(options={})
  @options = HashUtil.deep_symbolize_keys(options)

  setup_defaults

  @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@options[:crawl_id]}", :redis => RedisConnection.new(@options[:redis_options]))
  @stats = Stats.new(@options)
  @debug = @options[:debug]
  @first_to_finish = false

end

Instance Method Details

#already_crawled?(link = @options[:url]) ⇒ `Boolean`

Returns true if the url requested is already in the crawled queue



17
18
19

# File 'lib/crawl.rb', line 17

def already_crawled?(link=@options[:url])
  @redis.sismember "crawled", link
end

#already_handled?(link) ⇒ `Boolean`



29
30
31

# File 'lib/crawl.rb', line 29

def already_handled?(link)
  already_crawled?(link) || already_queued?(link) || already_running?(link)
end

#already_queued?(link) ⇒ `Boolean`



21
22
23

# File 'lib/crawl.rb', line 21

def already_queued?(link)
  @redis.sismember "queued", link
end

#already_running?(link) ⇒ `Boolean`



25
26
27

# File 'lib/crawl.rb', line 25

def already_running?(link)
  @redis.sismember "currently_running", link
end

#content ⇒ `Object`

# File 'lib/crawl.rb', line 144

def content
  raise "Content is not available" if @content.nil?
  CobwebModule::CrawlObject.new(@content, @options)
end

#crawled_base_url ⇒ `Object`



247
248
249

# File 'lib/crawl.rb', line 247

def crawled_base_url
  @redis.get("crawled_base_url")
end

#debug_ap(value) ⇒ `Object`



280
281
282

# File 'lib/crawl.rb', line 280

def debug_ap(value)
  ap(value) if @options[:debug]
end

#debug_puts(value) ⇒ `Object`



284
285
286

# File 'lib/crawl.rb', line 284

def debug_puts(value)
  puts(value) if @options[:debug]
end

#finish ⇒ `Object`

# File 'lib/crawl.rb', line 216

def finish
  debug_puts ""
  debug_puts "========================================================================"
  debug_puts "finished crawl on #{@options[:url]}"
  print_counters
  debug_puts "========================================================================"
  debug_puts ""

  set_first_to_finish
  @stats.end_crawl(@options)
end

#finished? ⇒ `Boolean`

# File 'lib/crawl.rb', line 195

def finished?
  print_counters
  debug_puts @stats.get_status
  if @stats.get_status == CobwebCrawlHelper::FINISHED
    debug_puts "Already Finished!"
  end  
  # if there's nothing left queued or the crawled limit has been reached and we're not still processing something
  if @options[:crawl_limit].nil? || @options[:crawl_limit] == 0
    if queue_counter == 0 && @redis.smembers("currently_running").empty?
      debug_puts "queue_counter is 0 and currently_running is empty so we're done"
      #finished
      return true
    end
  elsif (queue_counter == 0 || process_counter >= @options[:crawl_limit].to_i) && @redis.smembers("currently_running").empty?
    #finished
    debug_puts "queue_counter: #{queue_counter}, @redis.smembers(\"currently_running\").empty?: #{@redis.smembers("currently_running").empty?}, process_counter: #{process_counter}, @options[:crawl_limit].to_i: #{@options[:crawl_limit].to_i}"
    return true
  end
  false
end

#finished_processing ⇒ `Object`



191
192
193

# File 'lib/crawl.rb', line 191

def finished_processing
  @redis.srem "currently_running", @options[:url]
end

#first_to_finish? ⇒ `Boolean`



243
244
245

# File 'lib/crawl.rb', line 243

def first_to_finish?
  @first_to_finish
end

#lock(key, &block) ⇒ `Object`

# File 'lib/crawl.rb', line 259

def lock(key, &block)
  debug_puts "REQUESTING LOCK [#{key}]"
  set_nx = @redis.setnx("#{key}_lock", "locked")
  debug_puts "LOCK:#{key}:#{set_nx}"
  while !set_nx
    debug_puts "===== WAITING FOR LOCK [#{key}] ====="
    sleep 0.01
    set_nx = @redis.setnx("#{key}_lock", "locked")
  end

  debug_puts "RECEIVED LOCK [#{key}]"
  @redis.expire("#{key}_lock", 30)
  begin
    result = yield
  ensure
    @redis.del("#{key}_lock")
    #debug_puts "LOCK RELEASED [#{key}]"
  end
  result
end

#process(&block) ⇒ `Object`

# File 'lib/crawl.rb', line 175

def process(&block)
  lock("process-count") do
    if @options[:crawl_limit_by_page]
      if content.mime_type.match("text/html")
        increment_process_counter
      end
    else
      increment_process_counter
    end
    #@redis.sadd "queued", @options[:url]
  end

  yield if block_given?
  @redis.incr("crawl_job_enqueued_count")
end

#process_links(&block) ⇒ `Object`

# File 'lib/crawl.rb', line 101

def process_links &block

  # set the base url if this is the first page
  set_base_url @redis

  @cobweb_links = CobwebLinks.new(@options)
  if within_queue_limits?
    document_links = ContentLinkParser.new(@options[:url], content.body, @options).all_links(:valid_schemes => [:http, :https])
    #get rid of duplicate links in the same page.
    document_links.uniq!
    
    # select the link if its internal
    internal_links = document_links.select{ |link| @cobweb_links.internal?(link) }

    # reject the link if we've crawled it or queued it

    internal_links.reject! { |link| already_handled?(link)}

    lock("internal-links") do
      internal_links.each do |link|
        if within_queue_limits? && !already_handled?(link)
          if status != CobwebCrawlHelper::CANCELLED
            yield link if block_given?
            unless link.nil?
              @redis.sadd "queued", link
              increment_queue_counter
            end
          else
            debug_puts "Cannot enqueue new content as crawl has been cancelled."
          end
        end
      end
    end

    if @options[:store_inbound_links]
      document_links.each do |link|
        uri = URI.parse(link).normalize
        @redis.sadd("inbound_links_#{Digest::MD5.hexdigest(uri.to_s)}", url)
      end
    end
  end
end

#redis ⇒ `Object`



255
256
257

# File 'lib/crawl.rb', line 255

def redis
  @redis
end

#retrieve ⇒ `Object`

# File 'lib/crawl.rb', line 60

def retrieve

  unless already_running? @options[:url]
    unless already_crawled? @options[:url]
      update_queues
      if within_crawl_limits?
        @redis.sadd("currently_running", @options[:url])
        @stats.update_status("Retrieving #{@options[:url]}...")
        @content = Cobweb.new(@options).get(@options[:url], @options)
        update_counters

        if @options[:url] == @redis.get("original_base_url")
          @redis.set("crawled_base_url", @content[:base_url])
        end

        if content.permitted_type?
          ## update statistics

          @stats.update_statistics(@content)
          return true
        end
      else
        puts "======================================="
        puts "OUTWITH CRAWL LIMITS"
        puts "======================================="
        decrement_queue_counter
      end
    else
      puts "======================================="
      puts "ALREADY CRAWLED"
      puts "======================================="
      decrement_queue_counter
    end
  else
    debug_puts "\n\nDETECTED DUPLICATE JOB for #{@options[:url]}\n"
    debug_ap @redis.smembers("currently_running")
    decrement_queue_counter
  end
  false
end

#set_first_to_finish ⇒ `Object`

# File 'lib/crawl.rb', line 228

def set_first_to_finish
  @redis.watch("first_to_finish") do
    if !@redis.exists("first_to_finish")
      @redis.multi do
        debug_puts "set first to finish"
        @first_to_finish = true
        @redis.set("first_to_finish", 1)
      end
    else
      @redis.unwatch
    end
  end
end

#statistics ⇒ `Object`



251
252
253

# File 'lib/crawl.rb', line 251

def statistics
  @stats.get_statistics
end

#to_be_processed? ⇒ `Boolean`



171
172
173

# File 'lib/crawl.rb', line 171

def to_be_processed?
  !finished? && within_process_limits? && !@redis.sismember("queued", @options[:url])
end

#update_counters ⇒ `Object`

# File 'lib/crawl.rb', line 160

def update_counters
  if @options[:crawl_limit_by_page]
    if content.mime_type.match("text/html")
      increment_crawl_counter
    end
  else
    increment_crawl_counter
  end
  decrement_queue_counter
end

#update_queues ⇒ `Object`

# File 'lib/crawl.rb', line 149

def update_queues
  lock("update_queues") do
    #@redis.incr "inprogress"
    # move the url from the queued list to the crawled list - for both the original url, and the content url (to handle redirects)
    @redis.srem "queued", @options[:url]
    @redis.sadd "crawled", @options[:url]

    # increment the counter if we are not limiting by page only || we are limiting count by page and it is a page
  end
end

#within_crawl_limits? ⇒ `Boolean`

Returns true if the crawl count is within limits



34
35
36

# File 'lib/crawl.rb', line 34

def within_crawl_limits?
  @options[:crawl_limit].nil? || crawl_counter < @options[:crawl_limit].to_i
end

#within_process_limits? ⇒ `Boolean`

Returns true if the processed count is within limits



39
40
41

# File 'lib/crawl.rb', line 39

def within_process_limits?
  @options[:crawl_limit].nil? || process_counter < @options[:crawl_limit].to_i
end

#within_queue_limits? ⇒ `Boolean`

Returns true if the queue count is calculated to be still within limits when complete

# File 'lib/crawl.rb', line 44

def within_queue_limits?

  # if we are limiting by page we can't limit the queue size as we don't know the mime type until retrieved
  if @options[:crawl_limit_by_page]
    return true

    # if a crawl limit is set, limit queue size to crawled + queue
  elsif @options[:crawl_limit].to_i > 0
    (queue_counter + crawl_counter) < @options[:crawl_limit].to_i

    # no crawl limit set so always within queue limit
  else
    true
  end
end

Class: CobwebModule::Crawl

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ Crawl

Instance Method Details

#already_crawled?(link = @options[:url]) ⇒ Boolean

#already_handled?(link) ⇒ Boolean

#already_queued?(link) ⇒ Boolean

#already_running?(link) ⇒ Boolean

#content ⇒ Object

#crawled_base_url ⇒ Object

#debug_ap(value) ⇒ Object

#debug_puts(value) ⇒ Object

#finish ⇒ Object

#finished? ⇒ Boolean

#finished_processing ⇒ Object

#first_to_finish? ⇒ Boolean

#lock(key, &block) ⇒ Object

#process(&block) ⇒ Object

#process_links(&block) ⇒ Object

#redis ⇒ Object

#retrieve ⇒ Object

#set_first_to_finish ⇒ Object

#statistics ⇒ Object

#to_be_processed? ⇒ Boolean

#update_counters ⇒ Object

#update_queues ⇒ Object

#within_crawl_limits? ⇒ Boolean

#within_process_limits? ⇒ Boolean

#within_queue_limits? ⇒ Boolean