Method: CobwebModule::Crawl#retrieve

Defined in:
lib/crawl.rb

#retrieveObject



64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# File 'lib/crawl.rb', line 64

def retrieve
  unless cancelled?
    unless already_running? @options[:url]
      unless already_crawled? @options[:url]
        update_queues
        if within_crawl_limits?
          @redis.sadd("currently_running", @options[:url])
          @stats.update_status("Retrieving #{@options[:url]}...")
          @content = Cobweb.new(@options).get(@options[:url], @options)
          update_counters

          if @options[:url] == @redis.get("original_base_url")
            @redis.set("crawled_base_url", @content[:base_url])
          end

          if content.permitted_type?
            ## update statistics

            @stats.update_statistics(@content)
            return true
          end
        else
          puts "======================================="
          puts "OUTWITH CRAWL LIMITS"
          puts "======================================="
          decrement_queue_counter
        end
      else
        puts "======================================="
        puts "ALREADY CRAWLED"
        puts "======================================="
        decrement_queue_counter
      end
    else
      debug_puts "\n\nDETECTED DUPLICATE JOB for #{@options[:url]}\n"
      debug_ap @redis.smembers("currently_running")
      decrement_queue_counter
    end
  else
    puts "======================================="
    puts "CRAWL CANCELLED"
    puts "======================================="
  end
  false
end