Class: Stats

Inherits:
Object
  • Object
show all
Defined in:
lib/stats.rb

Overview

Stats class is the main statisitics hub for monitoring crawls. Either can be viewed through the Sinatra interface, or returned from the CobwebCrawler.crawl method or block

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options) ⇒ Stats

Sets up redis usage for statistics



8
9
10
11
12
13
# File 'lib/stats.rb', line 8

def initialize(options)
  options[:redis_options] = {} unless options.has_key? :redis_options
  @full_redis = Redis.new(options[:redis_options])
  @lock = Mutex.new
  @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis => @full_redis)
end

Instance Attribute Details

#redisObject (readonly)

Returns the value of attribute redis.



5
6
7
# File 'lib/stats.rb', line 5

def redis
  @redis
end

Instance Method Details

#end_crawl(options, cancelled = false) ⇒ Object

Removes the crawl from the running crawls and updates status



28
29
30
31
32
33
34
35
36
37
# File 'lib/stats.rb', line 28

def end_crawl(options, cancelled=false)
  #@full_redis.srem "cobweb_crawls", options[:crawl_id]
  if cancelled
    @redis.hset "statistics", "current_status", CobwebCrawlHelper::CANCELLED
  else
    @redis.hset "statistics", "current_status", CobwebCrawlHelper::FINISHED
  end
  @redis.hset "statistics", "crawl_finished_at", DateTime.now
  #@redis.del "crawl_details"
end

#get_crawledObject



39
40
41
# File 'lib/stats.rb', line 39

def get_crawled
  @redis.smembers "crawled"
end

#get_statisticsObject

Returns the statistics hash



155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# File 'lib/stats.rb', line 155

def get_statistics
  
  statistics = HashUtil.deep_symbolize_keys(@redis.hgetall("statistics"))
  if statistics[:status_counts].nil?
    statistics[:status_counts]
  else
    statistics[:status_counts] = JSON.parse(statistics[:status_counts])
  end
  if statistics[:mime_counts].nil?
    statistics[:mime_counts]
  else
    statistics[:mime_counts] = JSON.parse(statistics[:mime_counts])
  end
  statistics
end

#get_statusObject

Returns the current status of the crawl



177
178
179
# File 'lib/stats.rb', line 177

def get_status
  @redis.hget "statistics", "current_status"
end


43
44
45
46
# File 'lib/stats.rb', line 43

def inbound_links_for(url)
  uri = UriHelper.parse(url)
  @redis.smembers("inbound_links_#{Digest::MD5.hexdigest(uri.to_s)}")
end

#set_totalsObject

Sets totals for the end of the crawl (Not Used)



182
183
184
185
# File 'lib/stats.rb', line 182

def set_totals
  stats = get_statistics
  stats[:crawled] = @redis.smembers "crawled"
end

#start_crawl(options) ⇒ Object

Sets up the crawl in statistics



16
17
18
19
20
21
22
23
24
25
# File 'lib/stats.rb', line 16

def start_crawl(options)
  unless @full_redis.sismember "cobweb_crawls", options[:crawl_id]
    @full_redis.sadd "cobweb_crawls", options[:crawl_id]
    options.keys.each do |key|
      @redis.hset "crawl_details", key, options[key].to_s
    end
  end
  @redis.hset "statistics", "crawl_started_at", DateTime.now
  @redis.hset "statistics", "current_status", CobwebCrawlHelper::STARTING
end

#update_statistics(content, crawl_counter = @redis.scard("crawled").to_i, queue_counter = @redis.scard("queued").to_i) ⇒ Object

Returns statistics hash. update_statistics takes the content hash, extracts statistics from it and updates redis with the data.



49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# File 'lib/stats.rb', line 49

def update_statistics(content, crawl_counter=@redis.scard("crawled").to_i, queue_counter=@redis.scard("queued").to_i)
  @lock.synchronize {
    @statistics = get_statistics
    
    if @statistics.has_key? :average_response_time
      @statistics[:average_response_time] = (((@redis.hget("statistics", "average_response_time").to_f*crawl_counter) + content[:response_time].to_f) / (crawl_counter + 1))
    else
      @statistics[:average_response_time] = content[:response_time].to_f
    end
    @statistics[:maximum_response_time] = content[:response_time].to_f if @statistics[:maximum_response_time].nil? or content[:response_time].to_f > @statistics[:maximum_response_time].to_f
    @statistics[:minimum_response_time] = content[:response_time].to_f if @statistics[:minimum_response_time].nil? or content[:response_time].to_f < @statistics[:minimum_response_time].to_f
    if @statistics.has_key? :average_length
      @statistics[:average_length] = (((@redis.hget("statistics", "average_length").to_i*crawl_counter) + content[:length].to_i) / (crawl_counter + 1))
    else
      @statistics[:average_length] = content[:length].to_i
    end
    @statistics[:maximum_length] = content[:length].to_i if @redis.hget("statistics", "maximum_length").nil? or content[:length].to_i > @statistics[:maximum_length].to_i
    @statistics[:minimum_length] = content[:length].to_i if @redis.hget("statistics", "minimum_length").nil? or content[:length].to_i < @statistics[:minimum_length].to_i
    
    if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
      @statistics[:page_count] = @statistics[:page_count].to_i + 1
      @statistics[:page_size] = @statistics[:page_size].to_i + content[:length].to_i
      increment_time_stat("pages_count")
    else
      @statistics[:asset_count] = @statistics[:asset_count].to_i + 1
      @statistics[:asset_size] = @statistics[:asset_size].to_i + content[:length].to_i
      increment_time_stat("assets_count")
    end
    
    total_redirects = @statistics[:total_redirects].to_i
    @statistics[:total_redirects] = 0 if total_redirects.nil?
    @statistics[:total_redirects] = total_redirects += content[:redirect_through].count unless content[:redirect_through].nil?

    @statistics[:crawl_counter] = crawl_counter
    @statistics[:queue_counter] = queue_counter
    
    total_length = @statistics[:total_length].to_i
    @statistics[:total_length] = total_length + content[:length].to_i

    mime_counts = {}
    if @statistics.has_key? :mime_counts
      mime_counts = @statistics[:mime_counts]
      if mime_counts.has_key? content[:mime_type]
        mime_counts[content[:mime_type]] += 1
      else
        mime_counts[content[:mime_type]] = 1
      end
    else
      mime_counts = {content[:mime_type] => 1}
    end

    @statistics[:mime_counts] = mime_counts.to_json

    # record mime categories stats
    if content[:mime_type].cobweb_starts_with? "text"
      increment_time_stat("mime_text_count")
    elsif content[:mime_type].cobweb_starts_with? "application"
      increment_time_stat("mime_application_count")
    elsif content[:mime_type].cobweb_starts_with? "audio"
      increment_time_stat("mime_audio_count")
    elsif content[:mime_type].cobweb_starts_with? "image"
      increment_time_stat("mime_image_count")
    elsif content[:mime_type].cobweb_starts_with? "message"
      increment_time_stat("mime_message_count")
    elsif content[:mime_type].cobweb_starts_with? "model"
      increment_time_stat("mime_model_count")
    elsif content[:mime_type].cobweb_starts_with? "multipart"
      increment_time_stat("mime_multipart_count")
    elsif content[:mime_type].cobweb_starts_with? "video"
      increment_time_stat("mime_video_count")
    end
    
    status_counts = {}
    if @statistics.has_key? :status_counts
      status_counts = @statistics[:status_counts]
      status_code = content[:status_code].to_i.to_s.to_sym
      if status_counts.has_key? status_code
        status_counts[status_code] += 1
      else
        status_counts[status_code] = 1
      end      
    else
      status_counts = {status_code => 1}
    end
    
    # record statistics by status type
    if content[:status_code] >= 200 && content[:status_code] < 300
      increment_time_stat("status_200_count")
    elsif content[:status_code] >= 400 && content[:status_code] < 500
      increment_time_stat("status|_400_count")
    elsif content[:status_code] >= 500 && content[:status_code] < 600
      increment_time_stat("status|_500_count")
    end
    
    @statistics[:status_counts] = status_counts.to_json
    
    ## time based statistics
    increment_time_stat("minute_totals", "minute", 60)
    
    redis_command = "@redis.hmset 'statistics', #{@statistics.keys.map{|key| "'#{key}', '#{@statistics[key].to_s.gsub("'","''")}'"}.join(", ")}"
    instance_eval redis_command
  }
  @statistics
end

#update_status(status) ⇒ Object

Sets the current status of the crawl



172
173
174
# File 'lib/stats.rb', line 172

def update_status(status)
  @redis.hset("statistics", "current_status", status) unless get_status == CobwebCrawlHelper::CANCELLED
end