Class: Stats

Inherits:
Object
  • Object
show all
Defined in:
lib/stats.rb

Overview

Stats class is the main statisitics hub for monitoring crawls. Either can be viewed through the Sinatra interface, or returned from the CobwebCrawler.crawl method or block

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options) ⇒ Stats

Sets up redis usage for statistics



8
9
10
11
12
# File 'lib/stats.rb', line 8

def initialize(options)
  options[:redis_options] = {} unless options.has_key? :redis_options
  @full_redis = Redis.new(options[:redis_options])
  @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis => @full_redis)
end

Instance Attribute Details

#redisObject (readonly)

Returns the value of attribute redis.



5
6
7
# File 'lib/stats.rb', line 5

def redis
  @redis
end

Instance Method Details

#end_crawl(options, cancelled = false) ⇒ Object

Removes the crawl from the running crawls and updates status



26
27
28
29
30
31
32
33
34
# File 'lib/stats.rb', line 26

def end_crawl(options, cancelled=false)
  #@full_redis.srem "cobweb_crawls", options[:crawl_id]
  if cancelled
    @redis.hset "statistics", "current_status", CobwebCrawlHelper::CANCELLED
  else
    @redis.hset "statistics", "current_status", CobwebCrawlHelper::FINISHED
  end
  #@redis.del "crawl_details"
end

#get_crawledObject



36
37
38
# File 'lib/stats.rb', line 36

def get_crawled
  @redis.smembers "crawled"
end

#get_statisticsObject

Returns the statistics hash



150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
# File 'lib/stats.rb', line 150

def get_statistics
  
  @statistics = HashUtil.deep_symbolize_keys(@redis.hgetall("statistics"))
  if @statistics[:status_counts].nil?
    @statistics[:status_counts]
  else
    @statistics[:status_counts] = JSON.parse(@statistics[:status_counts])
  end
  if @statistics[:mime_counts].nil?
    @statistics[:mime_counts]
  else
    @statistics[:mime_counts] = JSON.parse(@statistics[:mime_counts])
  end
  @statistics
end

#get_statusObject

Returns the current status of the crawl



172
173
174
# File 'lib/stats.rb', line 172

def get_status
  @redis.hget "statistics", "current_status"
end


40
41
42
# File 'lib/stats.rb', line 40

def inbound_links_for(url, redis=@redis)
  @redis.smembers("inbound_links_#{Digest::MD5.hexdigest(url)}")
end

#set_totalsObject

Sets totals for the end of the crawl (Not Used)



177
178
179
180
# File 'lib/stats.rb', line 177

def set_totals
  stats = get_statistics
  stats[:crawled] = @redis.smembers "crawled"
end

#start_crawl(options) ⇒ Object

Sets up the crawl in statistics



15
16
17
18
19
20
21
22
23
# File 'lib/stats.rb', line 15

def start_crawl(options)
  unless @full_redis.sismember "cobweb_crawls", options[:crawl_id]
    @full_redis.sadd "cobweb_crawls", options[:crawl_id]
    options.keys.each do |key|
      @redis.hset "crawl_details", key, options[key].to_s
    end
  end
  @redis.hset "statistics", "current_status", CobwebCrawlHelper::STARTING
end

#update_statistics(content, crawl_counter = @redis.scard("crawled").to_i, queue_counter = @redis.scard("queued").to_i) ⇒ Object

Returns statistics hash. update_statistics takes the content hash, extracts statistics from it and updates redis with the data.



45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# File 'lib/stats.rb', line 45

def update_statistics(content, crawl_counter=@redis.scard("crawled").to_i, queue_counter=@redis.scard("queued").to_i)
  
  @statistics = get_statistics
  
  if @statistics.has_key? :average_response_time
    @statistics[:average_response_time] = (((@redis.hget("statistics", "average_response_time").to_f*crawl_counter) + content[:response_time].to_f) / (crawl_counter + 1))
  else
    @statistics[:average_response_time] = content[:response_time].to_f
  end
  @statistics[:maximum_response_time] = content[:response_time].to_f if @statistics[:maximum_response_time].nil? or content[:response_time].to_f > @statistics[:maximum_response_time].to_f
  @statistics[:minimum_response_time] = content[:response_time].to_f if @statistics[:minimum_response_time].nil? or content[:response_time].to_f < @statistics[:minimum_response_time].to_f
  if @statistics.has_key? :average_length
    @statistics[:average_length] = (((@redis.hget("statistics", "average_length").to_i*crawl_counter) + content[:length].to_i) / (crawl_counter + 1))
  else
    @statistics[:average_length] = content[:length].to_i
  end
  @statistics[:maximum_length] = content[:length].to_i if @redis.hget("statistics", "maximum_length").nil? or content[:length].to_i > @statistics[:maximum_length].to_i
  @statistics[:minimum_length] = content[:length].to_i if @redis.hget("statistics", "minimum_length").nil? or content[:length].to_i < @statistics[:minimum_length].to_i
  
  if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
    @statistics[:page_count] = @statistics[:page_count].to_i + 1
    @statistics[:page_size] = @statistics[:page_size].to_i + content[:length].to_i
    increment_time_stat("pages_count")
  else
    @statistics[:asset_count] = @statistics[:asset_count].to_i + 1
    @statistics[:asset_size] = @statistics[:asset_size].to_i + content[:length].to_i
    increment_time_stat("assets_count")
  end
  
  total_redirects = @statistics[:total_redirects].to_i
  @statistics[:total_redirects] = 0 if total_redirects.nil?
  @statistics[:total_redirects] = total_redirects += content[:redirect_through].count unless content[:redirect_through].nil?

  @statistics[:crawl_counter] = crawl_counter
  @statistics[:queue_counter] = queue_counter
  
  total_length = @statistics[:total_length].to_i
  @statistics[:total_length] = total_length + content[:length].to_i

  mime_counts = {}
  if @statistics.has_key? :mime_counts
    mime_counts = @statistics[:mime_counts]
    if mime_counts.has_key? content[:mime_type]
      mime_counts[content[:mime_type]] += 1
    else
      mime_counts[content[:mime_type]] = 1
    end
  else
    mime_counts = {content[:mime_type] => 1}
  end
  @statistics[:mime_counts] = mime_counts.to_json

  # record mime categories stats
  if content[:mime_type].cobweb_starts_with? "text"
    increment_time_stat("mime_text_count")
  elsif content[:mime_type].cobweb_starts_with? "application"
    increment_time_stat("mime_application_count")
  elsif content[:mime_type].cobweb_starts_with? "audio"
    increment_time_stat("mime_audio_count")
  elsif content[:mime_type].cobweb_starts_with? "image"
    increment_time_stat("mime_image_count")
  elsif content[:mime_type].cobweb_starts_with? "message"
    increment_time_stat("mime_message_count")
  elsif content[:mime_type].cobweb_starts_with? "model"
    increment_time_stat("mime_model_count")
  elsif content[:mime_type].cobweb_starts_with? "multipart"
    increment_time_stat("mime_multipart_count")
  elsif content[:mime_type].cobweb_starts_with? "video"
    increment_time_stat("mime_video_count")
  end
  
  status_counts = {}
  if @statistics.has_key? :status_counts
    status_counts = @statistics[:status_counts]
    status_code = content[:status_code].to_i.to_s.to_sym
    if status_counts.has_key? status_code
      status_counts[status_code] += 1
    else
      status_counts[status_code] = 1
    end      
  else
    status_counts = {status_code => 1}
  end
  
  # record statistics by status type
  if content[:status_code] >= 200 && content[:status_code] < 300
    increment_time_stat("status_200_count")
  elsif content[:status_code] >= 400 && content[:status_code] < 500
    increment_time_stat("status|_400_count")
  elsif content[:status_code] >= 500 && content[:status_code] < 600
    increment_time_stat("status|_500_count")
  end
  
  @statistics[:status_counts] = status_counts.to_json
  
  ## time based statistics
  increment_time_stat("minute_totals", "minute", 60)
  
  redis_command = "@redis.hmset 'statistics', #{@statistics.keys.map{|key| "'#{key}', '#{@statistics[key].to_s.gsub("'","''")}'"}.join(", ")}"
  instance_eval redis_command
  
  @statistics
end

#update_status(status) ⇒ Object

Sets the current status of the crawl



167
168
169
# File 'lib/stats.rb', line 167

def update_status(status)
  @redis.hset("statistics", "current_status", status) unless get_status == CobwebCrawlHelper::CANCELLED
end