Class: Stats

Inherits:
Object
  • Object
show all
Defined in:
lib/stats.rb

Overview

Stats class is the main statisitics hub for monitoring crawls. Either can be viewed through the Sinatra interface, or returned from the CobwebCrawler.crawl method or block

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options) ⇒ Stats

Sets up redis usage for statistics



8
9
10
11
12
13
14
15
16
17
# File 'lib/stats.rb', line 8

def initialize(options)
  options[:redis_options] = {} unless options.has_key? :redis_options
  if options[:redis]
    @full_redis = options[:redis]
  else
    @full_redis = Redis.new(options[:redis_options])
  end
  @lock = Mutex.new
  @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis => @full_redis)
end

Instance Attribute Details

#redisObject (readonly)

Returns the value of attribute redis.



5
6
7
# File 'lib/stats.rb', line 5

def redis
  @redis
end

Instance Method Details

#end_crawl(options, cancelled = false) ⇒ Object

Removes the crawl from the running crawls and updates status



32
33
34
35
36
37
38
39
40
41
# File 'lib/stats.rb', line 32

def end_crawl(options, cancelled=false)
  #@full_redis.srem "cobweb_crawls", options[:crawl_id]
  if cancelled
    @redis.hset "statistics", "current_status", CobwebCrawlHelper::CANCELLED
  else
    @redis.hset "statistics", "current_status", CobwebCrawlHelper::FINISHED
  end
  @redis.hset "statistics", "crawl_finished_at", DateTime.now
  #@redis.del "crawl_details"
end

#get_crawledObject



43
44
45
# File 'lib/stats.rb', line 43

def get_crawled
  @redis.smembers "crawled"
end

#get_statisticsObject

Returns the statistics hash



159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# File 'lib/stats.rb', line 159

def get_statistics

  statistics = HashUtil.deep_symbolize_keys(@redis.hgetall("statistics"))
  if statistics[:status_counts].nil?
    statistics[:status_counts]
  else
    statistics[:status_counts] = JSON.parse(statistics[:status_counts])
  end
  if statistics[:mime_counts].nil?
    statistics[:mime_counts]
  else
    statistics[:mime_counts] = JSON.parse(statistics[:mime_counts])
  end
  statistics
end

#get_statusObject

Returns the current status of the crawl



181
182
183
# File 'lib/stats.rb', line 181

def get_status
  @redis.hget "statistics", "current_status"
end


47
48
49
50
# File 'lib/stats.rb', line 47

def inbound_links_for(url)
  uri = UriHelper.parse(url).normalize
  @redis.smembers("inbound_links_#{Digest::MD5.hexdigest(uri.to_s)}")
end

#set_totalsObject

Sets totals for the end of the crawl (Not Used)



186
187
188
189
# File 'lib/stats.rb', line 186

def set_totals
  stats = get_statistics
  stats[:crawled] = @redis.smembers "crawled"
end

#start_crawl(options) ⇒ Object

Sets up the crawl in statistics



20
21
22
23
24
25
26
27
28
29
# File 'lib/stats.rb', line 20

def start_crawl(options)
  unless @full_redis.sismember "cobweb_crawls", options[:crawl_id]
    @full_redis.sadd "cobweb_crawls", options[:crawl_id]
    options.keys.each do |key|
      @redis.hset "crawl_details", key, options[key].to_s
    end
  end
  @redis.hset "statistics", "crawl_started_at", DateTime.now
  @redis.hset "statistics", "current_status", CobwebCrawlHelper::STARTING
end

#update_statistics(content, crawl_counter = @redis.scard("crawled").to_i, queue_counter = @redis.scard("queued").to_i) ⇒ Object

Returns statistics hash. update_statistics takes the content hash, extracts statistics from it and updates redis with the data.



53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# File 'lib/stats.rb', line 53

def update_statistics(content, crawl_counter=@redis.scard("crawled").to_i, queue_counter=@redis.scard("queued").to_i)
  @lock.synchronize {
    @statistics = get_statistics

    if @statistics.has_key? :average_response_time
      @statistics[:average_response_time] = (((@redis.hget("statistics", "average_response_time").to_f*crawl_counter) + content[:response_time].to_f) / (crawl_counter + 1))
    else
      @statistics[:average_response_time] = content[:response_time].to_f
    end
    @statistics[:maximum_response_time] = content[:response_time].to_f if @statistics[:maximum_response_time].nil? or content[:response_time].to_f > @statistics[:maximum_response_time].to_f
    @statistics[:minimum_response_time] = content[:response_time].to_f if @statistics[:minimum_response_time].nil? or content[:response_time].to_f < @statistics[:minimum_response_time].to_f
    if @statistics.has_key? :average_length
      @statistics[:average_length] = (((@redis.hget("statistics", "average_length").to_i*crawl_counter) + content[:length].to_i) / (crawl_counter + 1))
    else
      @statistics[:average_length] = content[:length].to_i
    end
    @statistics[:maximum_length] = content[:length].to_i if @redis.hget("statistics", "maximum_length").nil? or content[:length].to_i > @statistics[:maximum_length].to_i
    @statistics[:minimum_length] = content[:length].to_i if @redis.hget("statistics", "minimum_length").nil? or content[:length].to_i < @statistics[:minimum_length].to_i

    if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
      @statistics[:page_count] = @statistics[:page_count].to_i + 1
      @statistics[:page_size] = @statistics[:page_size].to_i + content[:length].to_i
      increment_time_stat("pages_count")
    else
      @statistics[:asset_count] = @statistics[:asset_count].to_i + 1
      @statistics[:asset_size] = @statistics[:asset_size].to_i + content[:length].to_i
      increment_time_stat("assets_count")
    end

    total_redirects = @statistics[:total_redirects].to_i
    @statistics[:total_redirects] = 0 if total_redirects.nil?
    @statistics[:total_redirects] = total_redirects += content[:redirect_through].count unless content[:redirect_through].nil?

    @statistics[:crawl_counter] = crawl_counter
    @statistics[:queue_counter] = queue_counter

    total_length = @statistics[:total_length].to_i
    @statistics[:total_length] = total_length + content[:length].to_i

    mime_counts = {}
    if @statistics.has_key? :mime_counts
      mime_counts = @statistics[:mime_counts]
      if mime_counts.has_key? content[:mime_type]
        mime_counts[content[:mime_type]] += 1
      else
        mime_counts[content[:mime_type]] = 1
      end
    else
      mime_counts = {content[:mime_type] => 1}
    end

    @statistics[:mime_counts] = mime_counts.to_json

    # record mime categories stats
    if content[:mime_type].cobweb_starts_with? "text"
      increment_time_stat("mime_text_count")
    elsif content[:mime_type].cobweb_starts_with? "application"
      increment_time_stat("mime_application_count")
    elsif content[:mime_type].cobweb_starts_with? "audio"
      increment_time_stat("mime_audio_count")
    elsif content[:mime_type].cobweb_starts_with? "image"
      increment_time_stat("mime_image_count")
    elsif content[:mime_type].cobweb_starts_with? "message"
      increment_time_stat("mime_message_count")
    elsif content[:mime_type].cobweb_starts_with? "model"
      increment_time_stat("mime_model_count")
    elsif content[:mime_type].cobweb_starts_with? "multipart"
      increment_time_stat("mime_multipart_count")
    elsif content[:mime_type].cobweb_starts_with? "video"
      increment_time_stat("mime_video_count")
    end

    status_counts = {}
    if @statistics.has_key? :status_counts
      status_counts = @statistics[:status_counts]
      status_code = content[:status_code].to_i.to_s.to_sym
      if status_counts.has_key? status_code
        status_counts[status_code] += 1
      else
        status_counts[status_code] = 1
      end
    else
      status_counts = {status_code => 1}
    end

    # record statistics by status type
    if content[:status_code] >= 200 && content[:status_code] < 300
      increment_time_stat("status_200_count")
    elsif content[:status_code] >= 400 && content[:status_code] < 500
      increment_time_stat("status|_400_count")
    elsif content[:status_code] >= 500 && content[:status_code] < 600
      increment_time_stat("status|_500_count")
    end

    @statistics[:status_counts] = status_counts.to_json

    ## time based statistics
    increment_time_stat("minute_totals", "minute", 60)

    redis_command = "@redis.hmset 'statistics', #{@statistics.keys.map{|key| "'#{key}', '#{@statistics[key].to_s.gsub("'","''")}'"}.join(", ")}"
    instance_eval redis_command
  }
  @statistics
end

#update_status(status) ⇒ Object

Sets the current status of the crawl



176
177
178
# File 'lib/stats.rb', line 176

def update_status(status)
  @redis.hset("statistics", "current_status", status) unless get_status == CobwebCrawlHelper::CANCELLED
end