Class: SimpleCrawler::Crawler

Inherits:
Object
  • Object
show all
Defined in:
lib/simplecrawler.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url) ⇒ Crawler



37
38
39
40
41
42
43
44
45
# File 'lib/simplecrawler.rb', line 37

def initialize(url)
  @load_binary_data = false #default, skip loading of pagedata for binary files into Document.data
  @site_uri = URI.parse(url)
  @site_uri.path = "/" if @site_uri.path == ""
  @visited = Hash.new
  @queue = Array.new
  @current_count = 0
  add_uri(@site_uri)
end

Instance Attribute Details

#current_countObject

Returns the value of attribute current_count.



35
36
37
# File 'lib/simplecrawler.rb', line 35

def current_count
  @current_count
end

#include_patternsObject

Returns the value of attribute include_patterns.



35
36
37
# File 'lib/simplecrawler.rb', line 35

def include_patterns
  @include_patterns
end

#load_binary_dataObject

Returns the value of attribute load_binary_data.



35
36
37
# File 'lib/simplecrawler.rb', line 35

def load_binary_data
  @load_binary_data
end

#maxcountObject

Returns the value of attribute maxcount.



35
36
37
# File 'lib/simplecrawler.rb', line 35

def maxcount
  @maxcount
end

#queueObject

Returns the value of attribute queue.



35
36
37
# File 'lib/simplecrawler.rb', line 35

def queue
  @queue
end

#site_uriObject

Returns the value of attribute site_uri.



35
36
37
# File 'lib/simplecrawler.rb', line 35

def site_uri
  @site_uri
end

#skip_patternsObject

Returns the value of attribute skip_patterns.



35
36
37
# File 'lib/simplecrawler.rb', line 35

def skip_patterns
  @skip_patterns
end

#user_agentObject

Returns the value of attribute user_agent.



35
36
37
# File 'lib/simplecrawler.rb', line 35

def user_agent
  @user_agent
end

#visitedObject

Returns the value of attribute visited.



35
36
37
# File 'lib/simplecrawler.rb', line 35

def visited
  @visited
end

Instance Method Details

#add_uri(uri) ⇒ Object



105
106
107
108
109
110
111
112
113
114
115
116
117
118
# File 'lib/simplecrawler.rb', line 105

def add_uri(uri)

  if uri.class == String
    uri = URI.parse(uri.strip)
  end

  unless skip_uri?(uri)
    @queue.push uri.request_uri
    @current_count = @current_count + 1
    @visited[uri.request_uri] = false
    log("   Added #{uri}")
  end

end

#crawlObject

Initiate crawling.



184
185
186
187
188
189
190
191
192
# File 'lib/simplecrawler.rb', line 184

def crawl()
  while (!@queue.empty?)
    uri = @queue.shift
    current_doc = get_doc(uri)
    yield current_doc
    queue_local_links(current_doc)
    @visited[uri] = true
  end
end

#get_doc(request_uri) ⇒ Object



121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# File 'lib/simplecrawler.rb', line 121

def get_doc(request_uri)
  doc = Document.new
  begin
    log("   Getting #{request_uri}")
    request_uri = URI.parse(request_uri)

    uri = @site_uri.clone
    uri.path = request_uri.path #if request_uri.path.to_s != "/"
    uri.query = request_uri.query
    doc.uri = uri
    doc.fetched_at = Time.now

    log("Opening #{uri}")

    file = open(uri)

    mime_type = file.meta["content-type"].split(";")[0] if file.meta["content-type"]

    if MARKUP_MIME_TYPES.include?(mime_type.downcase) or @load_binary_data
      log("Loading data from #{uri}")
      doc.data = file.read
    else
      log("Skipping data for #{uri}")
      doc.data = nil
    end

    doc.headers = file.meta
    doc.http_status = file.status

  rescue => error
    log("Error fetching #{uri}: #{error.message}")
    if error.message[0..2] =~ /\d\d\d/ then
      doc.http_status = [error.message[0..2], error.message[3..-1]]
      return doc
    else
      raise error
    end
  end
  return doc
end

#log(message) ⇒ Object

Override this method for your own logging needs.



49
50
51
# File 'lib/simplecrawler.rb', line 49

def log(message)
  puts message
end


163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
# File 'lib/simplecrawler.rb', line 163

def queue_local_links(doc)
  return if doc.data == nil
  log("Queuing links for #{doc.uri}")
  Hpricot.buffer_size = 524288 #Allow for asp.net bastard-sized viewstate attributes...
  doc = Hpricot(doc.data)
  links = doc.search("a[@href]")
  for link in links
    if link.attributes["href"].length > 0 then
      begin
        uri = URI.parse(link.attributes["href"])
        add_uri(uri)
      rescue
        #skip this link
      end
    end
  end
  doc = nil
end

#skip_uri?(uri) ⇒ Boolean

Check if a path should be ignored because it matches a skip pattern or is already visited.



54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# File 'lib/simplecrawler.rb', line 54

def skip_uri?(uri)

  #Check if maxcount is reached
  if @maxcount
    if @current_count >= @maxcount
      return true
    end
  end

  #Check if path belongs to site
  unless (uri.relative? or uri.host == @site_uri.host)
    return true
  end

  #Check if fragment identifier (e.g. #content)
  if uri.request_uri.length == 0 and uri.fragment.length > 0
    return true
  end

  #Check if uri already visited in this crawl or if it is queued for crawling
  if @visited.has_key?(uri.request_uri) or @queue.include?(uri.request_uri)
    return true
  end

  #Check if uri is in a skip pattern
  if @skip_patterns
    for skip_pattern in @skip_patterns
      re = Regexp.new(skip_pattern)
      if re.match(uri.request_uri)
        return true
      end
    end
  end

  #Check if uri is in at least one of the include patterns
  if @include_patterns
    match_found = false
    for include_pattern in @include_patterns
      re = Regexp.new(include_pattern)
      if re.match(uri.request_uri)
        match_found = true
      end
    end

    return true unless match_found
  end

  return false
end