Class: SimpleCrawler::Crawler
- Inherits:
-
Object
- Object
- SimpleCrawler::Crawler
- Defined in:
- lib/simplecrawler.rb
Instance Attribute Summary collapse
-
#current_count ⇒ Object
Returns the value of attribute current_count.
-
#include_patterns ⇒ Object
Returns the value of attribute include_patterns.
-
#load_binary_data ⇒ Object
Returns the value of attribute load_binary_data.
-
#maxcount ⇒ Object
Returns the value of attribute maxcount.
-
#queue ⇒ Object
Returns the value of attribute queue.
-
#site_uri ⇒ Object
Returns the value of attribute site_uri.
-
#skip_patterns ⇒ Object
Returns the value of attribute skip_patterns.
-
#user_agent ⇒ Object
Returns the value of attribute user_agent.
-
#visited ⇒ Object
Returns the value of attribute visited.
Instance Method Summary collapse
- #add_uri(uri) ⇒ Object
-
#crawl ⇒ Object
Initiate crawling.
- #get_doc(request_uri) ⇒ Object
-
#initialize(url) ⇒ Crawler
constructor
A new instance of Crawler.
-
#log(message) ⇒ Object
Override this method for your own logging needs.
- #queue_local_links(doc) ⇒ Object
-
#skip_uri?(uri) ⇒ Boolean
Check if a path should be ignored because it matches a skip pattern or is already visited.
Constructor Details
#initialize(url) ⇒ Crawler
37 38 39 40 41 42 43 44 45 |
# File 'lib/simplecrawler.rb', line 37 def initialize(url) @load_binary_data = false #default, skip loading of pagedata for binary files into Document.data @site_uri = URI.parse(url) @site_uri.path = "/" if @site_uri.path == "" @visited = Hash.new @queue = Array.new @current_count = 0 add_uri(@site_uri) end |
Instance Attribute Details
#current_count ⇒ Object
Returns the value of attribute current_count.
35 36 37 |
# File 'lib/simplecrawler.rb', line 35 def current_count @current_count end |
#include_patterns ⇒ Object
Returns the value of attribute include_patterns.
35 36 37 |
# File 'lib/simplecrawler.rb', line 35 def include_patterns @include_patterns end |
#load_binary_data ⇒ Object
Returns the value of attribute load_binary_data.
35 36 37 |
# File 'lib/simplecrawler.rb', line 35 def load_binary_data @load_binary_data end |
#maxcount ⇒ Object
Returns the value of attribute maxcount.
35 36 37 |
# File 'lib/simplecrawler.rb', line 35 def maxcount @maxcount end |
#queue ⇒ Object
Returns the value of attribute queue.
35 36 37 |
# File 'lib/simplecrawler.rb', line 35 def queue @queue end |
#site_uri ⇒ Object
Returns the value of attribute site_uri.
35 36 37 |
# File 'lib/simplecrawler.rb', line 35 def site_uri @site_uri end |
#skip_patterns ⇒ Object
Returns the value of attribute skip_patterns.
35 36 37 |
# File 'lib/simplecrawler.rb', line 35 def skip_patterns @skip_patterns end |
#user_agent ⇒ Object
Returns the value of attribute user_agent.
35 36 37 |
# File 'lib/simplecrawler.rb', line 35 def user_agent @user_agent end |
#visited ⇒ Object
Returns the value of attribute visited.
35 36 37 |
# File 'lib/simplecrawler.rb', line 35 def visited @visited end |
Instance Method Details
#add_uri(uri) ⇒ Object
105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
# File 'lib/simplecrawler.rb', line 105 def add_uri(uri) if uri.class == String uri = URI.parse(uri.strip) end unless skip_uri?(uri) @queue.push uri.request_uri @current_count = @current_count + 1 @visited[uri.request_uri] = false log(" Added #{uri}") end end |
#crawl ⇒ Object
Initiate crawling.
184 185 186 187 188 189 190 191 192 |
# File 'lib/simplecrawler.rb', line 184 def crawl() while (!@queue.empty?) uri = @queue.shift current_doc = get_doc(uri) yield current_doc queue_local_links(current_doc) @visited[uri] = true end end |
#get_doc(request_uri) ⇒ Object
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
# File 'lib/simplecrawler.rb', line 121 def get_doc(request_uri) doc = Document.new begin log(" Getting #{request_uri}") request_uri = URI.parse(request_uri) uri = @site_uri.clone uri.path = request_uri.path #if request_uri.path.to_s != "/" uri.query = request_uri.query doc.uri = uri doc.fetched_at = Time.now log("Opening #{uri}") file = open(uri) mime_type = file.["content-type"].split(";")[0] if file.["content-type"] if MARKUP_MIME_TYPES.include?(mime_type.downcase) or @load_binary_data log("Loading data from #{uri}") doc.data = file.read else log("Skipping data for #{uri}") doc.data = nil end doc.headers = file. doc.http_status = file.status rescue => error log("Error fetching #{uri}: #{error.}") if error.[0..2] =~ /\d\d\d/ then doc.http_status = [error.[0..2], error.[3..-1]] return doc else raise error end end return doc end |
#log(message) ⇒ Object
Override this method for your own logging needs.
49 50 51 |
# File 'lib/simplecrawler.rb', line 49 def log() puts end |
#queue_local_links(doc) ⇒ Object
163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 |
# File 'lib/simplecrawler.rb', line 163 def queue_local_links(doc) return if doc.data == nil log("Queuing links for #{doc.uri}") Hpricot.buffer_size = 524288 #Allow for asp.net bastard-sized viewstate attributes... doc = Hpricot(doc.data) links = doc.search("a[@href]") for link in links if link.attributes["href"].length > 0 then begin uri = URI.parse(link.attributes["href"]) add_uri(uri) rescue #skip this link end end end doc = nil end |
#skip_uri?(uri) ⇒ Boolean
Check if a path should be ignored because it matches a skip pattern or is already visited.
54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
# File 'lib/simplecrawler.rb', line 54 def skip_uri?(uri) #Check if maxcount is reached if @maxcount if @current_count >= @maxcount return true end end #Check if path belongs to site unless (uri.relative? or uri.host == @site_uri.host) return true end #Check if fragment identifier (e.g. #content) if uri.request_uri.length == 0 and uri.fragment.length > 0 return true end #Check if uri already visited in this crawl or if it is queued for crawling if @visited.has_key?(uri.request_uri) or @queue.include?(uri.request_uri) return true end #Check if uri is in a skip pattern if @skip_patterns for skip_pattern in @skip_patterns re = Regexp.new(skip_pattern) if re.match(uri.request_uri) return true end end end #Check if uri is in at least one of the include patterns if @include_patterns match_found = false for include_pattern in @include_patterns re = Regexp.new(include_pattern) if re.match(uri.request_uri) match_found = true end end return true unless match_found end return false end |