Class: Staticizer::Crawler
- Inherits:
-
Object
- Object
- Staticizer::Crawler
- Defined in:
- lib/staticizer/crawler.rb
Instance Attribute Summary collapse
-
#output_dir ⇒ Object
Returns the value of attribute output_dir.
-
#url_queue ⇒ Object
readonly
Returns the value of attribute url_queue.
Instance Method Summary collapse
- #add_url(url, info = {}) ⇒ Object
- #add_urls(urls, info = {}) ⇒ Object
- #crawl ⇒ Object
- #extract_css_urls(css, base_uri) ⇒ Object
- #extract_hrefs(doc, base_uri) ⇒ Object
- #extract_images(doc, base_uri) ⇒ Object
- #extract_links(doc, base_uri) ⇒ Object
- #extract_scripts(doc, base_uri) ⇒ Object
-
#initialize(initial_page, opts = {}) ⇒ Crawler
constructor
A new instance of Crawler.
- #log_level ⇒ Object
- #log_level=(level) ⇒ Object
- #make_absolute(base_uri, href) ⇒ Object
- #process_body(body, uri, opts) ⇒ Object
-
#process_redirect(url, destination_url) ⇒ Object
If we hit a redirect we save the redirect as a meta refresh page TODO: for AWS S3 hosting we could instead create a redirect?.
- #process_success(response, parsed_uri) ⇒ Object
-
#process_url(url, info) ⇒ Object
Fetch a URI and save it to disk.
- #save_page(response, uri) ⇒ Object
- #save_page_to_aws(response, uri) ⇒ Object
- #save_page_to_disk(response, uri) ⇒ Object
Constructor Details
#initialize(initial_page, opts = {}) ⇒ Crawler
Returns a new instance of Crawler.
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
# File 'lib/staticizer/crawler.rb', line 12 def initialize(initial_page, opts = {}) if initial_page.nil? raise ArgumentError, "Initial page required" end @opts = opts.dup @url_queue = [] @processed_urls = [] @output_dir = @opts[:output_dir] || File.("crawl/") @log = @opts[:logger] || Logger.new(STDOUT) @log.level = @opts[:log_level] || Logger::INFO if @opts[:aws] bucket_name = @opts[:aws].delete(:bucket_name) Aws.config.update(opts[:aws]) @s3_bucket = Aws::S3::Resource.new.bucket(bucket_name) end if @opts[:valid_domains].nil? uri = URI.parse(initial_page) @opts[:valid_domains] ||= [uri.host] end if @opts[:process_body] @process_body = @opts[:process_body] end add_url(initial_page) end |
Instance Attribute Details
#output_dir ⇒ Object
Returns the value of attribute output_dir.
10 11 12 |
# File 'lib/staticizer/crawler.rb', line 10 def output_dir @output_dir end |
#url_queue ⇒ Object (readonly)
Returns the value of attribute url_queue.
9 10 11 |
# File 'lib/staticizer/crawler.rb', line 9 def url_queue @url_queue end |
Instance Method Details
#add_url(url, info = {}) ⇒ Object
91 92 93 94 95 96 97 98 99 100 101 102 103 |
# File 'lib/staticizer/crawler.rb', line 91 def add_url(url, info = {}) if @opts[:filter_url] url = @opts[:filter_url].call(url, info) return if url.nil? else regex = "(#{@opts[:valid_domains].join(")|(")})" return if url !~ %r{^https?://#{regex}} end url = url.sub(/#.*$/,'') # strip off any fragments return if @url_queue.index {|u| u[0] == url } || @processed_urls.include?(url) @url_queue << [url, info] end |
#add_urls(urls, info = {}) ⇒ Object
80 81 82 |
# File 'lib/staticizer/crawler.rb', line 80 def add_urls(urls, info = {}) urls.compact.uniq.each {|url| add_url(url, info.dup) } end |
#crawl ⇒ Object
50 51 52 53 54 55 56 57 58 |
# File 'lib/staticizer/crawler.rb', line 50 def crawl @log.info("Starting crawl") while(@url_queue.length > 0) url, info = @url_queue.shift @processed_urls << url process_url(url, info) end @log.info("Finished crawl") end |
#extract_css_urls(css, base_uri) ⇒ Object
76 77 78 |
# File 'lib/staticizer/crawler.rb', line 76 def extract_css_urls(css, base_uri) css.scan(/url\(\s*['"]?(.+?)['"]?\s*\)/).map {|src| make_absolute(base_uri, src[0]) } end |
#extract_hrefs(doc, base_uri) ⇒ Object
60 61 62 |
# File 'lib/staticizer/crawler.rb', line 60 def extract_hrefs(doc, base_uri) doc.xpath("//a/@href").map {|href| make_absolute(base_uri, href) } end |
#extract_images(doc, base_uri) ⇒ Object
64 65 66 |
# File 'lib/staticizer/crawler.rb', line 64 def extract_images(doc, base_uri) doc.xpath("//img/@src").map {|src| make_absolute(base_uri, src) } end |
#extract_links(doc, base_uri) ⇒ Object
68 69 70 |
# File 'lib/staticizer/crawler.rb', line 68 def extract_links(doc, base_uri) doc.xpath("//link/@href").map {|href| make_absolute(base_uri, href) } end |
#extract_scripts(doc, base_uri) ⇒ Object
72 73 74 |
# File 'lib/staticizer/crawler.rb', line 72 def extract_scripts(doc, base_uri) doc.xpath("//script/@src").map {|src| make_absolute(base_uri, src) } end |
#log_level ⇒ Object
42 43 44 |
# File 'lib/staticizer/crawler.rb', line 42 def log_level @log.level end |
#log_level=(level) ⇒ Object
46 47 48 |
# File 'lib/staticizer/crawler.rb', line 46 def log_level=(level) @log.level = level end |
#make_absolute(base_uri, href) ⇒ Object
84 85 86 87 88 89 |
# File 'lib/staticizer/crawler.rb', line 84 def make_absolute(base_uri, href) URI::join(base_uri, href).to_s rescue StandardError => e @log.error "Could not make absolute '#{base_uri}' - '#{href}' - #{e}" return nil end |
#process_body(body, uri, opts) ⇒ Object
205 206 207 208 209 210 |
# File 'lib/staticizer/crawler.rb', line 205 def process_body(body, uri, opts) if @process_body body = @process_body.call(body, uri, opts) end body end |
#process_redirect(url, destination_url) ⇒ Object
If we hit a redirect we save the redirect as a meta refresh page TODO: for AWS S3 hosting we could instead create a redirect?
200 201 202 203 |
# File 'lib/staticizer/crawler.rb', line 200 def process_redirect(url, destination_url) body = "<html><head><META http-equiv='refresh' content='0;URL=\"#{destination_url}\"'></head><body>You are being redirected to <a href='#{destination_url}'>#{destination_url}</a>.</body></html>" save_page(body, url) end |
#process_success(response, parsed_uri) ⇒ Object
176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 |
# File 'lib/staticizer/crawler.rb', line 176 def process_success(response, parsed_uri) url = parsed_uri.to_s if @opts[:filter_process] return if @opts[:filter_process].call(response, parsed_uri) end case response['content-type'] when /css/ save_page(response, parsed_uri) add_urls(extract_css_urls(response.body, url), {:type_hint => "css_url"}) when /html/ save_page(response, parsed_uri) doc = Nokogiri::HTML(response.body) add_urls(extract_links(doc, url), {:type_hint => "link"}) add_urls(extract_scripts(doc, url), {:type_hint => "script"}) add_urls(extract_images(doc, url), {:type_hint => "image"}) add_urls(extract_css_urls(response.body, url), {:type_hint => "css_url"}) add_urls(extract_hrefs(doc, url), {:type_hint => "href"}) unless @opts[:single_page] else save_page(response, parsed_uri) end end |
#process_url(url, info) ⇒ Object
Fetch a URI and save it to disk
213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 |
# File 'lib/staticizer/crawler.rb', line 213 def process_url(url, info) @http_connections ||= {} parsed_uri = URI(url) @log.debug "Fetching #{parsed_uri}" # Attempt to use an already open Net::HTTP connection key = parsed_uri.host + parsed_uri.port.to_s connection = @http_connections[key] if connection.nil? connection = Net::HTTP.new(parsed_uri.host, parsed_uri.port) connection.use_ssl = true if parsed_uri.scheme.downcase == "https" @http_connections[key] = connection end request = Net::HTTP::Get.new(parsed_uri.request_uri) begin connection.request(request) do |response| case response when Net::HTTPSuccess process_success(response, parsed_uri) when Net::HTTPRedirection redirect_url = response['location'] @log.debug "Processing redirect to #{redirect_url}" process_redirect(parsed_uri, redirect_url) add_url(redirect_url) else @log.error "Error #{response.code}:#{response.} fetching url #{url}" end end rescue OpenSSL::SSL::SSLError => e @log.error "SSL Error #{e.} fetching url #{url}" rescue Errno::ECONNRESET => e @log.error "Error #{e.class}:#{e.} fetching url #{url}" end end |
#save_page(response, uri) ⇒ Object
105 106 107 108 109 110 111 112 |
# File 'lib/staticizer/crawler.rb', line 105 def save_page(response, uri) return if @opts[:skip_write] if @opts[:aws] save_page_to_aws(response, uri) else save_page_to_disk(response, uri) end end |
#save_page_to_aws(response, uri) ⇒ Object
158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
# File 'lib/staticizer/crawler.rb', line 158 def save_page_to_aws(response, uri) key = uri.path key += "?#{uri.query}" if uri.query key = key.gsub(%r{^/},"") key = "index.html" if key == "" # Upload this file directly to AWS::S3 opts = {:acl => "public-read"} opts[:content_type] = response['content-type'] rescue "text/html" @log.info "Uploading #{key} to s3 with content type #{opts[:content_type]}" if response.respond_to?(:read_body) body = process_body(response.read_body, uri, opts) @s3_bucket.object(key).put(opts.merge(body: body)) else body = process_body(response, uri, opts) @s3_bucket.object(key).put(opts.merge(body: body)) end end |
#save_page_to_disk(response, uri) ⇒ Object
114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
# File 'lib/staticizer/crawler.rb', line 114 def save_page_to_disk(response, uri) path = uri.path path += "?#{uri.query}" if uri.query path_segments = path.scan(%r{[^/]*/}) filename = path.include?("/") ? path[path.rindex("/")+1..-1] : path current = @output_dir FileUtils.mkdir_p(current) unless File.exist?(current) # Create all the directories necessary for this file path_segments.each do |segment| current = File.join(current, "#{segment}").sub(%r{/$},'') if File.file?(current) # If we are trying to create a directory and there already is a file # with the same name add a .d to the file since we can't create # a directory and file with the same name in the file system dirfile = current + ".d" FileUtils.mv(current, dirfile) FileUtils.mkdir(current) FileUtils.cp(dirfile, File.join(current, "/index.html")) elsif !File.exists?(current) FileUtils.mkdir(current) end end body = response.respond_to?(:read_body) ? response.read_body : response body = process_body(body, uri, {}) outfile = File.join(current, "/#{filename}") if filename == "" indexfile = File.join(outfile, "/index.html") @log.info "Saving #{indexfile}" File.open(indexfile, "wb") {|f| f << body } elsif File.directory?(outfile) dirfile = outfile + ".d" @log.info "Saving #{dirfile}" File.open(dirfile, "wb") {|f| f << body } FileUtils.cp(dirfile, File.join(outfile, "/index.html")) else @log.info "Saving #{outfile}" File.open(outfile, "wb") {|f| f << body } end end |