Class: Staticizer::Crawler
- Inherits:
-
Object
- Object
- Staticizer::Crawler
- Defined in:
- lib/staticizer/crawler.rb
Instance Attribute Summary collapse
-
#output_dir ⇒ Object
Returns the value of attribute output_dir.
-
#url_queue ⇒ Object
readonly
Returns the value of attribute url_queue.
Instance Method Summary collapse
- #add_url(url, info = {}) ⇒ Object
- #add_urls(urls, info = {}) ⇒ Object
- #crawl ⇒ Object
- #extract_css_urls(css, base_uri) ⇒ Object
- #extract_hrefs(doc, base_uri) ⇒ Object
- #extract_images(doc, base_uri) ⇒ Object
- #extract_links(doc, base_uri) ⇒ Object
- #extract_scripts(doc, base_uri) ⇒ Object
-
#initialize(initial_page, opts = {}) ⇒ Crawler
constructor
A new instance of Crawler.
- #log_level ⇒ Object
- #log_level=(level) ⇒ Object
- #make_absolute(base_uri, href) ⇒ Object
-
#process_redirect(url, destination_url) ⇒ Object
If we hit a redirect we save the redirect as a meta refresh page TODO: for AWS S3 hosting we could instead create a redirect?.
- #process_success(response, parsed_uri) ⇒ Object
-
#process_url(url, info) ⇒ Object
Fetch a URI and save it to disk.
- #save_page(response, uri) ⇒ Object
- #save_page_to_aws(response, uri) ⇒ Object
- #save_page_to_disk(response, uri) ⇒ Object
Constructor Details
#initialize(initial_page, opts = {}) ⇒ Crawler
Returns a new instance of Crawler.
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
# File 'lib/staticizer/crawler.rb', line 12 def initialize(initial_page, opts = {}) if initial_page.nil? raise ArgumentError, "Initial page required" end @opts = opts.dup @url_queue = [] @processed_urls = [] @output_dir = @opts[:output_dir] || File.("crawl/") @log = @opts[:logger] || Logger.new(STDOUT) @log.level = @opts[:log_level] || Logger::INFO if @opts[:aws] bucket_name = @opts[:aws].delete(:bucket_name) AWS.config(opts[:aws]) @s3_bucket = AWS::S3.new.buckets[bucket_name] @s3_bucket.acl = :public_read end if @opts[:valid_domains].nil? uri = URI.parse(initial_page) @opts[:valid_domains] ||= [uri.host] end add_url(initial_page) end |
Instance Attribute Details
#output_dir ⇒ Object
Returns the value of attribute output_dir.
10 11 12 |
# File 'lib/staticizer/crawler.rb', line 10 def output_dir @output_dir end |
#url_queue ⇒ Object (readonly)
Returns the value of attribute url_queue.
9 10 11 |
# File 'lib/staticizer/crawler.rb', line 9 def url_queue @url_queue end |
Instance Method Details
#add_url(url, info = {}) ⇒ Object
86 87 88 89 90 91 92 93 94 95 96 97 98 |
# File 'lib/staticizer/crawler.rb', line 86 def add_url(url, info = {}) if @opts[:filter_url] url = @opts[:filter_url].call(url, info) return if url.nil? else regex = "(#{@opts[:valid_domains].join(")|(")})" return if url !~ %r{^https?://#{regex}} end url = url.sub(/#.*$/,'') # strip off any fragments return if @url_queue.index {|u| u[0] == url } || @processed_urls.include?(url) @url_queue << [url, info] end |
#add_urls(urls, info = {}) ⇒ Object
76 77 78 |
# File 'lib/staticizer/crawler.rb', line 76 def add_urls(urls, info = {}) urls.compact.uniq.each {|url| add_url(url, info.dup) } end |
#crawl ⇒ Object
46 47 48 49 50 51 52 53 54 |
# File 'lib/staticizer/crawler.rb', line 46 def crawl @log.info("Starting crawl") while(@url_queue.length > 0) url, info = @url_queue.shift @processed_urls << url process_url(url, info) end @log.info("Finished crawl") end |
#extract_css_urls(css, base_uri) ⇒ Object
72 73 74 |
# File 'lib/staticizer/crawler.rb', line 72 def extract_css_urls(css, base_uri) css.scan(/url\(['"]?(.+?)['"]?\)/).map {|src| make_absolute(base_uri, src[0]) } end |
#extract_hrefs(doc, base_uri) ⇒ Object
56 57 58 |
# File 'lib/staticizer/crawler.rb', line 56 def extract_hrefs(doc, base_uri) doc.xpath("//a/@href").map {|href| make_absolute(base_uri, href) } end |
#extract_images(doc, base_uri) ⇒ Object
60 61 62 |
# File 'lib/staticizer/crawler.rb', line 60 def extract_images(doc, base_uri) doc.xpath("//img/@src").map {|src| make_absolute(base_uri, src) } end |
#extract_links(doc, base_uri) ⇒ Object
64 65 66 |
# File 'lib/staticizer/crawler.rb', line 64 def extract_links(doc, base_uri) doc.xpath("//link/@href").map {|href| make_absolute(base_uri, href) } end |
#extract_scripts(doc, base_uri) ⇒ Object
68 69 70 |
# File 'lib/staticizer/crawler.rb', line 68 def extract_scripts(doc, base_uri) doc.xpath("//script/@src").map {|src| make_absolute(base_uri, src) } end |
#log_level ⇒ Object
38 39 40 |
# File 'lib/staticizer/crawler.rb', line 38 def log_level @log.level end |
#log_level=(level) ⇒ Object
42 43 44 |
# File 'lib/staticizer/crawler.rb', line 42 def log_level=(level) @log.level = level end |
#make_absolute(base_uri, href) ⇒ Object
80 81 82 83 84 |
# File 'lib/staticizer/crawler.rb', line 80 def make_absolute(base_uri, href) URI::join(base_uri, href).to_s rescue StandardError => e @log.error "Could not make absolute '#{base_uri}' - '#{href}' - #{e}" end |
#process_redirect(url, destination_url) ⇒ Object
If we hit a redirect we save the redirect as a meta refresh page TODO: for AWS S3 hosting we could instead create a redirect?
187 188 189 190 |
# File 'lib/staticizer/crawler.rb', line 187 def process_redirect(url, destination_url) body = "<html><head><META http-equiv='refresh' content='0;URL=\"#{destination_url}\"'></head><body>You are being redirected to <a href='#{destination_url}'>#{destination_url}</a>.</body></html>" save_page(body, url) end |
#process_success(response, parsed_uri) ⇒ Object
167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 |
# File 'lib/staticizer/crawler.rb', line 167 def process_success(response, parsed_uri) url = parsed_uri.to_s case response['content-type'] when /css/ save_page(response, parsed_uri) add_urls(extract_css_urls(response.body, url), {:type_hint => "css_url"}) when /html/ save_page(response, parsed_uri) doc = Nokogiri::HTML(response.body) add_urls(extract_links(doc, url), {:type_hint => "link"}) add_urls(extract_scripts(doc, url), {:type_hint => "script"}) add_urls(extract_images(doc, url), {:type_hint => "image"}) add_urls(extract_hrefs(doc, url), {:type_hint => "href"}) unless @opts[:single_page] else save_page(response, parsed_uri) end end |
#process_url(url, info) ⇒ Object
Fetch a URI and save it to disk
193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 |
# File 'lib/staticizer/crawler.rb', line 193 def process_url(url, info) @http_connections ||= {} parsed_uri = URI(url) @log.debug "Fetching #{parsed_uri}" # Attempt to use an already open Net::HTTP connection key = parsed_uri.host + parsed_uri.port.to_s connection = @http_connections[key] if connection.nil? connection = Net::HTTP.new(parsed_uri.host, parsed_uri.port) @http_connections[key] = connection end request = Net::HTTP::Get.new(parsed_uri.request_uri) connection.request(request) do |response| case response when Net::HTTPSuccess process_success(response, parsed_uri) when Net::HTTPRedirection redirect_url = response['location'] @log.debug "Processing redirect to #{redirect_url}" process_redirect(parsed_uri, redirect_url) add_url(redirect_url) else @log.error "Error #{response.code}:#{response.} fetching url #{url}" end end end |
#save_page(response, uri) ⇒ Object
100 101 102 103 104 105 106 |
# File 'lib/staticizer/crawler.rb', line 100 def save_page(response, uri) if @opts[:aws] save_page_to_aws(response, uri) else save_page_to_disk(response, uri) end end |
#save_page_to_aws(response, uri) ⇒ Object
151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
# File 'lib/staticizer/crawler.rb', line 151 def save_page_to_aws(response, uri) key = uri.path key += "?#{uri.query}" if uri.query key = key.gsub(%r{^/},"") key = "index.html" if key == "" # Upload this file directly to AWS::S3 opts = {:acl => :public_read} opts[:content_type] = response['content-type'] rescue "text/html" @log.info "Uploading #{key} to s3 with content type #{opts[:content_type]}" if response.respond_to?(:read_body) @s3_bucket.objects[key].write(response.read_body, opts) else @s3_bucket.objects[key].write(response, opts) end end |
#save_page_to_disk(response, uri) ⇒ Object
108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
# File 'lib/staticizer/crawler.rb', line 108 def save_page_to_disk(response, uri) path = uri.path path += "?#{uri.query}" if uri.query path_segments = path.scan(%r{[^/]*/}) filename = path.include?("/") ? path[path.rindex("/")+1..-1] : path current = @output_dir FileUtils.mkdir_p(current) unless File.exist?(current) # Create all the directories necessary for this file path_segments.each do |segment| current = File.join(current, "#{segment}").sub(%r{/$},'') if File.file?(current) # If we are trying to create a directory and there already is a file # with the same name add a .d to the file since we can't create # a directory and file with the same name in the file system dirfile = current + ".d" FileUtils.mv(current, dirfile) FileUtils.mkdir(current) FileUtils.cp(dirfile, File.join(current, "/index.html")) elsif !File.exists?(current) FileUtils.mkdir(current) end end body = response.respond_to?(:read_body) ? response.read_body : response outfile = File.join(current, "/#{filename}") if filename == "" indexfile = File.join(outfile, "/index.html") @log.info "Saving #{indexfile}" File.open(indexfile, "wb") {|f| f << body } elsif File.directory?(outfile) dirfile = outfile + ".d" @log.info "Saving #{dirfile}" File.open(dirfile, "wb") {|f| f << body } FileUtils.cp(dirfile, File.join(outfile, "/index.html")) else @log.info "Saving #{outfile}" File.open(outfile, "wb") {|f| f << body } end end |