Class: Staticizer::Crawler
- Inherits:
-
Object
- Object
- Staticizer::Crawler
- Defined in:
- lib/staticizer/crawler.rb
Instance Method Summary collapse
- #add_url(url, info = {}) ⇒ Object
- #add_urls(urls, info = {}) ⇒ Object
- #crawl ⇒ Object
- #extract_css_urls(css, base_uri) ⇒ Object
- #extract_hrefs(doc, base_uri) ⇒ Object
- #extract_images(doc, base_uri) ⇒ Object
- #extract_links(doc, base_uri) ⇒ Object
- #extract_scripts(doc, base_uri) ⇒ Object
- #extract_videos(doc, base_uri) ⇒ Object
-
#initialize(initial_page, opts = {}) ⇒ Crawler
constructor
A new instance of Crawler.
- #make_absolute(base_uri, href) ⇒ Object
-
#process_redirect(url, destination_url, opts = {}) ⇒ Object
If we hit a redirect we save the redirect as a meta refresh page TODO: for AWS S3 hosting we could instead create a redirect?.
- #process_success(response, parsed_uri) ⇒ Object
-
#process_url(url, info) ⇒ Object
Fetch a URI and save it to disk.
- #save_page(response, uri, opts = {}) ⇒ Object
- #save_page_to_aws(response, uri, opts = {}) ⇒ Object
- #save_page_to_disk(response, uri, opts = {}) ⇒ Object
Constructor Details
#initialize(initial_page, opts = {}) ⇒ Crawler
Returns a new instance of Crawler.
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 |
# File 'lib/staticizer/crawler.rb', line 9 def initialize(initial_page, opts = {}) if initial_page.nil? raise ArgumentError, "Initial page required" end @opts = opts.dup @url_queue = [] @processed_urls = [] @opts[:output_dir] ||= File.("crawl/") @log = @opts[:logger] || Logger.new(STDOUT) @log.level = @opts[:log_level] || Logger::INFO if @opts[:aws] bucket_name = @opts[:aws].delete(:bucket_name) AWS.config(opts[:aws]) @s3_bucket = AWS::S3.new.buckets[bucket_name] @s3_bucket.acl = :public_read end if @opts[:valid_domains].nil? uri = URI.parse(initial_page) @opts[:valid_domains] ||= [uri.host] end add_url(initial_page) end |
Instance Method Details
#add_url(url, info = {}) ⇒ Object
99 100 101 102 103 104 105 106 107 108 109 110 111 |
# File 'lib/staticizer/crawler.rb', line 99 def add_url(url, info = {}) if @opts[:filter_url] url = @opts[:filter_url].call(url, info) return if url.nil? else regex = "(#{@opts[:valid_domains].join(")|(")})" return if url !~ %r{^https?://#{regex}} end url = url.sub(/#.*$/,'') # strip off any fragments return if @url_queue.index {|u| u[0] == url } || @processed_urls.include?(url) @url_queue << [url, info] end |
#add_urls(urls, info = {}) ⇒ Object
79 80 81 |
# File 'lib/staticizer/crawler.rb', line 79 def add_urls(urls, info = {}) urls.compact.uniq.each {|url| add_url(url, info.dup) } end |
#crawl ⇒ Object
35 36 37 38 39 40 41 42 43 |
# File 'lib/staticizer/crawler.rb', line 35 def crawl @log.info("Starting crawl") while(@url_queue.length > 0) url, info = @url_queue.shift @processed_urls << url process_url(url, info) end @log.info("Finished crawl") end |
#extract_css_urls(css, base_uri) ⇒ Object
70 71 72 73 74 75 76 77 |
# File 'lib/staticizer/crawler.rb', line 70 def extract_css_urls(css, base_uri) css.scan(/url\(([^)]+)\)/).map do |src| path = src[0] # URLS in css can be wrapped with " or 'ex: url("http:://something/"), strip these path = path.strip.gsub(/^['"]/, "").gsub(/['"]$/,"") make_absolute(base_uri, path) end end |
#extract_hrefs(doc, base_uri) ⇒ Object
54 55 56 |
# File 'lib/staticizer/crawler.rb', line 54 def extract_hrefs(doc, base_uri) doc.xpath("//a/@href").map {|href| make_absolute(base_uri, href) } end |
#extract_images(doc, base_uri) ⇒ Object
58 59 60 |
# File 'lib/staticizer/crawler.rb', line 58 def extract_images(doc, base_uri) doc.xpath("//img/@src").map {|src| make_absolute(base_uri, src) } end |
#extract_links(doc, base_uri) ⇒ Object
62 63 64 |
# File 'lib/staticizer/crawler.rb', line 62 def extract_links(doc, base_uri) doc.xpath("//link/@href").map {|href| make_absolute(base_uri, href) } end |
#extract_scripts(doc, base_uri) ⇒ Object
66 67 68 |
# File 'lib/staticizer/crawler.rb', line 66 def extract_scripts(doc, base_uri) doc.xpath("//script/@src").map {|src| make_absolute(base_uri, src) } end |
#extract_videos(doc, base_uri) ⇒ Object
45 46 47 48 49 50 51 52 |
# File 'lib/staticizer/crawler.rb', line 45 def extract_videos(doc, base_uri) doc.xpath("//video").map do |video| sources = video.xpath("//source/@src").map {|src| make_absolute(base_uri, src)} poster = video.attributes["poster"].to_s make_absolute(base_uri, poster) [poster, sources] end.flatten.uniq.compact end |
#make_absolute(base_uri, href) ⇒ Object
83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
# File 'lib/staticizer/crawler.rb', line 83 def make_absolute(base_uri, href) if href.to_s =~ /^https?/i # If the uri is already absolute then don't do anything to it except make spaces to + (otherwise # will not retrieve) href.to_s.gsub(" ", "+") else dup_uri = base_uri.dup # Remove the query params as otherwise will try use those when making absolute uri dup_uri.query = nil URI::join(dup_uri.to_s, href).to_s end rescue StandardError => e @log.error "Could not make absolute #{dup_uri} - #{href}" nil end |
#process_redirect(url, destination_url, opts = {}) ⇒ Object
If we hit a redirect we save the redirect as a meta refresh page TODO: for AWS S3 hosting we could instead create a redirect?
210 211 212 213 |
# File 'lib/staticizer/crawler.rb', line 210 def process_redirect(url, destination_url, opts = {}) body = "<html><head><META http-equiv='refresh' content='0;URL=\"#{destination_url}\"'></head><body>You are being redirected to <a href='#{destination_url}'>#{destination_url}</a>.</body></html>" save_page(body, url, opts) end |
#process_success(response, parsed_uri) ⇒ Object
186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 |
# File 'lib/staticizer/crawler.rb', line 186 def process_success(response, parsed_uri) url = parsed_uri.to_s case response['content-type'] when /css/ save_page(response, parsed_uri, no_query: true) add_urls(extract_css_urls(response.body, parsed_uri), {:type_hint => "css_url"}) when /html/ body = response.body save_page(body.gsub("https://www.canaan.com", ""), parsed_uri) doc = Nokogiri::HTML(body) add_urls(extract_videos(doc, parsed_uri), {:type_hint => "video"}) add_urls(extract_links(doc, parsed_uri), {:type_hint => "link"}) add_urls(extract_scripts(doc, parsed_uri), {:type_hint => "script"}) add_urls(extract_images(doc, parsed_uri), {:type_hint => "image"}) add_urls(extract_hrefs(doc, parsed_uri), {:type_hint => "href"}) # extract inline style="background-image:url('https://')" type of urls add_urls(extract_css_urls(body, parsed_uri), {:type_hint => "css_url"}) else save_page(response, parsed_uri, no_query: true) end end |
#process_url(url, info) ⇒ Object
Fetch a URI and save it to disk
216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 |
# File 'lib/staticizer/crawler.rb', line 216 def process_url(url, info) @http_connections ||= {} parsed_uri = URI(url) @log.debug "Fetching #{parsed_uri}" # Attempt to use an already open Net::HTTP connection key = parsed_uri.host + parsed_uri.port.to_s connection = @http_connections[key] if connection.nil? connection = Net::HTTP.new(parsed_uri.host, parsed_uri.port) connection.use_ssl = true if parsed_uri.scheme == "https" @http_connections[key] = connection end request = Net::HTTP::Get.new(parsed_uri.request_uri) connection.request(request) do |response| case response when Net::HTTPSuccess process_success(response, parsed_uri) when Net::HTTPRedirection redirect_url = response['location'] @log.debug "Processing redirect to #{redirect_url}" process_redirect(parsed_uri, redirect_url) add_url(redirect_url) else @log.error "Error #{response.code}:#{response.message} fetching url #{url}" end end end |
#save_page(response, uri, opts = {}) ⇒ Object
113 114 115 116 117 118 119 |
# File 'lib/staticizer/crawler.rb', line 113 def save_page(response, uri, opts = {}) if @opts[:aws] save_page_to_aws(response, uri, opts) else save_page_to_disk(response, uri, opts) end end |
#save_page_to_aws(response, uri, opts = {}) ⇒ Object
170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
# File 'lib/staticizer/crawler.rb', line 170 def save_page_to_aws(response, uri, opts = {}) key = uri.path key += "?#{uri.query}" if uri.query key = key.gsub(%r{^/},"") key = "index.html" if key == "" # Upload this file directly to AWS::S3 opts = {:acl => :public_read} opts[:content_type] = response['content-type'] rescue "text/html" @log.info "Uploading #{key} to s3 with content type #{opts[:content_type]}" if response.respond_to?(:read_body) @s3_bucket.objects[key].write(response.read_body, opts) else @s3_bucket.objects[key].write(response, opts) end end |
#save_page_to_disk(response, uri, opts = {}) ⇒ Object
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 |
# File 'lib/staticizer/crawler.rb', line 121 def save_page_to_disk(response, uri, opts = {}) path = uri.path path += "?#{uri.query}" if uri.query && !opts[:no_query] && !@opts[:no_query] path_segments = path.scan(%r{[^/]*/}) filename = path.include?("/") ? path[path.rindex("/")+1..-1] : path current = @opts[:output_dir] FileUtils.mkdir_p(current) unless File.exist?(current) # Create all the directories necessary for this file path_segments.each do |segment| current = File.join(current, "#{segment}").sub(%r{/$},'') if File.file?(current) # If we are trying to create a directory and there already is a file # with the same name add a .d to the file since we can't create # a directory and file with the same name in the file system dirfile = current + ".d" FileUtils.mv(current, dirfile) FileUtils.mkdir(current) FileUtils.cp(dirfile, File.join(current, "/index.html")) elsif !File.exists?(current) FileUtils.mkdir(current) end end body = response.respond_to?(:read_body) ? response.read_body : response body = @opts[:process_body].call(body, uri, opts) if @opts[:process_body] outfile = File.join(current, "/#{filename}") if filename == "" indexfile = File.join(outfile, "/index.html") return if opts[:no_overwrite] && File.exists?(indexfile) @log.info "Saving #{indexfile}" File.open(indexfile, "wb") {|f| f << body } elsif File.directory?(outfile) dirfile = outfile + ".d" outfile = File.join(outfile, "/index.html") return if opts[:no_overwrite] && File.exists?(outfile) @log.info "Saving #{dirfile}" File.open(dirfile, "wb") {|f| f << body } FileUtils.cp(dirfile, outfile) else return if opts[:no_overwrite] && File.exists?(outfile) @log.info "Saving #{outfile}" File.open(outfile, "wb") {|f| f << body } end end |