Class: BlackStack::Bots::Scraper
- Inherits:
-
Object
- Object
- BlackStack::Bots::Scraper
- Defined in:
- lib/scraper.rb
Instance Attribute Summary collapse
-
#browser ⇒ Object
Returns the value of attribute browser.
-
#domain ⇒ Object
Returns the value of attribute domain.
-
#links ⇒ Object
Returns the value of attribute links.
-
#links_processed ⇒ Object
auxiliar array of links that I have extracted links from.
-
#load_wait_time ⇒ Object
Returns the value of attribute load_wait_time.
-
#stop_scraping_at_match_number ⇒ Object
Returns the value of attribute stop_scraping_at_match_number.
-
#stop_scraping_at_page_number ⇒ Object
Returns the value of attribute stop_scraping_at_page_number.
-
#timeout ⇒ Object
Returns the value of attribute timeout.
Instance Method Summary collapse
-
#find_keywords(a, stop_at = 25, stop_on_first_link_found = false, l = nil) ⇒ Object
def get_links.
-
#get_links(stop_at = 100, l = nil) ⇒ Object
def get_links_from_url.
-
#get_links_from_sitemap(stop_at = 100, l = nil) ⇒ Object
def initialize.
-
#get_links_from_url(url, l = nil) ⇒ Object
internal use only.
-
#initialize(init_domain, timeout, h) ⇒ Scraper
constructor
A new instance of Scraper.
Constructor Details
#initialize(init_domain, timeout, h) ⇒ Scraper
Returns a new instance of Scraper.
8 9 10 11 12 13 14 15 16 17 18 |
# File 'lib/scraper.rb', line 8 def initialize(init_domain, timeout, h) self.domain = init_domain self.timeout = timeout || 10 self.load_wait_time = 3 self.stop_scraping_at_page_number = 25 self.stop_scraping_at_match_number = 1 #self.agent.agent.http.verify_mode = OpenSSL::SSL::VERIFY_NONE self.links = [] self.links_processed = [] self.browser = BlackStack::Bots::Browser.new() end |
Instance Attribute Details
#browser ⇒ Object
Returns the value of attribute browser.
4 5 6 |
# File 'lib/scraper.rb', line 4 def browser @browser end |
#domain ⇒ Object
Returns the value of attribute domain.
4 5 6 |
# File 'lib/scraper.rb', line 4 def domain @domain end |
#links ⇒ Object
Returns the value of attribute links.
4 5 6 |
# File 'lib/scraper.rb', line 4 def links @links end |
#links_processed ⇒ Object
auxiliar array of links that I have extracted links from
6 7 8 |
# File 'lib/scraper.rb', line 6 def links_processed @links_processed end |
#load_wait_time ⇒ Object
Returns the value of attribute load_wait_time.
4 5 6 |
# File 'lib/scraper.rb', line 4 def load_wait_time @load_wait_time end |
#stop_scraping_at_match_number ⇒ Object
Returns the value of attribute stop_scraping_at_match_number.
4 5 6 |
# File 'lib/scraper.rb', line 4 def stop_scraping_at_match_number @stop_scraping_at_match_number end |
#stop_scraping_at_page_number ⇒ Object
Returns the value of attribute stop_scraping_at_page_number.
4 5 6 |
# File 'lib/scraper.rb', line 4 def stop_scraping_at_page_number @stop_scraping_at_page_number end |
#timeout ⇒ Object
Returns the value of attribute timeout.
4 5 6 |
# File 'lib/scraper.rb', line 4 def timeout @timeout end |
Instance Method Details
#find_keywords(a, stop_at = 25, stop_on_first_link_found = false, l = nil) ⇒ Object
def get_links
180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 |
# File 'lib/scraper.rb', line 180 def find_keywords(a, stop_at=25, stop_on_first_link_found=false, l=nil) pages = [] l = BlackStack::DummyLogger.new(nil) if l.nil? # iterate the links j = 0 self.links.reject { |link| link =~ /\.pdf$/i || link =~ /\.jpg$/i || link =~ /\.jpeg$/i || link =~ /\.gif$/i }.each { |link| j += 1 break if j > stop_at l.logs "#{j.to_s}. find_keywords (#{link})... " begin # get the page browser.goto link sleep(self.load_wait_time) # wait 10 seconds for javascript content to load # get page body content in plain text title = browser.title s = browser.body.text # add the link to the results of no-keyword hpage = { 'page_url' => link.downcase, 'page_title' => title, 'page_html' => browser.body.html, 'keywords' => [] } pages << hpage # iterate the keywords i = 0 match = false a.each { |k| # find the keyword match = ( s =~ /#{Regexp.escape(k)}/i ) hpage['keywords'] << k if match # count the number of links with match # break if only 1 link is needed if match i += 1 break if stop_on_first_link_found end # if } # each break if match && stop_on_first_link_found l.logf i == 0 ? 'no keywords found'.yellow : "#{i} keywords found".green # find_keywords rescue Net::ReadTimeout => e l.logf "Timeout Error: #{e.message}".red l.logs "Restarting browser..." browser.close if browser self.browser = BlackStack::Bots::Browser.new() l.done rescue => e l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links end } # each # return pages end |
#get_links(stop_at = 100, l = nil) ⇒ Object
def get_links_from_url
158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
# File 'lib/scraper.rb', line 158 def get_links(stop_at=100, l=nil) l = BlackStack::DummyLogger.new(nil) if l.nil? # get links from the sitemap self.get_links_from_sitemap(stop_at, l) =begin # working with root url url = "http://#{self.domain}/" self.links << url if self.links.select { |link| link == url }.empty? # iterate until I have discovered all the links while self.links.size != self.links_processed.size && stop_at >= self.links.size # iterate the links who are not in links_processed self.links.select { |link| !self.links_processed.include?(link) }.each { |link| # get the links from the url self.get_links_from_url(link, l) # add the link to the list of processed links self.links_processed << link } end # while =end end |
#get_links_from_sitemap(stop_at = 100, l = nil) ⇒ Object
def initialize
20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
# File 'lib/scraper.rb', line 20 def get_links_from_sitemap(stop_at=100, l=nil) max_allowed_timeout_errors = 3 timeout_errors = 0 max_links = self.links.size + stop_at l.logs "Scrape sitemaps... " begin l.logs "get_sitemaps from #{self.domain}... " # download the robots.txt url = "http://#{domain}/robots.txt" # get the content of robots.txt from url browser.goto url s = browser.text # get the sitemap sitemaps = s.split("\n").select { |line| line =~ /^sitemap:/i }.map { |a| a.downcase.split('sitemap:').last.strip }.uniq processed = [] to_process = sitemaps - processed l.logf sitemaps.size == 0 ? 'no sitemap found'.yellow : "#{sitemaps.size} sitemaps found".green # get_links # while there are sitemaps to process while to_process.size > 0 && timeout_errors < max_allowed_timeout_errors && max_links >= self.links.size to_process.each { |b| l.logs "go to #{b}... " begin browser.goto b l.done l.logs "parsing #{b}... " s = browser.text # extract all URLs doc = Nokogiri::HTML(s) l.done # get the value of all <loc> tags with .xml extension l.logs "get_sitemaps from #{b}... " sitemaps += doc.xpath('//loc').map(&:text).select { |s| s =~ /\.xml$/ }.map { |s| s.downcase } sitemaps.uniq! l.logf sitemaps.size == 0 ? 'no sitemap found'.yellow : "#{sitemaps.size} sitemaps found".green # get_links # get the value of all <loc> tags without .xml extension l.logs "get_links from #{b}..." self.links += doc.xpath('//loc').map(&:text).select { |s| s !~ /\.xml$/ }.map { |s| s.downcase } self.links.uniq! l.logf self.links.size == 0 ? 'no links found'.yellow : "#{self.links.size} links found".green # get_links # add the sitemap to the list of processed sitemaps processed << b # reset timeout errors timeout_errors = 0 # break if I exceeded the limit of links break if max_links <= self.links.size rescue Net::ReadTimeout => e l.logf "Timeout Error: #{e.message}".red l.logs "Restarting browser..." browser.close if browser self.browser = BlackStack::Bots::Browser.new() l.done timeout_errors += 1 break if timeout_errors >= max_allowed_timeout_errors rescue => e l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links end } # update the list of sitemaps to process processed.uniq! to_process = sitemaps - processed end l.logf sitemaps.size == 0 ? 'no sitemap found'.yellow : "#{sitemaps.size} sitemaps found".green # get_links rescue Net::ReadTimeout => e l.logf "Timeout Error: #{e.message}".red l.logs "Restarting browser..." browser.close if browser self.browser = BlackStack::Bots::Browser.new() l.done rescue => e l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links end end |
#get_links_from_url(url, l = nil) ⇒ Object
internal use only
111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
# File 'lib/scraper.rb', line 111 def get_links_from_url(url, l=nil) l = BlackStack::DummyLogger.new(nil) if l.nil? l.logs "get_links (#{url})... " aux = [] begin # trim url url = url.strip # get domain of the url using open-uri domain = URI.parse(url).host # visit the main page of the website browser.goto url sleep(self.load_wait_time) # wait 10 seconds for javascript content to load # get the self.links to the pages of the website aux = browser.links.map(&:href) # remove non-string elements aux = aux.select { |link| link.is_a?(String) } # remove # from the self.links aux = aux.map { |link| !link.nil? && link.split('#').first } # remove querystring from the self.links aux = aux.map { |link| !link.nil? && link.split('?').first } # remove the self.links that are not http:// or https:// aux = aux.select { |link| !link.nil? && link =~ /^https?:\/\// } # remove the self.links that are not from the same domain aux = aux.select { |link| !link.nil? && link =~ /#{domain}/ } # remove nil values aux = aux.compact # remove duplications aux = aux.uniq # filter links who already are in the list a = aux.size aux = aux.select { |link| !self.links.include?(link) } b = aux.size # add new links to self.links l.logf "done".green + " (#{a} links found, #{b} new, #{self.links.size} total)" # get_links rescue Net::ReadTimeout => e l.logf "Timeout Error: #{e.message}".red l.logs "Restarting browser..." browser.close if browser self.browser = BlackStack::Bots::Browser.new() l.done rescue => e l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links end self.links += aux end |