Class: BlackStack::Bots::Scraper
- Inherits:
-
Object
- Object
- BlackStack::Bots::Scraper
- Defined in:
- lib/scraper.rb
Instance Attribute Summary collapse
-
#domain ⇒ Object
Returns the value of attribute domain.
-
#links ⇒ Object
Returns the value of attribute links.
-
#links_processed ⇒ Object
auxiliar array of links that I have extracted links from.
-
#load_wait_time ⇒ Object
Returns the value of attribute load_wait_time.
-
#stop_scraping_at_match_number ⇒ Object
Returns the value of attribute stop_scraping_at_match_number.
-
#stop_scraping_at_page_number ⇒ Object
Returns the value of attribute stop_scraping_at_page_number.
-
#timeout ⇒ Object
Returns the value of attribute timeout.
Instance Method Summary collapse
-
#find_keywords(a, stop_at = 25, stop_on_first_link_found = false, l = nil) ⇒ Object
def get_links.
-
#get_links(stop_at = 10, l = nil) ⇒ Object
def get_links_from_url.
-
#get_links_from_sitemap(l = nil) ⇒ Object
def initialize.
-
#get_links_from_url(url, l = nil) ⇒ Object
internal use only.
-
#initialize(init_domain, timeout, h) ⇒ Scraper
constructor
A new instance of Scraper.
Constructor Details
#initialize(init_domain, timeout, h) ⇒ Scraper
Returns a new instance of Scraper.
8 9 10 11 12 13 14 15 16 17 |
# File 'lib/scraper.rb', line 8 def initialize(init_domain, timeout, h) self.domain = init_domain self.timeout = timeout || 10 self.load_wait_time = 3 self.stop_scraping_at_page_number = 25 self.stop_scraping_at_match_number = 1 #self.agent.agent.http.verify_mode = OpenSSL::SSL::VERIFY_NONE self.links = [] self.links_processed = [] end |
Instance Attribute Details
#domain ⇒ Object
Returns the value of attribute domain.
4 5 6 |
# File 'lib/scraper.rb', line 4 def domain @domain end |
#links ⇒ Object
Returns the value of attribute links.
4 5 6 |
# File 'lib/scraper.rb', line 4 def links @links end |
#links_processed ⇒ Object
auxiliar array of links that I have extracted links from
6 7 8 |
# File 'lib/scraper.rb', line 6 def links_processed @links_processed end |
#load_wait_time ⇒ Object
Returns the value of attribute load_wait_time.
4 5 6 |
# File 'lib/scraper.rb', line 4 def load_wait_time @load_wait_time end |
#stop_scraping_at_match_number ⇒ Object
Returns the value of attribute stop_scraping_at_match_number.
4 5 6 |
# File 'lib/scraper.rb', line 4 def stop_scraping_at_match_number @stop_scraping_at_match_number end |
#stop_scraping_at_page_number ⇒ Object
Returns the value of attribute stop_scraping_at_page_number.
4 5 6 |
# File 'lib/scraper.rb', line 4 def stop_scraping_at_page_number @stop_scraping_at_page_number end |
#timeout ⇒ Object
Returns the value of attribute timeout.
4 5 6 |
# File 'lib/scraper.rb', line 4 def timeout @timeout end |
Instance Method Details
#find_keywords(a, stop_at = 25, stop_on_first_link_found = false, l = nil) ⇒ Object
def get_links
106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
# File 'lib/scraper.rb', line 106 def find_keywords(a, stop_at=25, stop_on_first_link_found=false, l=nil) pages = [] browser = nil l = BlackStack::DummyLogger.new(nil) if l.nil? # iterate the links j = 0 self.links.reject { |link| link =~ /\.pdf$/i || link =~ /\.jpg$/i || link =~ /\.jpeg$/i || link =~ /\.gif$/i }.each { |link| j += 1 break if j > stop_at l.logs "#{j.to_s}. find_keywords (#{link})... " begin # get the page browser = BlackStack::Bots::Browser.new() browser.goto link sleep(self.load_wait_time) # wait 10 seconds for javascript content to load # get page body content in plain text title = browser.title s = browser.body.text # add the link to the results of no-keyword hpage = { 'page_url' => link.downcase, 'page_title' => title, 'page_html' => browser.body.html, 'keywords' => [] } pages << hpage # iterate the keywords i = 0 match = false a.each { |k| # find the keyword match = ( s =~ /#{Regexp.escape(k)}/i ) hpage['keywords'] << k if match # count the number of links with match # break if only 1 link is needed if match i += 1 break if stop_on_first_link_found end # if } # each break if match && stop_on_first_link_found l.logf i == 0 ? 'no keywords found'.yellow : "#{i} keywords found".green # find_keywords rescue Net::ReadTimeout => e l.logf "Timeout Error: #{e.}".red rescue => e l.logf "Error: #{e..split("\n").first[0..100]}".red # get_links ensure browser.close if browser end } # each # return pages end |
#get_links(stop_at = 10, l = nil) ⇒ Object
def get_links_from_url
87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
# File 'lib/scraper.rb', line 87 def get_links(stop_at=10, l=nil) l = BlackStack::DummyLogger.new(nil) if l.nil? # working with root url url = "http://#{self.domain}/" self.links << url if self.links.select { |link| link == url }.empty? # iterate until I have discovered all the links while self.links.size != self.links_processed.size && stop_at >= self.links.size # iterate the links who are not in links_processed self.links.select { |link| !self.links_processed.include?(link) }.each { |link| # get the links from the url self.get_links_from_url(link, l) # add the link to the list of processed links self.links_processed << link } end # while # get links from the sitemap self.get_links_from_sitemap(l) end |
#get_links_from_sitemap(l = nil) ⇒ Object
def initialize
19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 |
# File 'lib/scraper.rb', line 19 def get_links_from_sitemap(l=nil) i = 0 l.logs "Scrape sitemaps... " begin # download the robots.txt url = "http://#{domain}/robots.txt" # get the content of robots.txt from url s = Timeout::timeout(self.timeout) { URI.open(url).read } # get the sitemap sitemaps = s.split("\n").select { |line| line =~ /^sitemap:/i }.map { |a| a.downcase.split('sitemap:').last.strip }.uniq sitemaps.each { |b| parser = Timeout::timeout(self.timeout) { SitemapParser.new b } self.links += Timeout::timeout(self.timeout) { parser.to_a } self.links.uniq! } l.logf sitemaps.size == 0 ? 'no sitemap found'.yellow : "#{sitemaps.size} sitemaps found".green # get_links rescue => e l.logf "Error: #{e..split("\n").first[0..100]}".red # get_links end end |
#get_links_from_url(url, l = nil) ⇒ Object
internal use only
41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
# File 'lib/scraper.rb', line 41 def get_links_from_url(url, l=nil) l = BlackStack::DummyLogger.new(nil) if l.nil? l.logs "get_links (#{url})... " aux = [] browser = nil begin # trim url url = url.strip # get domain of the url using open-uri domain = URI.parse(url).host # visit the main page of the website browser = BlackStack::Bots::Browser.new() browser.goto url sleep(self.load_wait_time) # wait 10 seconds for javascript content to load # get the self.links to the pages of the website aux = browser.links.map(&:href) # remove non-string elements aux = aux.select { |link| link.is_a?(String) } # remove # from the self.links aux = aux.map { |link| !link.nil? && link.split('#').first } # remove querystring from the self.links aux = aux.map { |link| !link.nil? && link.split('?').first } # remove the self.links that are not http:// or https:// aux = aux.select { |link| !link.nil? && link =~ /^https?:\/\// } # remove the self.links that are not from the same domain aux = aux.select { |link| !link.nil? && link =~ /#{domain}/ } # remove nil values aux = aux.compact # remove duplications aux = aux.uniq # filter links who already are in the list a = aux.size aux = aux.select { |link| !self.links.include?(link) } b = aux.size # add new links to self.links l.logf "done".green + " (#{a} links found, #{b} new, #{self.links.size} total)" # get_links rescue Net::ReadTimeout => e l.logf "Timeout Error: #{e.}".red rescue => e l.logf "Error: #{e..split("\n").first[0..100]}".red # get_links ensure browser.close if browser end self.links += aux end |