Module: CrawlerSample
- Included in:
- String
- Defined in:
- lib/crawler_sample.rb,
lib/crawler_sample/version.rb
Constant Summary collapse
- VERSION =
"0.0.1"
Instance Attribute Summary collapse
-
#crawl_stop_count ⇒ Object
Your code goes here…
-
#deep_flg ⇒ Object
Your code goes here…
-
#delay ⇒ Object
Your code goes here…
-
#exclude_urls ⇒ Object
Your code goes here…
-
#stop_flg ⇒ Object
Your code goes here…
-
#target_urls ⇒ Object
Your code goes here…
-
#top_url ⇒ Object
Your code goes here…
Instance Method Summary collapse
- #crawl(url = nil) ⇒ Object
- #crawl_delay ⇒ Object
- #crawl_delay=(delay = nil) ⇒ Object
- #crawl_from_url(url = nil) {|site_contents| ... } ⇒ Object
- #crawl_is_force_stop ⇒ Object
- #scrape(url, option = {}) ⇒ Object
Instance Attribute Details
#crawl_stop_count ⇒ Object
Your code goes here…
9 10 11 |
# File 'lib/crawler_sample.rb', line 9 def crawl_stop_count @crawl_stop_count end |
#deep_flg ⇒ Object
Your code goes here…
9 10 11 |
# File 'lib/crawler_sample.rb', line 9 def deep_flg @deep_flg end |
#delay ⇒ Object
Your code goes here…
9 10 11 |
# File 'lib/crawler_sample.rb', line 9 def delay @delay end |
#exclude_urls ⇒ Object
Your code goes here…
9 10 11 |
# File 'lib/crawler_sample.rb', line 9 def exclude_urls @exclude_urls end |
#stop_flg ⇒ Object
Your code goes here…
9 10 11 |
# File 'lib/crawler_sample.rb', line 9 def stop_flg @stop_flg end |
#target_urls ⇒ Object
Your code goes here…
9 10 11 |
# File 'lib/crawler_sample.rb', line 9 def target_urls @target_urls end |
#top_url ⇒ Object
Your code goes here…
9 10 11 |
# File 'lib/crawler_sample.rb', line 9 def top_url @top_url end |
Instance Method Details
#crawl(url = nil) ⇒ Object
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
# File 'lib/crawler_sample.rb', line 11 def crawl(url=nil) raise "URL is Blank" if url.nil? self.target_urls=[url] self.exclude_urls=[] target_scheme=URI.parse(url).scheme target_host=URI.parse(url).host error_cnt=0 crawl_page_cnt=0 self.crawl_stop_count = 1000 if self.crawl_stop_count.to_i <= 0 loop do begin break if self.target_urls.empty? || self.stop_flg==true url = self.target_urls.pop self.exclude_urls << url begin p "SuccessURL #{url}" site_contents = self.scrape(url) crawl_page_cnt += 1 rescue p "ErrorURL #{url}" end site_contents.search("a").each do|anc| #ホスト名がtargetのホストと違う場合はクロール対象外にする begin; URI.parse(anc["href"]).host; rescue; next; end next unless anc["href"].scan(/\.(jpg|jpeg|png|gif|bmp|zip|exe|pdf|lzh)/i).empty? next if URI.parse(anc["href"]).host && target_host!=URI.parse(anc["href"]).host anc["href"] = URI.parse(anc["href"]).path if URI.parse(anc["href"]).host anc["href"] = anc["href"].gsub(/\/\.{1,2}/,"") anc["href"] = "/#{anc["href"]}" if anc["href"][0] != "/" self.target_urls << "#{target_scheme}://#{target_host}#{anc["href"]}".gsub(/\/\.{1,2}/,"") self.target_urls = (self.target_urls - self.exclude_urls).uniq end yield site_contents rescue => e error_cnt +=1 p "error #{error_cnt} #{e}" next end crawl_is_force_stop if error_cnt > 200 || crawl_page_cnt > self.crawl_stop_count end end |
#crawl_delay ⇒ Object
68 69 70 |
# File 'lib/crawler_sample.rb', line 68 def crawl_delay return self.delay.nil? ? 1 : self.delay end |
#crawl_delay=(delay = nil) ⇒ Object
64 65 66 |
# File 'lib/crawler_sample.rb', line 64 def crawl_delay=(delay=nil) self.delay= delay.nil? ? 1 : delay end |
#crawl_from_url(url = nil) {|site_contents| ... } ⇒ Object
57 58 59 60 61 62 |
# File 'lib/crawler_sample.rb', line 57 def crawl_from_url(url=nil) raise "URL is Blank" if url.nil? site_contents = self.scrape(url) return if site_contents.nil? yield site_contents end |
#crawl_is_force_stop ⇒ Object
53 54 55 |
# File 'lib/crawler_sample.rb', line 53 def crawl_is_force_stop self.stop_flg=true end |
#scrape(url, option = {}) ⇒ Object
72 73 74 75 76 77 78 79 |
# File 'lib/crawler_sample.rb', line 72 def scrape(url, option={}) #self.delay = option[:delay].present? ? option[:delay] : 1 sleep self.crawl_delay #delay html=open(url,"r:binary","User-Agent"=>"Blue Field 0.5.0.1" ).read return Nokogiri::HTML(html.toutf8, nil, 'utf-8') rescue raise FaildScrape end |