Module: CrawlerSample

Included in:
String
Defined in:
lib/crawler_sample.rb,
lib/crawler_sample/version.rb

Constant Summary collapse

VERSION =
"0.0.1"

Instance Attribute Summary collapse

Instance Method Summary collapse

Instance Attribute Details

#crawl_stop_countObject

Your code goes here…



9
10
11
# File 'lib/crawler_sample.rb', line 9

def crawl_stop_count
  @crawl_stop_count
end

#deep_flgObject

Your code goes here…



9
10
11
# File 'lib/crawler_sample.rb', line 9

def deep_flg
  @deep_flg
end

#delayObject

Your code goes here…



9
10
11
# File 'lib/crawler_sample.rb', line 9

def delay
  @delay
end

#exclude_urlsObject

Your code goes here…



9
10
11
# File 'lib/crawler_sample.rb', line 9

def exclude_urls
  @exclude_urls
end

#stop_flgObject

Your code goes here…



9
10
11
# File 'lib/crawler_sample.rb', line 9

def stop_flg
  @stop_flg
end

#target_urlsObject

Your code goes here…



9
10
11
# File 'lib/crawler_sample.rb', line 9

def target_urls
  @target_urls
end

#top_urlObject

Your code goes here…



9
10
11
# File 'lib/crawler_sample.rb', line 9

def top_url
  @top_url
end

Instance Method Details

#crawl(url = nil) ⇒ Object



11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# File 'lib/crawler_sample.rb', line 11

def crawl(url=nil)
  raise "URL is Blank" if url.nil?
  self.target_urls=[url]
  self.exclude_urls=[]
  target_scheme=URI.parse(url).scheme
  target_host=URI.parse(url).host
  error_cnt=0
  crawl_page_cnt=0
  self.crawl_stop_count = 1000 if self.crawl_stop_count.to_i <= 0
  loop do
    begin
      break if self.target_urls.empty? || self.stop_flg==true
      url =  self.target_urls.pop
      self.exclude_urls << url
      begin
        p "SuccessURL #{url}"
        site_contents = self.scrape(url)
        crawl_page_cnt += 1
      rescue
        p "ErrorURL #{url}"
      end
      site_contents.search("a").each do|anc|
        #ホスト名がtargetのホストと違う場合はクロール対象外にする
        begin; URI.parse(anc["href"]).host; rescue; next; end
        next unless anc["href"].scan(/\.(jpg|jpeg|png|gif|bmp|zip|exe|pdf|lzh)/i).empty?
        next if URI.parse(anc["href"]).host && target_host!=URI.parse(anc["href"]).host
        anc["href"] = URI.parse(anc["href"]).path if URI.parse(anc["href"]).host
        anc["href"] = anc["href"].gsub(/\/\.{1,2}/,"")
        anc["href"] = "/#{anc["href"]}" if anc["href"][0] != "/"
        self.target_urls << "#{target_scheme}://#{target_host}#{anc["href"]}".gsub(/\/\.{1,2}/,"")
        self.target_urls = (self.target_urls - self.exclude_urls).uniq
      end
      yield site_contents
    rescue => e
      error_cnt +=1
      p "error #{error_cnt} #{e}"
      next
    end
    crawl_is_force_stop if error_cnt > 200 || crawl_page_cnt > self.crawl_stop_count
  end
end

#crawl_delayObject



68
69
70
# File 'lib/crawler_sample.rb', line 68

def crawl_delay
  return self.delay.nil? ? 1 : self.delay
end

#crawl_delay=(delay = nil) ⇒ Object



64
65
66
# File 'lib/crawler_sample.rb', line 64

def crawl_delay=(delay=nil)
  self.delay= delay.nil? ? 1 : delay
end

#crawl_from_url(url = nil) {|site_contents| ... } ⇒ Object

Yields:

  • (site_contents)


57
58
59
60
61
62
# File 'lib/crawler_sample.rb', line 57

def crawl_from_url(url=nil)
  raise "URL is Blank" if url.nil?
  site_contents = self.scrape(url)
  return if site_contents.nil?
  yield site_contents
end

#crawl_is_force_stopObject



53
54
55
# File 'lib/crawler_sample.rb', line 53

def crawl_is_force_stop
  self.stop_flg=true
end

#scrape(url, option = {}) ⇒ Object



72
73
74
75
76
77
78
79
# File 'lib/crawler_sample.rb', line 72

def scrape(url, option={})
  #self.delay = option[:delay].present? ? option[:delay] : 1
  sleep self.crawl_delay #delay
  html=open(url,"r:binary","User-Agent"=>"Blue Field 0.5.0.1" ).read
  return  Nokogiri::HTML(html.toutf8, nil, 'utf-8')
rescue
  raise FaildScrape
end