Module: CrawlerProcess

Included in:
ContentCrawler::Crawler
Defined in:
lib/content_crawler/crawler_process.rb

Instance Method Summary collapse

Instance Method Details

#audio_video_collection(audio_video_detail, options = {}) ⇒ Object

To get audio video details



112
113
114
115
116
117
118
119
120
121
# File 'lib/content_crawler/crawler_process.rb', line 112

def audio_video_collection(audio_video_detail, options={})
    auvid_collection = []
    audio_video_detail.each do |auvid|
      hash = {}
      hash[:src] = auvid.attributes["src"].value.strip
      hash[:type] = auvid.attributes["type"].value.strip
      auvid_collection << hash
    end
    collection_attr(auvid_collection, options)
end

#check_local_dir(image_store_dir) ⇒ Object

To save images in dir



83
84
85
86
87
88
89
# File 'lib/content_crawler/crawler_process.rb', line 83

def check_local_dir(image_store_dir)
    image_store_dir = "#{Dir.home}/crawled_images" if image_store_dir.nil?
    if not Dir.exist?("#{image_store_dir}")
      Dir.mkdir("#{image_store_dir}")
    end
    image_store_dir
end

#close_browserObject

close browser



164
165
166
167
# File 'lib/content_crawler/crawler_process.rb', line 164

def close_browser
  @browser.close if not @browser.nil?
  @headless.destroy if not @headless.nil?
end

#collection_attr(collection, options) ⇒ Object

To get particular attribute



144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# File 'lib/content_crawler/crawler_process.rb', line 144

def collection_attr(collection, options)
  collection = [collection].flatten.compact.uniq
  case options[:format]
    when "srcs_types", "texts_values", "texts_srcs", "texts_hrefs"
      collection
    when "only_srcs"
      collection.map{|collobjt| collobjt[:src]}.compact
    when "only_types"
      collection.map{|collobjt| collobjt[:type]}.compact
    when "only_values"
      collection.map{|collobjt| collobjt[:value]}.compact
    when "only_texts"
      collection.map{|collobjt| collobjt[:text]}.compact
    when "only_hrefs"
      collection.map{|collobjt| collobjt[:href]}.compact
    else
      collection
  end
end

To get the anchor tag details



45
46
47
48
49
50
51
52
53
54
55
# File 'lib/content_crawler/crawler_process.rb', line 45

def collection_links(parser_links, options={})
  links = Array.new
  parser_links = [parser_links].flatten.uniq
  parser_links.each do |link|
     data = {}
     data[:href] = link.attributes["href"].nil? ? " " : link.attributes["href"].value.strip
     data[:text] = link.text.nil? ? " " : link.text.strip
     links << data
  end
  collection_attr(links, options)
end

#datalist_collection(datalist_detail, options = {}) ⇒ Object

to get datalists



134
135
136
137
138
139
140
141
142
# File 'lib/content_crawler/crawler_process.rb', line 134

def datalist_collection(datalist_detail, options={})
     datalists = []
     datalist_detail.each do |datalist|
         hash = {}
         hash[:value] = datalist.attributes["value"].value.strip
         datalists << hash
     end
    collection_attr(datalists, options)
end

#iframe_embed_collection(ifrm_embd_detail, options = {}) ⇒ Object

To get iframe links



102
103
104
105
106
107
108
109
110
# File 'lib/content_crawler/crawler_process.rb', line 102

def iframe_embed_collection(ifrm_embd_detail, options={})
    ifrm_embds = []
    ifrm_embd_detail.each do |ifrmembd|
      hash = {}
      hash[:src] = ifrmembd.value.strip
      ifrm_embds << hash
    end
    collection_attr(ifrm_embds, options)
end

#initialize(crawler, base_url, options = {:timeout=>300, :user_agent=>nil}) ⇒ Object

Initialize the crawler process



11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# File 'lib/content_crawler/crawler_process.rb', line 11

def initialize(crawler, base_url, options={:timeout=>300, :user_agent=>nil})
    @base_url = base_url
    case crawler
     when "selenium_webdriver_with_headless"
       @headless = Headless.new
       @headless.start
         watir_web_browser(options[:timeout])
      when "selenium_webdriver_without_headless"
         watir_web_browser(options[:timeout])
     when "mechanize_parser"
         mechanize_parser(options[:user_agent])
     else
      puts "Please select any one of the parser(selenium_webdriver_with_headless, selenium_webdriver_without_headless, mechanize_parser) to crawl content"
    end
end

#mechanize_parser(user_agent = nil) ⇒ Object

Mechanize parser



35
36
37
38
39
40
41
42
43
# File 'lib/content_crawler/crawler_process.rb', line 35

def mechanize_parser(user_agent=nil)
    if user_agent.nil?
      @agent = Mechanize.new{|a| a.ssl_version, a.verify_mode = 'SSLv3', OpenSSL::SSL::VERIFY_NONE}
    else
      @agent = Mechanize.new{|agent| agent.user_agent_alias = user_agent}
    end
    #@page = @agent.get(@base_url).parser
    @agent
end

#object_collection(object_detail, options = {}) ⇒ Object

to Get object details



123
124
125
126
127
128
129
130
131
132
# File 'lib/content_crawler/crawler_process.rb', line 123

def object_collection(object_detail, options={})
     objects = []
     object_detail.each do |object|
         hash = {}
         hash[:text] = object.text.strip
         hash[:value] = object.value.strip
         objects << hash
     end
    collection_attr(objects, options)
end

#select_collection(select_detail, options = {}) ⇒ Object

To get select tag details



91
92
93
94
95
96
97
98
99
100
# File 'lib/content_crawler/crawler_process.rb', line 91

def select_collection(select_detail, options={})
    selects = []
     select_detail.each do |select|
         hash = {}
         hash[:text] = select.text.strip
         hash[:value] = select.attributes["value"].text.strip
         selects << hash
     end
    collection_attr(selects, options)
end

#store_remote_image(image_detail, image_store_dir) ⇒ Object

To get image



57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# File 'lib/content_crawler/crawler_process.rb', line 57

def store_remote_image(image_detail, image_store_dir)
    image_store_dir = check_local_dir(image_store_dir)
    remote_image_urls = iframe_embed_collection(image_detail, {:format => "only_srcs"})
    local_images = []
    remote_image_urls.each do |image_url|
      image_url = "#{@base_url}#{image_url}" if not image_url.include?("http")
      url = URI.parse(image_url)
      response = Net::HTTP.get_response(url)
      if response.is_a?(Net::HTTPSuccess)
          http = Net::HTTP.new(url.host, url.port)
          http.use_ssl = true if url.scheme == "https"
          http.verify_mode = OpenSSL::SSL::VERIFY_NONE
          http.start do
            http.request_get(url.path) do |res|
              File.open("#{image_store_dir}/#{File.basename(url.path)}",'wb') do |file|
                  file.write(res.body)
              end
            end
          end
          local_image = "#{image_store_dir}/#{File.basename(url.path)}"
          local_images << local_image
      end
    end
    local_images
end

#watir_web_browser(timeout) ⇒ Object

Web driver watir browser, which will be opening a browser



27
28
29
30
31
32
33
# File 'lib/content_crawler/crawler_process.rb', line 27

def watir_web_browser(timeout)
    client = Selenium::WebDriver::Remote::Http::Default.new
    client.timeout = timeout
    @browser = Watir::Browser.new :firefox, :http_client => client
    @browser.goto(@base_url)
    @browser
end