Module: CrawlerProcess
- Included in:
- ContentCrawler::Crawler
- Defined in:
- lib/content_crawler/crawler_process.rb
Instance Method Summary collapse
-
#audio_video_collection(audio_video_detail, options = {}) ⇒ Object
To get audio video details.
-
#check_local_dir(image_store_dir) ⇒ Object
To save images in dir.
-
#close_browser ⇒ Object
close browser.
-
#collection_attr(collection, options) ⇒ Object
To get particular attribute.
-
#collection_links(parser_links, options = {}) ⇒ Object
To get the anchor tag details.
-
#datalist_collection(datalist_detail, options = {}) ⇒ Object
to get datalists.
-
#iframe_embed_collection(ifrm_embd_detail, options = {}) ⇒ Object
To get iframe links.
-
#initialize(crawler, base_url, options = {:timeout=>300, :user_agent=>nil}) ⇒ Object
Initialize the crawler process.
-
#mechanize_parser(user_agent = nil) ⇒ Object
Mechanize parser.
-
#object_collection(object_detail, options = {}) ⇒ Object
to Get object details.
-
#select_collection(select_detail, options = {}) ⇒ Object
To get select tag details.
-
#store_remote_image(image_detail, image_store_dir) ⇒ Object
To get image.
-
#watir_web_browser(timeout) ⇒ Object
Web driver watir browser, which will be opening a browser.
Instance Method Details
#audio_video_collection(audio_video_detail, options = {}) ⇒ Object
To get audio video details
112 113 114 115 116 117 118 119 120 121 |
# File 'lib/content_crawler/crawler_process.rb', line 112 def audio_video_collection(audio_video_detail, ={}) auvid_collection = [] audio_video_detail.each do |auvid| hash = {} hash[:src] = auvid.attributes["src"].value.strip hash[:type] = auvid.attributes["type"].value.strip auvid_collection << hash end collection_attr(auvid_collection, ) end |
#check_local_dir(image_store_dir) ⇒ Object
To save images in dir
83 84 85 86 87 88 89 |
# File 'lib/content_crawler/crawler_process.rb', line 83 def check_local_dir(image_store_dir) image_store_dir = "#{Dir.home}/crawled_images" if image_store_dir.nil? if not Dir.exist?("#{image_store_dir}") Dir.mkdir("#{image_store_dir}") end image_store_dir end |
#close_browser ⇒ Object
close browser
164 165 166 167 |
# File 'lib/content_crawler/crawler_process.rb', line 164 def close_browser @browser.close if not @browser.nil? @headless.destroy if not @headless.nil? end |
#collection_attr(collection, options) ⇒ Object
To get particular attribute
144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
# File 'lib/content_crawler/crawler_process.rb', line 144 def collection_attr(collection, ) collection = [collection].flatten.compact.uniq case [:format] when "srcs_types", "texts_values", "texts_srcs", "texts_hrefs" collection when "only_srcs" collection.map{|collobjt| collobjt[:src]}.compact when "only_types" collection.map{|collobjt| collobjt[:type]}.compact when "only_values" collection.map{|collobjt| collobjt[:value]}.compact when "only_texts" collection.map{|collobjt| collobjt[:text]}.compact when "only_hrefs" collection.map{|collobjt| collobjt[:href]}.compact else collection end end |
#collection_links(parser_links, options = {}) ⇒ Object
To get the anchor tag details
45 46 47 48 49 50 51 52 53 54 55 |
# File 'lib/content_crawler/crawler_process.rb', line 45 def collection_links(parser_links, ={}) links = Array.new parser_links = [parser_links].flatten.uniq parser_links.each do |link| data = {} data[:href] = link.attributes["href"].nil? ? " " : link.attributes["href"].value.strip data[:text] = link.text.nil? ? " " : link.text.strip links << data end collection_attr(links, ) end |
#datalist_collection(datalist_detail, options = {}) ⇒ Object
to get datalists
134 135 136 137 138 139 140 141 142 |
# File 'lib/content_crawler/crawler_process.rb', line 134 def datalist_collection(datalist_detail, ={}) datalists = [] datalist_detail.each do |datalist| hash = {} hash[:value] = datalist.attributes["value"].value.strip datalists << hash end collection_attr(datalists, ) end |
#iframe_embed_collection(ifrm_embd_detail, options = {}) ⇒ Object
To get iframe links
102 103 104 105 106 107 108 109 110 |
# File 'lib/content_crawler/crawler_process.rb', line 102 def (ifrm_embd_detail, ={}) ifrm_embds = [] ifrm_embd_detail.each do |ifrmembd| hash = {} hash[:src] = ifrmembd.value.strip ifrm_embds << hash end collection_attr(ifrm_embds, ) end |
#initialize(crawler, base_url, options = {:timeout=>300, :user_agent=>nil}) ⇒ Object
Initialize the crawler process
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 |
# File 'lib/content_crawler/crawler_process.rb', line 11 def initialize(crawler, base_url, ={:timeout=>300, :user_agent=>nil}) @base_url = base_url case crawler when "selenium_webdriver_with_headless" @headless = Headless.new @headless.start watir_web_browser([:timeout]) when "selenium_webdriver_without_headless" watir_web_browser([:timeout]) when "mechanize_parser" mechanize_parser([:user_agent]) else puts "Please select any one of the parser(selenium_webdriver_with_headless, selenium_webdriver_without_headless, mechanize_parser) to crawl content" end end |
#mechanize_parser(user_agent = nil) ⇒ Object
Mechanize parser
35 36 37 38 39 40 41 42 43 |
# File 'lib/content_crawler/crawler_process.rb', line 35 def mechanize_parser(user_agent=nil) if user_agent.nil? @agent = Mechanize.new{|a| a.ssl_version, a.verify_mode = 'SSLv3', OpenSSL::SSL::VERIFY_NONE} else @agent = Mechanize.new{|agent| agent.user_agent_alias = user_agent} end #@page = @agent.get(@base_url).parser @agent end |
#object_collection(object_detail, options = {}) ⇒ Object
to Get object details
123 124 125 126 127 128 129 130 131 132 |
# File 'lib/content_crawler/crawler_process.rb', line 123 def object_collection(object_detail, ={}) objects = [] object_detail.each do |object| hash = {} hash[:text] = object.text.strip hash[:value] = object.value.strip objects << hash end collection_attr(objects, ) end |
#select_collection(select_detail, options = {}) ⇒ Object
To get select tag details
91 92 93 94 95 96 97 98 99 100 |
# File 'lib/content_crawler/crawler_process.rb', line 91 def select_collection(select_detail, ={}) selects = [] select_detail.each do |select| hash = {} hash[:text] = select.text.strip hash[:value] = select.attributes["value"].text.strip selects << hash end collection_attr(selects, ) end |
#store_remote_image(image_detail, image_store_dir) ⇒ Object
To get image
57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
# File 'lib/content_crawler/crawler_process.rb', line 57 def store_remote_image(image_detail, image_store_dir) image_store_dir = check_local_dir(image_store_dir) remote_image_urls = (image_detail, {:format => "only_srcs"}) local_images = [] remote_image_urls.each do |image_url| image_url = "#{@base_url}#{image_url}" if not image_url.include?("http") url = URI.parse(image_url) response = Net::HTTP.get_response(url) if response.is_a?(Net::HTTPSuccess) http = Net::HTTP.new(url.host, url.port) http.use_ssl = true if url.scheme == "https" http.verify_mode = OpenSSL::SSL::VERIFY_NONE http.start do http.request_get(url.path) do |res| File.open("#{image_store_dir}/#{File.basename(url.path)}",'wb') do |file| file.write(res.body) end end end local_image = "#{image_store_dir}/#{File.basename(url.path)}" local_images << local_image end end local_images end |
#watir_web_browser(timeout) ⇒ Object
Web driver watir browser, which will be opening a browser
27 28 29 30 31 32 33 |
# File 'lib/content_crawler/crawler_process.rb', line 27 def watir_web_browser(timeout) client = Selenium::WebDriver::Remote::Http::Default.new client.timeout = timeout @browser = Watir::Browser.new :firefox, :http_client => client @browser.goto(@base_url) @browser end |