Class: ContentCrawler::Crawler
- Inherits:
-
Object
- Object
- ContentCrawler::Crawler
show all
- Includes:
- CrawlerProcess
- Defined in:
- lib/content_crawler.rb
Instance Method Summary
collapse
-
#close_browser ⇒ Object
-
#get_audio_video_elements(xpath = nil, options = {}) ⇒ Object
-
#get_datalist_elements(xpath = nil, options = {}) ⇒ Object
-
#get_iframe_embed_elements(xpath = nil, options = {}) ⇒ Object
-
#get_link_elements(xpath = nil, options = {}) ⇒ Object
-
#get_object_elements(xpath = nil, options = {}) ⇒ Object
-
#get_parser_page(crawl_url = nil) ⇒ Object
-
#get_remote_image(xpath = nil, image_store_dir = nil) ⇒ Object
-
#get_select_elements(xpath = nil, options = {}) ⇒ Object
-
#get_simple_text(xpath = nil) ⇒ Object
-
#initialize(crawler, base_url, options = {:timeout=>300, :user_agent=>nil}) ⇒ Crawler
constructor
A new instance of Crawler.
#audio_video_collection, #check_local_dir, #collection_attr, #collection_links, #datalist_collection, #iframe_embed_collection, #mechanize_parser, #object_collection, #select_collection, #store_remote_image, #watir_web_browser
Constructor Details
#initialize(crawler, base_url, options = {:timeout=>300, :user_agent=>nil}) ⇒ Crawler
Returns a new instance of Crawler.
13
14
15
|
# File 'lib/content_crawler.rb', line 13
def initialize(crawler, base_url, options={:timeout=>300, :user_agent=>nil})
super
end
|
Instance Method Details
#close_browser ⇒ Object
60
61
62
|
# File 'lib/content_crawler.rb', line 60
def close_browser
super
end
|
#get_audio_video_elements(xpath = nil, options = {}) ⇒ Object
48
49
50
|
# File 'lib/content_crawler.rb', line 48
def get_audio_video_elements(xpath=nil, options={})
audio_video_collection(@page.xpath(xpath), options) if not xpath.nil?
end
|
#get_datalist_elements(xpath = nil, options = {}) ⇒ Object
56
57
58
|
# File 'lib/content_crawler.rb', line 56
def get_datalist_elements(xpath=nil, options={})
datalist_collection(@page.xpath(xpath), options) if not xpath.nil?
end
|
#get_iframe_embed_elements(xpath = nil, options = {}) ⇒ Object
44
45
46
|
# File 'lib/content_crawler.rb', line 44
def get_iframe_embed_elements(xpath=nil, options={})
iframe_embed_collection(@page.xpath(xpath), options) if not xpath.nil?
end
|
#get_link_elements(xpath = nil, options = {}) ⇒ Object
32
33
34
|
# File 'lib/content_crawler.rb', line 32
def get_link_elements(xpath=nil, options={})
collection_links(@page.xpath(xpath), options) if not xpath.nil?
end
|
#get_object_elements(xpath = nil, options = {}) ⇒ Object
52
53
54
|
# File 'lib/content_crawler.rb', line 52
def get_object_elements(xpath=nil, options={})
object_collection(@page.xpath(xpath), options) if not xpath.nil?
end
|
#get_parser_page(crawl_url = nil) ⇒ Object
17
18
19
20
21
22
23
24
25
26
|
# File 'lib/content_crawler.rb', line 17
def get_parser_page(crawl_url=nil)
if (not @browser.nil? and not crawl_url.nil?)
@browser.goto(crawl_url)
@page = Nokogiri::HTML(@browser.html)
elsif (not @agent.nil? and not crawl_url.nil?)
@page = @agent.get(crawl_url).parser if not crawl_url.nil?
else
"Please select any one of the parser(selenium_webdriver_with_headless, selenium_webdriver_without_headless, mechanize_parser) and pass the crawl_url to crawl content"
end
end
|
#get_remote_image(xpath = nil, image_store_dir = nil) ⇒ Object
36
37
38
|
# File 'lib/content_crawler.rb', line 36
def get_remote_image(xpath=nil, image_store_dir=nil)
store_remote_image(@page.xpath(xpath), image_store_dir) if not xpath.nil?
end
|
#get_select_elements(xpath = nil, options = {}) ⇒ Object
40
41
42
|
# File 'lib/content_crawler.rb', line 40
def get_select_elements(xpath=nil, options={})
select_collection(@page.xpath(xpath), options) if not xpath.nil?
end
|
#get_simple_text(xpath = nil) ⇒ Object
28
29
30
|
# File 'lib/content_crawler.rb', line 28
def get_simple_text(xpath=nil)
@page.xpath(xpath).text.strip if not xpath.nil?
end
|