Class: ContentCrawler::Crawler

Inherits:

Object

Object
ContentCrawler::Crawler

show all

Includes:: CrawlerProcess

Defined in:: lib/content_crawler.rb

Instance Method Summary collapse

Methods included from CrawlerProcess

#audio_video_collection, #check_local_dir, #collection_attr, #collection_links, #datalist_collection, #iframe_embed_collection, #mechanize_parser, #object_collection, #select_collection, #store_remote_image, #watir_web_browser

Constructor Details

#initialize(crawler, base_url, options = {:timeout=>300, :user_agent=>nil}) ⇒ `Crawler`

Returns a new instance of Crawler.



13
14
15

# File 'lib/content_crawler.rb', line 13

def initialize(crawler, base_url, options={:timeout=>300, :user_agent=>nil})
   super
end

Instance Method Details

#close_browser ⇒ `Object`



60
61
62

# File 'lib/content_crawler.rb', line 60

def close_browser
    super
end

#get_audio_video_elements(xpath = nil, options = {}) ⇒ `Object`



48
49
50

# File 'lib/content_crawler.rb', line 48

def get_audio_video_elements(xpath=nil, options={})
    audio_video_collection(@page.xpath(xpath), options) if not xpath.nil?
end

#get_datalist_elements(xpath = nil, options = {}) ⇒ `Object`



56
57
58

# File 'lib/content_crawler.rb', line 56

def get_datalist_elements(xpath=nil, options={})
    datalist_collection(@page.xpath(xpath), options) if not xpath.nil?
end

#get_iframe_embed_elements(xpath = nil, options = {}) ⇒ `Object`



44
45
46

# File 'lib/content_crawler.rb', line 44

def get_iframe_embed_elements(xpath=nil, options={})
    iframe_embed_collection(@page.xpath(xpath), options) if not xpath.nil?
end

#get_link_elements(xpath = nil, options = {}) ⇒ `Object`



32
33
34

# File 'lib/content_crawler.rb', line 32

def get_link_elements(xpath=nil, options={})
    collection_links(@page.xpath(xpath), options) if not xpath.nil?
end

#get_object_elements(xpath = nil, options = {}) ⇒ `Object`



52
53
54

# File 'lib/content_crawler.rb', line 52

def get_object_elements(xpath=nil, options={})
    object_collection(@page.xpath(xpath), options) if not xpath.nil?
end

#get_parser_page(crawl_url = nil) ⇒ `Object`

# File 'lib/content_crawler.rb', line 17

def get_parser_page(crawl_url=nil)
    if (not @browser.nil? and not crawl_url.nil?)
        @browser.goto(crawl_url)
        @page = Nokogiri::HTML(@browser.html)
    elsif (not @agent.nil? and not crawl_url.nil?)
        @page = @agent.get(crawl_url).parser if not crawl_url.nil?
    else
        "Please select any one of the parser(selenium_webdriver_with_headless, selenium_webdriver_without_headless, mechanize_parser) and pass the crawl_url to crawl content"
    end
end

#get_remote_image(xpath = nil, image_store_dir = nil) ⇒ `Object`



36
37
38

# File 'lib/content_crawler.rb', line 36

def get_remote_image(xpath=nil, image_store_dir=nil)
    store_remote_image(@page.xpath(xpath), image_store_dir) if not xpath.nil?
end

#get_select_elements(xpath = nil, options = {}) ⇒ `Object`



40
41
42

# File 'lib/content_crawler.rb', line 40

def get_select_elements(xpath=nil, options={})
    select_collection(@page.xpath(xpath), options) if not xpath.nil?
end

#get_simple_text(xpath = nil) ⇒ `Object`



28
29
30

# File 'lib/content_crawler.rb', line 28

def get_simple_text(xpath=nil)
    @page.xpath(xpath).text.strip if not xpath.nil?
end

Class: ContentCrawler::Crawler

Instance Method Summary collapse

Methods included from CrawlerProcess

Constructor Details

#initialize(crawler, base_url, options = {:timeout=>300, :user_agent=>nil}) ⇒ Crawler

Instance Method Details

#close_browser ⇒ Object

#get_audio_video_elements(xpath = nil, options = {}) ⇒ Object

#get_datalist_elements(xpath = nil, options = {}) ⇒ Object

#get_iframe_embed_elements(xpath = nil, options = {}) ⇒ Object

#get_link_elements(xpath = nil, options = {}) ⇒ Object

#get_object_elements(xpath = nil, options = {}) ⇒ Object

#get_parser_page(crawl_url = nil) ⇒ Object

#get_remote_image(xpath = nil, image_store_dir = nil) ⇒ Object

#get_select_elements(xpath = nil, options = {}) ⇒ Object

#get_simple_text(xpath = nil) ⇒ Object

#initialize(crawler, base_url, options = {:timeout=>300, :user_agent=>nil}) ⇒ `Crawler`

#close_browser ⇒ `Object`

#get_audio_video_elements(xpath = nil, options = {}) ⇒ `Object`

#get_datalist_elements(xpath = nil, options = {}) ⇒ `Object`

#get_iframe_embed_elements(xpath = nil, options = {}) ⇒ `Object`

#get_link_elements(xpath = nil, options = {}) ⇒ `Object`

#get_object_elements(xpath = nil, options = {}) ⇒ `Object`

#get_parser_page(crawl_url = nil) ⇒ `Object`

#get_remote_image(xpath = nil, image_store_dir = nil) ⇒ `Object`

#get_select_elements(xpath = nil, options = {}) ⇒ `Object`

#get_simple_text(xpath = nil) ⇒ `Object`