Class: VideoGrabber::Scraper

Inherits:
Object
  • Object
show all
Defined in:
lib/video_grabber/scraper.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(config) ⇒ Scraper

Returns a new instance of Scraper.



7
8
9
10
11
12
13
14
15
16
# File 'lib/video_grabber/scraper.rb', line 7

def initialize(config)
  @keep_browser_open      = config.keep_browser_open
  @url                    = config.url
  @timeout                = config.timeout
  @headless_enabled       = config.headless_enabled
  @firefox_extension_path = config.firefox_extension_path
  @html_attributes        = config.html_attributes
  @browser_type           = config.browser
  @browser_parameters     = { http_client: browser_http_client }
end

Instance Attribute Details

#browserObject (readonly)

Returns the value of attribute browser.



4
5
6
# File 'lib/video_grabber/scraper.rb', line 4

def browser
  @browser
end

#browser_parametersObject (readonly)

Returns the value of attribute browser_parameters.



4
5
6
# File 'lib/video_grabber/scraper.rb', line 4

def browser_parameters
  @browser_parameters
end

#browser_typeObject (readonly)

Returns the value of attribute browser_type.



4
5
6
# File 'lib/video_grabber/scraper.rb', line 4

def browser_type
  @browser_type
end

#firefox_extension_pathObject (readonly)

Returns the value of attribute firefox_extension_path.



4
5
6
# File 'lib/video_grabber/scraper.rb', line 4

def firefox_extension_path
  @firefox_extension_path
end

#headless_enabledObject (readonly)

Returns the value of attribute headless_enabled.



4
5
6
# File 'lib/video_grabber/scraper.rb', line 4

def headless_enabled
  @headless_enabled
end

#html_attributesObject (readonly)

Returns the value of attribute html_attributes.



4
5
6
# File 'lib/video_grabber/scraper.rb', line 4

def html_attributes
  @html_attributes
end

#keep_browser_openObject (readonly)

Returns the value of attribute keep_browser_open.



4
5
6
# File 'lib/video_grabber/scraper.rb', line 4

def keep_browser_open
  @keep_browser_open
end

#timeoutObject (readonly)

Returns the value of attribute timeout.



4
5
6
# File 'lib/video_grabber/scraper.rb', line 4

def timeout
  @timeout
end

#urlObject (readonly)

Returns the value of attribute url.



4
5
6
# File 'lib/video_grabber/scraper.rb', line 4

def url
  @url
end

Instance Method Details

#fetch_videosObject



30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# File 'lib/video_grabber/scraper.rb', line 30

def fetch_videos
  links_list = []
  links_list += browser.videos.map(&:html)

  links_list += ::Nokogiri::HTML(browser.html).xpath('//iframe').map do |iframe_node|
    ::Nokogiri::HTML(::CGI.unescapeHTML(iframe_node.to_s)).xpath('.//video').map{ |element| element.to_s }
  end.flatten

  links_list += begin
    html = ::CGI.unescapeHTML(browser.html)
    html = html.split('<video').map{|e| '<video ' + e if e.match('</video>')}.compact
    html = html.map{|e| e.split('</video>')[0..-2].join('</video>') + '</video>' }
  end

  stop unless keep_browser_open

  links_list = links_list.map{|element| element.split.join(" ") }.reject(&:empty?).uniq

  add_attributes(links_list) || links_list
rescue ::Watir::Exception::Error
  raise ::VideoGrabber::BrowserIsClosed, 'Please restart the scraper (scraper_instance.start), or keep the browser open'
end

#startObject



18
19
20
21
22
23
24
# File 'lib/video_grabber/scraper.rb', line 18

def start
  open_browser
  browser.goto(url) ; self
rescue ::Net::ReadTimeout
  stop
  raise ::VideoGrabber::TimeOut
end

#stopObject



26
27
28
# File 'lib/video_grabber/scraper.rb', line 26

def stop
  browser.close
end