Class: WebScrapingAI::HtmlApi

Inherits:
Object
  • Object
show all
Defined in:
lib/webscraping_ai/api/html_api.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(api_client = ApiClient.default) ⇒ HtmlApi

Returns a new instance of HtmlApi.



19
20
21
# File 'lib/webscraping_ai/api/html_api.rb', line 19

def initialize(api_client = ApiClient.default)
  @api_client = api_client
end

Instance Attribute Details

#api_clientObject

Returns the value of attribute api_client.



17
18
19
# File 'lib/webscraping_ai/api/html_api.rb', line 17

def api_client
  @api_client
end

Instance Method Details

#get_page(url, opts = {}) ⇒ ScrappedPage

Get page HTML by URL (renders JS in Chrome and uses rotating proxies)

Parameters:

  • url (String)

    URL of the page to get

  • opts (Hash) (defaults to: {})

    the optional parameters

Options Hash (opts):

  • :selector (String)

    CSS selector to get a part of the page (null by default, returns whole page HTML)

  • :outer_html (Boolean)

    Return outer HTML of the selected element (false by default, returns inner HTML)

  • :proxy (String)

    Proxy country code, for geotargeting (US by default)

  • :disable_js (Boolean)

    Disable JS execution (false by default)

  • :inline_css (Boolean)

    Inline included CSS files to make page viewable on other domains (false by default)

Returns:



31
32
33
34
# File 'lib/webscraping_ai/api/html_api.rb', line 31

def get_page(url, opts = {})
  data, _status_code, _headers = get_page_with_http_info(url, opts)
  data
end

#get_page_with_http_info(url, opts = {}) ⇒ Array<(ScrappedPage, Integer, Hash)>

Get page HTML by URL (renders JS in Chrome and uses rotating proxies)

Parameters:

  • url (String)

    URL of the page to get

  • opts (Hash) (defaults to: {})

    the optional parameters

Options Hash (opts):

  • :selector (String)

    CSS selector to get a part of the page (null by default, returns whole page HTML)

  • :outer_html (Boolean)

    Return outer HTML of the selected element (false by default, returns inner HTML)

  • :proxy (String)

    Proxy country code, for geotargeting (US by default)

  • :disable_js (Boolean)

    Disable JS execution (false by default)

  • :inline_css (Boolean)

    Inline included CSS files to make page viewable on other domains (false by default)

Returns:

  • (Array<(ScrappedPage, Integer, Hash)>)

    ScrappedPage data, response status code and response headers



45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# File 'lib/webscraping_ai/api/html_api.rb', line 45

def get_page_with_http_info(url, opts = {})
  if @api_client.config.debugging
    @api_client.config.logger.debug 'Calling API: HtmlApi.get_page ...'
  end
  # verify the required parameter 'url' is set
  if @api_client.config.client_side_validation && url.nil?
    fail ArgumentError, "Missing the required parameter 'url' when calling HtmlApi.get_page"
  end
  # resource path
  local_var_path = '/'

  # query parameters
  query_params = opts[:query_params] || {}
  query_params[:'url'] = url
  query_params[:'selector'] = opts[:'selector'] if !opts[:'selector'].nil?
  query_params[:'outer_html'] = opts[:'outer_html'] if !opts[:'outer_html'].nil?
  query_params[:'proxy'] = opts[:'proxy'] if !opts[:'proxy'].nil?
  query_params[:'disable_js'] = opts[:'disable_js'] if !opts[:'disable_js'].nil?
  query_params[:'inline_css'] = opts[:'inline_css'] if !opts[:'inline_css'].nil?

  # header parameters
  header_params = opts[:header_params] || {}
  # HTTP header 'Accept' (if needed)
  header_params['Accept'] = @api_client.select_header_accept(['application/json'])

  # form parameters
  form_params = opts[:form_params] || {}

  # http body (model)
  post_body = opts[:body] 

  # return_type
  return_type = opts[:return_type] || 'ScrappedPage' 

  # auth_names
  auth_names = opts[:auth_names] || ['api_key']

  new_options = opts.merge(
    :header_params => header_params,
    :query_params => query_params,
    :form_params => form_params,
    :body => post_body,
    :auth_names => auth_names,
    :return_type => return_type
  )

  data, status_code, headers = @api_client.call_api(:GET, local_var_path, new_options)
  if @api_client.config.debugging
    @api_client.config.logger.debug "API called: HtmlApi#get_page\nData: #{data.inspect}\nStatus code: #{status_code}\nHeaders: #{headers}"
  end
  return data, status_code, headers
end