Class: Carbon::UtilitiesApi

Inherits:
Object
  • Object
show all
Defined in:
lib/carbon_ruby_sdk/api/utilities_api.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(api_client = ApiClient.default) ⇒ UtilitiesApi

Returns a new instance of UtilitiesApi.



15
16
17
# File 'lib/carbon_ruby_sdk/api/utilities_api.rb', line 15

def initialize(api_client = ApiClient.default)
  @api_client = api_client
end

Instance Attribute Details

#api_clientObject

Returns the value of attribute api_client.



13
14
15
# File 'lib/carbon_ruby_sdk/api/utilities_api.rb', line 13

def api_client
  @api_client
end

Instance Method Details

#fetch_urls(url:, extra: {}) ⇒ Object

Fetch Urls

Extracts all URLs from a webpage.

Args:

url (str): URL of the webpage

Returns:

FetchURLsResponse: A response object with a list of URLs extracted from the webpage and the webpage content.

Parameters:

  • url (String)
  • extra (Hash) (defaults to: {})

    additional parameters to pass along through :header_params, :query_params, or parameter name



31
32
33
34
# File 'lib/carbon_ruby_sdk/api/utilities_api.rb', line 31

def fetch_urls(url:, extra: {})
  api_response = fetch_urls_with_http_info_impl(url, extra)
  api_response.data
end

#fetch_urls_with_http_info(url:, extra: {}) ⇒ Object

Fetch Urls

Extracts all URLs from a webpage.

Args:

url (str): URL of the webpage

Returns:

FetchURLsResponse: A response object with a list of URLs extracted from the webpage and the webpage content.

Parameters:

  • url (String)
  • extra (Hash) (defaults to: {})

    additional parameters to pass along through :header_params, :query_params, or parameter name



48
49
50
# File 'lib/carbon_ruby_sdk/api/utilities_api.rb', line 48

def fetch_urls_with_http_info(url:, extra: {})
  fetch_urls_with_http_info_impl(url, extra)
end

#fetch_webpage(url:, extra: {}) ⇒ Object

Fetch Urls V2

Parameters:

  • url (String)
  • body (FetchURLsRequest)
  • extra (Hash) (defaults to: {})

    additional parameters to pass along through :header_params, :query_params, or parameter name



122
123
124
125
126
127
128
# File 'lib/carbon_ruby_sdk/api/utilities_api.rb', line 122

def fetch_webpage(url:, extra: {})
  _body = {}
  _body[:url] = url if url != SENTINEL
  fetch_urls_request = _body
  api_response = fetch_webpage_with_http_info_impl(fetch_urls_request, extra)
  api_response.data
end

#fetch_webpage_with_http_info(url:, extra: {}) ⇒ Object

Fetch Urls V2

Parameters:

  • url (String)
  • body (FetchURLsRequest)
  • extra (Hash) (defaults to: {})

    additional parameters to pass along through :header_params, :query_params, or parameter name



135
136
137
138
139
140
# File 'lib/carbon_ruby_sdk/api/utilities_api.rb', line 135

def fetch_webpage_with_http_info(url:, extra: {})
  _body = {}
  _body[:url] = url if url != SENTINEL
  fetch_urls_request = _body
  fetch_webpage_with_http_info_impl(fetch_urls_request, extra)
end

#fetch_youtube_transcripts(id:, raw: false, extra: {}) ⇒ Object

Fetch Youtube Transcripts

Fetches english transcripts from YouTube videos.

Args:

id (str): The ID of the YouTube video. 
raw (bool): Whether to return the raw transcript or not. Defaults to False.

Returns:

dict: A dictionary with the transcript of the YouTube video.

Parameters:

  • id (String)
  • raw (Boolean) (defaults to: false)
  • extra (Hash) (defaults to: {})

    additional parameters to pass along through :header_params, :query_params, or parameter name



223
224
225
226
227
# File 'lib/carbon_ruby_sdk/api/utilities_api.rb', line 223

def fetch_youtube_transcripts(id:, raw: false, extra: {})
  extra[:raw] = raw if raw != SENTINEL
  api_response = fetch_youtube_transcripts_with_http_info_impl(id, extra)
  api_response.data
end

#fetch_youtube_transcripts_with_http_info(id:, raw: false, extra: {}) ⇒ Object

Fetch Youtube Transcripts

Fetches english transcripts from YouTube videos.

Args:

id (str): The ID of the YouTube video. 
raw (bool): Whether to return the raw transcript or not. Defaults to False.

Returns:

dict: A dictionary with the transcript of the YouTube video.

Parameters:

  • id (String)
  • raw (Boolean) (defaults to: false)
  • extra (Hash) (defaults to: {})

    additional parameters to pass along through :header_params, :query_params, or parameter name



243
244
245
246
# File 'lib/carbon_ruby_sdk/api/utilities_api.rb', line 243

def fetch_youtube_transcripts_with_http_info(id:, raw: false, extra: {})
  extra[:raw] = raw if raw != SENTINEL
  fetch_youtube_transcripts_with_http_info_impl(id, extra)
end

#process_sitemap(url:, extra: {}) ⇒ Object

Sitemap

Retrieves all URLs from a sitemap, which can subsequently be utilized with our ‘web_scrape` endpoint.

<!–Args:

url (str): URL of the sitemap

Returns:

dict: A dictionary with a list of URLs extracted from the sitemap.-->

Parameters:

  • url (String)
  • extra (Hash) (defaults to: {})

    additional parameters to pass along through :header_params, :query_params, or parameter name



328
329
330
331
# File 'lib/carbon_ruby_sdk/api/utilities_api.rb', line 328

def process_sitemap(url:, extra: {})
  api_response = process_sitemap_with_http_info_impl(url, extra)
  api_response.data
end

#process_sitemap_with_http_info(url:, extra: {}) ⇒ Object

Sitemap

Retrieves all URLs from a sitemap, which can subsequently be utilized with our ‘web_scrape` endpoint.

<!–Args:

url (str): URL of the sitemap

Returns:

dict: A dictionary with a list of URLs extracted from the sitemap.-->

Parameters:

  • url (String)
  • extra (Hash) (defaults to: {})

    additional parameters to pass along through :header_params, :query_params, or parameter name



345
346
347
# File 'lib/carbon_ruby_sdk/api/utilities_api.rb', line 345

def process_sitemap_with_http_info(url:, extra: {})
  process_sitemap_with_http_info_impl(url, extra)
end

#scrape_sitemap(url:, tags: SENTINEL, max_pages_to_scrape: SENTINEL, chunk_size: 1500, chunk_overlap: 20, skip_embedding_generation: false, enable_auto_sync: false, generate_sparse_vectors: false, prepend_filename_to_chunks: false, html_tags_to_skip: SENTINEL, css_classes_to_skip: SENTINEL, css_selectors_to_skip: SENTINEL, embedding_model: SENTINEL, url_paths_to_include: SENTINEL, url_paths_to_exclude: SENTINEL, urls_to_scrape: SENTINEL, download_css_and_media: false, generate_chunks_only: false, store_file_only: false, use_premium_proxies: false, extra: {}) ⇒ Object

Scrape Sitemap

Extracts all URLs from a sitemap and performs a web scrape on each of them.

Args:

sitemap_url (str): URL of the sitemap

Returns:

dict: A response object with the status of the scraping job message.-->

Parameters:

  • url (String)
  • tags (Hash<String, Tags1>) (defaults to: SENTINEL)
  • max_pages_to_scrape (Integer) (defaults to: SENTINEL)
  • chunk_size (Integer) (defaults to: 1500)
  • chunk_overlap (Integer) (defaults to: 20)
  • skip_embedding_generation (Boolean) (defaults to: false)
  • enable_auto_sync (Boolean) (defaults to: false)
  • generate_sparse_vectors (Boolean) (defaults to: false)
  • prepend_filename_to_chunks (Boolean) (defaults to: false)
  • html_tags_to_skip (Array<String>) (defaults to: SENTINEL)
  • css_classes_to_skip (Array<String>) (defaults to: SENTINEL)
  • css_selectors_to_skip (Array<String>) (defaults to: SENTINEL)
  • embedding_model (EmbeddingGenerators) (defaults to: SENTINEL)
  • url_paths_to_include (Array<String>) (defaults to: SENTINEL)

    URL subpaths or directories that you want to include. For example if you want to only include URLs that start with /questions in stackoverflow.com, you will add /questions/ in this input

  • url_paths_to_exclude (Array<String>) (defaults to: SENTINEL)

    URL subpaths or directories that you want to exclude. For example if you want to exclude URLs that start with /questions in stackoverflow.com, you will add /questions/ in this input

  • urls_to_scrape (Array<String>) (defaults to: SENTINEL)

    You can submit a subset of URLs from the sitemap that should be scraped. To get the list of URLs, you can check out /process_sitemap endpoint. If left empty, all URLs from the sitemap will be scraped.

  • download_css_and_media (Boolean) (defaults to: false)

    Whether the scraper should download css and media from the page (images, fonts, etc). Scrapes might take longer to finish with this flag enabled, but the success rate is improved.

  • generate_chunks_only (Boolean) (defaults to: false)

    If this flag is enabled, the file will be chunked and stored with Carbon, but no embeddings will be generated. This overrides the skip_embedding_generation flag.

  • store_file_only (Boolean) (defaults to: false)

    If this flag is enabled, the file will be stored with Carbon, but no processing will be done.

  • use_premium_proxies (Boolean) (defaults to: false)

    If the default proxies are blocked and not returning results, this flag can be enabled to use alternate proxies (residential and office). Scrapes might take longer to finish with this flag enabled.

  • body (SitemapScrapeRequest)
  • extra (Hash) (defaults to: {})

    additional parameters to pass along through :header_params, :query_params, or parameter name



446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
# File 'lib/carbon_ruby_sdk/api/utilities_api.rb', line 446

def scrape_sitemap(url:, tags: SENTINEL, max_pages_to_scrape: SENTINEL, chunk_size: 1500, chunk_overlap: 20, skip_embedding_generation: false, enable_auto_sync: false, generate_sparse_vectors: false, prepend_filename_to_chunks: false, html_tags_to_skip: SENTINEL, css_classes_to_skip: SENTINEL, css_selectors_to_skip: SENTINEL, embedding_model: SENTINEL, url_paths_to_include: SENTINEL, url_paths_to_exclude: SENTINEL, urls_to_scrape: SENTINEL, download_css_and_media: false, generate_chunks_only: false, store_file_only: false, use_premium_proxies: false, extra: {})
  _body = {}
  _body[:tags] = tags if tags != SENTINEL
  _body[:url] = url if url != SENTINEL
  _body[:max_pages_to_scrape] = max_pages_to_scrape if max_pages_to_scrape != SENTINEL
  _body[:chunk_size] = chunk_size if chunk_size != SENTINEL
  _body[:chunk_overlap] = chunk_overlap if chunk_overlap != SENTINEL
  _body[:skip_embedding_generation] = skip_embedding_generation if skip_embedding_generation != SENTINEL
  _body[:enable_auto_sync] = enable_auto_sync if enable_auto_sync != SENTINEL
  _body[:generate_sparse_vectors] = generate_sparse_vectors if generate_sparse_vectors != SENTINEL
  _body[:prepend_filename_to_chunks] = prepend_filename_to_chunks if prepend_filename_to_chunks != SENTINEL
  _body[:html_tags_to_skip] = html_tags_to_skip if html_tags_to_skip != SENTINEL
  _body[:css_classes_to_skip] = css_classes_to_skip if css_classes_to_skip != SENTINEL
  _body[:css_selectors_to_skip] = css_selectors_to_skip if css_selectors_to_skip != SENTINEL
  _body[:embedding_model] = embedding_model if embedding_model != SENTINEL
  _body[:url_paths_to_include] = url_paths_to_include if url_paths_to_include != SENTINEL
  _body[:url_paths_to_exclude] = url_paths_to_exclude if url_paths_to_exclude != SENTINEL
  _body[:urls_to_scrape] = urls_to_scrape if urls_to_scrape != SENTINEL
  _body[:download_css_and_media] = download_css_and_media if download_css_and_media != SENTINEL
  _body[:generate_chunks_only] = generate_chunks_only if generate_chunks_only != SENTINEL
  _body[:store_file_only] = store_file_only if store_file_only != SENTINEL
  _body[:use_premium_proxies] = use_premium_proxies if use_premium_proxies != SENTINEL
  sitemap_scrape_request = _body
  api_response = scrape_sitemap_with_http_info_impl(sitemap_scrape_request, extra)
  api_response.data
end

#scrape_sitemap_with_http_info(url:, tags: SENTINEL, max_pages_to_scrape: SENTINEL, chunk_size: 1500, chunk_overlap: 20, skip_embedding_generation: false, enable_auto_sync: false, generate_sparse_vectors: false, prepend_filename_to_chunks: false, html_tags_to_skip: SENTINEL, css_classes_to_skip: SENTINEL, css_selectors_to_skip: SENTINEL, embedding_model: SENTINEL, url_paths_to_include: SENTINEL, url_paths_to_exclude: SENTINEL, urls_to_scrape: SENTINEL, download_css_and_media: false, generate_chunks_only: false, store_file_only: false, use_premium_proxies: false, extra: {}) ⇒ Object

Scrape Sitemap

Extracts all URLs from a sitemap and performs a web scrape on each of them.

Args:

sitemap_url (str): URL of the sitemap

Returns:

dict: A response object with the status of the scraping job message.-->

Parameters:

  • url (String)
  • tags (Hash<String, Tags1>) (defaults to: SENTINEL)
  • max_pages_to_scrape (Integer) (defaults to: SENTINEL)
  • chunk_size (Integer) (defaults to: 1500)
  • chunk_overlap (Integer) (defaults to: 20)
  • skip_embedding_generation (Boolean) (defaults to: false)
  • enable_auto_sync (Boolean) (defaults to: false)
  • generate_sparse_vectors (Boolean) (defaults to: false)
  • prepend_filename_to_chunks (Boolean) (defaults to: false)
  • html_tags_to_skip (Array<String>) (defaults to: SENTINEL)
  • css_classes_to_skip (Array<String>) (defaults to: SENTINEL)
  • css_selectors_to_skip (Array<String>) (defaults to: SENTINEL)
  • embedding_model (EmbeddingGenerators) (defaults to: SENTINEL)
  • url_paths_to_include (Array<String>) (defaults to: SENTINEL)

    URL subpaths or directories that you want to include. For example if you want to only include URLs that start with /questions in stackoverflow.com, you will add /questions/ in this input

  • url_paths_to_exclude (Array<String>) (defaults to: SENTINEL)

    URL subpaths or directories that you want to exclude. For example if you want to exclude URLs that start with /questions in stackoverflow.com, you will add /questions/ in this input

  • urls_to_scrape (Array<String>) (defaults to: SENTINEL)

    You can submit a subset of URLs from the sitemap that should be scraped. To get the list of URLs, you can check out /process_sitemap endpoint. If left empty, all URLs from the sitemap will be scraped.

  • download_css_and_media (Boolean) (defaults to: false)

    Whether the scraper should download css and media from the page (images, fonts, etc). Scrapes might take longer to finish with this flag enabled, but the success rate is improved.

  • generate_chunks_only (Boolean) (defaults to: false)

    If this flag is enabled, the file will be chunked and stored with Carbon, but no embeddings will be generated. This overrides the skip_embedding_generation flag.

  • store_file_only (Boolean) (defaults to: false)

    If this flag is enabled, the file will be stored with Carbon, but no processing will be done.

  • use_premium_proxies (Boolean) (defaults to: false)

    If the default proxies are blocked and not returning results, this flag can be enabled to use alternate proxies (residential and office). Scrapes might take longer to finish with this flag enabled.

  • body (SitemapScrapeRequest)
  • extra (Hash) (defaults to: {})

    additional parameters to pass along through :header_params, :query_params, or parameter name



505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
# File 'lib/carbon_ruby_sdk/api/utilities_api.rb', line 505

def scrape_sitemap_with_http_info(url:, tags: SENTINEL, max_pages_to_scrape: SENTINEL, chunk_size: 1500, chunk_overlap: 20, skip_embedding_generation: false, enable_auto_sync: false, generate_sparse_vectors: false, prepend_filename_to_chunks: false, html_tags_to_skip: SENTINEL, css_classes_to_skip: SENTINEL, css_selectors_to_skip: SENTINEL, embedding_model: SENTINEL, url_paths_to_include: SENTINEL, url_paths_to_exclude: SENTINEL, urls_to_scrape: SENTINEL, download_css_and_media: false, generate_chunks_only: false, store_file_only: false, use_premium_proxies: false, extra: {})
  _body = {}
  _body[:tags] = tags if tags != SENTINEL
  _body[:url] = url if url != SENTINEL
  _body[:max_pages_to_scrape] = max_pages_to_scrape if max_pages_to_scrape != SENTINEL
  _body[:chunk_size] = chunk_size if chunk_size != SENTINEL
  _body[:chunk_overlap] = chunk_overlap if chunk_overlap != SENTINEL
  _body[:skip_embedding_generation] = skip_embedding_generation if skip_embedding_generation != SENTINEL
  _body[:enable_auto_sync] = enable_auto_sync if enable_auto_sync != SENTINEL
  _body[:generate_sparse_vectors] = generate_sparse_vectors if generate_sparse_vectors != SENTINEL
  _body[:prepend_filename_to_chunks] = prepend_filename_to_chunks if prepend_filename_to_chunks != SENTINEL
  _body[:html_tags_to_skip] = html_tags_to_skip if html_tags_to_skip != SENTINEL
  _body[:css_classes_to_skip] = css_classes_to_skip if css_classes_to_skip != SENTINEL
  _body[:css_selectors_to_skip] = css_selectors_to_skip if css_selectors_to_skip != SENTINEL
  _body[:embedding_model] = embedding_model if embedding_model != SENTINEL
  _body[:url_paths_to_include] = url_paths_to_include if url_paths_to_include != SENTINEL
  _body[:url_paths_to_exclude] = url_paths_to_exclude if url_paths_to_exclude != SENTINEL
  _body[:urls_to_scrape] = urls_to_scrape if urls_to_scrape != SENTINEL
  _body[:download_css_and_media] = download_css_and_media if download_css_and_media != SENTINEL
  _body[:generate_chunks_only] = generate_chunks_only if generate_chunks_only != SENTINEL
  _body[:store_file_only] = store_file_only if store_file_only != SENTINEL
  _body[:use_premium_proxies] = use_premium_proxies if use_premium_proxies != SENTINEL
  sitemap_scrape_request = _body
  scrape_sitemap_with_http_info_impl(sitemap_scrape_request, extra)
end

#scrape_web(body:, extra: {}) ⇒ Object

Web Scrape

Conduct a web scrape on a given webpage URL. Our web scraper is fully compatible with JavaScript and supports recursion depth, enabling you to efficiently extract all content from the target website.

<!–Args:

scraping_requests (List[WebscrapeRequest]): A list of WebscrapeRequest objects.

Returns:

dict: A response object with the status of the scraping job message.-->

Parameters:

  • body (Array<WebscrapeRequest>)
  • extra (Hash) (defaults to: {})

    additional parameters to pass along through :header_params, :query_params, or parameter name



613
614
615
616
617
# File 'lib/carbon_ruby_sdk/api/utilities_api.rb', line 613

def scrape_web(body:, extra: {})
  webscrape_request = body
  api_response = scrape_web_with_http_info_impl(webscrape_request, extra)
  api_response.data
end

#scrape_web_with_http_info(body:, extra: {}) ⇒ Object

Web Scrape

Conduct a web scrape on a given webpage URL. Our web scraper is fully compatible with JavaScript and supports recursion depth, enabling you to efficiently extract all content from the target website.

<!–Args:

scraping_requests (List[WebscrapeRequest]): A list of WebscrapeRequest objects.

Returns:

dict: A response object with the status of the scraping job message.-->

Parameters:

  • body (Array<WebscrapeRequest>)
  • extra (Hash) (defaults to: {})

    additional parameters to pass along through :header_params, :query_params, or parameter name



632
633
634
635
# File 'lib/carbon_ruby_sdk/api/utilities_api.rb', line 632

def scrape_web_with_http_info(body:, extra: {})
  webscrape_request = body
  scrape_web_with_http_info_impl(webscrape_request, extra)
end

#search_urls(query:, extra: {}) ⇒ Object

Search Urls

Perform a web search and obtain a list of relevant URLs.

As an illustration, when you perform a search for “content related to MRNA,” you will receive a list of links such as the following:

- https://tomrenz.substack.com/p/mrna-and-why-it-matters

- https://www.statnews.com/2020/11/10/the-story-of-mrna-how-a-once-dismissed-idea-became-a-leading-technology-in-the-covid-vaccine-race/

- https://www.statnews.com/2022/11/16/covid-19-vaccines-were-a-success-but-mrna-still-has-a-delivery-problem/

- https://joomi.substack.com/p/were-still-being-misled-about-how

Subsequently, you can submit these links to the web_scrape endpoint in order to retrieve the content of the respective web pages.

Args:

query (str): Query to search for

Returns:

FetchURLsResponse: A response object with a list of URLs for a given search query.

Parameters:

  • query (String)
  • extra (Hash) (defaults to: {})

    additional parameters to pass along through :header_params, :query_params, or parameter name



730
731
732
733
# File 'lib/carbon_ruby_sdk/api/utilities_api.rb', line 730

def search_urls(query:, extra: {})
  api_response = search_urls_with_http_info_impl(query, extra)
  api_response.data
end

#search_urls_with_http_info(query:, extra: {}) ⇒ Object

Search Urls

Perform a web search and obtain a list of relevant URLs.

As an illustration, when you perform a search for “content related to MRNA,” you will receive a list of links such as the following:

- https://tomrenz.substack.com/p/mrna-and-why-it-matters

- https://www.statnews.com/2020/11/10/the-story-of-mrna-how-a-once-dismissed-idea-became-a-leading-technology-in-the-covid-vaccine-race/

- https://www.statnews.com/2022/11/16/covid-19-vaccines-were-a-success-but-mrna-still-has-a-delivery-problem/

- https://joomi.substack.com/p/were-still-being-misled-about-how

Subsequently, you can submit these links to the web_scrape endpoint in order to retrieve the content of the respective web pages.

Args:

query (str): Query to search for

Returns:

FetchURLsResponse: A response object with a list of URLs for a given search query.

Parameters:

  • query (String)
  • extra (Hash) (defaults to: {})

    additional parameters to pass along through :header_params, :query_params, or parameter name



759
760
761
# File 'lib/carbon_ruby_sdk/api/utilities_api.rb', line 759

def search_urls_with_http_info(query:, extra: {})
  search_urls_with_http_info_impl(query, extra)
end

#user_webpages(filters: SENTINEL, pagination: SENTINEL, order_by: SENTINEL, order_dir: SENTINEL, extra: {}) ⇒ Object

User Web Pages

Parameters:



836
837
838
839
840
841
842
843
844
845
# File 'lib/carbon_ruby_sdk/api/utilities_api.rb', line 836

def user_webpages(filters: SENTINEL, pagination: SENTINEL, order_by: SENTINEL, order_dir: SENTINEL, extra: {})
  _body = {}
  _body[:filters] = filters if filters != SENTINEL
  _body[:pagination] = pagination if pagination != SENTINEL
  _body[:order_by] = order_by if order_by != SENTINEL
  _body[:order_dir] = order_dir if order_dir != SENTINEL
  user_web_pages_request = _body
  api_response = user_webpages_with_http_info_impl(user_web_pages_request, extra)
  api_response.data
end

#user_webpages_with_http_info(filters: SENTINEL, pagination: SENTINEL, order_by: SENTINEL, order_dir: SENTINEL, extra: {}) ⇒ Object

User Web Pages

Parameters:



855
856
857
858
859
860
861
862
863
# File 'lib/carbon_ruby_sdk/api/utilities_api.rb', line 855

def user_webpages_with_http_info(filters: SENTINEL, pagination: SENTINEL, order_by: SENTINEL, order_dir: SENTINEL, extra: {})
  _body = {}
  _body[:filters] = filters if filters != SENTINEL
  _body[:pagination] = pagination if pagination != SENTINEL
  _body[:order_by] = order_by if order_by != SENTINEL
  _body[:order_dir] = order_dir if order_dir != SENTINEL
  user_web_pages_request = _body
  user_webpages_with_http_info_impl(user_web_pages_request, extra)
end