Module: Instaview

Defined in:: lib/instaview.rb,
lib/instaview/version.rb

Defined Under Namespace

Classes: Error

Constant Summary collapse

VERSION =

"0.2.0"

Class Method Summary collapse

Class Method Details

.cache_dir ⇒ `Object`

— Cache helpers —



110
111
112

# File 'lib/instaview.rb', line 110

def self.cache_dir
  ENV['INSTAVIEW_CACHE_DIR'] || File.join(Dir.home, ".cache", "instaview")
end

.cache_file_for(username) ⇒ `Object`

# File 'lib/instaview.rb', line 122

def self.cache_file_for(username)
  sanitized = username.to_s.gsub(/[^a-zA-Z0-9_\-.]/, '_')
  File.join(cache_dir, "#{sanitized}.json")
end

.fetch_data_async(username, method: :selenium) ⇒ `Object`

Raises:

(ArgumentError)

# File 'lib/instaview.rb', line 41

def self.fetch_data_async(username, method: :selenium)
  raise ArgumentError, "username is required" if username.nil? || username.to_s.strip.empty?

  Thread.new do
    result = case method
             when :selenium
               scrape_instagram_stories(username)
             when :simple_http
               scrape_with_simple_http(username)
             else
               scrape_instagram_stories(username)
             end

    # Persist to cache on success
    if data_found?(result)
      begin
        write_to_cache(username, result)
      rescue StandardError
        # Ignore cache write failures to avoid affecting callers
      end
    end

    result
  end
end

.get_from_cache_or_async(username, max_age_hours: 12, method: :selenium) ⇒ `Object`

# File 'lib/instaview.rb', line 77

def self.get_from_cache_or_async(username, max_age_hours: 12, method: :selenium)
  max_age_seconds = (max_age_hours.to_i * 3600)
  cached = read_from_cache(username, max_age_seconds: max_age_seconds)
  return cached if cached

  t = fetch_data_async(username, method: method)
  t.value # join and return result
end

.getData(username = nil) ⇒ `Object`

Raises:

(ArgumentError)

# File 'lib/instaview.rb', line 25

def self.getData(username = nil)
  # Default data accessor: try cache first (12h TTL), otherwise fetch asynchronously and return result
  raise ArgumentError, "username is required" if username.nil? || username.to_s.strip.empty?
  get_from_cache_or_async(username, max_age_hours: 12)
end

.load_from_cache_only(username, max_age_hours: 12) ⇒ `Object`

Raises:

(ArgumentError)

# File 'lib/instaview.rb', line 95

def self.load_from_cache_only(username, max_age_hours: 12)
  raise ArgumentError, "username is required" if username.nil? || username.to_s.strip.empty?
  max_age_seconds = (max_age_hours.to_i * 3600)
  read_from_cache(username, max_age_seconds: max_age_seconds)
end

.parseData ⇒ `Object`

# File 'lib/instaview.rb', line 460

def self.parseData
  # Using a third-party web app, to get Instagram data.
  # Afterwards, we use Nokogiri to parse the HTML.
  require "nokogiri"
  require "open-uri"

  url = "https://www.instaview.me/"
  html = URI.open(url)
  doc = Nokogiri::HTML(html)

  doc.xpath("//profile-media-list__item").map(&:text)
end

.read_from_cache(username, max_age_seconds: 43_200) ⇒ `Object`

# File 'lib/instaview.rb', line 136

def self.read_from_cache(username, max_age_seconds: 43_200)
  path = cache_file_for(username)
  return nil unless File.exist?(path)

  age = Time.now - File.mtime(path)
  return nil if age > max_age_seconds

  content = File.read(path)
  data = JSON.parse(content, symbolize_names: true)
  return nil unless data_found?(data)
  # annotate so callers can tell it came from cache
  if data.is_a?(Hash)
    data[:cached] = true
  end
  data
rescue JSON::ParserError
  nil
end

.scrape_instagram_stories(username = nil) ⇒ `Object`

# File 'lib/instaview.rb', line 201

def self.scrape_instagram_stories(username = nil)
  target_username = username || ARGV[0] # pass username as argument

  driver = nil
  begin
    # Setup Selenium WebDriver with headless Chrome
    options = Selenium::WebDriver::Chrome::Options.new
    options.add_argument('--headless=new')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-gpu')
    options.add_argument('--disable-extensions')
    options.add_argument('--disable-background-timer-throttling')
    options.add_argument('--disable-backgrounding-occluded-windows')
    options.add_argument('--disable-renderer-backgrounding')
    options.add_argument('--window-size=1920,1080')
    options.add_argument('--remote-debugging-port=9222')
    options.add_argument('--user-data-dir=/tmp/chrome-user-data')

    # Try different Chrome/Chromium binaries
    chrome_paths = [
      "/snap/bin/chromium",
      "/usr/bin/chromium",
      "/usr/bin/chromium-browser",
      "/usr/bin/google-chrome"
    ]

    chrome_binary = chrome_paths.find { |path| File.exist?(path) }
    options.binary = chrome_binary if chrome_binary

    driver = Selenium::WebDriver.for :chrome, options: options

    # 1) Go to StoriesIG homepage
    driver.navigate.to "https://storiesig.info/"
    sleep 2

    # 2) Find the specific search input for StoriesIG
    wait = Selenium::WebDriver::Wait.new(timeout: 10)

    input_element = begin
      wait.until do
        element = driver.find_element(:css, 'input.search.search-form__input[placeholder*="username"]')
        element if element.displayed?
      end
    rescue Selenium::WebDriver::Error::TimeoutError
      raise Instaview::Error, "Search input not found with selector: input.search.search-form__input"
    end

    input_element.clear
    input_element.send_keys(target_username)

    # 3) Click the specific search button
    begin
      button_element = driver.find_element(:css, 'button.search-form__button')
      button_element.click
    rescue Selenium::WebDriver::Error::NoSuchElementError
      input_element.send_keys(:return)
    end

    # 4) Wait for results to load and check different possible outcomes
    sleep 3

    # Check for various possible page states
    page_state = "unknown"
    error_message = nil

    # Check if media items loaded
    media_items = driver.find_elements(:css, 'li.profile-media-list__item')
    if media_items.length > 0
      page_state = "media_found"
    else
      # Check for error messages or other states
      sleep 2  # Give it more time
      media_items = driver.find_elements(:css, 'li.profile-media-list__item')

      if media_items.length > 0
        page_state = "media_found_delayed"
      else
        # Look for common error indicators
        error_selectors = [
          '.error', '.alert', '.warning',
          '[class*="error"]', '[class*="not-found"]',
          'p:contains("not found")', 'div:contains("error")'
        ]

        error_found = false
        error_selectors.each do |selector|
          begin
            error_elements = driver.find_elements(:css, selector)
            if error_elements.any?
              error_message = error_elements.first.text
              error_found = true
              break
            end
          rescue StandardError
            # Continue checking other selectors
          end
        end

        page_state = error_found ? "error_found" : "no_media"
      end
    end

    # 5) Extract media content from the specific structure
    html = driver.page_source
    doc = Nokogiri::HTML(html)

    # Extract specific media items using the provided selector
    media_list_items = doc.css('li.profile-media-list__item')

    extracted_media = []
    media_list_items.each do |item|
      media_data = {}

      # Extract image source
      img_element = item.css('.media-content__image').first
      if img_element
        media_data[:image_url] = img_element['src']
        media_data[:alt_text] = img_element['alt']
      end

      # Extract caption
      caption_element = item.css('.media-content__caption').first
      media_data[:caption] = caption_element&.text&.strip

      # Extract download link
      download_element = item.css('a.button.button--filled.button__download').first
      media_data[:download_url] = download_element['href'] if download_element

      # Extract metadata
      like_element = item.css('.media-content__meta-like').first
      media_data[:likes] = like_element&.text&.strip

      time_element = item.css('.media-content__meta-time').first
      media_data[:time] = time_element&.text&.strip
      media_data[:time_title] = time_element['title'] if time_element

      extracted_media << media_data unless media_data.empty?
    end

    # Also extract any general images and links
    all_images = doc.css('img').map { |img| img['src'] }.compact.uniq.reject(&:empty?)
    all_links = doc.css('a').map { |link| link['href'] }.compact.uniq.reject(&:empty?)
    download_links = doc.css('a.button__download').map { |link| link['href'] }.compact.uniq

    result = {
      username: target_username,
      method: "selenium_storiesig",
      page_state: page_state,
      media_items_found: extracted_media.length,
      media_items: extracted_media,
      all_images: all_images.select { |img| img.start_with?('http') }.first(10), # Limit output
      download_links: download_links,
      error_message: error_message,
      success: extracted_media.length > 0,
      debug_info: {
        total_images: all_images.length,
        total_links: all_links.length,
      }
    }

    # Save screenshot for debugging if needed
    if ENV['INSTAVIEW_DEBUG']
      screenshot_path = "/tmp/instaview_debug_#{Time.now.to_i}.png"
      driver.save_screenshot(screenshot_path)
      result[:debug_info][:screenshot_path] = screenshot_path
    end

    result
  rescue Instaview::Error
    raise
  rescue => e
    raise Instaview::Error, "Selenium scraping failed: #{e.message}"
  ensure
    driver&.quit
  end
end

.scrape_with_simple_http(username = nil) ⇒ `Object`

Raises:

(ArgumentError)

# File 'lib/instaview.rb', line 388

def self.scrape_with_simple_http(username = nil)
  target_username = username
  raise ArgumentError, "Username is required for simple HTTP method" if target_username.nil? || target_username.empty?

  begin
    # Simple HTTP approach using curl
    curl_command = "curl -s -L -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' 'https://storiesig.info/'"

    html_content = `#{curl_command}`

    unless $?.success? && !html_content.empty?
      raise Instaview::Error, "Curl command failed or returned empty content"
    end

    doc = Nokogiri::HTML(html_content)

    # Extract basic page information
    forms = doc.css('form')
    inputs = doc.css('input[type="text"], input[name*="user"]')

    # Look for any existing media or links
    images = doc.css('img').map { |img| img['src'] }.compact.select { |src| src.start_with?('http') }

    {
      username: target_username,
      method: "simple_http_curl",
      forms_found: forms.length,
      inputs_found: inputs.length,
      sample_images: images.first(3),
      message: "Simple HTTP method using curl - shows page structure. For full automation use selenium method."
    }
  rescue Instaview::Error
    raise
  rescue => e
    raise Instaview::Error, "HTTP scraping failed: #{e.message}"
  end
end

.test_connectivity ⇒ `Object`

# File 'lib/instaview.rb', line 434

def self.test_connectivity
  # Simple test method to verify the gem works
  {
    gem_name: "Instaview",
    version: Instaview::VERSION,
    methods_available: [
      "scrape_instagram_stories",
      "scrape_with_simple_http",
      "fetch_data_async",
      "get_from_cache_or_async",
      "load_from_cache_only",
      "getData",
      "test_connectivity"
    ],
    status: "OK"
  }
end

.write_to_cache(username, data) ⇒ `Object`

# File 'lib/instaview.rb', line 164

def self.write_to_cache(username, data)
  FileUtils.mkdir_p(cache_dir)
  File.write(cache_file_for(username), JSON.pretty_generate(data))
  true
end

Module: Instaview

Defined Under Namespace

Constant Summary collapse

Class Method Summary collapse

Class Method Details

.cache_dir ⇒ Object

.cache_file_for(username) ⇒ Object

.fetch_data_async(username, method: :selenium) ⇒ Object

.get_from_cache_or_async(username, max_age_hours: 12, method: :selenium) ⇒ Object

.getData(username = nil) ⇒ Object

.load_from_cache_only(username, max_age_hours: 12) ⇒ Object

.parseData ⇒ Object

.read_from_cache(username, max_age_seconds: 43_200) ⇒ Object

.scrape_instagram_stories(username = nil) ⇒ Object

.scrape_with_simple_http(username = nil) ⇒ Object

.test_connectivity ⇒ Object

.write_to_cache(username, data) ⇒ Object

.cache_dir ⇒ `Object`

.cache_file_for(username) ⇒ `Object`

.fetch_data_async(username, method: :selenium) ⇒ `Object`

.get_from_cache_or_async(username, max_age_hours: 12, method: :selenium) ⇒ `Object`

.getData(username = nil) ⇒ `Object`

.load_from_cache_only(username, max_age_hours: 12) ⇒ `Object`

.parseData ⇒ `Object`

.read_from_cache(username, max_age_seconds: 43_200) ⇒ `Object`

.scrape_instagram_stories(username = nil) ⇒ `Object`

.scrape_with_simple_http(username = nil) ⇒ `Object`

.test_connectivity ⇒ `Object`

.write_to_cache(username, data) ⇒ `Object`