Module: Instaview
- Defined in:
- lib/instaview.rb,
lib/instaview/version.rb
Defined Under Namespace
Classes: Error
Constant Summary collapse
- VERSION =
"0.2.0"
Class Method Summary collapse
-
.cache_dir ⇒ Object
— Cache helpers —.
- .cache_file_for(username) ⇒ Object
- .fetch_data_async(username, method: :selenium) ⇒ Object
- .get_from_cache_or_async(username, max_age_hours: 12, method: :selenium) ⇒ Object
- .getData(username = nil) ⇒ Object
- .load_from_cache_only(username, max_age_hours: 12) ⇒ Object
- .parseData ⇒ Object
- .read_from_cache(username, max_age_seconds: 43_200) ⇒ Object
- .scrape_instagram_stories(username = nil) ⇒ Object
- .scrape_with_simple_http(username = nil) ⇒ Object
- .test_connectivity ⇒ Object
- .write_to_cache(username, data) ⇒ Object
Class Method Details
.cache_dir ⇒ Object
— Cache helpers —
110 111 112 |
# File 'lib/instaview.rb', line 110 def self.cache_dir ENV['INSTAVIEW_CACHE_DIR'] || File.join(Dir.home, ".cache", "instaview") end |
.cache_file_for(username) ⇒ Object
122 123 124 125 |
# File 'lib/instaview.rb', line 122 def self.cache_file_for(username) sanitized = username.to_s.gsub(/[^a-zA-Z0-9_\-.]/, '_') File.join(cache_dir, "#{sanitized}.json") end |
.fetch_data_async(username, method: :selenium) ⇒ Object
41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
# File 'lib/instaview.rb', line 41 def self.fetch_data_async(username, method: :selenium) raise ArgumentError, "username is required" if username.nil? || username.to_s.strip.empty? Thread.new do result = case method when :selenium scrape_instagram_stories(username) when :simple_http scrape_with_simple_http(username) else scrape_instagram_stories(username) end # Persist to cache on success if data_found?(result) begin write_to_cache(username, result) rescue StandardError # Ignore cache write failures to avoid affecting callers end end result end end |
.get_from_cache_or_async(username, max_age_hours: 12, method: :selenium) ⇒ Object
77 78 79 80 81 82 83 84 |
# File 'lib/instaview.rb', line 77 def self.get_from_cache_or_async(username, max_age_hours: 12, method: :selenium) max_age_seconds = (max_age_hours.to_i * 3600) cached = read_from_cache(username, max_age_seconds: max_age_seconds) return cached if cached t = fetch_data_async(username, method: method) t.value # join and return result end |
.getData(username = nil) ⇒ Object
25 26 27 28 29 |
# File 'lib/instaview.rb', line 25 def self.getData(username = nil) # Default data accessor: try cache first (12h TTL), otherwise fetch asynchronously and return result raise ArgumentError, "username is required" if username.nil? || username.to_s.strip.empty? get_from_cache_or_async(username, max_age_hours: 12) end |
.load_from_cache_only(username, max_age_hours: 12) ⇒ Object
95 96 97 98 99 |
# File 'lib/instaview.rb', line 95 def self.load_from_cache_only(username, max_age_hours: 12) raise ArgumentError, "username is required" if username.nil? || username.to_s.strip.empty? max_age_seconds = (max_age_hours.to_i * 3600) read_from_cache(username, max_age_seconds: max_age_seconds) end |
.parseData ⇒ Object
460 461 462 463 464 465 466 467 468 469 470 471 |
# File 'lib/instaview.rb', line 460 def self.parseData # Using a third-party web app, to get Instagram data. # Afterwards, we use Nokogiri to parse the HTML. require "nokogiri" require "open-uri" url = "https://www.instaview.me/" html = URI.open(url) doc = Nokogiri::HTML(html) doc.xpath("//profile-media-list__item").map(&:text) end |
.read_from_cache(username, max_age_seconds: 43_200) ⇒ Object
136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
# File 'lib/instaview.rb', line 136 def self.read_from_cache(username, max_age_seconds: 43_200) path = cache_file_for(username) return nil unless File.exist?(path) age = Time.now - File.mtime(path) return nil if age > max_age_seconds content = File.read(path) data = JSON.parse(content, symbolize_names: true) return nil unless data_found?(data) # annotate so callers can tell it came from cache if data.is_a?(Hash) data[:cached] = true end data rescue JSON::ParserError nil end |
.scrape_instagram_stories(username = nil) ⇒ Object
201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 |
# File 'lib/instaview.rb', line 201 def self.scrape_instagram_stories(username = nil) target_username = username || ARGV[0] # pass username as argument driver = nil begin # Setup Selenium WebDriver with headless Chrome = Selenium::WebDriver::Chrome::Options.new .add_argument('--headless=new') .add_argument('--no-sandbox') .add_argument('--disable-dev-shm-usage') .add_argument('--disable-gpu') .add_argument('--disable-extensions') .add_argument('--disable-background-timer-throttling') .add_argument('--disable-backgrounding-occluded-windows') .add_argument('--disable-renderer-backgrounding') .add_argument('--window-size=1920,1080') .add_argument('--remote-debugging-port=9222') .add_argument('--user-data-dir=/tmp/chrome-user-data') # Try different Chrome/Chromium binaries chrome_paths = [ "/snap/bin/chromium", "/usr/bin/chromium", "/usr/bin/chromium-browser", "/usr/bin/google-chrome" ] chrome_binary = chrome_paths.find { |path| File.exist?(path) } .binary = chrome_binary if chrome_binary driver = Selenium::WebDriver.for :chrome, options: # 1) Go to StoriesIG homepage driver.navigate.to "https://storiesig.info/" sleep 2 # 2) Find the specific search input for StoriesIG wait = Selenium::WebDriver::Wait.new(timeout: 10) input_element = begin wait.until do element = driver.find_element(:css, 'input.search.search-form__input[placeholder*="username"]') element if element.displayed? end rescue Selenium::WebDriver::Error::TimeoutError raise Instaview::Error, "Search input not found with selector: input.search.search-form__input" end input_element.clear input_element.send_keys(target_username) # 3) Click the specific search button begin = driver.find_element(:css, 'button.search-form__button') .click rescue Selenium::WebDriver::Error::NoSuchElementError input_element.send_keys(:return) end # 4) Wait for results to load and check different possible outcomes sleep 3 # Check for various possible page states page_state = "unknown" = nil # Check if media items loaded media_items = driver.find_elements(:css, 'li.profile-media-list__item') if media_items.length > 0 page_state = "media_found" else # Check for error messages or other states sleep 2 # Give it more time media_items = driver.find_elements(:css, 'li.profile-media-list__item') if media_items.length > 0 page_state = "media_found_delayed" else # Look for common error indicators error_selectors = [ '.error', '.alert', '.warning', '[class*="error"]', '[class*="not-found"]', 'p:contains("not found")', 'div:contains("error")' ] error_found = false error_selectors.each do |selector| begin error_elements = driver.find_elements(:css, selector) if error_elements.any? = error_elements.first.text error_found = true break end rescue StandardError # Continue checking other selectors end end page_state = error_found ? "error_found" : "no_media" end end # 5) Extract media content from the specific structure html = driver.page_source doc = Nokogiri::HTML(html) # Extract specific media items using the provided selector media_list_items = doc.css('li.profile-media-list__item') extracted_media = [] media_list_items.each do |item| media_data = {} # Extract image source img_element = item.css('.media-content__image').first if img_element media_data[:image_url] = img_element['src'] media_data[:alt_text] = img_element['alt'] end # Extract caption caption_element = item.css('.media-content__caption').first media_data[:caption] = caption_element&.text&.strip # Extract download link download_element = item.css('a.button.button--filled.button__download').first media_data[:download_url] = download_element['href'] if download_element # Extract metadata like_element = item.css('.media-content__meta-like').first media_data[:likes] = like_element&.text&.strip time_element = item.css('.media-content__meta-time').first media_data[:time] = time_element&.text&.strip media_data[:time_title] = time_element['title'] if time_element extracted_media << media_data unless media_data.empty? end # Also extract any general images and links all_images = doc.css('img').map { |img| img['src'] }.compact.uniq.reject(&:empty?) all_links = doc.css('a').map { |link| link['href'] }.compact.uniq.reject(&:empty?) download_links = doc.css('a.button__download').map { |link| link['href'] }.compact.uniq result = { username: target_username, method: "selenium_storiesig", page_state: page_state, media_items_found: extracted_media.length, media_items: extracted_media, all_images: all_images.select { |img| img.start_with?('http') }.first(10), # Limit output download_links: download_links, error_message: , success: extracted_media.length > 0, debug_info: { total_images: all_images.length, total_links: all_links.length, } } # Save screenshot for debugging if needed if ENV['INSTAVIEW_DEBUG'] screenshot_path = "/tmp/instaview_debug_#{Time.now.to_i}.png" driver.save_screenshot(screenshot_path) result[:debug_info][:screenshot_path] = screenshot_path end result rescue Instaview::Error raise rescue => e raise Instaview::Error, "Selenium scraping failed: #{e.message}" ensure driver&.quit end end |
.scrape_with_simple_http(username = nil) ⇒ Object
388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 |
# File 'lib/instaview.rb', line 388 def self.scrape_with_simple_http(username = nil) target_username = username raise ArgumentError, "Username is required for simple HTTP method" if target_username.nil? || target_username.empty? begin # Simple HTTP approach using curl curl_command = "curl -s -L -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' 'https://storiesig.info/'" html_content = `#{curl_command}` unless $?.success? && !html_content.empty? raise Instaview::Error, "Curl command failed or returned empty content" end doc = Nokogiri::HTML(html_content) # Extract basic page information forms = doc.css('form') inputs = doc.css('input[type="text"], input[name*="user"]') # Look for any existing media or links images = doc.css('img').map { |img| img['src'] }.compact.select { |src| src.start_with?('http') } { username: target_username, method: "simple_http_curl", forms_found: forms.length, inputs_found: inputs.length, sample_images: images.first(3), message: "Simple HTTP method using curl - shows page structure. For full automation use selenium method." } rescue Instaview::Error raise rescue => e raise Instaview::Error, "HTTP scraping failed: #{e.message}" end end |
.test_connectivity ⇒ Object
434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 |
# File 'lib/instaview.rb', line 434 def self.test_connectivity # Simple test method to verify the gem works { gem_name: "Instaview", version: Instaview::VERSION, methods_available: [ "scrape_instagram_stories", "scrape_with_simple_http", "fetch_data_async", "get_from_cache_or_async", "load_from_cache_only", "getData", "test_connectivity" ], status: "OK" } end |
.write_to_cache(username, data) ⇒ Object
164 165 166 167 168 |
# File 'lib/instaview.rb', line 164 def self.write_to_cache(username, data) FileUtils.mkdir_p(cache_dir) File.write(cache_file_for(username), JSON.pretty_generate(data)) true end |