Class: Mosquito::TweetScraper
- Defined in:
- lib/mosquito/scrapers/tweet_scraper.rb
Instance Method Summary collapse
Methods inherited from Scraper
#get_content_of_page_from_url_curl, #get_content_of_subpage_from_url, #initialize
Constructor Details
This class inherits a constructor from Mosquito::Scraper
Instance Method Details
#parse(id) ⇒ Object
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
# File 'lib/mosquito/scrapers/tweet_scraper.rb', line 10 def parse(id) # Stuff we need to get from the DOM (implemented is starred): # - User # - Text * # - Image / Images / Video * # - Date * # - Number of likes * # - Hashtags Capybara.app_host = ENV["NITTER_URL"] # video slideshows https://www.instagram.com/p/CY7KxwYOFBS/?utm_source=ig_embed&utm_campaign=loading # login begin doc = Nokogiri::HTML(URI.open("#{ENV["NITTER_URL"]}/jack/status/#{id}")) rescue OpenURI::HTTPError raise Mosquito::NoTweetFoundError end unless doc.xpath("//div[contains(@class, 'error-panel')]").empty? raise Mosquito::NoTweetFoundError end text = doc.xpath("//div[contains(@class, 'tweet-content media-body')]").first.content date = DateTime.parse(doc.xpath("//span[contains(@class, 'tweet-date')]").first.child["title"]) id = URI.parse(doc.xpath("//link[contains(@rel, 'canonical')]").first["href"]).path.split("/").last number_of_likes = doc.xpath("//span[contains(@class, 'tweet-stat')][last()]/div").first.content.delete(",").to_i language = "en" # We can't determine this anymore with Nitter, but english will be fine, we don't actually use this anywhere... i think # user images = [] videos = [] video_preview_image = nil video_file_type = nil # # Single image # image_url = doc.xpath("//div[contains(@class, 'main-tweet')]/div/div/div/div/div/a[contains(@class, 'still-image')]/@href").first&.content # images << Mosquito.retrieve_media("#{Capybara.app_host}#{image_url}") unless image_url.nil? # debugger # Slideshow nodes = doc.xpath("//div[contains(@class, 'main-tweet')]/div/div/div[contains(@class, 'attachments')]/div[contains(@class, 'gallery-row')]/div/a/@href") images.concat(nodes.map { |node| Mosquito.retrieve_media("#{Capybara.app_host}#{node.value}") }) # Video nodes = doc.xpath("//div[contains(@class, 'main-tweet')]/div/div/div[contains(@class, 'attachments')]/div[contains(@class, 'gallery-video')]/div/video") unless nodes.empty? video_preview_image = Mosquito.retrieve_media("#{Capybara.app_host}#{nodes.first["poster"]}", extension: ".jpg") videos.concat(nodes.map { |node| Mosquito.retrieve_media(node.xpath("//source").first["src"]) }) video_file_type = "video" # This is always video now, sing a gif isn't displayed differently end # GIF nodes = doc.xpath("//div[contains(@class, 'main-tweet')]/div/div/div[contains(@class, 'attachments')]/div[contains(@class, 'gallery-gif')]/div/video") unless nodes.empty? video_preview_image = Mosquito.retrieve_media(nodes.first["poster"], extension: ".jpg") videos.concat(nodes.map { |node| Mosquito.retrieve_media("#{Capybara.app_host}#{node.xpath("//source[1]/source/@src").first&.content}") }) video_file_type = "gif" end username = doc.xpath("//a[contains(@class, 'username')][1]/@href").first.value user = UserScraper.new.parse(username) screenshot_file = take_screenshot() { images: images, video: videos, video_preview_image: video_preview_image, screenshot_file: screenshot_file, text: text, date: date, number_of_likes: number_of_likes, user: user, id: id, language: language, video_file_type: video_file_type } end |
#take_screenshot ⇒ Object
91 92 93 94 95 96 |
# File 'lib/mosquito/scrapers/tweet_scraper.rb', line 91 def take_screenshot # First check if a post has a fact check overlay, if so, clear it. # The only issue is that this can take *awhile* to search. Not sure what to do about that # since it's Instagram's fault for having such a fucked up obfuscated hierarchy # Take the screenshot and return it save_screenshot("#{Mosquito.temp_storage_location}/instagram_screenshot_#{SecureRandom.uuid}.png") end |