Class: Zorki::PostScraper
- Defined in:
- lib/zorki/scrapers/post_scraper.rb
Instance Method Summary collapse
Methods inherited from Scraper
#get_content_of_subpage_from_url, #initialize
Constructor Details
This class inherits a constructor from Zorki::Scraper
Instance Method Details
#parse(id) ⇒ Object
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
# File 'lib/zorki/scrapers/post_scraper.rb', line 7 def parse(id) # Stuff we need to get from the DOM (implemented is starred): # - User * # - Text * # - Image * / Images * / Video * # - Date * # - Number of likes * # - Hashtags .app_host = "https://instagram.com" # video slideshows https://www.instagram.com/p/CY7KxwYOFBS/?utm_source=ig_embed&utm_campaign=loading login graphql_object = get_content_of_subpage_from_url( "https://www.instagram.com/p/#{id}/", "/graphql", "data,xdt_api__v1__media__shortcode__web_info,items" ) graphql_object = graphql_object.first if graphql_object.kind_of?(Array) # For pages that have been marked misinfo the structure is very different than not # If it is a clean post then it's just a schema.org thing, but if it's misinfo it's the old # way of deeply nested stuff. # # First we check which one we're getting if graphql_object.has_key?("articleBody") # Let's just parse the images first images = graphql_object["image"].map do |image| Zorki.retrieve_media(image["url"]) end text = graphql_object["articleBody"] username = graphql_object["author"]["identifier"]["value"] # 2021-04-01T17:07:10-07:00 date = DateTime.strptime(graphql_object["dateCreated"], "%Y-%m-%dT%H:%M:%S%z") interactions = graphql_object["interactionStatistic"] number_of_likes = interactions.select do |x| x["interactionType"] == "http://schema.org/LikeAction" end.first["userInteractionCount"] unless graphql_object["video"].empty? video_url = graphql_object["video"].first["contentUrl"] video = Zorki.retrieve_media(video_url) video_preview_image_url = graphql_object["video"].first["thumbnailUrl"] video_preview_image = Zorki.retrieve_media(video_preview_image_url) end else # We need to see if this is a single image post or a slideshow. We do that # by looking for a single image, if it's not there, we assume the alternative. # debugger # graphql_object = graphql_object["data"]["xdt_api__v1__media__shortcode__web_info"] unless graphql_object["items"][0].has_key?("video_versions") && !graphql_object["items"][0]["video_versions"].nil? # Check if there is a slideshow or not unless graphql_object["items"][0].has_key?("carousel_media") && !graphql_object["items"][0]["carousel_media"].nil? # Single image image_url = graphql_object["items"][0]["image_versions2"]["candidates"][0]["url"] images = [Zorki.retrieve_media(image_url)] else # Slideshow images = graphql_object["items"][0]["carousel_media"].map do |media| Zorki.retrieve_media(media["image_versions2"]["candidates"][0]["url"]) end end else # some of these I've seen in both ways, thus the commented out lines # video_url = graphql_object["entry_data"]["PostPage"].first["graphql"]["shortcode_media"]["video_url"] video_url = graphql_object["items"][0]["video_versions"][0]["url"] video = Zorki.retrieve_media(video_url) # video_preview_image_url = graphql_object["entry_data"]["PostPage"].first["graphql"]["shortcode_media"]["display_resources"].last["src"] video_preview_image_url = graphql_object["items"][0]["image_versions2"]["candidates"][0]["url"] video_preview_image = Zorki.retrieve_media(video_preview_image_url) end unless graphql_object["items"][0]["caption"].nil? text = graphql_object["items"][0]["caption"]["text"] else text = "" end username = graphql_object["items"][0]["user"]["username"] date = DateTime.strptime(graphql_object["items"][0]["taken_at"].to_s, "%s") number_of_likes = graphql_object["items"][0]["like_count"] end screenshot_file = take_screenshot() # This has to run last since it switches pages user = User.lookup([username]).first { images: images, video: video, video_preview_image: video_preview_image, screenshot_file: screenshot_file, text: text, date: date, number_of_likes: number_of_likes, user: user, id: id } end |
#take_screenshot ⇒ Object
114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
# File 'lib/zorki/scrapers/post_scraper.rb', line 114 def take_screenshot # First check if a post has a fact check overlay, if so, clear it. # The only issue is that this can take *awhile* to search. Not sure what to do about that # since it's Instagram's fault for having such a fucked up obfuscated hierarchy begin ("See Post").click sleep(0.1) rescue ::ElementNotFound # Do nothing if the element is not found end # Take the screenshot and return it save_screenshot("#{Zorki.temp_storage_location}/instagram_screenshot_#{SecureRandom.uuid}.png") end |