Module: UrlScraper
- Defined in:
- lib/url_scraper.rb,
lib/url_scraper/version.rb
Defined Under Namespace
Constant Summary collapse
- TYPES =
{ 'activity' => %w(activity sport), 'business' => %w(bar company cafe hotel restaurant), 'group' => %w(cause sports_league sports_team), 'organization' => %w(band government non_profit school university), 'person' => %w(actor athlete author director musician politician public_figure), 'place' => %w(city country landmark state_province), 'product' => %w(album book drink food game movie product song tv_show), 'website' => %w(blog website) }
- VERSION =
"0.0.5"
Class Method Summary collapse
-
.fetch(uri, strict = true) ⇒ Object
Fetch Open Graph data from the specified URI.
- .parse(html, strict = true, uri) ⇒ Object
Class Method Details
.fetch(uri, strict = true) ⇒ Object
Fetch Open Graph data from the specified URI. Makes an HTTP GET request and returns an UrlScraper::Object if there is data to be found or false
if there isn’t.
Pass false
for the second argument if you want to see invalid (i.e. missing a required attribute) data.
28 29 30 31 32 |
# File 'lib/url_scraper.rb', line 28 def self.fetch(uri, strict = true) parse(RestClient.get(uri).body, strict, uri) rescue RestClient::Exception, SocketError false end |
.parse(html, strict = true, uri) ⇒ Object
34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
# File 'lib/url_scraper.rb', line 34 def self.parse(html, strict = true, uri) logger = Logger.new(STDOUT) doc = Nokogiri::HTML.parse(html) page = UrlScraper::Object.new doc.css('meta').each do |m| if m.attribute('property') && m.attribute('property').to_s.match(/^og:(.+)$/i) page[$1.gsub('-','_')] = m.attribute('content').to_s end end page.title = (doc.at_css('title').text rescue nil) if page.title.nil? if page.description.nil? page.description = doc.at_css("meta[name='description']")['content'] unless doc.at_css("meta[name='description']").nil? end if page.image.nil? image_array = [] doc.css("img").each do |img| next if img["src"].to_s.empty? image = URI.escape(img["src"].strip) image = image.gsub(/([{}|\^\[\]\@`])/) {|s| CGI.escape(s)} # escape characters that URI.escape doesn't get image = URI.parse(uri).merge(URI.parse image.to_s).to_s image_array << image end page.image = image_array unless image_array.empty? end # return false if page.keys.empty? # return false unless page.valid? if strict page.image = Array.wrap(page.image) page # return doc end |