Module: UrlScraper

Defined in:
lib/url_scraper.rb,
lib/url_scraper/version.rb

Defined Under Namespace

Classes: CLI, Engine, Object

Constant Summary collapse

TYPES =
{
  'activity' => %w(activity sport),
  'business' => %w(bar company cafe hotel restaurant),
  'group' => %w(cause sports_league sports_team),
  'organization' => %w(band government non_profit school university),
  'person' => %w(actor athlete author director musician politician public_figure),
  'place' => %w(city country landmark state_province),
  'product' => %w(album book drink food game movie product song tv_show),
  'website' => %w(blog website)
}
VERSION =
"0.0.5"

Class Method Summary collapse

Class Method Details

.fetch(uri, strict = true) ⇒ Object

Fetch Open Graph data from the specified URI. Makes an HTTP GET request and returns an UrlScraper::Object if there is data to be found or false if there isn’t.

Pass false for the second argument if you want to see invalid (i.e. missing a required attribute) data.



28
29
30
31
32
# File 'lib/url_scraper.rb', line 28

def self.fetch(uri, strict = true)
  parse(RestClient.get(uri).body, strict, uri)
  rescue RestClient::Exception, SocketError
    false
end

.parse(html, strict = true, uri) ⇒ Object



34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# File 'lib/url_scraper.rb', line 34

def self.parse(html, strict = true, uri)
  logger = Logger.new(STDOUT)
  doc = Nokogiri::HTML.parse(html)
  page = UrlScraper::Object.new
  doc.css('meta').each do |m|
    if m.attribute('property') && m.attribute('property').to_s.match(/^og:(.+)$/i)
      page[$1.gsub('-','_')] = m.attribute('content').to_s
    end
  end

  page.title = (doc.at_css('title').text rescue nil) if page.title.nil?
  if page.description.nil?
    page.description = doc.at_css("meta[name='description']")['content'] unless doc.at_css("meta[name='description']").nil?
  end
  if page.image.nil?
    image_array = []
    doc.css("img").each do |img|
      next if img["src"].to_s.empty?
      image = URI.escape(img["src"].strip)
      image = image.gsub(/([{}|\^\[\]\@`])/) {|s| CGI.escape(s)} # escape characters that URI.escape doesn't get
      image = URI.parse(uri).merge(URI.parse image.to_s).to_s
      image_array << image
    end
    page.image = image_array unless image_array.empty?
  end
  # return false if page.keys.empty?
  # return false unless page.valid? if strict
  page.image = Array.wrap(page.image)
  page
  # return doc
end