Class: GuardianScraper

Inherits:
Object
  • Object
show all
Defined in:
lib/guardianscraper.rb

Overview

Scraper for the articles about the NSA docs in the Guardian

Instance Method Summary collapse

Constructor Details

#initialize(url) ⇒ GuardianScraper

Returns a new instance of GuardianScraper.



7
8
9
# File 'lib/guardianscraper.rb', line 7

def initialize(url)
  @url = url
end

Instance Method Details

#getArticleObject

Download the article and save the text and other data



12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/guardianscraper.rb', line 12

def getArticle
  articlehash = Hash.new
  html = Nokogiri::HTML(open(@url))

  # Gets misc data on article
  articlehash[:headline] = html.css('h1[itemprop="name headline  "]').text
  articlehash[:description] = html.css('div[itemprop="description"]').text
  articlehash[:date] = html.css('time[itemprop="datePublished"]').text
  articlehash[:author] = html.css("a.contributor").text
  articlehash[:published_by] = "The Guardian"                                                     
  articlehash[:caption] = html.css("div.caption").text

  # Gets list of documents linked to
  articlehash[:documents] = Array.new
  html.css('div[itemprop="description"]').css("a").each do |d|
    articlehash[:documents].push(d["href"])
  end
  
  # Gets text of article
  articlehash[:text] = html.css("div#article-body-blocks").text
  articlehash[:plaintext] = html.css("div#article-body-blocks").text

  JSON.pretty_generate(articlehash)
end