Class: EFFScraper

Inherits:

Object

Object
EFFScraper

show all

Defined in:: lib/effscraper.rb

Instance Method Summary collapse

#initialize(url) ⇒ EFFScraper constructor

A new instance of EFFScraper.
#scrapeCase ⇒ Object

Scrapes all documents in case.
#scrapePage(html) ⇒ Object

Scrapes each page of documents.

Constructor Details

#initialize(url) ⇒ `EFFScraper`

Returns a new instance of EFFScraper.

# File 'lib/effscraper.rb', line 7

def initialize(url)
  @url = url
  @casearray = Array.new
end

Instance Method Details

#scrapeCase ⇒ `Object`

Scrapes all documents in case

# File 'lib/effscraper.rb', line 13

def scrapeCase
  html = Nokogiri::HTML(open(@url))

  # Get number of pages to scrape
  if html.css("li.pager-current")[0]
    count = html.css("li.pager-current")[0].text.split(" ")
    n = count[2].to_i
  else
    n = 1
  end

  # Go through pages and scrape them
  for i in 1..n
    if i > 1
      link = "https://eff.org" + html.css("li.pager-next")[0].css("a")[0]["href"]
      html = Nokogiri::HTML(open(link))
    end
    
    scrapePage(html)
  end
  
  JSON.pretty_generate(@casearray)
end

#scrapePage(html) ⇒ `Object`

Scrapes each page of documents

# File 'lib/effscraper.rb', line 38

def scrapePage(html)
  items = html.css("div.view-content")[0]
  
  items.css("li").each do |l|
    dochash = Hash.new

    # Gets link to document and file
    l.css("a").each do |a|
      if a.text == "[PDF]"
        dochash[:url] = a["href"]
        `wget #{dochash[:url]}` 
        path = dochash[:url].split("/")
        dochash[:path] = path[path.length-1].chomp.strip
      end
    end

    # Get date and title                                                      
    dochash[:doc_date] = l.css("span.date-display-single").text
    dochash[:title] = l.css("a")[1].text

    # Extract metadata and text
    begin
      u = UploadConvert.new(dochash[:path])
      metadata = u.extractMetadataPDF
      metadata.each{|k, v| dochash[k] = v}
      dochash[:text] = u.detectPDFType
      @casearray.push(dochash)
    rescue
    end
  end
end

Class: EFFScraper

Instance Method Summary collapse

Constructor Details

#initialize(url) ⇒ EFFScraper

Instance Method Details

#scrapeCase ⇒ Object

#scrapePage(html) ⇒ Object

#initialize(url) ⇒ `EFFScraper`

#scrapeCase ⇒ `Object`

#scrapePage(html) ⇒ `Object`