Class: WLSearchScraper

Inherits:
Object
  • Object
show all
Defined in:
lib/wlsearchscraper.rb

Instance Method Summary collapse

Constructor Details

#initialize(searchterms) ⇒ WLSearchScraper

Returns a new instance of WLSearchScraper.



6
7
8
9
# File 'lib/wlsearchscraper.rb', line 6

def initialize(searchterms)
  @searchterms = searchterms
  @resultlist = Array.new
end

Instance Method Details

#cableParser(url) ⇒ Object



25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# File 'lib/wlsearchscraper.rb', line 25

def cableParser(url)
  cablehash = Hash.new
  html = Nokogiri::HTML(open(url))
  
  # Go through and get all the metadata and content
  html.css("td").each do |t|
    a = t.css("a")
    if !(a.empty?) && (a[0]["title"] == "Date")
      cablehash[:date] = t.css("div[2]").text
    elsif !(a.empty?) && (a[0]["title"] == "Canonical ID")
      cablehash[:id] = t.css("div[2]").text
    elsif !(a.empty?) && (a[0]["title"] == "Original Classification")
      cablehash[:original_classification] = t.css("div[2]").text
    elsif !(a.empty?) && (a[0]["title"] == "Current Classification")
      cablehash[:current_classification] = t.css("div[2]").text
    elsif !(a.empty?) && (a[0]["title"] == "Handling Restrictions")
      cablehash[:handling_restrictions] = t.css("div[2]").text
    elsif !(a.empty?) && (a[0]["title"] == "Character Count")
      cablehash[:character_count] = t.css("div[2]").text
    elsif !(a.empty?) && (a[0]["title"] == "Executive Order")
      cablehash[:executive_order] = t.css("div[2]").text
    elsif !(a.empty?) && (a[0]["title"] == "Locator")
      cablehash[:locator] = t.css("div[2]").text
    elsif !(a.empty?) && (a[0]["title"] == "TAGS")
      cablehash[:tags] = t.css("div[2]").text
    elsif !(a.empty?) && (a[0]["title"] == "Concepts")
      cablehash[:concepts] = t.css("div[2]").text
    elsif !(a.empty?) && (a[0]["title"] == "Enclosure")
      cablehash[:enclosure] = t.css("div[2]").text
    elsif !(a.empty?) && (a[0]["title"] == "Type")
      cablehash[:type] = t.css("div[2]").text
    elsif !(a.empty?) && (a[0]["title"] == "Office Origin")
      cablehash[:office_origin] = t.css("div[2]").text
    elsif !(a.empty?) && (a[0]["title"] == "Office Action")
      cablehash[:office_action] = t.css("div[2]").text
    elsif !(a.empty?) && (a[0]["title"] == "Archive Status")
      cablehash[:archive_status] = t.css("div[2]").text
    elsif !(a.empty?) && (a[0]["title"] == "From")
      cablehash[:from] = t.css("div[2]").text
    elsif !(a.empty?) && (a[0]["title"] == "Markings")
      cablehash[:markings] = t.css("div[2]").text
    elsif !(a.empty?) && (a[0]["title"] == "To")
      to = t.css("div[2]").text
      splitto = to.split(" | ")
      splitarray = Array.new
      splitto.each do |s|
        splitarray.push(s.strip)
      end
      cablehash[:to] = splitarray
    elsif !(a.empty?) && (a[0]["title"] == "Linked documents or other documents with the same ID")
      cablehash[:linked_docs] = t.css("div[2]").text
    end
  end

  # Get cable content
  contentcount = 0
  html.css("div").each do |d|
   if d["class"] == "text-content"
     contentcount += 1
     if contentcount == 2
       cablehash[:content] = d.text
     end
   end
  end
  
  return cablehash
end

#scrapeObject

Returns array of document URLs matching search terms



12
13
14
15
16
17
18
19
20
21
22
23
# File 'lib/wlsearchscraper.rb', line 12

def scrape
  @searchterms.gsub!(" ", "+")
  url = "https://search.wikileaks.org/advanced?q=" + @searchterms + "&exclude_words=&words_title_only=&words_content_only=&publication_type[]=3"
  html = Nokogiri::HTML(open(url))

  html.css("h4").each do |h|
    href = h.css("a")[0]["href"]
    @resultlist.push(cableParser(href))
  end
  
  return JSON.pretty_generate(@resultlist)
end