Class: Scraper

Inherits:
Object
  • Object
show all
Defined in:
lib/scraper.rb

Constant Summary collapse

@@all_topics =
[]

Class Method Summary collapse

Class Method Details

.all_topicsObject

Scrapes all main topics from all portals main page creates all topic instances


32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# File 'lib/scraper.rb', line 32

def self.all_topics
 html = open("https://en.wikipedia.org/wiki/Portal:Contents/Portals")
 doc = Nokogiri::HTML(html) do |config|
   config.noblanks
 end

 #sets a container for the main topic headlines
 doc.search("#mw-content-text div table table div").each{|anchor|
   if anchor['style'] == "position: relative;border: 0px solid #A3BFB1;background: #CEF2E0;color: black;padding: .1em;text-align: center;font-weight: bold;font-size: 100%;margin-bottom: 0px;border-top: 1px solid #A3BFB1;border-bottom: 1px solid #A3BFB1;"
     anchor['class'] = "title_container" unless anchor.text.include?("General reference")
   end
 }

 #set .headlines class for all main topics
 doc.search("h2 .mw-headline big").each{|anchor|
   anchor['class'] = "headlines" unless anchor.text == "Wikipedia's contents: Portals" || anchor.text == "Wikipedia's contents: Portals" || anchor.text.include?("General reference")
 }

 #updating the @@all_topics hash with topic symbols
 doc.search(".headlines").each{|anchor|
   copy = anchor.text.chomp("(see in all page types)").strip
   copy.slice!(-3..-1)
   @@all_topics << copy
 }

 @@all_topics.each{|item|
   Topic.new(item)
 }
 @@all_topics
end

.get_portal_name(url) ⇒ Object


63
64
65
66
67
68
69
70
# File 'lib/scraper.rb', line 63

def self.get_portal_name(url)
  html = open(url)
  puts "***Scraping Portal Name"
  doc = Nokogiri::HTML(html) do |config|
    config.noblanks
  end
  doc.search("title").text
end

.scrape_portals_page(name) ⇒ Object


4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# File 'lib/scraper.rb', line 4

def self.scrape_portals_page(name)
 choice_index = @@all_topics.index(name) + 1

 #choice is the chosen topic index
 #there are 11 main topics derrived from Scraper.all_topics
 html = open("https://en.wikipedia.org/wiki/Portal:Contents/Portals")
 doc = Nokogiri::HTML(html) do |config|
   config.noblanks
 end

 puts "***Scraping Portals Page"
 #set portals-container class for all portal links for each topic
 #Thus there are 12 portal links containers but we're skipping the first one
 doc.search("div").each{|anchor|
   if anchor['style'] == "box-sizing: border-box; border: 0px solid #A3BFB1; border-bottom: 0px solid #A3BFB1;; border-top-width: 1px; vertical-align: top;background: #F5FFFA;opacity: 1; color: black; text-align: left; margin: 0 0 10px; padding: 1em;;padding-top: .3em;-moz-border-radius: 0; -webkit-border-radius: 0; border-radius: 0;"
     anchor['class'] = "portals-container"
   end
 }

 #randomnly select a sub-portal from the main topic portal choice
 randval = Random.new
 randnum = randval.rand(doc.search(".portals-container")[choice_index].search("a").count{|i| i.attribute("href").value.include?("/wiki/Portal:")})
 randportal = doc.search(".portals-container")[choice_index].search("a")[randnum].attribute("href").value.prepend("https://en.wikipedia.org")
 randportal
end