Module: OmniScrape

Defined in:
lib/omni_scrape.rb,
lib/omni_scrape/version.rb

Constant Summary collapse

VERSION =
"0.1.5.2"

Instance Method Summary collapse

Instance Method Details

#CrawlScrape(url, depth, sub_url) ⇒ Object



4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# File 'lib/omni_scrape.rb', line 4

def CrawlScrape(url, depth, sub_url)
   #open the starting page
page = Nokogiri::HTML(open(url))
   #collect all of the links from the page
links= page.css('a')

   #initialize variables
refarr=[]
titlearr=[]
titles =[]
hrefs = []
x=0
   #add title and href to arrays for each link
links.each do |link|
    if(link['title']!=nil && link['title']!="" &&link['href']!=nil && link['href']!="")
    # puts x
    # puts (link['title'].split.join)
    # x+=1
    titles.push((link['title']).split.join)
    hrefs.push((link['href']).split.join)
      
    end
    
   end
inc=0
   #transfer links to other array
while(!hrefs.empty?)
  value= hrefs.pop
  puts value
  refarr.push(value)
  refarr[inc]
  inc+=1
end
inc=0
   #transfer titles to other array
   while(!titles.empty?)
      value = titles.pop
   puts value
   titlearr.push(value)
   puts titlearr[inc]
   inc+=1
end
   #setup for recognition of the end of the array
       refarr.push("-")
 
   #in each link 
for i in 0..titlearr.length
   if(refarr[i]!="-")
       #evaluate whether link is internal or external
       if(refarr[i].include?('http://'))
           url=refarr[i]
           else
   url=sub_url+refarr[i]
           end
fourofour=false
   
  begin
    if(fourofour==false)
    pagina = Nokogiri::HTML(open(url))
    end
           #test for a 404
  rescue Exception =>ex
    puts "got a 404"
    fourofour=true
    retry
  end
  if (fourofour==false)
           #trim it down and remove special characters
  trimval=titlearr[i]
  finval=trimval.gsub!(/[!:\/-]/, '')
       puts titlearr[i]
  if(finval==nil && titlearr[i]!=nil)
  finval=titlearr[i]
  end
  puts finval
  if(finval!=nil)
           #store html from the link with title of the link
  crfile=File.new((finval+".html").chomp,"w")
   crfile.puts pagina
   crfile.close
end
end
       end
end

end