4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
|
# File 'lib/omni_scrape.rb', line 4
def CrawlScrape(url, depth, sub_url)
page = Nokogiri::HTML(open(url))
links= page.css('a')
refarr=[]
titlearr=[]
titles =[]
hrefs = []
x=0
links.each do |link|
if(link['title']!=nil && link['title']!="" &&link['href']!=nil && link['href']!="")
titles.push((link['title']).split.join)
hrefs.push((link['href']).split.join)
end
end
inc=0
while(!hrefs.empty?)
value= hrefs.pop
puts value
refarr.push(value)
refarr[inc]
inc+=1
end
inc=0
while(!titles.empty?)
value = titles.pop
puts value
titlearr.push(value)
puts titlearr[inc]
inc+=1
end
refarr.push("-")
for i in 0..titlearr.length
if(refarr[i]!="-")
if(refarr[i].include?('http://'))
url=refarr[i]
else
url=sub_url+refarr[i]
end
fourofour=false
begin
if(fourofour==false)
pagina = Nokogiri::HTML(open(url))
end
rescue Exception =>ex
puts "got a 404"
fourofour=true
retry
end
if (fourofour==false)
trimval=titlearr[i]
finval=trimval.gsub!(/[!:\/-]/, '')
puts titlearr[i]
if(finval==nil && titlearr[i]!=nil)
finval=titlearr[i]
end
puts finval
if(finval!=nil)
crfile=File.new((finval+".html").chomp,"w")
crfile.puts pagina
crfile.close
end
end
end
end
end
|