Class: WebWordSorter

Inherits:
Object
  • Object
show all
Defined in:
lib/WebWordSorter.rb

Overview

Description

This class contains all the necessary methods to do the following: -Crawl most websites and return an array of their URLS. -Convert all HTML and most web linked PDF documents to one large string given an array of urls. -Parse out all non words and non human sensible markup. -Stooge Sort an array of words via Iteration, <b>NOT recursion. NOTE: Current verison

IS using recursive stooge sort!</b>

-Write array the words to file.

When used in properly this class will take all unique words on a website, parse them, sort them, and write them to a .txt file.

Instance Method Summary collapse

Instance Method Details

#crawler(url_base) ⇒ Object

Example

base = WebWordSorter.new base.crawler(“www.example.com”)



38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# File 'lib/WebWordSorter.rb', line 38

def crawler (url_base)
	pg_crawl = ProgressBar.create(:title => "Crawling: #{url_base}", :starting_at => 20, :total => nil, :format => '%a  %t')

       page_urls = Array.new

	Anemone.crawl("#{url_base}") do |anemone|
 			anemone.on_every_page do |page|

     			page_urls << page.url

     			pg_crawl.title=page.url
     			pg_crawl.increment


 			end	
	end
	pg_crawl.total = 100
	pg_crawl.finish
       return page_urls
end

#pages_to_string(url_array) ⇒ Object

Example

base = WebWordSorter.new urls = [“www.test1.com”,“www.test2.com”,“www.test3.com”] base.pages_to_string“www.example.com”)



76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# File 'lib/WebWordSorter.rb', line 76

def pages_to_string (url_array)

	pages_string = String.new

	url_array.each do |address|

		if Faraday.head(address).status == 200 

			if ("#{address}"[-4,4] ==".pdf")

				pdf_file = open (address)

				pdf_reader = PDF::Reader.new(pdf_file)


				pdf_reader.pages.each do |page|
					pages_string =  pages_string + page.text
				end	
			else	
				data = Nokogiri::HTML(open(address))

				#take advanatge of the nokogiri script removal 
				#BEFORE converting to string.
				data.css('script').remove

				pages_string = pages_string + data
			end	
		end	
	end	

	return pages_string
end

#parse_string(pre_parse_string) ⇒ Object

Example

base = WebWordSorter.new base.parse_string(“hello this is a test 1234 !@#$”)

will return: "hello this is a test"


123
124
125
126
127
128
129
130
131
# File 'lib/WebWordSorter.rb', line 123

def parse_string (pre_parse_string)

		parsed_string = String.new
		parsed_string = pre_parse_string.gsub(/[-]+/, '')
		parsed_string = parsed_string.gsub(/[^ a-z A-Z  - ]+/, ' ')
		parsed_string = parsed_string.squeeze(' ')

	return parsed_string
end

#save_array_to_file(file_name, array_to_save) ⇒ Object

Example

base = WebWordSorter.new base.save_array_to_file some_array



211
212
213
214
215
216
217
218
# File 'lib/WebWordSorter.rb', line 211

def save_array_to_file (file_name, array_to_save)

	File.open("#{file_name}.txt", "w") do |file|

		array_to_save. each {|word| file.puts(word)}

	end
end

#spell_check(words) ⇒ Object

Example

base = WebWordSorter.new base.spell_check some_array



170
171
172
173
174
175
176
177
178
# File 'lib/WebWordSorter.rb', line 170

def spell_check(words)

       dict_path = File.join( File.dirname(__FILE__), '/resources/words.txt' )

 		dictionary = IO.readlines(dict_path).map { |line| line.strip }

 		words.reject { |word| !dictionary.include? word }

end

#split_uniq(pre_array_string) ⇒ Object

Example

base = WebWordSorter.new base.crawler some_string



145
146
147
148
149
150
151
152
153
154
155
# File 'lib/WebWordSorter.rb', line 145

def split_uniq (pre_array_string)

	words_array = pre_array_string.split

	words_array = words_array.uniq

	words_array.map{|x| x.strip }

	return words_array

end

#stooge_sort(input_array) ⇒ Object

Example

base = WebWordSorter.new base.stooge_sort some_array



193
194
195
196
197
198
# File 'lib/WebWordSorter.rb', line 193

def stooge_sort(input_array)

  sorted_array = input_array.stoogesort

  return sorted_array
end