Module: Wriggler
- Defined in:
- lib/wriggler.rb,
lib/wriggler/version.rb
Constant Summary collapse
- VERSION =
"1.2.0"
Instance Attribute Summary collapse
-
#content ⇒ Object
readonly
Returns the value of attribute content.
-
#directory ⇒ Object
readonly
Returns the value of attribute directory.
Class Method Summary collapse
- .crawl(tags = [], directory = "") ⇒ Object
- .crawl_file(doc) ⇒ Object
- .gather_files ⇒ Object
- .is_HTML?(file) ⇒ Boolean
- .is_XML?(file) ⇒ Boolean
- .navigate_directory ⇒ Object
- .open_next_file(file) ⇒ Object
- .sanitize(text) ⇒ Object
- .set_HTML(file) ⇒ Object
- .set_XML(file) ⇒ Object
Instance Attribute Details
#content ⇒ Object (readonly)
Returns the value of attribute content.
6 7 8 |
# File 'lib/wriggler.rb', line 6 def content @content end |
#directory ⇒ Object (readonly)
Returns the value of attribute directory.
6 7 8 |
# File 'lib/wriggler.rb', line 6 def directory @directory end |
Class Method Details
.crawl(tags = [], directory = "") ⇒ Object
8 9 10 11 12 13 14 |
# File 'lib/wriggler.rb', line 8 def self.crawl(=[], directory="") @content = Hash[.map {|k| [k, []]}] #Hash with content @directory = directory #Current top-level directory navigate_directory @content end |
.crawl_file(doc) ⇒ Object
64 65 66 67 68 69 70 71 72 73 |
# File 'lib/wriggler.rb', line 64 def self.crawl_file(doc) #Crawl the Nokogiri Object for the file @content.each_key do |key| arr = [] if !doc.xpath("//#{key}").empty? #Returns an empty array if tag is not present doc.xpath("//#{key}").map{ |tag| arr << sanitize(tag.text) } end @content.fetch(key) << arr end end |
.gather_files ⇒ Object
22 23 24 25 26 27 28 29 |
# File 'lib/wriggler.rb', line 22 def self.gather_files #Gathers all of the HTML or XML files from this and all subdirectories into an array Find.find(@directory) do |file| if is_XML?(file) || is_HTML?(file) open_next_file(file) end end end |
.is_HTML?(file) ⇒ Boolean
42 43 44 45 |
# File 'lib/wriggler.rb', line 42 def self.is_HTML?(file) #Determines, using a regex check, if it is an HTML file file =~ /.html/ end |
.is_XML?(file) ⇒ Boolean
47 48 49 50 |
# File 'lib/wriggler.rb', line 47 def self.is_XML?(file) #Determines, using a regex check, if it is an XML file file =~ /.xml/ end |
.navigate_directory ⇒ Object
16 17 18 19 20 |
# File 'lib/wriggler.rb', line 16 def self.navigate_directory #Set the cwd to the given dir send to gather all nested files from there Dir.chdir(@directory) gather_files end |
.open_next_file(file) ⇒ Object
31 32 33 34 35 36 37 38 39 40 |
# File 'lib/wriggler.rb', line 31 def self.open_next_file(file) #Opens the next file on the list, depending on the extension passes it to HTML or XML f = File.open(file) if is_HTML?(file) set_HTML(f) elsif is_XML?(file) set_XML(f) end end |
.sanitize(text) ⇒ Object
75 76 77 78 |
# File 'lib/wriggler.rb', line 75 def self.sanitize(text) #Removes any escaped quotes, replaces them text.gsub(/"/, "'").lstrip.chomp end |
.set_HTML(file) ⇒ Object
52 53 54 55 56 |
# File 'lib/wriggler.rb', line 52 def self.set_HTML(file) #Set the HTML file into Nokogiri for crawling doc = Nokogiri::HTML(file) crawl_file(doc) end |
.set_XML(file) ⇒ Object
58 59 60 61 62 |
# File 'lib/wriggler.rb', line 58 def self.set_XML(file) #Set the XML file into Nokogiri for crawling doc = Nokogiri::XML(file) crawl_file(doc) end |