Module: Wriggler

Defined in:
lib/wriggler.rb,
lib/wriggler/version.rb

Constant Summary collapse

VERSION =
"1.2.0"

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Attribute Details

#contentObject (readonly)

Returns the value of attribute content.



6
7
8
# File 'lib/wriggler.rb', line 6

def content
  @content
end

#directoryObject (readonly)

Returns the value of attribute directory.



6
7
8
# File 'lib/wriggler.rb', line 6

def directory
  @directory
end

Class Method Details

.crawl(tags = [], directory = "") ⇒ Object



8
9
10
11
12
13
14
# File 'lib/wriggler.rb', line 8

def self.crawl(tags=[], directory="")
  @content = Hash[tags.map {|k| [k, []]}]   #Hash with content
  @directory = directory                    #Current top-level directory

  navigate_directory
  @content
end

.crawl_file(doc) ⇒ Object



64
65
66
67
68
69
70
71
72
73
# File 'lib/wriggler.rb', line 64

def self.crawl_file(doc)
  #Crawl the Nokogiri Object for the file
  @content.each_key do |key|
    arr = []
    if !doc.xpath("//#{key}").empty?        #Returns an empty array if tag is not present
      doc.xpath("//#{key}").map{ |tag| arr << sanitize(tag.text) }
    end
    @content.fetch(key) << arr
  end
end

.gather_filesObject



22
23
24
25
26
27
28
29
# File 'lib/wriggler.rb', line 22

def self.gather_files
	#Gathers all of the HTML or XML files from this and all subdirectories into an array
  Find.find(@directory) do |file|
    if is_XML?(file) || is_HTML?(file)
      open_next_file(file)
    end
  end
end

.is_HTML?(file) ⇒ Boolean

Returns:

  • (Boolean)


42
43
44
45
# File 'lib/wriggler.rb', line 42

def self.is_HTML?(file)
	#Determines, using a regex check, if it is an HTML file
	file =~ /.html/
end

.is_XML?(file) ⇒ Boolean

Returns:

  • (Boolean)


47
48
49
50
# File 'lib/wriggler.rb', line 47

def self.is_XML?(file)
	#Determines, using a regex check, if it is an XML file
	file =~ /.xml/
end


16
17
18
19
20
# File 'lib/wriggler.rb', line 16

def self.navigate_directory
	#Set the cwd to the given dir send to gather all nested files from there
	Dir.chdir(@directory) 
	gather_files
end

.open_next_file(file) ⇒ Object



31
32
33
34
35
36
37
38
39
40
# File 'lib/wriggler.rb', line 31

def self.open_next_file(file)
	#Opens the next file on the list, depending on the extension passes it to HTML or XML
	f = File.open(file)

	if is_HTML?(file)
		set_HTML(f)
	elsif is_XML?(file)
		set_XML(f)
	end
end

.sanitize(text) ⇒ Object



75
76
77
78
# File 'lib/wriggler.rb', line 75

def self.sanitize(text)
	#Removes any escaped quotes, replaces them
	text.gsub(/"/, "'").lstrip.chomp				
end

.set_HTML(file) ⇒ Object



52
53
54
55
56
# File 'lib/wriggler.rb', line 52

def self.set_HTML(file)
	#Set the HTML file into Nokogiri for crawling
	doc = Nokogiri::HTML(file)
	crawl_file(doc)
end

.set_XML(file) ⇒ Object



58
59
60
61
62
# File 'lib/wriggler.rb', line 58

def self.set_XML(file)
	#Set the XML file into Nokogiri for crawling
	doc = Nokogiri::XML(file)
	crawl_file(doc)
end