Class: Util::TextCleaner

Inherits:
Object show all
Defined in:
lib/utilities/text_cleaner.rb

Instance Method Summary collapse

Instance Method Details

#clean_controls(extracted_data) ⇒ Object

Cleans control information from passed in file



11
12
13
14
15
16
17
18
19
20
# File 'lib/utilities/text_cleaner.rb', line 11

def clean_controls(extracted_data)
  controls_data = isolate_controls_data(extracted_data)
  clean_section_header = remove_section_header(controls_data)
  clean_whitespace = remove_newline_in_controls(clean_section_header)
  clean_special = remove_special(clean_whitespace)
  clean_no_space = remove_extra_space(clean_special)
  clean_pagenum = remove_pagenum(clean_no_space)
  clean_data = separate_controls(clean_pagenum)
  clean_data
end

#clean_data(data) ⇒ Object

Takes in text file, cleans data and writes to new text file.



6
7
8
# File 'lib/utilities/text_cleaner.rb', line 6

def clean_data(data)
  clean_controls(data)
end

#isolate_controls_data(extracted_data) ⇒ Object

Removes everything before and after the controls



23
24
25
26
27
28
29
# File 'lib/utilities/text_cleaner.rb', line 23

def isolate_controls_data(extracted_data)
  extracted_data = extracted_data.gsub(/\| P a g e+/, "| P a g e\n")
  extracted_data = extracted_data.split("\n").map{ |line| line.strip}.reject { |e| e.to_s.empty? }.join("\n")
  extracted_data = extracted_data.gsub('???', '')
  controls_data = /^1\.1\s*[^\)]*?(?=\)$)(.*\n)*?(?=\s*Appendix:)/.match(extracted_data).to_s
  controls_data
end

#remove_extra_space(extracted_data) ⇒ Object



60
61
62
63
64
65
66
67
# File 'lib/utilities/text_cleaner.rb', line 60

def remove_extra_space(extracted_data)
  clean_data = extracted_data.gsub(/\n\n\n/, "\n")
  clean_data = clean_data.gsub(/\t\n/, "\n")
  clean_data = clean_data.delete("\t")
  clean_data = clean_data.delete("\r")
  clean_data = clean_data.gsub(/\s\s/, ' ')
  clean_data.gsub(/(\n\n(?!^\d\.\d{1,}.*\n?.*?))/, '')
end

#remove_newline_in_controls(extracted_data) ⇒ Object

removes newlines between a control



45
46
47
48
# File 'lib/utilities/text_cleaner.rb', line 45

def remove_newline_in_controls(extracted_data)
  clean_whitespace = extracted_data.gsub(/\s\n.*?(?!d\.)/, "\n").to_s
  clean_whitespace
end

#remove_pagenum(extracted_data) ⇒ Object

Removes all pagenumbers between the controls



32
33
34
35
36
# File 'lib/utilities/text_cleaner.rb', line 32

def remove_pagenum(extracted_data)
  clean_pagenum = extracted_data.gsub(/(\d{1,3}\|Page|\d{1,3} \| P a g e)/, '').to_s
  clean_pagenum = clean_pagenum.gsub(/(\d{1,3} \| Page)/, '').to_s
  clean_pagenum
end

#remove_section_header(extracted_data) ⇒ Object

Removes section headers for each control



39
40
41
42
# File 'lib/utilities/text_cleaner.rb', line 39

def remove_section_header(extracted_data)
  clean_section_header = extracted_data.gsub(/(?<!•)\s\n\d{1}\s.*(?:.*\n)*?(?=\d\.\d)/, "\n\n").to_s
  clean_section_header
end

#remove_special(extracted_data) ⇒ Object



55
56
57
58
# File 'lib/utilities/text_cleaner.rb', line 55

def remove_special(extracted_data)
  extracted_data = extracted_data.gsub(/[]/, '')
  extracted_data.gsub(/[•]/, '')
end

#separate_controls(extracted_data) ⇒ Object

adds whitespace between different controls



51
52
53
# File 'lib/utilities/text_cleaner.rb', line 51

def separate_controls(extracted_data)
  extracted_data.gsub(/((?=^\s*?\d\.\d{1,}.*\n?.*?(?<=\)$)))/, "\n").to_s
end