Module: CxExtractor::Utils
- Included in:
- CxExtractor
- Defined in:
- lib/cx_extractor/utils.rb
Overview
utils for cx_extractor
Instance Method Summary collapse
- #find_dive(block_distribution, surge_point) ⇒ Object
- #find_surge(block_distribution, start, threshold) ⇒ Object
- #get_clean_text(dom) ⇒ Object
- #line_block_distribute(lines) ⇒ Object
- #replace_special_char(str) ⇒ Object
Instance Method Details
#find_dive(block_distribution, surge_point) ⇒ Object
29 30 31 32 33 34 35 36 37 |
# File 'lib/cx_extractor/utils.rb', line 29 def find_dive(block_distribution, surge_point) ((surge_point + 1)...(block_distribution.size - 2)).each do |index| if block_distribution[index].zero? && block_distribution[index + 1].zero? return index - 1 end end block_distribution.size - 1 end |
#find_surge(block_distribution, start, threshold) ⇒ Object
17 18 19 20 21 22 23 24 25 26 27 |
# File 'lib/cx_extractor/utils.rb', line 17 def find_surge(block_distribution, start, threshold) ((start + 1)...block_distribution.length - 3).each do |index| if block_distribution[index] > threshold && ( block_distribution[index + 1] > 0 || block_distribution[index + 2] > 0 || block_distribution[index + 3] > 0) return index end end -1 end |
#get_clean_text(dom) ⇒ Object
39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
# File 'lib/cx_extractor/utils.rb', line 39 def get_clean_text(dom) # remove html comment html = dom.clone html.gsub!(/<!--.*?(.|\n)*?-->/, "\n") # remove javascript html.gsub!(%r{<script.*?>.*?(.|\n)*?</script>}, "\n") # remove a html.gsub!(%r{<a[\t|\n|\r|\f].*?>.*?</a>}, '') # remove css html.gsub!(%r{<style.*?>.*?(.|\n)*?</style>}, "\n") # remove tag html.gsub!(/<.*?(.|\n)*?>/, '') replace_special_char(html) end |
#line_block_distribute(lines) ⇒ Object
4 5 6 7 8 9 10 11 12 13 14 15 |
# File 'lib/cx_extractor/utils.rb', line 4 def line_block_distribute(lines) block_distribution = [] index_distribution = lines.map(&:length) (0...(lines.length - balck_width + 1)).each do |i| word_num = 0 (0...balck_width).each do |j| word_num += index_distribution[i + j] end block_distribution[i] = word_num end block_distribution end |
#replace_special_char(str) ⇒ Object
54 55 56 57 58 59 60 61 |
# File 'lib/cx_extractor/utils.rb', line 54 def replace_special_char(str) str.gsub!('•', 'ยท') str.gsub!('&', '&') str.gsub!(' ', ' ') str.gsub!('©', '@') str.gsub!("\r\n|\r", "\n") str end |