Module: CxExtractor::Utils

Included in:
CxExtractor
Defined in:
lib/cx_extractor/utils.rb

Overview

utils for cx_extractor

Instance Method Summary collapse

Instance Method Details

#find_dive(block_distribution, surge_point) ⇒ Object



29
30
31
32
33
34
35
36
37
# File 'lib/cx_extractor/utils.rb', line 29

def find_dive(block_distribution, surge_point)
  ((surge_point + 1)...(block_distribution.size - 2)).each do |index|
    if block_distribution[index].zero? &&
       block_distribution[index + 1].zero?
      return index - 1
    end
  end
  block_distribution.size - 1
end

#find_surge(block_distribution, start, threshold) ⇒ Object



17
18
19
20
21
22
23
24
25
26
27
# File 'lib/cx_extractor/utils.rb', line 17

def find_surge(block_distribution, start, threshold)
  ((start + 1)...block_distribution.length - 3).each do |index|
    if block_distribution[index] > threshold && (
       block_distribution[index + 1] > 0 ||
       block_distribution[index + 2] > 0 ||
       block_distribution[index + 3] > 0)
      return index
    end
  end
  -1
end

#get_clean_text(dom) ⇒ Object



39
40
41
42
43
44
45
46
47
48
49
50
51
52
# File 'lib/cx_extractor/utils.rb', line 39

def get_clean_text(dom)
  # remove html comment
  html = dom.clone
  html.gsub!(/<!--.*?(.|\n)*?-->/, "\n")
  # remove javascript
  html.gsub!(%r{<script.*?>.*?(.|\n)*?</script>}, "\n")
  # remove a
  html.gsub!(%r{<a[\t|\n|\r|\f].*?>.*?</a>}, '')
  # remove css
  html.gsub!(%r{<style.*?>.*?(.|\n)*?</style>}, "\n")
  # remove tag
  html.gsub!(/<.*?(.|\n)*?>/, '')
  replace_special_char(html)
end

#line_block_distribute(lines) ⇒ Object



4
5
6
7
8
9
10
11
12
13
14
15
# File 'lib/cx_extractor/utils.rb', line 4

def line_block_distribute(lines)
  block_distribution = []
  index_distribution = lines.map(&:length)
  (0...(lines.length - balck_width + 1)).each do |i|
    word_num = 0
    (0...balck_width).each do |j|
      word_num += index_distribution[i + j]
    end
    block_distribution[i] = word_num
  end
  block_distribution
end

#replace_special_char(str) ⇒ Object



54
55
56
57
58
59
60
61
# File 'lib/cx_extractor/utils.rb', line 54

def replace_special_char(str)
  str.gsub!('&#8226;', 'ยท')
  str.gsub!('&amp;', '&')
  str.gsub!('&nbsp;', ' ')
  str.gsub!('&copy;', '@')
  str.gsub!("\r\n|\r", "\n")
  str
end