Module: Preprocess

Included in:
LicenseMatcher::RuleMatcher, LicenseMatcher::TFRubyMatcher
Defined in:
lib/license_matcher/preprocess.rb

Instance Method Summary collapse

Instance Method Details

#clean_html(html_doc) ⇒ Object



36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# File 'lib/license_matcher/preprocess.rb', line 36

def clean_html(html_doc)
  body_text = ""
  body_elements = html_doc.xpath(
    '//p | //h1 | //h2 | //h3 | //h4 | //h5 | //h6 | //em | //strong | //b | //td | //pre
    | //li[not(@id) and not(@class) and not(a)] | //section//section[@class="project-info"]
    | //blockquote | //textarea'
  ).to_a

  #extract text from html tag and separate them by space
  body_elements.each {|el| body_text += ' ' + el.text.to_s}

  #REMOVE XML CDATA like opensource.org pages has
  body_text = body_text.to_s.strip
  body_text.gsub!(/\<\!\[CDATA.+?\]\]\>/i, ' ')

  if body_text.empty?
    p "match_html: document didnt pass noise filter, will use whole body content"
    body_text = html_doc.xpath('//body').text.to_s.strip
  end

  return body_text
end

#parse_html(html_text) ⇒ Object



59
60
61
62
63
64
65
66
# File 'lib/license_matcher/preprocess.rb', line 59

def parse_html(html_text)
  begin
    return Nokogiri.HTML(safe_encode(html_text))
  rescue Exception => e
    log.error "failed to parse html doc: \n #{html_text}"
    return nil
  end
end

#preprocess_html(html_text) ⇒ Object



21
22
23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/license_matcher/preprocess.rb', line 21

def preprocess_html(html_text)
  # if text is HTML doc, then
  # extract text only from visible html tags
  text = ""

  html_doc = parse_html(html_text)
  if html_doc
    text = clean_html(html_doc)
  else
    p "match_html: failed to parse html document\n#{html_text}"
  end

  return text
end

#preprocess_text(text) ⇒ Object



4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# File 'lib/license_matcher/preprocess.rb', line 4

def preprocess_text(text)
  text = safe_encode(text)

  #remove markdown url tags
  text = text.gsub(/\[.+?\]\(.+?\)/, ' ')

  #remove spam words
  text.gsub!(/\bTHE\b/i, '')

  #remove some XML grabage
  text = text.gsub(/\<\!\[CDATA.*?\]\]\>/, ' ').to_s
  text = text.gsub(/\<\!--.+?--\>/,  ' ').to_s
  text = text.gsub(/<\!\[CDATA.+?\]>/, ' ').to_s

  return text.to_s.strip.gsub(/\s+/, ' ')
end

#safe_encode(txt) ⇒ Object



68
69
70
71
72
73
# File 'lib/license_matcher/preprocess.rb', line 68

def safe_encode(txt)
  txt.to_s.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
rescue
  p "Failed to encode text:\n #{txt}i"
  return ""
end