Module: Awestruct::ContextHelper

Defined in:
lib/awestruct/context_helper.rb

Instance Method Summary collapse

Instance Method Details

#clean_html(str) ⇒ Object



11
12
13
# File 'lib/awestruct/context_helper.rb', line 11

def clean_html(str)
  str.gsub( / /, ' ' )
end

#close_tags(s) ⇒ Object



19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/awestruct/context_helper.rb', line 19

def close_tags(s)
  stack = []
  s.scan(/<\/?[^>]+>/).each do |tag|
    if tag[1] != '/'
      tag = tag[1..-1].scan(/\w+/).first
      stack = [ tag ] + stack
    else
      tag = tag[2..-1].scan(/\w+/).first
      if stack[0] == tag
        stack = stack.drop(1)
      else
        raise "Malformed HTML expected #{tag[0]} but got #{tag} '#{s}'"
      end
    end
  end
  stack.inject(s) { |memo,tag| memo += "</#{tag}>" }
end

#fix_url(base_url, url) ⇒ Object



70
71
72
73
# File 'lib/awestruct/context_helper.rb', line 70

def fix_url(base_url, url)
  return url unless ( url =~ /^\// )
  "#{base_url}#{url}"
end

#fully_qualify_urls(base_url, text) ⇒ Object



41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/awestruct/context_helper.rb', line 41

def fully_qualify_urls(base_url, text)
  begin
    doc = Oga.parse_html text

    doc.each_node do |elem|
      if elem.is_a?(Oga::XML::Element) && elem.html?
        case elem.name
        when 'a'
          elem.set 'href', fix_url(base_url, elem.get('href')) if elem.get('href')
        when 'link'
          elem.set 'href', fix_url(base_url, elem.get('href')) if elem.get('href')
        when 'img'
          elem.set 'src', fix_url(base_url, elem.get('src')) if elem.get('src')
        end
      end
    end

    doc.to_xml.tap do |d|
      d.force_encoding(text.encoding) if d.encoding != text.encoding
    end
  rescue => e
    Awestruct::ExceptionHelper.log_error e
    $LOG.info %Q(If the error has to do with 'end of input' ensure none of the following tags have a closing tag:
#{Oga::XML::HTML_VOID_ELEMENTS.to_a.collect {|a| a.downcase}.uniq.join(', ')}) if $LOG.info?
    $LOG.warn "Text being parsed:\n#{text}" if $LOG.warn?
    text # returning the bad text, which hopefully will help find the cause
  end
end

#html_to_text(str) ⇒ Object



7
8
9
# File 'lib/awestruct/context_helper.rb', line 7

def html_to_text(str)
  str.gsub( /<[^>]+>/, '' ).gsub( /&nbsp;/, ' ' )
end

#summarize(text, numwords = 20, ellipsis = '...') ⇒ Object



37
38
39
# File 'lib/awestruct/context_helper.rb', line 37

def summarize(text, numwords=20, ellipsis='...')
  close_tags(text.split(/ /)[0, numwords].join(' ') + ellipsis)
end

#without_images(str) ⇒ Object



15
16
17
# File 'lib/awestruct/context_helper.rb', line 15

def without_images(str)
  str.gsub(/<img[^>]+>/,'').gsub(/<a[^>]+>([^<]*)<\/a>/, '\1')
end