Module: Dap::Filter::HTMLGhetto

Included in:: FilterHTMLIframes, FilterHTMLLinks

Defined in:: lib/dap/filter/http.rb

Overview

Dirty element extractor, works around memory issues with Nokogiri

Instance Method Summary collapse

#extract_elements(data) ⇒ Object

Instance Method Details

#extract_elements(data) ⇒ `Object`

# File 'lib/dap/filter/http.rb', line 12

def extract_elements(data)
  @coder ||= HTMLEntities.new
  res = []
  data.
    to_s.
    encode('UTF-8', invalid: :replace, undef: :replace, replace: '').
    scan(/<([^>]+)>/m).each do |e|

    e = e.first

    # Skip closing tags
    next if e[0,1] == "/"

    # Get the name vs attributes
    name, astr = e.split(/\s+/, 2).map{|x| x.to_s }
    astr ||= ''

    # Skip non-alpha elements
    next unless name =~ /^[a-zA-Z]/

    # Convert newlines to spaces & strip trailing />
    astr = astr.gsub(/\n/, ' ').sub(/\/$/, '')

    o = { name: name }

    begin
     Shellwords.shellwords(astr).each do |attr_str|
        aname, avalue = attr_str.split('=', 2).map{|x| x.to_s.strip }
        avalue = avalue.to_s.gsub(/^\"|"$/, '')
        o[aname.downcase] = @coder.decode(avalue)
      end
    rescue ::Interrupt
      raise $!
    rescue ::Exception
      # If shellwords couldn't parse it, split on space instead
      astr.to_s.split(/\s+/).each do |attr_str|
        aname, avalue = attr_str.split('=', 2).map{|x| x.to_s.strip }
        avalue = avalue.to_s.gsub(/^\"|"$/, '')
        o[aname.downcase] = @coder.decode(avalue)
      end
    end
    res << o
  end

  res
end

Module: Dap::Filter::HTMLGhetto

Overview

Instance Method Summary collapse

Instance Method Details

#extract_elements(data) ⇒ Object

#extract_elements(data) ⇒ `Object`