Class: UnderOs::Parser::HTML
- Inherits:
-
Object
- Object
- UnderOs::Parser::HTML
- Defined in:
- lib/under_os/parser/html.rb
Instance Method Summary collapse
- #close_tag ⇒ Object
- #merge_data_attrs(hash) ⇒ Object
- #open_tag ⇒ Object
- #parse(html) ⇒ Object
- #parse_attrs_in(string) ⇒ Object
- #plain_text ⇒ Object
Instance Method Details
#close_tag ⇒ Object
44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
# File 'lib/under_os/parser/html.rb', line 44 def close_tag if m = @chunk.match(/\A<\/([a-z]+)>/) while node = @stack.pop if node[:tag] != m[1] if @stack.size > 0 @stack.last[:children] += node[:children] || [] node.delete(:children) node.delete(:text) end else break end end @node = @stack.last m[0].size end end |
#merge_data_attrs(hash) ⇒ Object
83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
# File 'lib/under_os/parser/html.rb', line 83 def merge_data_attrs(hash) hash.keys.each do |key| if key.to_s.starts_with?('data-') hash[:data] ||= {} value = hash.delete(key) key = key.to_s.gsub(/^data\-/, '').camelize hash[:data][key.to_sym] = value end end hash end |
#open_tag ⇒ Object
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
# File 'lib/under_os/parser/html.rb', line 26 def open_tag if m = @chunk.match(/\A<([a-z]+)([^>]*)>/) @node = {tag: m[1], attrs: parse_attrs_in(m[2])} if parent = @stack.last parent[:children] ||= [] parent[:children] << @node parent.delete(:text) # it can have either text or children else @top << @node end @stack << @node m[0].size end end |
#parse(html) ⇒ Object
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 |
# File 'lib/under_os/parser/html.rb', line 2 def parse(html) html = html.strip.gsub(/<\!--[\s\S]*?-->/, '').gsub(/>\s+/, '>').gsub(/\s+</, '<') [].tap do |top| @top = top @stack = [] @node = nil i = 0 while i < html.size @chunk = html.slice(i, html.size) i += open_tag || close_tag || plain_text end # closing all the missing tags while node = @stack.shift node.delete(:children) node.delete(:text) @top << node if ! @top.include?(node) end end end |
#parse_attrs_in(string) ⇒ Object
72 73 74 75 76 77 78 79 80 81 |
# File 'lib/under_os/parser/html.rb', line 72 def parse_attrs_in(string) merge_data_attrs({}.tap do |hash| string.scan(/([a-z][a-z_\-\d]+)=('|")(.+?)(\2)/).each do |match| value = match[0] == match[2] ? true : match[2] value = true if value == 'true' value = false if value == 'false' hash[match[0].to_sym] = value end end) end |
#plain_text ⇒ Object
64 65 66 67 68 69 70 |
# File 'lib/under_os/parser/html.rb', line 64 def plain_text if m = @chunk.match(/\A([^<]+)/) @stack.last[:text] = m[1] if @stack.last m[0].size end end |