Class: Rumba::Crawler::Parser
- Inherits:
-
Object
- Object
- Rumba::Crawler::Parser
- Includes:
- Models
- Defined in:
- lib/rumba/crawler/parser.rb
Constant Summary collapse
- SK =
Service Keys
['css', 'root', 'regexp']
Instance Method Summary collapse
- #create_object(name, node, template) ⇒ Object
- #get_content(node, template) ⇒ Object
- #get_node(doc, template) ⇒ Object
- #leaf_node?(template) ⇒ Boolean
- #parse_multi(doc, template) ⇒ Object
- #parse_node(doc, template, name) ⇒ Object
- #process(response, template) ⇒ Object
Instance Method Details
#create_object(name, node, template) ⇒ Object
37 38 39 40 41 42 43 44 45 46 47 |
# File 'lib/rumba/crawler/parser.rb', line 37 def create_object(name, node, template) object = send(name) template.reject{|key, _| SK.include?(key)}.each do |key, value| if value.is_a? Array object.send("#{key}=", parse_multi(node, value.first)) else object.send("#{key}=", parse_node(node, value, key)) end end return object end |
#get_content(node, template) ⇒ Object
49 50 51 52 53 54 55 |
# File 'lib/rumba/crawler/parser.rb', line 49 def get_content(node, template) if template['regexp'] /#{template['regexp']}/i.match(node.content).to_s else node.content end end |
#get_node(doc, template) ⇒ Object
57 58 59 60 61 62 63 64 65 |
# File 'lib/rumba/crawler/parser.rb', line 57 def get_node(doc, template) if template.is_a?(String) doc.css(template) elsif template['root'] @doc.css(template['root']).css(template['css']) else doc.css(template['css']) end end |
#leaf_node?(template) ⇒ Boolean
67 68 69 |
# File 'lib/rumba/crawler/parser.rb', line 67 def leaf_node?(template) template.is_a?(String) || template.reject{|key, _| SK.include?(key)}.empty? end |
#parse_multi(doc, template) ⇒ Object
18 19 20 21 22 23 24 25 26 |
# File 'lib/rumba/crawler/parser.rb', line 18 def parse_multi(doc, template) result = [] template.each do |key, value| get_node(doc, value).each do |node| result << create_object(key, node, value) end end return result end |
#parse_node(doc, template, name) ⇒ Object
28 29 30 31 32 33 34 35 |
# File 'lib/rumba/crawler/parser.rb', line 28 def parse_node(doc, template, name) node = get_node(doc, template).first if leaf_node?(template) get_content(node, template) else create_object(name, node, template) end end |
#process(response, template) ⇒ Object
8 9 10 11 12 13 14 15 16 |
# File 'lib/rumba/crawler/parser.rb', line 8 def process(response, template) template = JSON.parse(template) @doc = Nokogiri::HTML(response) if template.is_a? Array parse_multi(@doc, template.first) else parse_node(@doc, template, template.keys.first) end end |