Class: Rumba::Crawler::Parser

Inherits:
Object
  • Object
show all
Includes:
Models
Defined in:
lib/rumba/crawler/parser.rb

Constant Summary collapse

SK =

Service Keys

['css', 'root', 'regexp']

Instance Method Summary collapse

Instance Method Details

#create_object(name, node, template) ⇒ Object



37
38
39
40
41
42
43
44
45
46
47
# File 'lib/rumba/crawler/parser.rb', line 37

def create_object(name, node, template)
  object = send(name)
  template.reject{|key, _| SK.include?(key)}.each do |key, value|
    if value.is_a? Array
      object.send("#{key}=", parse_multi(node, value.first))
    else
      object.send("#{key}=", parse_node(node, value, key))
    end
  end
  return object
end

#get_content(node, template) ⇒ Object



49
50
51
52
53
54
55
# File 'lib/rumba/crawler/parser.rb', line 49

def get_content(node, template)
  if template['regexp']
    /#{template['regexp']}/i.match(node.content).to_s
  else
    node.content
  end
end

#get_node(doc, template) ⇒ Object



57
58
59
60
61
62
63
64
65
# File 'lib/rumba/crawler/parser.rb', line 57

def get_node(doc, template)
  if template.is_a?(String)
    doc.css(template)
  elsif template['root']
    @doc.css(template['root']).css(template['css'])
  else
    doc.css(template['css'])
  end
end

#leaf_node?(template) ⇒ Boolean

Returns:

  • (Boolean)


67
68
69
# File 'lib/rumba/crawler/parser.rb', line 67

def leaf_node?(template)
  template.is_a?(String) || template.reject{|key, _| SK.include?(key)}.empty?
end

#parse_multi(doc, template) ⇒ Object



18
19
20
21
22
23
24
25
26
# File 'lib/rumba/crawler/parser.rb', line 18

def parse_multi(doc, template)
  result = []
  template.each do |key, value|
    get_node(doc, value).each do |node|
      result << create_object(key, node, value)
    end
  end
  return result
end

#parse_node(doc, template, name) ⇒ Object



28
29
30
31
32
33
34
35
# File 'lib/rumba/crawler/parser.rb', line 28

def parse_node(doc, template, name)
  node = get_node(doc, template).first
  if leaf_node?(template)
    get_content(node, template)
  else
    create_object(name, node, template)
  end
end

#process(response, template) ⇒ Object



8
9
10
11
12
13
14
15
16
# File 'lib/rumba/crawler/parser.rb', line 8

def process(response, template)
  template = JSON.parse(template)
  @doc = Nokogiri::HTML(response)
  if template.is_a? Array
    parse_multi(@doc, template.first)
  else
    parse_node(@doc, template, template.keys.first)
  end
end