Class: Mushy::ParseHtml

Inherits:
Flux
  • Object
show all
Defined in:
lib/mushy/fluxs/parse_html.rb

Instance Attribute Summary

Attributes inherited from Flux

#config, #flow, #id, #masher, #parent_fluxs, #subscribed_to, #type

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from Flux

#convert_this_to_an_array, #convert_to_symbolized_hash, #execute, #execute_single_event, #group_these_results, #guard, #ignore_these_results, inherited, #initialize, #join_these_results, #limit_these_results, #merge_these_results, #model_these_results, #outgoing_split_these_results, #shape_these, #sort_these_results, #standardize_these

Constructor Details

This class inherits a constructor from Mushy::Flux

Class Method Details

.detailsObject



7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# File 'lib/mushy/fluxs/parse_html.rb', line 7

def self.details
  {
    name: 'ParseHtml',
    title: 'Parse HTML',
    fluxGroup: { name: 'Web' },
    description: 'Extract data from HTML.',
    config: {
      path: {
              description: 'The path to the HTML in the incoming event.',
              type:        'text',
              value:       'body',
            },
      extract: {
                 description: 'The form of the event that is meant to be pulled from this event.',
                 type: 'keyvalue',
                 value: { url: 'a|@href' },
               }
    },
    examples: {
      "Example 1" => {
                       description: 'Pulling all links out of HTML.',
                       input: {
                                html: '<a href="one">First</a><a href="two">Second</a>'
                              },
                       config: {
                                 path: 'html',
                                 extract: {
                                   url: "a|@href",
                                   name: "a"
                                 },
                               },
                       result: [
                          { url: 'one', name: 'First' },
                          { url: 'two', name: 'Second' }
                        ]
                     },
      "Example 2" => {
                       description: 'Pulling the contents of a single div.',
                       input: {
                                  html: "<div class=\"main\" data-this=\"you\">HEY</a>"
                              },
                       config: {
                                 path: 'html',
                                 extract: {
                                            content: "div.main",
                                            class: "div|@class",
                                            "data-this" => "div|@data-this",
                                          },
                               },
                       result: { content: 'HEY', class: 'main', "data-this" => "you" },
                     },
      }
  }
end

Instance Method Details

#process(event, config) ⇒ Object



62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# File 'lib/mushy/fluxs/parse_html.rb', line 62

def process event, config

  doc = Nokogiri::HTML event[config[:path]]

  matches = config[:extract].keys.reduce( { } ) do |matches, key|
    css, value = config[:extract][key].split('|')
    value = value || './node()'

    matches[key] = doc.css(css).map { |x| x.xpath(value).to_s }
    matches
  end

  matches[matches.keys.first]
     .each_with_index
     .map { |_, i| i }
     .map do |i|
            matches.keys.reduce(SymbolizedHash.new( { } )) do |record, key|
              record[key] = matches[key][i]
              record[key] = record[key].strip if record[key]
              record
            end
          end
end