Class: Mushy::ParseHtml
Instance Attribute Summary
Attributes inherited from Flux
#config, #flow, #id, #masher, #parent_fluxs, #subscribed_to, #type
Class Method Summary collapse
Instance Method Summary collapse
Methods inherited from Flux
#convert_this_to_an_array, #convert_to_symbolized_hash, #execute, #execute_single_event, #group_these_results, #guard, #ignore_these_results, inherited, #initialize, #join_these_results, #limit_these_results, #merge_these_results, #model_these_results, #outgoing_split_these_results, #shape_these, #sort_these_results, #standardize_these
Constructor Details
This class inherits a constructor from Mushy::Flux
Class Method Details
.details ⇒ Object
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
# File 'lib/mushy/fluxs/parse_html.rb', line 7 def self.details { name: 'ParseHtml', title: 'Parse HTML', fluxGroup: { name: 'Web' }, description: 'Extract data from HTML.', config: { path: { description: 'The path to the HTML in the incoming event.', type: 'text', value: 'body', }, extract: { description: 'The form of the event that is meant to be pulled from this event.', type: 'keyvalue', value: { url: 'a|@href' }, } }, examples: { "Example 1" => { description: 'Pulling all links out of HTML.', input: { html: '<a href="one">First</a><a href="two">Second</a>' }, config: { path: 'html', extract: { url: "a|@href", name: "a" }, }, result: [ { url: 'one', name: 'First' }, { url: 'two', name: 'Second' } ] }, "Example 2" => { description: 'Pulling the contents of a single div.', input: { html: "<div class=\"main\" data-this=\"you\">HEY</a>" }, config: { path: 'html', extract: { content: "div.main", class: "div|@class", "data-this" => "div|@data-this", }, }, result: { content: 'HEY', class: 'main', "data-this" => "you" }, }, } } end |
Instance Method Details
#process(event, config) ⇒ Object
62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
# File 'lib/mushy/fluxs/parse_html.rb', line 62 def process event, config doc = Nokogiri::HTML event[config[:path]] matches = config[:extract].keys.reduce( { } ) do |matches, key| css, value = config[:extract][key].split('|') value = value || './node()' matches[key] = doc.css(css).map { |x| x.xpath(value).to_s } matches end matches[matches.keys.first] .each_with_index .map { |_, i| i } .map do |i| matches.keys.reduce(SymbolizedHash.new( { } )) do |record, key| record[key] = matches[key][i] record[key] = record[key].strip if record[key] record end end end |