Class: Importer::HtmlReader
- Inherits:
-
DataReader
- Object
- DataReader
- Importer::HtmlReader
- Defined in:
- lib/iron/import/html_reader.rb
Instance Attribute Summary
Attributes inherited from DataReader
Instance Method Summary collapse
- #init_source(mode, source) ⇒ Object
-
#initialize(importer) ⇒ HtmlReader
constructor
A new instance of HtmlReader.
- #load_raw(scopes, &block) ⇒ Object
Methods inherited from DataReader
#add_error, #add_exception, for_format, for_path, for_source, for_stream, is_stream?, #load, #load_each, #parse_value, path_from_stream, #supports?, #supports_file!, #supports_file?, #supports_stream!, #supports_stream?, verify_nokogiri!, verify_roo!
Constructor Details
#initialize(importer) ⇒ HtmlReader
Returns a new instance of HtmlReader.
5 6 7 8 9 10 |
# File 'lib/iron/import/html_reader.rb', line 5 def initialize(importer) super(importer, :html) supports_file! supports_stream! @tables = nil end |
Instance Method Details
#init_source(mode, source) ⇒ Object
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 |
# File 'lib/iron/import/html_reader.rb', line 12 def init_source(mode, source) if mode == :stream @html = Nokogiri::HTML(source) elsif mode == :file @html = File.open(source) {|f| Nokogiri::HTML(f) } else add_error("Unsupported HTML mode: #{mode}") return false end if @html true else add_error("Failed parsing of HTML") false end rescue Exception => e add_exception(e) false end |
#load_raw(scopes, &block) ⇒ Object
34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
# File 'lib/iron/import/html_reader.rb', line 34 def load_raw(scopes, &block) # Default to searching all tables in the document if scopes.nil? || scopes.empty? scopes = ['table'] end # Catch here lets us break out of the nested loop cleanly catch(:found) do # Run each scope, which should be a valid css selector scopes.each do |scope| @html.css(scope).each do |table_node| rows = [] table_node.css('tr').each do |row_node| row = [] row_node.children.each do |cell_node| if ['th', 'td'].include?(cell_node.name) row << cell_node.text.strip # Handle col-span values appropriately span_count = cell_node.attr('colspan') (span_count.to_i - 1).times do row << nil end end end rows << row end found = block.call(rows) throw(:found, true) if found end end end rescue Exception => e # Not sure why we'd get here, but we strive for error-freedom here, yessir. add_exception(e) end |