Class: RDF::Tabular::Reader

Inherits:

Reader

Object
Reader
RDF::Tabular::Reader

show all

Includes:: Utils

Defined in:: lib/rdf/tabular/reader.rb

Overview

A Tabular Data to RDF parser in Ruby.

Author:

[Gregg Kellogg](greggkellogg.net/)

Instance Attribute Summary collapse

#input ⇒ :read readonly

Input open to read.
#metadata ⇒ Metadata readonly

Metadata associated with the CSV.

Instance Method Summary collapse

#each_statement(&block) ⇒ Object
#each_triple(&block) ⇒ Object
#initialize(input = $stdin, options = {}) {|reader| ... } ⇒ Reader constructor

Initializes the RDF::Tabular Reader instance.
#minimal? ⇒ Boolean
#prov? ⇒ Boolean
#to_atd(options = {}) ⇒ Hash

Return a hash representation of the annotated tabular data model for JSON serialization.
#to_hash(options = {}) ⇒ Hash, Array

Return a hash representation of the data for JSON serialization.
#to_json(options = {}) ⇒ String

Transform to JSON.

Methods included from Utils

debug, #depth

Constructor Details

#initialize(input = $stdin, options = {}) {|reader| ... } ⇒ `Reader`

Initializes the RDF::Tabular Reader instance.

Parameters:

input (Util::File::RemoteDoc, IO, StringIO, Array<Array<String>>) (defaults to: $stdin) —

An opened file possibly JSON Metadata, or an Array used as an internalized array of arrays
options (Hash{Symbol => Object}) (defaults to: {}) —

any additional options (see ‘RDF::Reader#initialize`)

Options Hash (options):

:metadata (Metadata, Hash, String, RDF::URI) —

user supplied metadata, merged on top of extracted metadata. If provided as a URL, Metadata is loade from that location
:minimal (Boolean) —

includes only the information gleaned from the cells of the tabular data
:noProv (Boolean) —

do not output optional provenance information

Yields:

(reader) —

‘self`

Yield Parameters:

reader (RDF::Reader)

Yield Returns:

(void) —

ignored

Raises:

(RDF::ReaderError) —

if the CSV document cannot be loaded

# File 'lib/rdf/tabular/reader.rb', line 37

def initialize(input = $stdin, options = {}, &block)
  super do
    # Base would be how we are to take this
    @options[:base] ||= base_uri.to_s if base_uri
    @options[:base] ||= input.base_uri if input.respond_to?(:base_uri)
    @options[:base] ||= input.path if input.respond_to?(:path)
    @options[:base] ||= input.filename if input.respond_to?(:filename)
    if RDF::URI(@options[:base]).relative? && File.exist?(@options[:base])
      @options[:base] = "file:/#{File.expand_path(@options[:base])}"
    end

    @options[:depth] ||= 0

    debug("Reader#initialize") {"input: #{input.inspect}, base: #{@options[:base]}"}

    # Minimal implies noProv
    @options[:noProv] ||= @options[:minimal]

    @input = input.is_a?(String) ? StringIO.new(input) : input

    depth do
      # If input is JSON, then the input is the metadata
      if @options[:base] =~ /\.json(?:ld)?$/ ||
         @input.respond_to?(:content_type) && @input.content_type =~ %r(application/(?:ld+)json)
        @metadata = Metadata.new(@input, @options.merge(filenames: @options[:base]))
        # If @metadata is for a Table, merge with something empty to create a TableGroup metadata
        if @metadata.is_a?(TableGroup)
          @metadata.normalize!
        else
          @metadata = @metadata.merge(TableGroup.new({}))
        end
        @input = @metadata
      elsif @options[:no_found_metadata]
        # Extract embedded metadata and merge
        table_metadata = @options[:metadata]
        embedded_metadata = table_metadata.dialect.embedded_metadata(input, @options)
        @metadata = table_metadata.dup.merge!(embedded_metadata)
      else
        # HTTP flags
        if @input.respond_to?(:headers) &&
           input.headers.fetch(:content_type, '').split(';').include?('header=absent')
          @options[:metadata] ||= Table.new(url: @options[:base])
          @options[:metadata].dialect.header = false
        end

        # It's tabluar data. Find metadata and proceed as if it was specified in the first place
        @metadata = Metadata.for_input(@input, @options)
        @input = @metadata
      end

      debug("Reader#initialize") {"input: #{input}, metadata: #{metadata.inspect}"}

      if block_given?
        case block.arity
          when 0 then instance_eval(&block)
          else block.call(self)
        end
      end
    end
  end
end

Instance Attribute Details

#input ⇒ `:read` (readonly)

Input open to read

Returns:

(:read)



20
21
22

# File 'lib/rdf/tabular/reader.rb', line 20

def input
  @input
end

#metadata ⇒ `Metadata` (readonly)

Metadata associated with the CSV

Returns:

(Metadata)



15
16
17

# File 'lib/rdf/tabular/reader.rb', line 15

def metadata
  @metadata
end

Instance Method Details

#each_statement(&block) ⇒ `Object`

See Also:

Reader#each_statement

# File 'lib/rdf/tabular/reader.rb', line 102

def each_statement(&block)
  if block_given?
    @callback = block

    start_time = Time.now

    # Construct metadata from that passed from file open, along with information from the file.
    if input.is_a?(Metadata)
      debug("each_statement: metadata") {input.inspect}

      # Validate metadata
      input.validate!

      depth do
        # Get Metadata to invoke and open referenced files
        case input.type
        when :TableGroup
          # Use resolved @id of TableGroup, if available
          table_group = input.id || RDF::Node.new
          add_statement(0, table_group, RDF.type, CSVW.TableGroup) unless minimal?

          # Common Properties
          input.each do |key, value|
            next unless key.to_s.include?(':') || key == :notes
            input.common_properties(table_group, key, value) do |statement|
              add_statement(0, statement)
            end
          end unless minimal?

          input.each_resource do |table|
            next if table.suppressOutput
            table_resource = table.id || RDF::Node.new
            add_statement(0, table_group, CSVW.table, table_resource) unless minimal?
            Reader.open(table.url, options.merge(
                format: :tabular,
                metadata: table,
                base: table.url,
                no_found_metadata: true,
                table_resource: table_resource
            )) do |r|
              r.each_statement(&block)
            end
          end

          # Provenance
          if prov?
            activity = RDF::Node.new
            add_statement(0, table_group, RDF::PROV.wasGeneratedBy, activity)
            add_statement(0, activity, RDF.type, RDF::PROV.Activity)
            add_statement(0, activity, RDF::PROV.wasAssociatedWith, RDF::URI("http://rubygems.org/gems/rdf-tabular"))
            add_statement(0, activity, RDF::PROV.startedAtTime, RDF::Literal::DateTime.new(start_time))
            add_statement(0, activity, RDF::PROV.endedAtTime, RDF::Literal::DateTime.new(Time.now))

            unless (urls = input.resources.map(&:url)).empty?
              usage = RDF::Node.new
              add_statement(0, activity, RDF::PROV.qualifiedUsage, usage)
              add_statement(0, usage, RDF.type, RDF::PROV.Usage)
              urls.each do |url|
                add_statement(0, usage, RDF::PROV.entity, RDF::URI(url))
              end
              add_statement(0, usage, RDF::PROV.hadRole, CSVW.csvEncodedTabularData)
            end

            unless Array(input.filenames).empty?
              usage = RDF::Node.new
              add_statement(0, activity, RDF::PROV.qualifiedUsage, usage)
              add_statement(0, usage, RDF.type, RDF::PROV.Usage)
              Array(input.filenames).each do |fn|
                add_statement(0, usage, RDF::PROV.entity, RDF::URI(fn))
              end
              add_statement(0, usage, RDF::PROV.hadRole, CSVW.tabularMetadata)
            end
          end
        when :Table
          Reader.open(input.url, options.merge(format: :tabular, metadata: input, base: input.url, no_found_metadata: true)) do |r|
            r.each_statement(&block)
          end
        else
          raise "Opened inappropriate metadata type: #{input.type}"
        end
      end
      return
    end

    # Output Table-Level RDF triples
    table_resource = options.fetch(:table_resource, (metadata.id || RDF::Node.new))
    unless minimal?
      add_statement(0, table_resource, RDF.type, CSVW.Table)
      add_statement(0, table_resource, CSVW.url, RDF::URI(metadata.url))
    end

    # Common Properties
    metadata.each do |key, value|
      next unless key.to_s.include?(':') || key == :notes
      metadata.common_properties(table_resource, key, value) do |statement|
        add_statement(0, statement)
      end
    end unless minimal?

    # Input is file containing CSV data.
    # Output ROW-Level statements
    last_row_num = 0
    metadata.each_row(input) do |row|
      if row.is_a?(RDF::Statement)
        # May add additional comments
        row.subject = table_resource
        add_statement(last_row_num + 1, row)
        next
      end
      last_row_num = row.sourceNumber

      # Output row-level metadata
      row_resource = RDF::Node.new
      default_cell_subject = RDF::Node.new
      unless minimal?
        add_statement(row.sourceNumber, table_resource, CSVW.row, row_resource)
        add_statement(row.sourceNumber, row_resource, CSVW.rownum, row.number)
        add_statement(row.sourceNumber, row_resource, CSVW.url, row.id)
      end
      row.values.each_with_index do |cell, index|
        next if cell.column.suppressOutput # Skip ignored cells
        cell_subject = cell.aboutUrl || default_cell_subject
        propertyUrl = cell.propertyUrl || RDF::URI("#{metadata.url}##{cell.column.name}")
        add_statement(row.sourceNumber, row_resource, CSVW.describes, cell_subject) unless minimal?

        if cell.column.valueUrl
          add_statement(row.sourceNumber, cell_subject, propertyUrl, cell.valueUrl) if cell.valueUrl
        elsif cell.column.ordered && cell.column.separator
          list = RDF::List[*Array(cell.value)]
          add_statement(row.sourceNumber, cell_subject, propertyUrl, list.subject)
          list.each_statement do |statement|
            next if statement.predicate == RDF.type && statement.object == RDF.List
            add_statement(row.sourceNumber, statement.subject, statement.predicate, statement.object)
          end
        else
          Array(cell.value).each do |v|
            add_statement(row.sourceNumber, cell_subject, propertyUrl, v)
          end
        end
      end
    end
  end
  enum_for(:each_statement)
end

#each_triple(&block) ⇒ `Object`

#minimal? ⇒ `Boolean`

Returns:

(Boolean)

544	# File 'lib/rdf/tabular/reader.rb', line 544 def minimal?; @options[:minimal]; end

#prov? ⇒ `Boolean`

Returns:

(Boolean)

545	# File 'lib/rdf/tabular/reader.rb', line 545 def prov?; !(@options[:noProv]); end

#to_atd(options = {}) ⇒ `Hash`

Return a hash representation of the annotated tabular data model for JSON serialization

Parameters:

options (Hash{Symbol => Object}) (defaults to: {})

Returns:

(Hash)

# File 'lib/rdf/tabular/reader.rb', line 481

def to_atd(options = {})
  # Construct metadata from that passed from file open, along with information from the file.
  if input.is_a?(Metadata)
    debug("each_statement: metadata") {input.inspect}
    depth do
      # Get Metadata to invoke and open referenced files
      case input.type
      when :TableGroup
        table_group = input.to_atd

        input.each_resource do |table|
          Reader.open(table.url, options.merge(
            format:             :tabular,
            metadata:           table,
            base:               table.url,
            no_found_metadata:  true, # FIXME: remove
            noProv:             true
          )) do |r|
            table = r.to_atd(options)
            
            # Fill in columns and rows in table_group entry from returned table
            t = table_group[:resources].detect {|tab| tab["url"] == table["url"]}
            t["columns"] = table["columns"]
            t["rows"] = table["rows"]
          end
        end

        # Result is table_group
        table_group
      when :Table
        table = nil
        Reader.open(input.url, options.merge(
          format:             :tabular,
          metadata:           input,
          base:               input.url,
          no_found_metadata:  true,
          noProv:             true
        )) do |r|
          table = r.to_atd(options)
        end

        table
      else
        raise "Opened inappropriate metadata type: #{input.type}"
      end
    end
  else
    rows = []
    table = metadata.to_atd
    rows, columns = table["rows"], table["columns"]

    # Input is file containing CSV data.
    # Output ROW-Level statements
    metadata.each_row(input) do |row|
      rows << row.to_atd
      row.values.each_with_index do |cell, colndx|
        columns[colndx]["cells"] << cell.id
      end
    end
    table
  end
end

#to_hash(options = {}) ⇒ `Hash`, `Array`

Return a hash representation of the data for JSON serialization

Produces an array if run in minimal mode.

Parameters:

options (Hash{Symbol => Object}) (defaults to: {})

Returns:

(Hash, Array)

# File 'lib/rdf/tabular/reader.rb', line 321

def to_hash(options = {})
  # Construct metadata from that passed from file open, along with information from the file.
  if input.is_a?(Metadata)
    debug("each_statement: metadata") {input.inspect}
    depth do
      # Get Metadata to invoke and open referenced files
      case input.type
      when :TableGroup
        # Validate metadata
        input.validate!

        tables = []
        table_group = {}
        table_group['@id'] = input.id.to_s if input.id

        # Common Properties
        input.each do |key, value|
          next unless key.to_s.include?(':') || key == :notes
          table_group[key] = input.common_properties(nil, key, value)
          table_group[key] = [table_group[key]] if key == :notes && !table_group[key].is_a?(Array)
        end

        table_group['table'] = tables

        input.each_resource do |table|
          next if table.suppressOutput
          Reader.open(table.url, options.merge(
            format:             :tabular,
            metadata:           table,
            base:               table.url,
            minimal:            minimal?,
            no_found_metadata:  true
          )) do |r|
            case table = r.to_hash(options)
            when Array then tables += table
            when Hash  then tables << table
            end
          end
        end

        # Result is table_group or array
        minimal? ? tables : table_group
      when :Table
        table = nil
        Reader.open(input.url, options.merge(
          format:             :tabular,
          metadata:           input,
          base:               input.url,
          minimal:            minimal?,
          no_found_metadata:  true
        )) do |r|
          table = r.to_hash(options)
        end

        table
      else
        raise "Opened inappropriate metadata type: #{input.type}"
      end
    end
  else
    rows = []
    table = {}
    table['@id'] = metadata.id.to_s if metadata.id
    table['url'] = metadata.url.to_s

    # Use string values notes and common properties
    metadata.each do |key, value|
      next unless key.to_s.include?(':') || key == :notes
      table[key] = metadata.common_properties(nil, key, value)
      table[key] = [table[key]] if key == :notes && !table[key].is_a?(Array)
    end unless minimal?

    table.merge!("row" => rows)

    # Input is file containing CSV data.
    # Output ROW-Level statements
    metadata.each_row(input) do |row|
      if row.is_a?(RDF::Statement)
        # May add additional comments
        table['rdfs:comment'] ||= []
        table['rdfs:comment'] << row.object.to_s
        next
      end
      # Output row-level metadata
      r, a, values = {}, {}, {}
      r["url"] = row.id.to_s
      r["rownum"] = row.number

      row.values.each_with_index do |cell, index|
        column = metadata.tableSchema.columns[index]

        # Ignore suppressed columns
        next if column.suppressOutput

        # Skip valueUrl cells where the valueUrl is null
        next if cell.column.valueUrl && cell.valueUrl.nil?

        # Skip empty sequences
        next if !cell.column.valueUrl && cell.value.is_a?(Array) && cell.value.empty?

        subject = cell.aboutUrl || 'null'
        co = (a[subject.to_s] ||= {})
        co['@id'] = subject.to_s unless subject == 'null'
        prop = case cell.propertyUrl
        when RDF.type then '@type'
        when nil then column.name
        else
          # Compact the property to a term or prefixed name
          metadata.context.compact_iri(cell.propertyUrl, vocab: true)
        end

        value = case
        when prop == '@type'
          metadata.context.compact_iri(cell.valueUrl || cell.value, vocab: true)
        when cell.valueUrl
          unless subject == cell.valueUrl
            values[cell.valueUrl.to_s] ||= {o: co, prop: prop, count: 0}
            values[cell.valueUrl.to_s][:count] += 1
          end
          cell.valueUrl.to_s
        when cell.value.is_a?(RDF::Literal::Numeric)
          cell.value.object
        when cell.value.is_a?(RDF::Literal::Boolean)
          cell.value.object
        else
          cell.value
        end

        # Add or merge value
        merge_compacted_value(co, prop, value)
      end

      # Check for nesting
      values.keys.each do |valueUrl|
        next unless a.has_key?(valueUrl)
        ref = values[valueUrl]
        co = ref[:o]
        prop = ref[:prop]
        next if ref[:count] != 1
        raise "Expected #{ref[o][prop].inspect} to include #{valueUrl.inspect}" unless Array(co[prop]).include?(valueUrl)
        co[prop] = Array(co[prop]).map {|e| e == valueUrl ? a.delete(valueUrl) : e}
        co[prop] = co[prop].first if co[prop].length == 1
      end

      r["describes"] = a.values

      if minimal?
        rows.concat(r["describes"])
      else
        rows << r
      end
    end

    minimal? ? table["row"] : table
  end
end

#to_json(options = {}) ⇒ `String`

Transform to JSON. Note that this must be run from within the reader context if the input is an open IO stream.

Examples:

outputing annotated CSV as JSON

result = nil
RDF::Tabular::Reader.open("etc/doap.csv") do |reader|
  result = reader.to_json
end
result #=> {...}

outputing annotated CSV as JSON from an in-memory structure

csv = %(
  GID,On Street,Species,Trim Cycle,Inventory Date
  1,ADDISON AV,Celtis australis,Large Tree Routine Prune,10/18/2010
  2,EMERSON ST,Liquidambar styraciflua,Large Tree Routine Prune,6/2/2010
  3,EMERSON ST,Liquidambar styraciflua,Large Tree Routine Prune,6/2/2010
).gsub(/^\s+/, '')
r = RDF::Tabular::Reader.new(csv)
r.to_json #=> {...}

Parameters:

options (Hash{Symbol => Object}) (defaults to: {}) —

may also be a JSON state

Options Hash (options):

io (IO, StringIO) —

to output to file
:state (::JSON::State) —

used when dumping
:atd (Boolean) —

output Abstract Table representation instead

Returns:

(String)

# File 'lib/rdf/tabular/reader.rb', line 284

def to_json(options = {})
  io = case options
  when IO, StringIO then options
  when Hash then options[:io]
  end
  json_state = case options
  when Hash
    case
    when options.has_key?(:state) then options[:state]
    when options.has_key?(:indent) then options
    else ::JSON::LD::JSON_STATE
    end
  when ::JSON::State, ::JSON::Ext::Generator::State, ::JSON::Pure::Generator::State
    options
  else ::JSON::LD::JSON_STATE
  end
  options = {} unless options.is_a?(Hash)

  hash_fn = options[:atd] ? :to_atd : :to_hash
  options = options.merge(noProv: @options[:noProv])

  if io
    ::JSON::dump_default_options = json_state
    ::JSON.dump(self.send(hash_fn, options), io)
  else
    hash = self.send(hash_fn, options)
    ::JSON.generate(hash, json_state)
  end
end

Class: RDF::Tabular::Reader

Overview

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from Utils

Constructor Details

#initialize(input = $stdin, options = {}) {|reader| ... } ⇒ Reader

Instance Attribute Details

#input ⇒ :read (readonly)

#metadata ⇒ Metadata (readonly)

Instance Method Details

#each_statement(&block) ⇒ Object

#each_triple(&block) ⇒ Object

#minimal? ⇒ Boolean

#prov? ⇒ Boolean

#to_atd(options = {}) ⇒ Hash

#to_hash(options = {}) ⇒ Hash, Array

#to_json(options = {}) ⇒ String

Examples:

outputing annotated CSV as JSON

outputing annotated CSV as JSON from an in-memory structure

#initialize(input = $stdin, options = {}) {|reader| ... } ⇒ `Reader`

#input ⇒ `:read` (readonly)

#metadata ⇒ `Metadata` (readonly)

#each_statement(&block) ⇒ `Object`

#each_triple(&block) ⇒ `Object`

#minimal? ⇒ `Boolean`

#prov? ⇒ `Boolean`

#to_atd(options = {}) ⇒ `Hash`

#to_hash(options = {}) ⇒ `Hash`, `Array`

#to_json(options = {}) ⇒ `String`