Class: Sc::Fragment

Inherits:
Object
  • Object
show all
Includes:
RDF::NodeProxy
Defined in:
lib/scrappy/extractor/fragment.rb

Instance Method Summary collapse

Instance Method Details

#all_mappings(options = {}) ⇒ Object

Returns all mappings of a fragment by recursively processing all submappings.



12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# File 'lib/scrappy/extractor/fragment.rb', line 12

def all_mappings options={}
  # Extracts all the mappings and any subfragment
  mappings(options).map do |mapping|
    node         = mapping[:node]
    subfragments = mapping[:subfragments]
    doc          = mapping[:doc]

    # Process subfragments
    consistent = true
    subfragments.each do |subfragment|
      # Get subfragment object
      subfragment = subfragment.proxy Node('sc:Fragment')
      
      # Add triples from submappings
      submappings = subfragment.all_mappings(options.merge(:doc=>doc))

      # Add relations
      submappings.each do |submapping|
        subnode = submapping[:node]
        node.graph << subnode if subnode.is_a?(RDF::Node)
        subfragment.sc::relation.each { |relation| node[relation] += [subnode] }
      end
      
      # Check consistency
      consistent = false if subfragment.sc::min_cardinality.first and submappings.size < subfragment.sc::min_cardinality.first.to_i
      consistent = false if subfragment.sc::max_cardinality.first and submappings.size > subfragment.sc::max_cardinality.first.to_i
    end

    # Skip the node if it has inconsistent relations
    # For example: extracting a sioc:Post with no dc:title would
    # violate the constraint sc:min_cardinality = 1
    next if !consistent
    
    { :node=>node, :subfragments=>subfragments, :doc=>doc }
  end.compact
end

#extract(options = {}) ⇒ Object

Extracts data out of a document and returns an array of nodes



6
7
8
# File 'lib/scrappy/extractor/fragment.rb', line 6

def extract options={}
  all_mappings(options).map { |mapping| mapping[:node] }
end

#mappings(options) ⇒ Object

Returns the mappings between this fragment and the RDF nodes it matches



51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# File 'lib/scrappy/extractor/fragment.rb', line 51

def mappings options
  # Identify the fragment's mappings
  docs = sc::selector.map { |s| graph.node(s).select options[:doc] }.flatten

  # Generate a result for each page mapping
  docs.map do |doc|
    # Build RDF nodes from identifier selectors (if present)
    node = build_node(doc, options[:referenceable])
    
    # Skip the node if no URI or bnode is created
    next if !node
    
    # Add info to the node

    # Build the object -- it can be a node or a literal
    object = if sc::type.include?(Node('rdf:Literal'))
      value = doc[:value].to_s.strip
      if options[:referenceable]
        node.rdf::value = value
        node.rdf::type += [Node('rdf:Literal')]
        node
      else
        value
      end
    else
      # Add statements about the node
      sc::type.each       { |type|       node.rdf::type += [type] if type != Node('rdf:Resource') }
      sc::superclass.each { |superclass| node.rdfs::subClassOf += [superclass] }
      sc::sameas.each     { |samenode|   node.owl::sameAs += [samenode] }

      node
    end

    # Add referenceable data if requested
    if options[:referenceable] and node.size > 0
      source                = reference(doc)
      source.sc::type       = sc::type
      source.sc::superclass = sc::superclass
      source.sc::sameas     = sc::sameas
      source.sc::relation   = sc::relation
      node.graph           << source
      node.sc::source       = source
    end
    
    # Variable object points to either a node or a literal
    # Return the object, as well as its subfragments (if any)
    # and the doc it was extracted from
    { :node=>object, :subfragments=>sc::subfragment, :doc=>doc }
  end.compact
end