Class: Nokogiri::XML::Document

Inherits:
Node
  • Object
show all
Defined in:
lib/nokogiri/xml/document.rb,
lib/nokogiri/ffi/xml/document.rb,
ext/nokogiri/xml_document.c,
ext/nokogiri/html_document.c

Overview

Nokogiri::XML::Document is the main entry point for dealing with XML documents. The Document is created by parsing an XML document. See Nokogiri.XML()

For searching a Document, see Nokogiri::XML::Node#css and Nokogiri::XML::Node#xpath

Direct Known Subclasses

HTML::Document

Constant Summary

Constants inherited from Node

Node::ATTRIBUTE_DECL, Node::ATTRIBUTE_NODE, Node::CDATA_SECTION_NODE, Node::COMMENT_NODE, Node::DOCB_DOCUMENT_NODE, Node::DOCUMENT_FRAG_NODE, Node::DOCUMENT_NODE, Node::DOCUMENT_TYPE_NODE, Node::DTD_NODE, Node::ELEMENT_DECL, Node::ELEMENT_NODE, Node::ENTITY_DECL, Node::ENTITY_NODE, Node::ENTITY_REF_NODE, Node::HTML_DOCUMENT_NODE, Node::NAMESPACE_DECL, Node::NOTATION_NODE, Node::PI_NODE, Node::TEXT_NODE, Node::XINCLUDE_END, Node::XINCLUDE_START

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from Node

#<=>, #==, #>, #[], #[]=, #accept, #add_namespace, #add_namespace_definition, #add_next_sibling, #add_previous_sibling, #after, #ancestors, #at, #at_css, #at_xpath, #attribute, #attribute_nodes, #attribute_with_ns, #attributes, #before, #blank?, #cdata?, #child, #children, #comment?, #content, #content=, #create_external_subset, #create_internal_subset, #css, #css_path, #decorate!, #default_namespace=, #description, #each, #element?, #element_children, #elements, #encode_special_chars, #external_subset, #first_element_child, #fragment?, #has_attribute?, #html?, #inner_html, #inner_html=, #inner_text, #internal_subset, #key?, #keys, #last_element_child, #line, #matches?, #name=, #namespace, #namespace=, #namespace_definitions, #namespace_scopes, #namespaced_key?, #next, #next_element, #next_sibling, #node_name, #node_name=, node_properties, #node_type, #parent, #parent=, #parse, #path, #pointer_id, #previous, #previous_element, #previous_sibling, #read_only?, #remove, #remove_attribute, #replace, #search, #serialize, #set_attribute, #swap, #text, #text?, #to_html, #to_s, #to_xhtml, #traverse, #type, #unlink, #values, #write_html_to, #write_to, #write_xhtml_to, #write_xml_to, #xml?, #xpath

Methods included from PP::Node

#inspect, #pretty_print

Constructor Details

#initialize(*args) ⇒ Document

:nodoc:



39
40
41
42
# File 'lib/nokogiri/xml/document.rb', line 39

def initialize *args # :nodoc:
  @errors     = []
  @decorators = nil
end

Instance Attribute Details

#cstructObject

Returns the value of attribute cstruct.



6
7
8
# File 'lib/nokogiri/ffi/xml/document.rb', line 6

def cstruct
  @cstruct
end

#errorsObject

A list of Nokogiri::XML::SyntaxError found when parsing a document



37
38
39
# File 'lib/nokogiri/xml/document.rb', line 37

def errors
  @errors
end

Class Method Details

.new(version = default) ⇒ Object

Create a new document with version (defaults to “1.0”)



300
301
302
303
304
305
# File 'ext/nokogiri/xml_document.c', line 300

def new(*args)
  version = args.first || "1.0"
  doc = wrap(LibXML.xmlNewDoc(version))
  doc.send :initialize, *args
  doc
end

.parse(string_or_io, url = nil, encoding = nil, options = ParseOptions::DEFAULT_XML) {|options| ... } ⇒ Object

Parse an XML file. thing may be a String, or any object that responds to read and close such as an IO, or StringIO. url is resource where this document is located. encoding is the encoding that should be used when processing the document. options is a number that sets options in the parser, such as Nokogiri::XML::ParseOptions::RECOVER. See the constants in Nokogiri::XML::ParseOptions.

Yields:

  • (options)


19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/nokogiri/xml/document.rb', line 19

def self.parse string_or_io, url = nil, encoding = nil, options = ParseOptions::DEFAULT_XML, &block

  options = Nokogiri::XML::ParseOptions.new(options) if Fixnum === options
  # Give the options to the user
  yield options if block_given?

  if string_or_io.respond_to?(:read)
    url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
    return read_io(string_or_io, url, encoding, options.to_i)
  end

  # read_memory pukes on empty docs
  return new if string_or_io.nil? or string_or_io.empty?

  read_memory(string_or_io, url, encoding, options.to_i)
end

.read_io(io, url, encoding, options) ⇒ Object

Create a new document from an IO object



182
183
184
185
186
# File 'ext/nokogiri/xml_document.c', line 182

def self.read_io io, url, encoding, options
  wrap_with_error_handling do
    LibXML.xmlReadIO(IoCallbacks.reader(io), nil, nil, url, encoding, options)
  end
end

.read_memory(string, url, encoding, options) ⇒ Object

Create a new document from a String



232
233
234
235
236
# File 'ext/nokogiri/xml_document.c', line 232

def self.read_memory(string, url, encoding, options)
  wrap_with_error_handling do
    LibXML.xmlReadMemory(string, string.length, url, encoding, options)
  end
end

.recursively_remove_namespaces_from_node(node) ⇒ Object



151
152
153
154
155
156
# File 'lib/nokogiri/ffi/xml/document.rb', line 151

def recursively_remove_namespaces_from_node(node)
  node.cstruct[:ns] = nil
  node.children.each do |child|
    recursively_remove_namespaces_from_node(child)
  end
end

.wrap(doc_struct) ⇒ Object



108
109
110
111
112
113
114
115
116
117
118
119
120
121
# File 'lib/nokogiri/ffi/xml/document.rb', line 108

def wrap doc_struct
  if doc_struct.is_a?(FFI::Pointer)
    # cast native pointers up into a doc cstruct
    return nil if doc_struct.null?
    doc_struct = LibXML::XmlDocument.new(doc_struct)
  end

  doc                  = self.allocate
  doc.cstruct          = doc_struct
  doc.cstruct.ruby_doc = doc
  doc.instance_eval { @decorators = nil; @node_cache = [] }
  doc.send :initialize
  doc
end

.wrap_with_error_handling(&block) ⇒ Object



127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# File 'lib/nokogiri/ffi/xml/document.rb', line 127

def wrap_with_error_handling(&block)
  error_list = []
  LibXML.xmlInitParser()
  LibXML.xmlResetLastError()
  LibXML.xmlSetStructuredErrorFunc(nil, SyntaxError.error_array_pusher(error_list))

  ptr = yield

  LibXML.xmlSetStructuredErrorFunc(nil, nil)

  if ptr.null?
    error = LibXML.xmlGetLastError()
    if error
      raise SyntaxError.wrap(error)
    else
      raise RuntimeError, "Could not parse document"
    end
  end

  document = wrap(ptr)
  document.errors = error_list
  return document
end

Instance Method Details

#add_child(child) ⇒ Object Also known as: <<



177
178
179
180
181
182
183
184
185
# File 'lib/nokogiri/xml/document.rb', line 177

def add_child child
  raise "Document already has a root node" if root
  if child.type == Node::DOCUMENT_FRAG_NODE
    raise "Document cannot have multiple root nodes" if child.children.size > 1
    super(child.children.first)
  else
    super
  end
end

#cloneObject



158
# File 'lib/nokogiri/xml/document.rb', line 158

alias :clone :dup

#collect_namespacesObject

Recursively get all namespaces from this node and its subtree and return them as a hash.

For example, given this document:

<root xmlns:foo="bar">
  <bar xmlns:hello="world" />
</root>

This method will return:

{ 'xmlns:foo' => 'bar', 'xmlns:hello' => 'world' }

WARNING: this method will clobber duplicate names in the keys. For example, given this document:

<root xmlns:foo="bar">
  <bar xmlns:foo="baz" />
</root>

The hash returned will look like this: { ‘xmlns:foo’ => ‘bar’ }



116
117
118
119
120
# File 'lib/nokogiri/xml/document.rb', line 116

def collect_namespaces
  ns = {}
  traverse { |j| ns.merge!(j.namespaces) }
  ns
end

#create_cdata(text) ⇒ Object

Create a CDATA element containing text



80
81
82
# File 'lib/nokogiri/xml/document.rb', line 80

def create_cdata text
  Nokogiri::XML::CDATA.new(self, text.to_s)
end

#create_element(name, *args, &block) ⇒ Object

Create an element with name, and optionally setting the content and attributes.

doc.create_element "div" # <div></div>
doc.create_element "div", :class => "container" # <div class='container'></div>
doc.create_element "div", "contents" # <div>contents</div>
doc.create_element "div", "contents", :class => "container" # <div class='container'>contents</div>
doc.create_element "div" { |node| node['class'] = "container" } # <div class='container'></div>


53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# File 'lib/nokogiri/xml/document.rb', line 53

def create_element name, *args, &block
  elm = Nokogiri::XML::Element.new(name, self, &block)
  args.each do |arg|
    case arg
    when Hash
      arg.each { |k,v|
        key = k.to_s
        if key =~ /^xmlns(:\w+)?$/
          ns_name = key.split(":", 2)[1]
          elm.add_namespace_definition ns_name, v
          next
        end
        elm[k.to_s] = v.to_s
      }
    else
      elm.content = arg
    end
  end
  elm
end

#create_entity(name, type, external_id, system_id, content) ⇒ Object

Create a new entity named name.

type is an integer representing the type of entity to be created, and it defaults to Nokogiri::XML::EntityDecl::INTERNAL_GENERAL. See the constants on Nokogiri::XML::EntityDecl for more information.

external_id, system_id, and content set the External ID, System ID, and content respectively. All of these parameters are optional.



372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
# File 'ext/nokogiri/xml_document.c', line 372

def create_entity(name, entity_type=Nokogiri::XML::EntityDecl::INTERNAL_GENERAL,
                  external_id=nil, system_id=nil, content=nil)
  LibXML.xmlResetLastError()
  ptr = LibXML.xmlAddDocEntity(cstruct, name, entity_type, external_id, system_id, content)
  if ptr.null?
    error = LibXML.xmlGetLastError()
    if error
      raise SyntaxError.wrap(error)
    else
      raise RuntimeError, "Could not create entity"
    end
  end

  Node.wrap(LibXML::XmlEntity.new(ptr))
end

#create_text_node(text, &block) ⇒ Object

Create a text node with text



75
76
77
# File 'lib/nokogiri/xml/document.rb', line 75

def create_text_node text, &block
  Nokogiri::XML::Text.new(text.to_s, self, &block)
end

#decorate(node) ⇒ Object

Apply any decorators to node



149
150
151
152
153
154
155
# File 'lib/nokogiri/xml/document.rb', line 149

def decorate node
  return unless @decorators
  @decorators.each { |klass,list|
    next unless node.is_a?(klass)
    list.each { |moodule| node.extend(moodule) }
  }
end

#decorators(key) ⇒ Object

Get the list of decorators given key



123
124
125
126
# File 'lib/nokogiri/xml/document.rb', line 123

def decorators key
  @decorators ||= Hash.new
  @decorators[key] ||= []
end

#documentObject

A reference to self



90
91
92
# File 'lib/nokogiri/xml/document.rb', line 90

def document
  self
end

#dupObject

Copy this Document. An optional depth may be passed in, but it defaults to a deep copy. 0 is a shallow copy, 1 is a deep copy.



277
278
279
280
281
282
283
284
285
286
# File 'ext/nokogiri/xml_document.c', line 277

def dup deep = 1
  dup_ptr = LibXML.xmlCopyDoc(cstruct, deep)
  return nil if dup_ptr.null?

  # xmlCopyDoc does not preserve document type. wtf?
  cstruct = LibXML::XmlDocumentCast.new(dup_ptr)
  cstruct[:type] = self.type

  self.class.wrap(dup_ptr)
end

#encodingObject

Get the encoding for this Document



152
153
154
155
# File 'ext/nokogiri/xml_document.c', line 152

def encoding
  ptr = cstruct[:encoding]
  ptr.null? ? nil : ptr.read_string
end

#encoding=(encoding) ⇒ Object

Set the encoding string for this Document



136
137
138
139
# File 'ext/nokogiri/xml_document.c', line 136

def encoding= encoding
  # TODO: if :encoding is already set, then it's probably getting leaked.
  cstruct[:encoding] = LibXML.xmlStrdup(encoding)
end

#fragment(tags = nil) ⇒ Object

Create a Nokogiri::XML::DocumentFragment from tags Returns an empty fragment if tags is nil.



168
169
170
# File 'lib/nokogiri/xml/document.rb', line 168

def fragment tags = nil
  DocumentFragment.new(self, tags, self.root)
end

#nameObject

The name of this document. Always returns “document”



85
86
87
# File 'lib/nokogiri/xml/document.rb', line 85

def name
  'document'
end

#namespacesObject

Get the hash of namespaces on the root Nokogiri::XML::Node



161
162
163
# File 'lib/nokogiri/xml/document.rb', line 161

def namespaces
  root ? root.namespaces : {}
end

#remove_namespaces!Object

Remove all namespaces from all nodes in the document.

This could be useful for developers who either don’t understand namespaces or don’t care about them.

The following example shows a use case, and you can decide for yourself whether this is a good thing or not:

doc = Nokogiri::XML <<-EOXML
   <root>
     <car xmlns:part="http://general-motors.com/">
       <part:tire>Michelin Model XGV</part:tire>
     </car>
     <bicycle xmlns:part="http://schwinn.com/">
       <part:tire>I'm a bicycle tire!</part:tire>
     </bicycle>
   </root>
   EOXML

doc.xpath("//tire").to_s # => ""
doc.xpath("//part:tire", "part" => "http://general-motors.com/").to_s # => "<part:tire>Michelin Model XGV</part:tire>"
doc.xpath("//part:tire", "part" => "http://schwinn.com/").to_s # => "<part:tire>I'm a bicycle tire!</part:tire>"

doc.remove_namespaces!

doc.xpath("//tire").to_s # => "<tire>Michelin Model XGV</tire><tire>I'm a bicycle tire!</tire>"
doc.xpath("//part:tire", "part" => "http://general-motors.com/").to_s # => ""
doc.xpath("//part:tire", "part" => "http://schwinn.com/").to_s # => ""

For more information on why this probably is not a good thing in general, please direct your browser to tenderlovemaking.com/2009/04/23/namespaces-in-xml/



352
353
354
# File 'ext/nokogiri/xml_document.c', line 352

def remove_namespaces!
  self.class.recursively_remove_namespaces_from_node(root)
end

#rootObject

Get the root node for this document.



117
118
119
120
# File 'ext/nokogiri/xml_document.c', line 117

def root
  ptr = LibXML.xmlDocGetRootElement(cstruct)
  ptr.null? ? nil : Node.wrap(LibXML::XmlNode.new(ptr))
end

#root=Object

Set the root element on this document



73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# File 'ext/nokogiri/xml_document.c', line 73

def root= new_root
  old_root = nil

  if new_root.nil?
    old_root_ptr = LibXML.xmlDocGetRootElement(cstruct)
    if (! old_root_ptr.null?)
      old_root = Node.wrap(old_root_ptr)
      LibXML.xmlUnlinkNode(old_root.cstruct)
      old_root.cstruct.keep_reference_from_document!
    end
    return new_root
  end

  if new_root.cstruct[:doc] != cstruct[:doc]
    old_root_ptr = LibXML.xmlDocGetRootElement(cstruct)
    new_root_ptr = LibXML.xmlDocCopyNode(new_root.cstruct, cstruct, 1)
    raise RuntimeError "Could not reparent node (xmlDocCopyNode)" if new_root_ptr.null?
    new_root = Node.wrap(new_root_ptr)
  end
  LibXML.xmlDocSetRootElement(cstruct, new_root.cstruct)
  if old_root_ptr && ! old_root_ptr.null?
    LibXML::XmlNode.new(old_root_ptr).keep_reference_from_document!
  end
  new_root
end

#slop!Object

Explore a document with shortcut methods.



138
139
140
141
142
143
144
145
# File 'lib/nokogiri/xml/document.rb', line 138

def slop!
  unless decorators(XML::Node).include? Nokogiri::Decorators::Slop
    decorators(XML::Node) << Nokogiri::Decorators::Slop
    decorate!
  end

  self
end

#urlObject

Get the url name for this document.



57
58
59
# File 'ext/nokogiri/xml_document.c', line 57

def url
  cstruct[:URL]
end

#validateObject

Validate this Document against it’s DTD. Returns a list of errors on the document or nil when there is no DTD.



131
132
133
134
# File 'lib/nokogiri/xml/document.rb', line 131

def validate
  return nil unless internal_subset
  internal_subset.validate self
end

#versionObject

Get the XML version for this Document



167
168
169
# File 'ext/nokogiri/xml_document.c', line 167

def version
  cstruct[:version]
end