Class: Nokogiri::XML::Document

Inherits:
Node
  • Object
show all
Defined in:
lib/nokogiri/xml/document.rb,
ext/nokogiri/xml_document.c,
ext/nokogiri/html_document.c

Overview

Nokogiri::XML::Document is the main entry point for dealing with XML documents. The Document is created by parsing an XML document. See Nokogiri.XML()

For searching a Document, see Nokogiri::XML::Node#css and Nokogiri::XML::Node#xpath

Direct Known Subclasses

HTML::Document

Constant Summary

Constants inherited from Node

Node::ATTRIBUTE_DECL, Node::ATTRIBUTE_NODE, Node::CDATA_SECTION_NODE, Node::COMMENT_NODE, Node::DOCB_DOCUMENT_NODE, Node::DOCUMENT_FRAG_NODE, Node::DOCUMENT_NODE, Node::DOCUMENT_TYPE_NODE, Node::DTD_NODE, Node::ELEMENT_DECL, Node::ELEMENT_NODE, Node::ENTITY_DECL, Node::ENTITY_NODE, Node::ENTITY_REF_NODE, Node::HTML_DOCUMENT_NODE, Node::NAMESPACE_DECL, Node::NOTATION_NODE, Node::PI_NODE, Node::TEXT_NODE, Node::XINCLUDE_END, Node::XINCLUDE_START

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from Node

#<=>, #==, #>, #[], #[]=, #accept, #add_namespace_definition, #add_next_sibling, #add_previous_sibling, #after, #ancestors, #at, #at_css, #at_xpath, #attribute, #attribute_nodes, #attribute_with_ns, #attributes, #before, #blank?, #cdata?, #child, #children, #children=, #comment?, #content, #content=, #create_external_subset, #create_internal_subset, #css, #css_path, #decorate!, #default_namespace=, #description, #each, #element?, #element_children, #encode_special_chars, #external_subset, #first_element_child, #fragment?, #html?, #inner_html, #inner_html=, #internal_subset, #key?, #keys, #last_element_child, #line, #matches?, #namespace, #namespace=, #namespace_definitions, #namespace_scopes, #namespaced_key?, #next_element, #next_sibling, #node_name, #node_name=, #node_type, #parent, #parent=, #parse, #path, #pointer_id, #previous_element, #previous_sibling, #read_only?, #remove_attribute, #replace, #search, #serialize, #swap, #text?, #to_html, #to_s, #to_xhtml, #traverse, #unlink, #values, #write_html_to, #write_to, #write_xhtml_to, #write_xml_to, #xml?, #xpath

Methods included from PP::Node

#inspect, #pretty_print

Constructor Details

#initialize(*args) ⇒ Document

:nodoc:



39
40
41
42
# File 'lib/nokogiri/xml/document.rb', line 39

def initialize *args # :nodoc:
  @errors     = []
  @decorators = nil
end

Instance Attribute Details

#errorsObject

A list of Nokogiri::XML::SyntaxError found when parsing a document



37
38
39
# File 'lib/nokogiri/xml/document.rb', line 37

def errors
  @errors
end

Class Method Details

.new(version = default) ⇒ Object

Create a new document with version (defaults to “1.0”)



314
315
316
317
318
319
320
321
322
323
324
325
326
327
# File 'ext/nokogiri/xml_document.c', line 314

static VALUE new(int argc, VALUE *argv, VALUE klass)
{
  xmlDocPtr doc;
  VALUE version, rest, rb_doc ;

  rb_scan_args(argc, argv, "0*", &rest);
  version = rb_ary_entry(rest, (long)0);
  if (NIL_P(version)) version = rb_str_new2("1.0");

  doc = xmlNewDoc((xmlChar *)StringValuePtr(version));
  rb_doc = Nokogiri_wrap_xml_document(klass, doc);
  rb_obj_call_init(rb_doc, argc, argv);
  return rb_doc ;
}

.parse(string_or_io, url = nil, encoding = nil, options = ParseOptions::DEFAULT_XML) {|options| ... } ⇒ Object

Parse an XML file. string_or_io may be a String, or any object that responds to read and close such as an IO, or StringIO. url is resource where this document is located. encoding is the encoding that should be used when processing the document. options is a number that sets options in the parser, such as Nokogiri::XML::ParseOptions::RECOVER. See the constants in Nokogiri::XML::ParseOptions.

Yields:

  • (options)


19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/nokogiri/xml/document.rb', line 19

def self.parse string_or_io, url = nil, encoding = nil, options = ParseOptions::DEFAULT_XML, &block

  options = Nokogiri::XML::ParseOptions.new(options) if Fixnum === options
  # Give the options to the user
  yield options if block_given?

  if string_or_io.respond_to?(:read)
    url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
    return read_io(string_or_io, url, encoding, options.to_i)
  end

  # read_memory pukes on empty docs
  return new if string_or_io.nil? or string_or_io.empty?

  read_memory(string_or_io, url, encoding, options.to_i)
end

.read_io(io, url, encoding, options) ⇒ Object

Create a new document from an IO object



196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
# File 'ext/nokogiri/xml_document.c', line 196

static VALUE read_io( VALUE klass,
                      VALUE io,
                      VALUE url,
                      VALUE encoding,
                      VALUE options )
{
  const char * c_url    = NIL_P(url)      ? NULL : StringValuePtr(url);
  const char * c_enc    = NIL_P(encoding) ? NULL : StringValuePtr(encoding);
  VALUE error_list      = rb_ary_new();
  VALUE document;
  xmlDocPtr doc;

  xmlResetLastError();
  xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher);

  doc = xmlReadIO(
      (xmlInputReadCallback)io_read_callback,
      (xmlInputCloseCallback)io_close_callback,
      (void *)io,
      c_url,
      c_enc,
      (int)NUM2INT(options)
  );
  xmlSetStructuredErrorFunc(NULL, NULL);

  if(doc == NULL) {
    xmlErrorPtr error;

    xmlFreeDoc(doc);

    error = xmlGetLastError();
    if(error)
      rb_exc_raise(Nokogiri_wrap_xml_syntax_error((VALUE)NULL, error));
    else
      rb_raise(rb_eRuntimeError, "Could not parse document");

    return Qnil;
  }

  document = Nokogiri_wrap_xml_document(klass, doc);
  rb_iv_set(document, "@errors", error_list);
  return document;
}

.read_memory(string, url, encoding, options) ⇒ Object

Create a new document from a String



246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
# File 'ext/nokogiri/xml_document.c', line 246

static VALUE read_memory( VALUE klass,
                          VALUE string,
                          VALUE url,
                          VALUE encoding,
                          VALUE options )
{
  const char * c_buffer = StringValuePtr(string);
  const char * c_url    = NIL_P(url)      ? NULL : StringValuePtr(url);
  const char * c_enc    = NIL_P(encoding) ? NULL : StringValuePtr(encoding);
  int len               = (int)RSTRING_LEN(string);
  VALUE error_list      = rb_ary_new();
  VALUE document;
  xmlDocPtr doc;

  xmlResetLastError();
  xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher);
  doc = xmlReadMemory(c_buffer, len, c_url, c_enc, (int)NUM2INT(options));
  xmlSetStructuredErrorFunc(NULL, NULL);

  if(doc == NULL) {
    xmlErrorPtr error;

    xmlFreeDoc(doc);

    error = xmlGetLastError();
    if(error)
      rb_exc_raise(Nokogiri_wrap_xml_syntax_error((VALUE)NULL, error));
    else
      rb_raise(rb_eRuntimeError, "Could not parse document");

    return Qnil;
  }

  document = Nokogiri_wrap_xml_document(klass, doc);
  rb_iv_set(document, "@errors", error_list);
  return document;
}

.wrap(document) ⇒ Object

JRuby Wraps Java's org.w3c.dom.document and returns Nokogiri::XML::Document



211
212
213
214
# File 'lib/nokogiri/xml/document.rb', line 211

def self.wrap document
  raise "JRuby only method" unless Nokogiri.jruby?
  return wrapJavaDocument(document)
end

Instance Method Details

#add_child(child) ⇒ Object Also known as: <<



197
198
199
200
201
202
203
204
205
# File 'lib/nokogiri/xml/document.rb', line 197

def add_child child
  raise "Document already has a root node" if root
  if child.type == Node::DOCUMENT_FRAG_NODE
    raise "Document cannot have multiple root nodes" if child.children.size > 1
    super(child.children.first)
  else
    super
  end
end

#collect_namespacesObject

Recursively get all namespaces from this node and its subtree and return them as a hash.

For example, given this document:

<root xmlns:foo="bar">
  <bar xmlns:hello="world" />
</root>

This method will return:

{ 'xmlns:foo' => 'bar', 'xmlns:hello' => 'world' }

WARNING: this method will clobber duplicate names in the keys. For example, given this document:

<root xmlns:foo="bar">
  <bar xmlns:foo="baz" />
</root>

The hash returned will look like this: { 'xmlns:foo' => 'bar' }

Non-prefixed default namespaces (as in “xmlns=”) are not included in the hash.

Note this is a very expensive operation in current implementation, as it traverses the entire graph, and also has to bring each node accross the libxml bridge into a ruby object.



123
124
125
126
127
# File 'lib/nokogiri/xml/document.rb', line 123

def collect_namespaces
  ns = {}
  traverse { |j| ns.merge!(j.namespaces) }
  ns
end

#create_cdata(text) ⇒ Object

Create a CDATA element containing text



80
81
82
# File 'lib/nokogiri/xml/document.rb', line 80

def create_cdata text
  Nokogiri::XML::CDATA.new(self, text.to_s)
end

#create_element(name, *args, &block) ⇒ Object

Create an element with name, and optionally setting the content and attributes.

doc.create_element "div" # <div></div>
doc.create_element "div", :class => "container" # <div class='container'></div>
doc.create_element "div", "contents" # <div>contents</div>
doc.create_element "div", "contents", :class => "container" # <div class='container'>contents</div>
doc.create_element "div" { |node| node['class'] = "container" } # <div class='container'></div>


53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# File 'lib/nokogiri/xml/document.rb', line 53

def create_element name, *args, &block
  elm = Nokogiri::XML::Element.new(name, self, &block)
  args.each do |arg|
    case arg
    when Hash
      arg.each { |k,v|
        key = k.to_s
        if key =~ /^xmlns(:\w+)?$/
          ns_name = key.split(":", 2)[1]
          elm.add_namespace_definition ns_name, v
          next
        end
        elm[k.to_s] = v.to_s
      }
    else
      elm.content = arg
    end
  end
  elm
end

#create_entity(name, type, external_id, system_id, content) ⇒ Object

Create a new entity named name.

type is an integer representing the type of entity to be created, and it defaults to Nokogiri::XML::EntityDecl::INTERNAL_GENERAL. See the constants on Nokogiri::XML::EntityDecl for more information.

external_id, system_id, and content set the External ID, System ID, and content respectively. All of these parameters are optional.



386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
# File 'ext/nokogiri/xml_document.c', line 386

static VALUE create_entity(int argc, VALUE *argv, VALUE self)
{
  VALUE name;
  VALUE type;
  VALUE external_id;
  VALUE system_id;
  VALUE content;
  xmlEntityPtr ptr;
  xmlDocPtr doc ;

  Data_Get_Struct(self, xmlDoc, doc);

  rb_scan_args(argc, argv, "14", &name, &type, &external_id, &system_id,
      &content);

  xmlResetLastError();
  ptr = xmlAddDocEntity(
      doc,
      (xmlChar *)(NIL_P(name)        ? NULL                        : StringValuePtr(name)),
      (int)      (NIL_P(type)        ? XML_INTERNAL_GENERAL_ENTITY : NUM2INT(type)),
      (xmlChar *)(NIL_P(external_id) ? NULL                        : StringValuePtr(external_id)),
      (xmlChar *)(NIL_P(system_id)   ? NULL                        : StringValuePtr(system_id)),
      (xmlChar *)(NIL_P(content)     ? NULL                        : StringValuePtr(content))
    );

  if(NULL == ptr) {
    xmlErrorPtr error = xmlGetLastError();
    if(error)
      rb_exc_raise(Nokogiri_wrap_xml_syntax_error((VALUE)NULL, error));
    else
      rb_raise(rb_eRuntimeError, "Could not create entity");

    return Qnil;
  }

  return Nokogiri_wrap_xml_node(cNokogiriXmlEntityDecl, (xmlNodePtr)ptr);
}

#create_text_node(text, &block) ⇒ Object

Create a text node with text



75
76
77
# File 'lib/nokogiri/xml/document.rb', line 75

def create_text_node text, &block
  Nokogiri::XML::Text.new(text.to_s, self, &block)
end

#decorate(node) ⇒ Object

Apply any decorators to node



170
171
172
173
174
175
176
# File 'lib/nokogiri/xml/document.rb', line 170

def decorate node
  return unless @decorators
  @decorators.each { |klass,list|
    next unless node.is_a?(klass)
    list.each { |moodule| node.extend(moodule) }
  }
end

#decorators(key) ⇒ Object

Get the list of decorators given key



130
131
132
133
# File 'lib/nokogiri/xml/document.rb', line 130

def decorators key
  @decorators ||= Hash.new
  @decorators[key] ||= []
end

#documentObject

A reference to self



90
91
92
# File 'lib/nokogiri/xml/document.rb', line 90

def document
  self
end

#dupObject Also known as: clone

Copy this Document. An optional depth may be passed in, but it defaults to a deep copy. 0 is a shallow copy, 1 is a deep copy.



291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
# File 'ext/nokogiri/xml_document.c', line 291

static VALUE duplicate_node(int argc, VALUE *argv, VALUE self)
{
  xmlDocPtr doc, dup;
  VALUE level;

  if(rb_scan_args(argc, argv, "01", &level) == 0)
    level = INT2NUM((long)1);

  Data_Get_Struct(self, xmlDoc, doc);

  dup = xmlCopyDoc(doc, (int)NUM2INT(level));
  if(dup == NULL) return Qnil;

  dup->type = doc->type;
  return Nokogiri_wrap_xml_document(rb_obj_class(self), dup);
}

#encodingObject

Get the encoding for this Document



166
167
168
169
170
171
172
173
# File 'ext/nokogiri/xml_document.c', line 166

static VALUE encoding(VALUE self)
{
  xmlDocPtr doc;
  Data_Get_Struct(self, xmlDoc, doc);

  if(!doc->encoding) return Qnil;
  return NOKOGIRI_STR_NEW2(doc->encoding);
}

#encoding=(encoding) ⇒ Object

Set the encoding string for this Document



150
151
152
153
154
155
156
157
158
# File 'ext/nokogiri/xml_document.c', line 150

static VALUE set_encoding(VALUE self, VALUE encoding)
{
  xmlDocPtr doc;
  Data_Get_Struct(self, xmlDoc, doc);

  doc->encoding = xmlStrdup((xmlChar *)StringValuePtr(encoding));

  return encoding;
}

#fragment(tags = nil) ⇒ Object

Create a Nokogiri::XML::DocumentFragment from tags Returns an empty fragment if tags is nil.



189
190
191
# File 'lib/nokogiri/xml/document.rb', line 189

def fragment tags = nil
  DocumentFragment.new(self, tags, self.root)
end

#nameObject

The name of this document. Always returns “document”



85
86
87
# File 'lib/nokogiri/xml/document.rb', line 85

def name
  'document'
end

#namespacesObject

Get the hash of namespaces on the root Nokogiri::XML::Node



182
183
184
# File 'lib/nokogiri/xml/document.rb', line 182

def namespaces
  root ? root.namespaces : {}
end

#remove_namespaces!Object

Remove all namespaces from all nodes in the document.

This could be useful for developers who either don't understand namespaces or don't care about them.

The following example shows a use case, and you can decide for yourself whether this is a good thing or not:

doc = Nokogiri::XML <<-EOXML
   <root>
     <car xmlns:part="http://general-motors.com/">
       <part:tire>Michelin Model XGV</part:tire>
     </car>
     <bicycle xmlns:part="http://schwinn.com/">
       <part:tire>I'm a bicycle tire!</part:tire>
     </bicycle>
   </root>
   EOXML

doc.xpath("//tire").to_s # => ""
doc.xpath("//part:tire", "part" => "http://general-motors.com/").to_s # => "<part:tire>Michelin Model XGV</part:tire>"
doc.xpath("//part:tire", "part" => "http://schwinn.com/").to_s # => "<part:tire>I'm a bicycle tire!</part:tire>"

doc.remove_namespaces!

doc.xpath("//tire").to_s # => "<tire>Michelin Model XGV</tire><tire>I'm a bicycle tire!</tire>"
doc.xpath("//part:tire", "part" => "http://general-motors.com/").to_s # => ""
doc.xpath("//part:tire", "part" => "http://schwinn.com/").to_s # => ""

For more information on why this probably is not a good thing in general, please direct your browser to tenderlovemaking.com/2009/04/23/namespaces-in-xml/



366
367
368
369
370
371
372
373
# File 'ext/nokogiri/xml_document.c', line 366

VALUE remove_namespaces_bang(VALUE self)
{
  xmlDocPtr doc ;
  Data_Get_Struct(self, xmlDoc, doc);

  recursively_remove_namespaces_from_node((xmlNodePtr)doc);
  return self;
}

#rootObject

Get the root node for this document.



131
132
133
134
135
136
137
138
139
140
141
142
# File 'ext/nokogiri/xml_document.c', line 131

static VALUE root(VALUE self)
{
  xmlDocPtr doc;
  xmlNodePtr root;

  Data_Get_Struct(self, xmlDoc, doc);

  root = xmlDocGetRootElement(doc);

  if(!root) return Qnil;
  return Nokogiri_wrap_xml_node(Qnil, root) ;
}

#root=Object

Set the root element on this document



87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# File 'ext/nokogiri/xml_document.c', line 87

static VALUE set_root(VALUE self, VALUE root)
{
  xmlDocPtr doc;
  xmlNodePtr new_root;
  xmlNodePtr old_root;

  Data_Get_Struct(self, xmlDoc, doc);

  old_root = NULL;

  if(NIL_P(root)) {
    old_root = xmlDocGetRootElement(doc);

    if(old_root) {
      xmlUnlinkNode(old_root);
      NOKOGIRI_ROOT_NODE(old_root);
    }

    return root;
  }

  Data_Get_Struct(root, xmlNode, new_root);


  /* If the new root's document is not the same as the current document,
   * then we need to dup the node in to this document. */
  if(new_root->doc != doc) {
    old_root = xmlDocGetRootElement(doc);
    if (!(new_root = xmlDocCopyNode(new_root, doc, 1))) {
      rb_raise(rb_eRuntimeError, "Could not reparent node (xmlDocCopyNode)");
    }
  }

  xmlDocSetRootElement(doc, new_root);
  if(old_root) NOKOGIRI_ROOT_NODE(old_root);
  return root;
}

#slop!Object

Explore a document with shortcut methods. See Nokogiri::Slop for details.

Note that any nodes that have been instantiated before #slop! is called will not be decorated with sloppy behavior. So, if you're in irb, the preferred idiom is:

irb> doc = Nokogiri::Slop my_markup

and not

irb> doc = Nokogiri::HTML my_markup
... followed by irb's implicit inspect (and therefore instantiation of every node) ...
irb> doc.slop!
... which does absolutely nothing.


159
160
161
162
163
164
165
166
# File 'lib/nokogiri/xml/document.rb', line 159

def slop!
  unless decorators(XML::Node).include? Nokogiri::Decorators::Slop
    decorators(XML::Node) << Nokogiri::Decorators::Slop
    decorate!
  end

  self
end

#to_javaObject

JRuby Returns Java's org.w3c.dom.document of this Document.



219
220
221
222
# File 'lib/nokogiri/xml/document.rb', line 219

def to_java
  raise "JRuby only method" unless Nokogiri.jruby?
  return toJavaDocument()
end

#urlObject

Get the url name for this document.



71
72
73
74
75
76
77
78
79
# File 'ext/nokogiri/xml_document.c', line 71

static VALUE url(VALUE self)
{
  xmlDocPtr doc;
  Data_Get_Struct(self, xmlDoc, doc);

  if(doc->URL) return NOKOGIRI_STR_NEW2(doc->URL);

  return Qnil;
}

#validateObject

Validate this Document against it's DTD. Returns a list of errors on the document or nil when there is no DTD.



138
139
140
141
# File 'lib/nokogiri/xml/document.rb', line 138

def validate
  return nil unless internal_subset
  internal_subset.validate self
end

#versionObject

Get the XML version for this Document



181
182
183
184
185
186
187
188
# File 'ext/nokogiri/xml_document.c', line 181

static VALUE version(VALUE self)
{
  xmlDocPtr doc;
  Data_Get_Struct(self, xmlDoc, doc);

  if(!doc->version) return Qnil;
  return NOKOGIRI_STR_NEW2(doc->version);
}