Class: Nokogiri::HTML::Document

Inherits:
XML::Document show all
Defined in:
lib/nokogiri/html/document.rb,
ext/nokogiri/html_document.c

Defined Under Namespace

Classes: EncodingFound, EncodingReader

Constant Summary

Constants inherited from XML::Document

XML::Document::NCNAME_CHAR, XML::Document::NCNAME_RE, XML::Document::NCNAME_START_CHAR

Constants inherited from XML::Node

XML::Node::ATTRIBUTE_DECL, XML::Node::ATTRIBUTE_NODE, XML::Node::CDATA_SECTION_NODE, XML::Node::COMMENT_NODE, XML::Node::DOCB_DOCUMENT_NODE, XML::Node::DOCUMENT_FRAG_NODE, XML::Node::DOCUMENT_NODE, XML::Node::DOCUMENT_TYPE_NODE, XML::Node::DTD_NODE, XML::Node::ELEMENT_DECL, XML::Node::ELEMENT_NODE, XML::Node::ENTITY_DECL, XML::Node::ENTITY_NODE, XML::Node::ENTITY_REF_NODE, XML::Node::HTML_DOCUMENT_NODE, XML::Node::NAMESPACE_DECL, XML::Node::NOTATION_NODE, XML::Node::PI_NODE, XML::Node::TEXT_NODE, XML::Node::XINCLUDE_END, XML::Node::XINCLUDE_START

Constants included from XML::Searchable

XML::Searchable::LOOKS_LIKE_XPATH

Instance Attribute Summary

Attributes inherited from XML::Document

#errors

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from XML::Document

#add_child, #canonicalize, #collect_namespaces, #create_cdata, #create_comment, #create_element, #create_entity, #create_text_node, #decorate, #decorators, #document, #dup, #encoding, #encoding=, #initialize, #name, #namespaces, #remove_namespaces!, #root, #root=, #slop!, #to_java, #url, #validate, #version, wrap

Methods inherited from XML::Node

#<<, #<=>, #==, #>, #[], #[]=, #accept, #add_child, #add_class, #add_namespace_definition, #add_next_sibling, #add_previous_sibling, #after, #ancestors, #append_class, #attribute, #attribute_nodes, #attribute_with_ns, #attributes, #before, #blank?, #canonicalize, #cdata?, #child, #children, #children=, #classes, #comment?, #content, #content=, #create_external_subset, #create_internal_subset, #css_path, #decorate!, #default_namespace=, #description, #do_xinclude, #document, #document?, #dup, #each, #element?, #element_children, #encode_special_chars, #external_subset, #first_element_child, #fragment?, #html?, #initialize, #inner_html, #inner_html=, #internal_subset, #key?, #keys, #lang, #lang=, #last_element_child, #line, #matches?, #namespace, #namespace=, #namespace_definitions, #namespace_scopes, #namespaced_key?, #namespaces, #native_content=, #next_element, #next_sibling, #node_name, #node_name=, #node_type, #parent, #parent=, #parse, #path, #pointer_id, #prepend_child, #previous_element, #previous_sibling, #processing_instruction?, #read_only?, #remove_attribute, #remove_class, #replace, #swap, #text?, #to_html, #to_s, #to_xhtml, #to_xml, #traverse, #unlink, #values, #wrap, #write_html_to, #write_to, #write_xhtml_to, #write_xml_to, #xml?

Methods included from XML::Searchable

#at, #at_css, #at_xpath, #css, #search, #xpath

Methods included from XML::PP::Node

#inspect, #pretty_print

Constructor Details

This class inherits a constructor from Nokogiri::XML::Document

Class Method Details

.newObject

Create a new document


11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# File 'ext/nokogiri/html_document.c', line 11

static VALUE new(int argc, VALUE *argv, VALUE klass)
{
  VALUE uri, external_id, rest, rb_doc;
  htmlDocPtr doc;

  rb_scan_args(argc, argv, "0*", &rest);
  uri         = rb_ary_entry(rest, (long)0);
  external_id = rb_ary_entry(rest, (long)1);

  doc = htmlNewDoc(
      RTEST(uri) ? (const xmlChar *)StringValueCStr(uri) : NULL,
      RTEST(external_id) ? (const xmlChar *)StringValueCStr(external_id) : NULL
  );
  rb_doc = Nokogiri_wrap_xml_document(klass, doc);
  rb_obj_call_init(rb_doc, argc, argv);
  return rb_doc ;
}

.parse(string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML) {|options| ... } ⇒ Object

Parse HTML. string_or_io may be a String, or any object that responds to read and close such as an IO, or StringIO. url is resource where this document is located. encoding is the encoding that should be used when processing the document. options is a number that sets options in the parser, such as Nokogiri::XML::ParseOptions::RECOVER. See the constants in Nokogiri::XML::ParseOptions.

Yields:

  • (options)

162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
# File 'lib/nokogiri/html/document.rb', line 162

def parse string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML

  options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
  # Give the options to the user
  yield options if block_given?

  if string_or_io.respond_to?(:encoding)
    unless string_or_io.encoding.name == "ASCII-8BIT"
      encoding ||= string_or_io.encoding.name
    end
  end

  if string_or_io.respond_to?(:read)
    url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
    unless encoding
      # Libxml2's parser has poor support for encoding
      # detection.  First, it does not recognize the HTML5
      # style meta charset declaration.  Secondly, even if it
      # successfully detects an encoding hint, it does not
      # re-decode or re-parse the preceding part which may be
      # garbled.
      #
      # EncodingReader aims to perform advanced encoding
      # detection beyond what Libxml2 does, and to emulate
      # rewinding of a stream and make Libxml2 redo parsing
      # from the start when an encoding hint is found.
      string_or_io = EncodingReader.new(string_or_io)
      begin
        return read_io(string_or_io, url, encoding, options.to_i)
      rescue EncodingFound => e
        encoding = e.found_encoding
      end
    end
    return read_io(string_or_io, url, encoding, options.to_i)
  end

  # read_memory pukes on empty docs
  if string_or_io.nil? or string_or_io.empty?
    return encoding ? new.tap { |i| i.encoding = encoding } : new
  end

  encoding ||= EncodingReader.detect_encoding(string_or_io)

  read_memory(string_or_io, url, encoding, options.to_i)
end

.read_io(io, url, encoding, options) ⇒ Object

Read the HTML document from io with given url, encoding, and options. See Nokogiri::HTML.parse


36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# File 'ext/nokogiri/html_document.c', line 36

static VALUE read_io( VALUE klass,
                      VALUE io,
                      VALUE url,
                      VALUE encoding,
                      VALUE options )
{
  const char * c_url    = NIL_P(url)      ? NULL : StringValueCStr(url);
  const char * c_enc    = NIL_P(encoding) ? NULL : StringValueCStr(encoding);
  VALUE error_list      = rb_ary_new();
  VALUE document;
  htmlDocPtr doc;

  xmlResetLastError();
  xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher);

  doc = htmlReadIO(
      io_read_callback,
      io_close_callback,
      (void *)io,
      c_url,
      c_enc,
      (int)NUM2INT(options)
  );
  xmlSetStructuredErrorFunc(NULL, NULL);

  /*
   * If EncodingFound has occurred in EncodingReader, make sure to do
   * a cleanup and propagate the error.
   */
  if (rb_respond_to(io, id_encoding_found)) {
    VALUE encoding_found = rb_funcall(io, id_encoding_found, 0);
    if (!NIL_P(encoding_found)) {
      xmlFreeDoc(doc);
      rb_exc_raise(encoding_found);
    }
  }

  if(doc == NULL) {
    xmlErrorPtr error;

    xmlFreeDoc(doc);

    error = xmlGetLastError();
    if(error)
      rb_exc_raise(Nokogiri_wrap_xml_syntax_error(error));
    else
      rb_raise(rb_eRuntimeError, "Could not parse document");

    return Qnil;
  }

  document = Nokogiri_wrap_xml_document(klass, doc);
  rb_iv_set(document, "@errors", error_list);
  return document;
}

.read_memory(string, url, encoding, options) ⇒ Object

Read the HTML document contained in string with given url, encoding, and options. See Nokogiri::HTML.parse


99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# File 'ext/nokogiri/html_document.c', line 99

static VALUE read_memory( VALUE klass,
                          VALUE string,
                          VALUE url,
                          VALUE encoding,
                          VALUE options )
{
  const char * c_buffer = StringValuePtr(string);
  const char * c_url    = NIL_P(url)      ? NULL : StringValueCStr(url);
  const char * c_enc    = NIL_P(encoding) ? NULL : StringValueCStr(encoding);
  int len               = (int)RSTRING_LEN(string);
  VALUE error_list      = rb_ary_new();
  VALUE document;
  htmlDocPtr doc;

  xmlResetLastError();
  xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher);

  doc = htmlReadMemory(c_buffer, len, c_url, c_enc, (int)NUM2INT(options));
  xmlSetStructuredErrorFunc(NULL, NULL);

  if(doc == NULL) {
    xmlErrorPtr error;

    xmlFreeDoc(doc);

    error = xmlGetLastError();
    if(error)
      rb_exc_raise(Nokogiri_wrap_xml_syntax_error(error));
    else
      rb_raise(rb_eRuntimeError, "Could not parse document");

    return Qnil;
  }

  document = Nokogiri_wrap_xml_document(klass, doc);
  rb_iv_set(document, "@errors", error_list);
  return document;
}

Instance Method Details

#fragment(tags = nil) ⇒ Object

Create a Nokogiri::XML::DocumentFragment from tags


149
150
151
# File 'lib/nokogiri/html/document.rb', line 149

def fragment tags = nil
  DocumentFragment.new(self, tags, self.root)
end

#meta_encodingObject

Get the meta tag encoding for this document. If there is no meta tag, then nil is returned.


7
8
9
10
11
12
13
14
# File 'lib/nokogiri/html/document.rb', line 7

def meta_encoding
  case
  when meta = at('//meta[@charset]')
    meta[:charset]
  when meta = meta_content_type
    meta['content'][/charset\s*=\s*([\w-]+)/i, 1]
  end
end

#meta_encoding=(encoding) ⇒ Object

Set the meta tag encoding for this document.

If an meta encoding tag is already present, its content is replaced with the given text.

Otherwise, this method tries to create one at an appropriate place supplying head and/or html elements as necessary, which is inside a head element if any, and before any text node or content element (typically <body>) if any.

The result when trying to set an encoding that is different from the document encoding is undefined.

Beware in CRuby, that libxml2 automatically inserts a meta tag into a head element.


32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# File 'lib/nokogiri/html/document.rb', line 32

def meta_encoding= encoding
  case
  when meta = meta_content_type
    meta['content'] = 'text/html; charset=%s' % encoding
    encoding
  when meta = at('//meta[@charset]')
    meta['charset'] = encoding
  else
    meta = XML::Node.new('meta', self)
    if dtd = internal_subset and dtd.html5_dtd?
      meta['charset'] = encoding
    else
      meta['http-equiv'] = 'Content-Type'
      meta['content'] = 'text/html; charset=%s' % encoding
    end

    case
    when head = at('//head')
      head.prepend_child(meta)
    else
      (meta)
    end
    encoding
  end
end

#serialize(options = {}) ⇒ Object

Serialize Node using options. Save options can also be set using a block. See SaveOptions.

These two statements are equivalent:

node.serialize(:encoding => 'UTF-8', :save_with => FORMAT | AS_XML)

or

node.serialize(:encoding => 'UTF-8') do |config|
  config.format.as_xml
end

142
143
144
145
# File 'lib/nokogiri/html/document.rb', line 142

def serialize options = {}
  options[:save_with] ||= XML::Node::SaveOptions::DEFAULT_HTML
  super
end

#titleObject

Get the title string of this document. Return nil if there is no title tag.


68
69
70
# File 'lib/nokogiri/html/document.rb', line 68

def title
  title = at('//title') and title.inner_text
end

#title=(text) ⇒ Object

Set the title string of this document.

If a title element is already present, its content is replaced with the given text.

Otherwise, this method tries to create one at an appropriate place supplying head and/or html elements as necessary, which is inside a head element if any, right after a meta encoding/charset tag if any, and before any text node or content element (typically <body>) if any.


83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# File 'lib/nokogiri/html/document.rb', line 83

def title=(text)
  tnode = XML::Text.new(text, self)
  if title = at('//title')
    title.children = tnode
    return text
  end

  title = XML::Node.new('title', self) << tnode
  case
  when head = at('//head')
    head << title
  when meta = at('//meta[@charset]') || meta_content_type
    # better put after charset declaration
    meta.add_next_sibling(title)
  else
    (title)
  end
  text
end

#typeObject

The type for this document


144
145
146
147
148
149
# File 'ext/nokogiri/html_document.c', line 144

static VALUE type(VALUE self)
{
  htmlDocPtr doc;
  Data_Get_Struct(self, xmlDoc, doc);
  return INT2NUM((long)doc->type);
}