Class: Nokogiri::HTML::Document

Inherits:
XML::Document show all
Defined in:
lib/nokogiri/html/document.rb,
ext/nokogiri/html_document.c

Defined Under Namespace

Classes: EncodingFoundException, EncodingReader

Constant Summary

Constants inherited from XML::Node

XML::Node::ATTRIBUTE_DECL, XML::Node::ATTRIBUTE_NODE, XML::Node::CDATA_SECTION_NODE, XML::Node::COMMENT_NODE, XML::Node::DOCB_DOCUMENT_NODE, XML::Node::DOCUMENT_FRAG_NODE, XML::Node::DOCUMENT_NODE, XML::Node::DOCUMENT_TYPE_NODE, XML::Node::DTD_NODE, XML::Node::ELEMENT_DECL, XML::Node::ELEMENT_NODE, XML::Node::ENTITY_DECL, XML::Node::ENTITY_NODE, XML::Node::ENTITY_REF_NODE, XML::Node::HTML_DOCUMENT_NODE, XML::Node::NAMESPACE_DECL, XML::Node::NOTATION_NODE, XML::Node::PI_NODE, XML::Node::TEXT_NODE, XML::Node::XINCLUDE_END, XML::Node::XINCLUDE_START

Instance Attribute Summary

Attributes inherited from XML::Document

#errors

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from XML::Document

#add_child, #collect_namespaces, #create_cdata, #create_element, #create_entity, #create_text_node, #decorate, #decorators, #document, #dup, #encoding, #encoding=, #initialize, #name, #namespaces, #remove_namespaces!, #root, #root=, #slop!, #to_java, #url, #validate, #version, wrap

Methods inherited from XML::Node

#<<, #<=>, #==, #>, #[], #[]=, #accept, #add_child, #add_namespace_definition, #add_next_sibling, #add_previous_sibling, #after, #ancestors, #at, #at_css, #at_xpath, #attribute, #attribute_nodes, #attribute_with_ns, #attributes, #before, #blank?, #cdata?, #child, #children, #children=, #comment?, #content, #content=, #create_external_subset, #create_internal_subset, #css, #css_path, #decorate!, #default_namespace=, #description, #document, #dup, #each, #element?, #element_children, #encode_special_chars, #external_subset, #first_element_child, #fragment?, #html?, #initialize, #inner_html, #inner_html=, #internal_subset, #key?, #keys, #last_element_child, #line, #matches?, #namespace, #namespace=, #namespace_definitions, #namespace_scopes, #namespaced_key?, #namespaces, #next_element, #next_sibling, #node_name, #node_name=, #node_type, #parent, #parent=, #parse, #path, #pointer_id, #previous_element, #previous_sibling, #read_only?, #remove_attribute, #replace, #search, #swap, #text?, #to_html, #to_s, #to_xhtml, #to_xml, #traverse, #unlink, #values, #write_html_to, #write_to, #write_xhtml_to, #write_xml_to, #xml?, #xpath

Methods included from XML::PP::Node

#inspect, #pretty_print

Constructor Details

This class inherits a constructor from Nokogiri::XML::Document

Class Method Details

.newObject

Create a new document



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# File 'ext/nokogiri/html_document.c', line 9

static VALUE new(int argc, VALUE *argv, VALUE klass)
{
  VALUE uri, external_id, rest, rb_doc;
  htmlDocPtr doc;

  rb_scan_args(argc, argv, "0*", &rest);
  uri         = rb_ary_entry(rest, (long)0);
  external_id = rb_ary_entry(rest, (long)1);

  doc = htmlNewDoc(
      RTEST(uri) ? (const xmlChar *)StringValuePtr(uri) : NULL,
      RTEST(external_id) ? (const xmlChar *)StringValuePtr(external_id) : NULL
  );
  rb_doc = Nokogiri_wrap_xml_document(klass, doc);
  rb_obj_call_init(rb_doc, argc, argv);
  return rb_doc ;
}

.parse(string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML) {|options| ... } ⇒ Object

Parse HTML. string_or_io may be a String, or any object that responds to read and close such as an IO, or StringIO. url is resource where this document is located. encoding is the encoding that should be used when processing the document. options is a number that sets options in the parser, such as Nokogiri::XML::ParseOptions::RECOVER. See the constants in Nokogiri::XML::ParseOptions.

Yields:

  • (options)


80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# File 'lib/nokogiri/html/document.rb', line 80

def parse string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML

  options = Nokogiri::XML::ParseOptions.new(options) if Fixnum === options
  # Give the options to the user
  yield options if block_given?

  if string_or_io.respond_to?(:encoding)
    unless string_or_io.encoding.name == "ASCII-8BIT"
      encoding ||= string_or_io.encoding.name
    end
  end

  if string_or_io.respond_to?(:read)
    url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
    if !encoding
      # Perform advanced encoding detection that libxml2 does
      # not do.
      string_or_io = EncodingReader.new(string_or_io)
      begin
        return read_io(string_or_io, url, encoding, options.to_i)
      rescue EncodingFoundException => e
        # A retry is required because libxml2 has a problem in
        # that it cannot switch encoding well in the middle of
        # parsing, especially if it has already seen a
        # non-ASCII character when it finds an encoding hint.
        encoding = e.encoding
      end
    end
    return read_io(string_or_io, url, encoding, options.to_i)
  end

  # read_memory pukes on empty docs
  return new if string_or_io.nil? or string_or_io.empty?

  if !encoding
    encoding = EncodingReader.detect_encoding(string_or_io)
  end

  read_memory(string_or_io, url, encoding, options.to_i)
end

.read_io(io, url, encoding, options) ⇒ Object

Read the HTML document from io with given url, encoding, and options. See Nokogiri::HTML.parse



34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# File 'ext/nokogiri/html_document.c', line 34

static VALUE read_io( VALUE klass,
                      VALUE io,
                      VALUE url,
                      VALUE encoding,
                      VALUE options )
{
  const char * c_url    = NIL_P(url)      ? NULL : StringValuePtr(url);
  const char * c_enc    = NIL_P(encoding) ? NULL : StringValuePtr(encoding);
  VALUE error_list      = rb_ary_new();
  VALUE document;
  htmlDocPtr doc;

  xmlResetLastError();
  xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher);

  doc = htmlReadIO(
      io_read_callback,
      io_close_callback,
      (void *)io,
      c_url,
      c_enc,
      (int)NUM2INT(options)
  );
  xmlSetStructuredErrorFunc(NULL, NULL);

  if(doc == NULL) {
    xmlErrorPtr error;

    xmlFreeDoc(doc);

    error = xmlGetLastError();
    if(error)
      rb_exc_raise(Nokogiri_wrap_xml_syntax_error((VALUE)NULL, error));
    else
      rb_raise(rb_eRuntimeError, "Could not parse document");

    return Qnil;
  }

  document = Nokogiri_wrap_xml_document(klass, doc);
  rb_iv_set(document, "@errors", error_list);
  return document;
}

.read_memory(string, url, encoding, options) ⇒ Object

Read the HTML document contained in string with given url, encoding, and options. See Nokogiri::HTML.parse



85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# File 'ext/nokogiri/html_document.c', line 85

static VALUE read_memory( VALUE klass,
                          VALUE string,
                          VALUE url,
                          VALUE encoding,
                          VALUE options )
{
  const char * c_buffer = StringValuePtr(string);
  const char * c_url    = NIL_P(url)      ? NULL : StringValuePtr(url);
  const char * c_enc    = NIL_P(encoding) ? NULL : StringValuePtr(encoding);
  int len               = (int)RSTRING_LEN(string);
  VALUE error_list      = rb_ary_new();
  VALUE document;
  htmlDocPtr doc;

  xmlResetLastError();
  xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher);

  doc = htmlReadMemory(c_buffer, len, c_url, c_enc, (int)NUM2INT(options));
  xmlSetStructuredErrorFunc(NULL, NULL);

  if(doc == NULL) {
    xmlErrorPtr error;

    xmlFreeDoc(doc);

    error = xmlGetLastError();
    if(error)
      rb_exc_raise(Nokogiri_wrap_xml_syntax_error((VALUE)NULL, error));
    else
      rb_raise(rb_eRuntimeError, "Could not parse document");

    return Qnil;
  }

  document = Nokogiri_wrap_xml_document(klass, doc);
  rb_iv_set(document, "@errors", error_list);
  return document;
}

Instance Method Details

#fragment(tags = nil) ⇒ Object

Create a Nokogiri::XML::DocumentFragment from tags



67
68
69
# File 'lib/nokogiri/html/document.rb', line 67

def fragment tags = nil
  DocumentFragment.new(self, tags, self.root)
end

#meta_encodingObject

Get the meta tag encoding for this document. If there is no meta tag, then nil is returned.



7
8
9
10
# File 'lib/nokogiri/html/document.rb', line 7

def meta_encoding
  meta = meta_content_type and
    /charset\s*=\s*([\w-]+)/i.match(meta['content'])[1]
end

#meta_encoding=(encoding) ⇒ Object

Set the meta tag encoding for this document. If there is no meta content tag, the encoding is not set.



15
16
17
18
# File 'lib/nokogiri/html/document.rb', line 15

def meta_encoding= encoding
  meta = meta_content_type and
    meta['content'] = "text/html; charset=%s" % encoding
end

#serialize(options = {}) ⇒ Object

Serialize Node using options. Save options can also be set using a block. See SaveOptions.

These two statements are equivalent:

node.serialize(:encoding => 'UTF-8', :save_with => FORMAT | AS_XML)

or

node.serialize(:encoding => 'UTF-8') do |config|
  config.format.as_xml
end


60
61
62
63
# File 'lib/nokogiri/html/document.rb', line 60

def serialize options = {}
  options[:save_with] ||= XML::Node::SaveOptions::DEFAULT_HTML
  super
end

#titleObject

Get the title string of this document. Return nil if there is no title tag.



30
31
32
# File 'lib/nokogiri/html/document.rb', line 30

def title
  title = at('title') and title.inner_text
end

#title=(text) ⇒ Object

Set the title string of this document. If there is no head element, the title is not set.



37
38
39
40
41
42
43
44
# File 'lib/nokogiri/html/document.rb', line 37

def title=(text)
  unless title = at('title')
    head = at('head') or return nil
    title = Nokogiri::XML::Node.new('title', self)
    head << title
  end
  title.children = XML::Text.new(text, self)
end

#typeObject

The type for this document



130
131
132
133
134
135
# File 'ext/nokogiri/html_document.c', line 130

static VALUE type(VALUE self)
{
  htmlDocPtr doc;
  Data_Get_Struct(self, xmlDoc, doc);
  return INT2NUM((long)doc->type);
}