Class: Nokolexbor::Document

Inherits:
Node
  • Object
show all
Defined in:
lib/nokolexbor/document.rb,
ext/nokolexbor/nl_document.c

Constant Summary

Constants inherited from Node

Node::ATTRIBUTE_NODE, Node::CDATA_SECTION_NODE, Node::COMMENT_NODE, Node::DOCUMENT_FRAG_NODE, Node::DOCUMENT_NODE, Node::DOCUMENT_TYPE_NODE, Node::ELEMENT_NODE, Node::ENTITY_NODE, Node::ENTITY_REF_NODE, Node::LOOKS_LIKE_XPATH, Node::NOTATION_NODE, Node::PI_NODE, Node::TEXT_NODE

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from Node

#<<, #==, #[], #[]=, #add_child, #add_class, #add_next_sibling, #add_previous_sibling, #add_sibling, #after, #ancestors, #append_class, #at, #at_css, #at_css_impl, #at_xpath, #attribute, #attribute_nodes, #attributes, #attrs, #before, #cdata?, #child, #children, #children=, #classes, #clone, #comment?, #content, #content=, #css, #css_impl, #css_path, #destroy, #document?, #each, #element?, #element_children, #first_element_child, #fragment, #fragment?, #inner_html, #inspect, #key?, #keys, #kwattr_add, #kwattr_append, #kwattr_remove, #kwattr_values, #last_element_child, #matches?, #name, #next, #next_element, #node_type, #nokogiri_at_css, #nokogiri_css, #outer_html, #parent, #parent=, #parse, #path, #prepend_child, #previous, #previous_element, #processing_instruction?, #remove, #remove_attr, #remove_class, #replace, #search, #source_location, #swap, #text?, #traverse, #value?, #values, #wrap, #write_to, #xpath

Class Method Details

.newDocument

Create a new document.

Returns:



84
85
86
87
88
# File 'ext/nokolexbor/nl_document.c', line 84

static VALUE
nl_document_new(VALUE self)
{
  return nl_document_parse(self, rb_str_new("", 0));
}

.parse(string_or_io) ⇒ Document

Parse HTML into a Nokolexbor::Document.

Returns:

Parameters:

  • string_or_io (String, #read)

    The HTML to be parsed. It may be a String, or any object that responds to #read such as an IO, or StringIO.



40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# File 'ext/nokolexbor/nl_document.c', line 40

static VALUE
nl_document_parse(VALUE self, VALUE rb_string_or_io)
{
  VALUE id_read = rb_intern("read");
  VALUE rb_html;
  if (rb_respond_to(rb_string_or_io, id_read)) {
    rb_html = rb_funcall(rb_string_or_io, id_read, 0);
  } else {
    rb_html = rb_string_or_io;
  }
  const char *html_c = StringValuePtr(rb_html);
  size_t html_len = RSTRING_LEN(rb_html);

#ifdef HAVE_PTHREAD_H
  lxb_html_parser_t *g_parser = (lxb_html_parser_t *)pthread_getspecific(p_key_parser);
#else
  lxb_html_parser_t *g_parser = NULL;
#endif
  if (g_parser == NULL) {
    g_parser = lxb_html_parser_create();
    lxb_status_t status = lxb_html_parser_init(g_parser);
    if (status != LXB_STATUS_OK) {
      nl_raise_lexbor_error(status);
    }
    g_parser->tree->scripting = true;
#ifdef HAVE_PTHREAD_H
    pthread_setspecific(p_key_parser, g_parser);
#endif
  }

  lxb_html_document_t *document = lxb_html_parse(g_parser, (const lxb_char_t *)html_c, html_len);

  if (document == NULL) {
    rb_raise(rb_eRuntimeError, "Error parsing document");
  }

  return TypedData_Wrap_Struct(cNokolexborDocument, &nl_document_type, document);
}

Instance Method Details

#create_cdata(string, &block) ⇒ CDATA

Create a CDATA containing string.

Returns:



57
58
59
# File 'lib/nokolexbor/document.rb', line 57

def create_cdata(string, &block)
  Nokolexbor::CDATA.new(string.to_s, self, &block)
end

#create_comment(string, &block) ⇒ Comment

Create a Comment containing string.

Returns:



64
65
66
# File 'lib/nokolexbor/document.rb', line 64

def create_comment(string, &block)
  Nokolexbor::Comment.new(string.to_s, self, &block)
end

#create_element(name, *contents_or_attrs, &block) ⇒ Element

Create an Element with name belonging to this document, optionally setting contents or attributes.

Examples:

An empty element without attributes

doc.create_element("div")
# => <div></div>

An element with contents

doc.create_element("div", "contents")
# => <div>contents</div>

An element with attributes

doc.create_element("div", {"class" => "container"})
# => <div class='container'></div>

An element with contents and attributes

doc.create_element("div", "contents", {"class" => "container"})
# => <div class='container'>contents</div>

Passing a block to mutate the element

doc.create_element("div") { |node| node["class"] = "blue" }
# => <div class='blue'></div>

Parameters:

  • name (String)
  • contents_or_attrs (#to_s, Hash)

Returns:



32
33
34
35
36
37
38
39
40
41
42
43
44
45
# File 'lib/nokolexbor/document.rb', line 32

def create_element(name, *contents_or_attrs, &block)
  elm = Nokolexbor::Element.new(name, self, &block)
  contents_or_attrs.each do |arg|
    case arg
    when Hash
      arg.each do |k, v|
        elm[k.to_s] = v.to_s
      end
    else
      elm.content = arg.to_s
    end
  end
  elm
end

#create_text_node(string, &block) ⇒ Text

Create a Text with string.

Returns:



50
51
52
# File 'lib/nokolexbor/document.rb', line 50

def create_text_node(string, &block)
  Nokolexbor::Text.new(string.to_s, self, &block)
end

#documentDocument

A reference to self.

Returns:



71
72
73
# File 'lib/nokolexbor/document.rb', line 71

def document
  self
end

#meta_encodingString

Get the meta tag encoding for this document. If there is no meta tag, nil is returned.

Returns:

  • (String)


78
79
80
81
82
83
84
# File 'lib/nokolexbor/document.rb', line 78

def meta_encoding
  if (meta = at_css("meta[charset]"))
    meta[:charset]
  elsif (meta = meta_content_type)
    meta["content"][/charset\s*=\s*([\w-]+)/i, 1]
  end
end

#meta_encoding=(encoding) ⇒ Object

Set the meta tag encoding for this document.

If an meta encoding tag is already present, its content is replaced with the given text.

Otherwise, this method tries to create one at an appropriate place supplying head and/or html elements as necessary, which is inside a head element if any, and before any text node or content element (typically <body>) if any.



95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# File 'lib/nokolexbor/document.rb', line 95

def meta_encoding=(encoding)
  if (meta = meta_content_type)
    meta["content"] = format("text/html; charset=%s", encoding)
    encoding
  elsif (meta = at_css("meta[charset]"))
    meta["charset"] = encoding
  else
    meta = Nokolexbor::Node.new("meta", self)
    meta["charset"] = encoding

    if (head = at_css("head"))
      head.prepend_child(meta)
    else
      (meta)
    end
    encoding
  end
end

#rootNode

Get the root node for this document.

Returns:



138
139
140
141
142
143
# File 'ext/nokolexbor/nl_document.c', line 138

static VALUE
nl_document_root(VALUE self)
{
  lxb_dom_document_t *doc = nl_rb_document_unwrap(self);
  return nl_rb_node_create(lxb_dom_document_root(doc), self);
}

#set_metadata_element(element) ⇒ Object



121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# File 'lib/nokolexbor/document.rb', line 121

def (element)
  if (head = at_css("head"))
    head << element
  elsif (html = at_css("html"))
    head = html.prepend_child(Nokolexbor::Node.new("head", self))
    head.prepend_child(element)
  elsif (first = children.find do |node|
           case node
           when Nokolexbor::Node
             true
           end
         end)
    # We reach here only if the underlying document model
    # allows <html>/<head> elements to be omitted and does not
    # automatically supply them.
    first.add_previous_sibling(element)
  else
    html = add_child(Nokolexbor::Node.new("html", self))
    head = html.add_child(Nokolexbor::Node.new("head", self))
    head.prepend_child(element)
  end
end

#titleString

Get the title of this document.

Returns:

  • (String)


103
104
105
106
107
108
109
# File 'ext/nokolexbor/nl_document.c', line 103

static VALUE
nl_document_get_title(VALUE self)
{
  size_t len;
  lxb_char_t *str = lxb_html_document_title(nl_rb_document_unwrap(self), &len);
  return str == NULL ? rb_str_new("", 0) : rb_utf8_str_new(str, len);
}

#title=(text) ⇒ String

Set the title of this document.

If a title element is already present, its content is replaced with the given text.

Otherwise, this method tries to create one inside <head>.

Returns:

  • (String)

Returns:

  • (String)


124
125
126
127
128
129
130
131
# File 'ext/nokolexbor/nl_document.c', line 124

static VALUE
nl_document_set_title(VALUE self, VALUE rb_title)
{
  const char *c_title = StringValuePtr(rb_title);
  size_t len = RSTRING_LEN(rb_title);
  lxb_char_t *str = lxb_html_document_title_set(nl_rb_document_unwrap(self), (const lxb_char_t *)c_title, len);
  return rb_title;
}