Class: Wgit::Document
- Inherits:
-
Object
- Object
- Wgit::Document
- Includes:
- Assertable
- Defined in:
- lib/wgit/document.rb
Overview
Class modeling a HTML web document. Also doubles as a search result.
Constant Summary collapse
- TEXT_ELEMENTS =
[:dd, :div, :dl, :dt, :figcaption, :figure, :hr, :li, :main, :ol, :p, :pre, :span, :ul, :h1, :h2, :h3, :h4, :h5]
Constants included from Assertable
Assertable::DEFAULT_DUCK_FAIL_MSG, Assertable::DEFAULT_TYPE_FAIL_MSG, Assertable::WRONG_METHOD_MSG
Instance Attribute Summary collapse
-
#author ⇒ Object
readonly
Returns the value of attribute author.
-
#html ⇒ Object
readonly
Returns the value of attribute html.
-
#keywords ⇒ Object
readonly
Returns the value of attribute keywords.
-
#links ⇒ Object
readonly
Returns the value of attribute links.
-
#score ⇒ Object
readonly
Returns the value of attribute score.
-
#text ⇒ Object
readonly
Returns the value of attribute text.
-
#title ⇒ Object
readonly
Returns the value of attribute title.
-
#url ⇒ Object
readonly
Returns the value of attribute url.
Instance Method Summary collapse
-
#==(other_doc) ⇒ Object
Override of the default == method, is equal if url and html both match.
-
#[](range) ⇒ Object
Shortcut for calling Document#html.
- #empty? ⇒ Boolean
- #external_links ⇒ Object (also: #external_urls)
-
#initialize(url_or_doc, html = nil) ⇒ Document
constructor
A new instance of Document.
- #internal_full_links ⇒ Object (also: #relative_full_links, #relative_full_urls)
- #internal_links ⇒ Object (also: #relative_links, #relative_urls)
-
#search(text, sentence_limit = 80) ⇒ Array
Searches against the Document#text for the given search text.
-
#search!(text) ⇒ Object
Performs a text search (see search for details) but assigns the results to the @text instance variable.
- #size ⇒ Object
- #stats ⇒ Object
- #to_h(include_html = false) ⇒ Object (also: #to_hash)
-
#xpath(xpath) ⇒ Object
Uses Nokogiri’s xpath method to search the doc’s html and return the results.
Methods included from Assertable
#assert_arr_types, #assert_respond_to, #assert_types
Constructor Details
#initialize(url_or_doc, html = nil) ⇒ Document
Returns a new instance of Document.
18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
# File 'lib/wgit/document.rb', line 18 def initialize(url_or_doc, html = nil) if (url_or_doc.is_a?(String)) assert_type(url_or_doc, Url) html ||= "" @url = url_or_doc @html = html @doc = Nokogiri::HTML(html) do |config| # TODO: Remove #'s below when crawling in production. #config.options = Nokogiri::XML::ParseOptions::STRICT | # Nokogiri::XML::ParseOptions::NONET end init_title init_keywords init_links init_text @score = 0.0 else # Init from a mongo collection document. @url = Wgit::Url.new(url_or_doc[:url]) @html = url_or_doc[:html].nil? ? "" : url_or_doc[:html] @title = url_or_doc[:title] @author = url_or_doc[:author] @keywords = url_or_doc[:keywords].nil? ? [] : url_or_doc[:keywords] @links = url_or_doc[:links].nil? ? [] : url_or_doc[:links] @links.map! { |link| Wgit::Url.new(link) } @text = url_or_doc[:text].nil? ? [] : url_or_doc[:text] @score = url_or_doc[:score].nil? ? 0.0 : url_or_doc[:score] end end |
Instance Attribute Details
#author ⇒ Object (readonly)
Returns the value of attribute author.
16 17 18 |
# File 'lib/wgit/document.rb', line 16 def @author end |
#html ⇒ Object (readonly)
Returns the value of attribute html.
16 17 18 |
# File 'lib/wgit/document.rb', line 16 def html @html end |
#keywords ⇒ Object (readonly)
Returns the value of attribute keywords.
16 17 18 |
# File 'lib/wgit/document.rb', line 16 def keywords @keywords end |
#links ⇒ Object (readonly)
Returns the value of attribute links.
16 17 18 |
# File 'lib/wgit/document.rb', line 16 def links @links end |
#score ⇒ Object (readonly)
Returns the value of attribute score.
16 17 18 |
# File 'lib/wgit/document.rb', line 16 def score @score end |
#text ⇒ Object (readonly)
Returns the value of attribute text.
16 17 18 |
# File 'lib/wgit/document.rb', line 16 def text @text end |
#title ⇒ Object (readonly)
Returns the value of attribute title.
16 17 18 |
# File 'lib/wgit/document.rb', line 16 def title @title end |
#url ⇒ Object (readonly)
Returns the value of attribute url.
16 17 18 |
# File 'lib/wgit/document.rb', line 16 def url @url end |
Instance Method Details
#==(other_doc) ⇒ Object
Override of the default == method, is equal if url and html both match. Use doc.object_id == other_doc.object_id for exact object comparison.
113 114 115 116 |
# File 'lib/wgit/document.rb', line 113 def ==(other_doc) return false unless other_doc.is_a? Wgit::Document url == other_doc.url and html == other_doc.html end |
#[](range) ⇒ Object
Shortcut for calling Document#html.
119 120 121 |
# File 'lib/wgit/document.rb', line 119 def [](range) html[range] end |
#empty? ⇒ Boolean
123 124 125 |
# File 'lib/wgit/document.rb', line 123 def empty? html.strip.empty? end |
#external_links ⇒ Object Also known as: external_urls
71 72 73 74 75 76 77 78 79 80 |
# File 'lib/wgit/document.rb', line 71 def external_links return [] if @links.empty? @links.reject do |link| begin link.relative_link? rescue true end end end |
#internal_full_links ⇒ Object Also known as: relative_full_links, relative_full_urls
63 64 65 66 67 68 69 |
# File 'lib/wgit/document.rb', line 63 def internal_full_links return [] if internal_links.empty? internal_links.map do |link| link.replace("/" + link) unless link.start_with?("/") Wgit::Url.new(@url.to_base + link) end end |
#internal_links ⇒ Object Also known as: relative_links, relative_urls
52 53 54 55 56 57 58 59 60 61 |
# File 'lib/wgit/document.rb', line 52 def internal_links return [] if @links.empty? @links.reject do |link| begin not link.relative_link? rescue true end end end |
#search(text, sentence_limit = 80) ⇒ Array
Searches against the Document#text for the given search text. The number of search hits for each sentenence are recorded internally and used to rank/sort the search results before being returned. Where the Database#search method search all documents for the most hits this method searches each documents text for the most hits.
Each search result comprises of a sentence of a given length. The length will be based on the sentence_limit parameter or the full length of the original sentence, which ever is less. The algorithm obviously ensures that the search value is visible somewhere in the sentence.
sentence.
143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
# File 'lib/wgit/document.rb', line 143 def search(text, sentence_limit = 80) raise "A search value must be provided" if text.empty? raise "The sentence length value must be even" if sentence_limit.odd? results = {} regex = Regexp.new(text, Regexp::IGNORECASE) @text.each do |sentence| hits = sentence.scan(regex).count if hits > 0 sentence.strip! index = sentence.index(regex) Wgit::Utils.format_sentence_length(sentence, index, sentence_limit) results[sentence] = hits end end return [] if results.empty? results = Hash[results.sort_by { |k, v| v }] results.keys.reverse end |
#search!(text) ⇒ Object
Performs a text search (see search for details) but assigns the results to the @text instance variable. This can be used for sub search functionality. Note that there is no way of getting the original text back however.
169 170 171 |
# File 'lib/wgit/document.rb', line 169 def search!(text) @text = search(text) end |
#size ⇒ Object
101 102 103 |
# File 'lib/wgit/document.rb', line 101 def size stats[:html] end |
#stats ⇒ Object
82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
# File 'lib/wgit/document.rb', line 82 def stats hash = {} instance_variables.each do |var| # Add up the total bytes of text as well as the length. if var == :@text count = 0 @text.each { |t| count += t.length } hash[:text_length] = @text.length hash[:text_bytes] = count # Else take the #length method return value. else next unless instance_variable_get(var).respond_to?(:length) hash[var[1..-1].to_sym] = instance_variable_get(var).send(:length) end end hash end |
#to_h(include_html = false) ⇒ Object Also known as: to_hash
105 106 107 108 109 |
# File 'lib/wgit/document.rb', line 105 def to_h(include_html = false) ignore = include_html ? [] : [:@html] ignore << :@doc # Always ignore :@doc Wgit::Utils.to_h(self, ignore) end |
#xpath(xpath) ⇒ Object
Uses Nokogiri’s xpath method to search the doc’s html and return the results.
175 176 177 |
# File 'lib/wgit/document.rb', line 175 def xpath(xpath) @doc.xpath(xpath) end |