Method: ContentFocus::HTML#static_fragments
- Defined in:
- lib/content_focus/html.rb
#static_fragments(options = {}) ⇒ Object
Get all relevant div/span/td/body/p blocks from the HTML page - based on the <title> This is to extract atomic/permanent content
73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
# File 'lib/content_focus/html.rb', line 73 def static_fragments( = {}) title_elements = (@doc/"title") return html if !title_elements || title_elements.empty? title_inner_text = title_elements.first.inner_text keywords = Linguistics::Tagger.keywords_for_caption(title_inner_text) blocks = [] # First, find the smallest blocks, but bigger than the title (@doc/"div|span|td|body|p|dd|ul").each do |element| next if element_with_negative_identifier(element) inner_text = '' element.children.each do |child| inner_text << child.to_s if child.is_a?(Hpricot::Text) end inner_text.downcase! next if inner_text.size <= title_inner_text.size # Check the occurance of keyword in block, skip if none num_matches = 0 keywords.each { |k| num_matches+=1 if inner_text.split(/\s+/).include?(k) } next if num_matches == 0 # Calculate a score based on keyword matches times positive naming of id/class score = num_matches identifier = nil if (identifier = element_with_positive_identifier(element)) score = score * 2; end blocks << {:score => score, :element => element, :inner_text => inner_text, :parent => element.parent ? element.parent.object_id : nil, :identifier => identifier} end big_block_identifiers = {} # Finding big blocks with both matches and positive identifiers (@doc/"div|span|table|td|body|p|dd|ul").each do |element| next if element_with_negative_identifier(element) # Need to log identifier statistics identifier = nil if (identifier = element_with_positive_identifier(element)) big_block_identifiers[identifier] ||= 0 big_block_identifiers[identifier] += 1 else next end inner_text = element.inner_text inner_text.downcase! next if inner_text.size <= title_inner_text.size # Check the occurance of keyword in block, skip if none num_matches = 0 keywords.each { |k| num_matches+=1 if inner_text.split(/\s+/).include?(k) } #puts "#{element.name}(#{element.inner_text.size}/#{title_inner_text.size}, score:#{num_matches} * #{element_with_positive_identifier(element)}): " + element.attributes['class'].to_s next if num_matches == 0 # Calculate a score based on keyword matches times positive naming of id/class score = num_matches if identifier score = score * 3; end blocks << {:score => score, :element => element, :inner_text => inner_text, :parent => element.parent ? element.parent : nil, :identifier => identifier} end # De-value the identifiers that are repeated blocks.each do |block| if block[:identifier] && big_block_identifiers[block[:identifier]].to_i > 1 block[:score] = block[:score] / 3; end end # Order those blocks by top matches blocks.sort! { |b,a| a[:score] <=> b[:score] } blocks.reject! { |b| b[:score] == 0 } blocks end |