Class: Pismo::Reader::Tree

Inherits:
Base
  • Object
show all
Defined in:
lib/pismo/reader/tree.rb

Constant Summary

Constants inherited from Base

Base::BAD_WORDS, Base::BLOCK_OUTPUT_ELEMENTS, Base::COULD_CONTAIN_FULL_CONTENT, Base::FATAL_WORDS, Base::GOOD_WORDS, Base::INLINE_OUTPUT_ELEMENTS, Base::META_WORDS, Base::NON_HEADER_ELEMENTS, Base::OK_ATTRIBUTES, Base::OK_CLEAN_ATTRIBUTES, Base::OK_ELEMENTS, Base::OUTPUT_ELEMENTS, Base::WONT_CONTAIN_FULL_CONTENT

Instance Attribute Summary

Attributes inherited from Base

#content_candidates, #doc, #options, #raw_content

Instance Method Summary collapse

Methods inherited from Base

#build_doc, #content, #images, #initialize, #sentences, #strip

Constructor Details

This class inherits a constructor from Pismo::Reader::Base

Instance Method Details

#analyzeObject

Analyze the structure of the HTML document and score branches for likelihood of containing useful content



6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# File 'lib/pismo/reader/tree.rb', line 6

def analyze
  @tree = {}    
  subels = {}
  
  t1 = Time.now.to_i + (Time.now.usec.to_f / 1000000)
  
  @doc.css(COULD_CONTAIN_FULL_CONTENT.join(", ")).each do |el|
    # Assume that no content we'll want comes in a total package of fewer than 80 characters!
    next unless el.text.to_s.strip.length >= 80
  
    path_segments = el.path.scan(/[a-z]+/)[2..-1] || []
    depth = path_segments.length

    local_ids = (el['id'].to_s + ' ' + el['class'].to_s).downcase.strip.scan(/[a-z]+/)
    ids = local_ids
    
    cp = el.parent
    (depth - 1).times do
      ids += (cp['id'].to_s + ' ' + cp['class'].to_s).downcase.strip.scan(/[a-z]+/)
      cp = cp.parent
    end if depth > 1
    
    #puts "IDS"
    #ap ids
    #puts "LOCAL IDS"
    #ap local_ids
    
    branch = {}        
    branch[:ids] = ids
    branch[:local_ids] = local_ids
    branch[:score] = -(BAD_WORDS & ids).size
    branch[:score] += ((GOOD_WORDS & ids).size * 2)
    next if branch[:score] < -5
  
    #puts "#{ids.join(",")} - #{branch[:score].to_s} - #{el.text.to_s.strip.length}"
    
    # Elements that have an ID or class are more likely to be our winners
    branch[:score] += 2 unless local_ids.empty?
  
    branch[:name] = el.name
    branch[:depth] = depth
    branch[:path] = el.path
    
    branch[:raw_word_count] = 0
    branch[:word_count] = 0
    branch[:child_count] = 0
    branch[:bad_child_count] = 0
    branch[:score_steps] = []
    
    
    el.traverse do |subel|
      div_at_end_of_branch = false if subel.name == "div"
      path = subel.path
      subels[path] ||= {}
      subels[path][:path_segments] ||= (path.scan(/[a-z]+/)[2..-1] || [])
      subels[path][:is_text] ||= subel.text?
                
      if subels[path][:is_text]
        subels[path][:text] ||= subel.text.downcase.scan(/[a-z]+/)
        next if subels[path][:text].empty?
  
        subels[path][:raw_word_count] ||= subels[path][:text].size
        subels[path][:word_count] ||= (%{a h1 h2 h3 h4 h5 h6 h6}.include?(subel.parent.name) ? 0 : subels[path][:text].select { |word| word.length > 3 }.size)
        subels[path][:meta_matches] ||= (subels[path][:text] & META_WORDS).size
        
        branch[:raw_word_count] += subels[path][:raw_word_count]
        branch[:word_count] += subels[path][:word_count] - subels[path][:meta_matches]
      end
      
      subels[path][:ids] ||= (subel['id'].to_s + ' ' + subel['class'].to_s).gsub(/[^a-z]/, ' ').downcase.strip.split(/\s+/)
      subels[path][:bad_child_count_inc] = (BAD_WORDS & subels[path][:ids]).size - (GOOD_WORDS & subels[path][:ids]).size
      subels[path][:child_count_inc] = subels[path][:ids].empty? ? 0 : 1
      
      branch[:bad_child_count] += subels[path][:bad_child_count_inc]
      branch[:child_count] += subels[path][:child_count_inc]
    end
            
    branch[:score] += 2 if branch[:name] == "div"
    branch[:score] += 4 if el.text.scan(/\,\s/).size > 10
    branch[:score_steps] << "lots of commas!" if el.text.scan(/\,\s/).size > 5
    branch[:score] *= 3
    
    
    branch[:score] *= 0.7 if el.children && el.children.size < 3
    branch[:score] *= 1.25 if branch[:raw_word_count] > 10
    next if branch[:raw_word_count] < 10        
    branch[:score] += [branch[:word_count], 1].max ** 0.5
  
    
    word_child_count_ratio = branch[:word_count].to_f / [branch[:child_count], 1].max        
    branch[:word_child_count_ratio] = word_child_count_ratio
  
    if branch[:raw_word_count] > 100
      good_word_ratio = branch[:word_count].to_f / branch[:raw_word_count]
      branch[:score] += good_word_ratio * 12
  
      if word_child_count_ratio > 50
        branch[:score] *= 1.5
      elsif word_child_count_ratio > 30
        branch[:score] *= 1.2
      elsif word_child_count_ratio > 15
        branch[:score] *= 1.1
      elsif word_child_count_ratio < 4
        branch[:score] *= 0.9
      end   
    end
    
    branch[:score_steps] << "s1: #{branch[:score]}"
    
    bad_child_ratio = branch[:bad_child_count].to_f / [branch[:child_count], 1].max
    branch[:bad_child_ratio] = bad_child_ratio
    branch[:score] += 3 if bad_child_ratio < 0.0
    branch[:score] -= 3 if bad_child_ratio > 0.15
    branch[:score] -= 2 if bad_child_ratio > 0.25
    branch[:score] -= 2 if bad_child_ratio > 0.4
    branch[:score] -= 4 if bad_child_ratio > 0.5
    branch[:score] -= 5 if bad_child_ratio > 0.7
    branch[:score] -= 5 if branch[:bad_child_count] > 20
    
    branch[:score] += depth
    branch[:score] *= 0.8 if ids.length > 10
    
    
    
    @tree[el.path] = branch
  end
  
  
  sorted_tree = @tree.sort_by { |k, v| v[:score] }
  
  #ap @doc.at(sorted_tree.first[0]).text
  
  # Sort the branches by their score in reverse order
  @content_candidates = sorted_tree.reverse.first([5, sorted_tree.length].min)
  
  #ap @content_candidates #.map { |i| [i[0], i[1][:name], i[1][:ids].join(','), i[1][:score] ]}
  #t2 = Time.now.to_i + (Time.now.usec.to_f / 1000000)      
  #puts t2 - t1      
  #exit
  
end

#content_at(index) ⇒ Object



148
149
150
# File 'lib/pismo/reader/tree.rb', line 148

def content_at(index)
  @doc.at(@content_candidates[index].first)
end