Class: Pismo::Reader::Cluster

Inherits:
Base
  • Object
show all
Defined in:
lib/pismo/reader/cluster.rb

Constant Summary collapse

DEFAULTS =

Default option parameters

{
  :threshold => 100,                                       # threshold for score of the text
  :min_length => 80,                                       # minimum length of evaluated blocks
  :decay_factor => 0.73,                                   # decay factor for block score
  :continuous_factor => 1.62,                              # continuous factor for block score ( the larger, the harder to continue )
  :no_body_factor => 0.72,                                 # no body factor that reduces block score if waste expressions are present
  :punctuation_weight => 10,                               # score weight for punctuation
  :punctuations => /(\.[^A-Za-z0-9]|,[^0-9]|!|\?)/,        # punctuation characters
  :waste_expressions => /Copyright|All Rights Reserved/i,  # characteristic keywords including footer
  :debug => false,                                         # if true, output block information to stdout
}

Constants inherited from Base

Base::BAD_WORDS, Base::BLOCK_OUTPUT_ELEMENTS, Base::COULD_CONTAIN_FULL_CONTENT, Base::FATAL_WORDS, Base::GOOD_WORDS, Base::INLINE_OUTPUT_ELEMENTS, Base::META_WORDS, Base::NON_HEADER_ELEMENTS, Base::OK_ATTRIBUTES, Base::OK_CLEAN_ATTRIBUTES, Base::OK_ELEMENTS, Base::OUTPUT_ELEMENTS, Base::WONT_CONTAIN_FULL_CONTENT

Instance Attribute Summary

Attributes inherited from Base

#content_candidates, #doc, #options, #raw_content

Instance Method Summary collapse

Methods inherited from Base

#build_doc, #content, #images, #initialize, #sentences, #strip

Constructor Details

This class inherits a constructor from Pismo::Reader::Base

Instance Method Details

#analyzeObject

Analyze the structure of the HTML document and score content blocks for likelihood of containing useful content



45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# File 'lib/pismo/reader/cluster.rb', line 45

def analyze
  
  opt = DEFAULTS.clone
  opt.merge!(@options)

  @sections = []
  factor = continuous = 1.0
  body = ''
  score = 0

  # The content is split into blocks of divs
  list = @raw_content.split(/<\/?(?:div)[^>]*>/)
  list.each do |block|
    next unless block
    block.gsub!(/\n/, '')

    # Ignore blocks that have no tex
    next if has_only_tags?(block)

    # Each new block iterated over makes it less likely for it to belong
    # to the existing block
    continuous /= opt[:continuous_factor] if body.length > 0

    # Clean up and strip block of html tags for scoring
    clean = clean_block(block)
    #clean = strip_tags(block)
    next if clean.length < opt[:min_length]

    # Calculate scores for clustering of blocks
    
    # c represents how probable it is for this block to be a content block
    c = (clean.length + clean.scan(opt[:punctuations]).length * opt[:punctuation_weight]) * factor

    # The further down the document we go (i.e. the more blocks we see),
    # the less likely they are to be valid content blocks
    factor *= opt[:decay_factor]

    # The not body rate represents how likely this is to be a junk block
    not_body_rate = block.scan(opt[:waste_expressions]).length

    # The block score is reduced if there is a not_body_rate
    c *= (opt[:no_body_factor] ** not_body_rate) if not_body_rate>0

    # c1 represents how probable it is for this block to belong to the
    # existing block or if it is a new one
    c1 = c * continuous

    puts "----- #{c}*#{continuous}=#{c1} #{clean.length} \n\n" if opt[:debug]

    if c1 > opt[:threshold]
      # Treat continuous blocks as cluster
      body += block + "\n"
      score += c1
      continuous = opt[:continuous_factor]
    elsif c > opt[:threshold]
      # Continuous block end
      @sections << { :body => body, :score => score }
      body = block + "\n"
      score = c
      continuous = opt[:continuous_factor]
    else
      # We drop blocks that don't have a high enough c score
    end
  end
  # Add the last block as we've finished iterating
  @sections << { :body => body, :score => score } if body
  # Sort the sections by score
  sorted_sections = @sections.sort_by { |section| section[:score] }
  # Convert to nokogiri representation for compatibility with the content method
  @content_candidates = sorted_sections.reverse.map { |section| Nokogiri::HTML(section[:body], nil, 'utf-8') }
end

#content_at(index) ⇒ Object



117
118
119
# File 'lib/pismo/reader/cluster.rb', line 117

def content_at(index)
  @content_candidates[index]
end