Class: ForumPost::Document
- Inherits:
-
Object
- Object
- ForumPost::Document
- Defined in:
- lib/forum_post.rb
Constant Summary collapse
- DEFAULT_OPTIONS =
{ :min_length => 5, :min_text_length => 15 }.freeze
- REGEXES =
{ :unlikelyCandidatesRe => /combx|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i, :maybe_post => /article|body|column|main|content|post|topic|text|info|message|item|bord|forum/i, :not_post => /author|head|avatar|profile|rank|user|uid/i, :divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i, :replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i, :replaceFontsRe => /<(\/?)font[^>]*>/i }
Instance Attribute Summary collapse
-
#html ⇒ Object
Returns the value of attribute html.
-
#options ⇒ Object
Returns the value of attribute options.
Instance Method Summary collapse
- #collect_likely_elem(h, flag, elem) ⇒ Object
- #content ⇒ Object
- #debug(str) ⇒ Object
- #elem_size(elem, type = 'inner_text') ⇒ Object
-
#initialize(input, options = {}) ⇒ Document
constructor
A new instance of Document.
- #is_contain(p, q) ⇒ Object
- #likely_posts ⇒ Object
- #make_html ⇒ Object
- #most_likely_posts(better_post) ⇒ Object
- #remove_script_and_style! ⇒ Object
- #remove_unlikely_candidates! ⇒ Object
- #score_elem(bests) ⇒ Object
- #select_best(candidates) ⇒ Object
- #transform_misused_divs_into_p! ⇒ Object
Constructor Details
#initialize(input, options = {}) ⇒ Document
Returns a new instance of Document.
21 22 23 24 25 |
# File 'lib/forum_post.rb', line 21 def initialize(input, = {}) @input = input.gsub(REGEXES[:replaceBrsRe], '</p><p>').gsub(REGEXES[:replaceFontsRe], '<\1span>') @options = DEFAULT_OPTIONS.merge() make_html end |
Instance Attribute Details
#html ⇒ Object
Returns the value of attribute html.
19 20 21 |
# File 'lib/forum_post.rb', line 19 def html @html end |
#options ⇒ Object
Returns the value of attribute options.
19 20 21 |
# File 'lib/forum_post.rb', line 19 def @options end |
Instance Method Details
#collect_likely_elem(h, flag, elem) ⇒ Object
132 133 134 135 136 137 138 139 |
# File 'lib/forum_post.rb', line 132 def collect_likely_elem(h,flag,elem) flag.split(/ /).each do |item| h.map do |k,v| h[k] << elem if k =~ Regexp.new(item) end end h[flag] = ([] << elem) end |
#content ⇒ Object
31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
# File 'lib/forum_post.rb', line 31 def content remove_script_and_style!#去除scrpt和style remove_unlikely_candidates!#去除不可能是发帖内容的标签 transform_misused_divs_into_p!#把一些噪音的div转换成p标签 better_post = likely_posts if better_post.size > 0 debug("post>4") handle_html=most_likely_posts(better_post) else handle_html=@html debug("post<4") end bests=handle_html.css("div,tr,td") if bests.size==0 debug("best_one:#{bests.name}.#{bests[:class]} #{bests[:id]}") return handle_html.text.gsub(/\s(\s+)/,"") else bests.map{|best| debug("bests:#{best.name}.#{best[:class]} #{best[:id]}")} end candidates=score_elem(bests) best_elem=select_best(candidates) best_elem.text.gsub(/\s(\s+)/,"") end |
#debug(str) ⇒ Object
173 174 175 |
# File 'lib/forum_post.rb', line 173 def debug(str) puts str if [:debug] end |
#elem_size(elem, type = 'inner_text') ⇒ Object
111 112 113 114 |
# File 'lib/forum_post.rb', line 111 def elem_size(elem,type='inner_text') return elem.text.gsub(/\s(\s+)/,"").size if type=='inner_text' return elem.inner_html.gsub(/\s(\s+)/,"").size if type=='inner_html' end |
#is_contain(p, q) ⇒ Object
166 167 168 169 170 171 |
# File 'lib/forum_post.rb', line 166 def is_contain(p,q) p.css("div,tr,td").each do |item| return true if item.name == q.name && item[:class] == q[:class] end return false end |
#likely_posts ⇒ Object
116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
# File 'lib/forum_post.rb', line 116 def likely_posts h={} likely_posts=[] @html.css("div,tr,td").each do |elem| str = "#{elem[:class]}#{elem[:id]}" if str =~ REGEXES[:maybe_post] flag="#{elem.name},#{elem[:class]}" collect_likely_elem(h,flag,elem) end end h.delete_if{|k,v| v.size < DEFAULT_OPTIONS[:min_length]} h.map{|k,v| likely_posts << v.first} likely_posts.map{|lp| debug("likely_posts:#{lp.name}.#{lp[:class]} #{lp[:id]}")} likely_posts end |
#make_html ⇒ Object
27 28 29 |
# File 'lib/forum_post.rb', line 27 def make_html @html = Nokogiri::HTML(@input, nil, 'UTF-8') end |
#most_likely_posts(better_post) ⇒ Object
94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
# File 'lib/forum_post.rb', line 94 def most_likely_posts(better_post) most_likely_posts=[] better_post.each do |q| flag=0 better_post.each do |p| if is_contain(p,q) == true && p != q flag+=1 end end if flag == 0 most_likely_posts << q debug("most_likelys:#{q.name}.#{q[:class]}.#{q[:id]}") end end most_likely_posts.sort{|m,n| elem_size(n)<=>elem_size(m)}.first end |
#remove_script_and_style! ⇒ Object
141 142 143 |
# File 'lib/forum_post.rb', line 141 def remove_script_and_style! @html.css("script, style").each { |i| i.remove } end |
#remove_unlikely_candidates! ⇒ Object
145 146 147 148 149 150 151 152 153 |
# File 'lib/forum_post.rb', line 145 def remove_unlikely_candidates! @html.css("*").each do |elem| str = "#{elem[:class]}#{elem[:id]}" if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:maybe_post] && elem.name.downcase != 'body' debug("Removing unlikely candidate - #{str}") elem.remove end end end |
#score_elem(bests) ⇒ Object
55 56 57 58 59 60 61 62 63 64 65 66 |
# File 'lib/forum_post.rb', line 55 def score_elem(bests) bests.each do |elem| base_score=100 str = "#{elem[:class]}#{elem[:id]}" base_score+=10 if str =~ REGEXES[:maybe_post] base_score-=20 if str =~ REGEXES[:not_post] base_score-=8 if elem_size(elem)<DEFAULT_OPTIONS[:min_text_length] elem["score"]=base_score.to_s end bests.map{|best| debug("#{best.name}.#{best[:class]} #{best[:id]}---score:#{best['score']}")} bests end |
#select_best(candidates) ⇒ Object
68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
# File 'lib/forum_post.rb', line 68 def select_best(candidates) last_candidates=[] candidates=candidates.sort{|a,b| b["score"].to_i<=>a["score"].to_i} best_score=candidates.first["score"] candidates.delete_if{|c| c["score"]!=best_score} return candidates.first if candidates.size==1 candidates.each do |p| flag=0 candidates.each do |q| if is_contain(p,q) == true && p != q flag+=1 end end last_candidates<<p if flag==0 end if last_candidates.size==1 debug("best_one:#{last_candidates.first.name}.#{last_candidates.first[:class]} #{last_candidates.first[:id]}") return last_candidates.first end last_candidates.each do |lc| lc["text_rate"] = (elem_size(lc)/elem_size(lc,'inner_html').to_f).to_s end last_candidates.map{|lc| debug("best_one:#{lc.name}.#{lc[:class]} #{lc[:id]}---text_rate:#{lc['text_rate']}")} last_candidates.sort{|a,b| b["text_rate"].to_f<=>a["text_rate"].to_f}.first end |
#transform_misused_divs_into_p! ⇒ Object
155 156 157 158 159 160 161 162 163 164 |
# File 'lib/forum_post.rb', line 155 def transform_misused_divs_into_p! @html.css("*").each do |elem| if elem.name.downcase == "div" if elem.inner_html !~ REGEXES[:divToPElementsRe] #debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p"); elem.name = "p" end end end end |