Class: MechanizeContent::Page

Inherits:
Object
  • Object
show all
Defined in:
lib/mechanize_content/page.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url) ⇒ Page

Returns a new instance of Page.



5
6
7
# File 'lib/mechanize_content/page.rb', line 5

def initialize(url)
  @url = url
end

Instance Attribute Details

#urlObject

Returns the value of attribute url.



3
4
5
# File 'lib/mechanize_content/page.rb', line 3

def url
  @url
end

Instance Method Details

#agentObject



121
122
123
# File 'lib/mechanize_content/page.rb', line 121

def agent
  @agent ||= Mechanize.new {|a| a.user_agent_alias = 'Mac Safari'}
end

#apple_touch_iconObject



34
35
36
37
38
39
# File 'lib/mechanize_content/page.rb', line 34

def apple_touch_icon
  icon = content.parser.xpath("//link[@rel='apple-touch-icon']/@href").first
  if icon
    URI.parse(icon.value).relative? ? (URI.parse(base_url.to_s)+icon.value).to_s : icon.value
  end
end

#base_urlObject



29
30
31
32
# File 'lib/mechanize_content/page.rb', line 29

def base_url
  base = content.parser.xpath("//base/@href").first
  base ? base.value : content.uri
end

#best_contentObject



41
42
43
# File 'lib/mechanize_content/page.rb', line 41

def best_content
  @best_content ||= find_content
end

#contentObject



98
99
100
# File 'lib/mechanize_content/page.rb', line 98

def content
  @page_content ||= fetch_content
end

#fetch_contentObject



102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# File 'lib/mechanize_content/page.rb', line 102

def fetch_content
  begin
    page_content = agent.get(@url)
    page_content if page_content.is_a?(Mechanize::Page)
  rescue Timeout::Error
    puts "Timeout - "+@url
  rescue Errno::ECONNRESET
    puts "Connection reset by peer - "+@url
  rescue Mechanize::ResponseCodeError
    puts "Invalid url"
  rescue Mechanize::UnsupportedSchemeError
    puts "Unsupported Scheme"
  rescue SocketError => e
    puts e
  # rescue
  #   puts "There was a problem connecting - "+@url
  end
end

#find_contentObject



45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# File 'lib/mechanize_content/page.rb', line 45

def find_content
  return nil unless content
  doc = content.parser
  readability = {}
  doc.css('p').each do |paragraph|
    if readability[paragraph.parent].nil?
      readability[paragraph.parent] = 0
    end
    parent_class = paragraph.parent['class'] || ""
    parent_id = paragraph.parent['id'] || ""
    if !parent_class.match('(comment|meta|footer|footnote)').nil?
      readability[paragraph.parent] -= 50
    elsif !parent_class.match('((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-_]?(content|text|body)?)(\\s|$))').nil?
      readability[paragraph.parent] += 25
    end

    if !parent_id.match('(comment|meta|footer|footnote)').nil?
      readability[paragraph.parent] -= 50
    elsif !parent_id.match('((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-_]?(content|text|body)?)(\\s|$))').nil?
      readability[paragraph.parent] += 25
    end

    if paragraph.inner_text().length > 10
      readability[paragraph.parent] += 1
    end
    if !paragraph.parent.attributes.values.nil?
      if !paragraph.parent.attributes.values.first.nil?
        if paragraph.parent.attributes.values.first.value.include? "comment"
          break
        end
      end
    end
    readability[paragraph.parent] += paragraph.inner_text().count(',')
  end
  sorted_results = readability.sort_by { |parent,score| -score }
  if sorted_results.nil? || sorted_results.first.nil?
    return nil
  elsif !sorted_results.first.first.xpath("//a[@href='http://get.adobe.com/flashplayer/']").empty? || !sorted_results.first.first.xpath("//a[@href='http://www.adobe.com/go/getflashplayer']").empty?
    return nil
  else
    top_result = sorted_results.first.first
    top_result.css('script').unlink
    top_result.css('iframe').unlink
    top_result.css('h1').unlink
    top_result.css('h2').unlink
    top_result.css("div#date-byline").unlink
    top_result.css("p.date").unlink
    top_result.css("div#facebook-like-button").unlink
    return top_result
  end
end

#imageObject



17
18
19
# File 'lib/mechanize_content/page.rb', line 17

def image
  @image ||= best_content ? Image.best_image(images, base_url) : nil
end

#image_iphoneObject



21
22
23
# File 'lib/mechanize_content/page.rb', line 21

def image_iphone
  @image_iphone ||= apple_touch_icon || image
end

#imagesObject



25
26
27
# File 'lib/mechanize_content/page.rb', line 25

def images
  best_content.css('img')
end

#textObject



13
14
15
# File 'lib/mechanize_content/page.rb', line 13

def text
  Util.force_utf8(best_content.text) if best_content && best_content.text.size > 50
end

#titleObject



9
10
11
# File 'lib/mechanize_content/page.rb', line 9

def title
  content.title if content
end