Module: Pismo::InternalAttributes

Included in:
Document
Defined in:
lib/pismo/internal_attributes.rb

Overview

Internal attributes are different pieces of data we can extract from a document’s content

Instance Method Summary collapse

Instance Method Details

#author(all = false) ⇒ Object

Returns the author of the page/content



119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
# File 'lib/pismo/internal_attributes.rb', line 119

def author(all = false)
  author = @doc.match([
                      '.post-author .fn',
                      '.wire_author',
                      '.cnnByline b',
                      '.editorlink',
                      '.authors p',
                      ['meta[@name="author"]', lambda { |el| el.attr('content') }],     # Traditional meta tag style
                      ['meta[@name="Author"]', lambda { |el| el.attr('content') }],     # CNN style
                      ['meta[@name="AUTHOR"]', lambda { |el| el.attr('content') }],     # CNN style
                      '.byline a',                                                      # Ruby Inside style
                      '.byline',
                      '.post_subheader_left a',                                         # TechCrunch style
                      '.byl',                                                           # BBC News style
                      '.meta a',
                      '.articledata .author a',
                      '#owners a',                                                      # Google Code style
                      '.author a',
                      '.author',
                      '.auth a',
                      '.auth',
                      '.cT-storyDetails h5',                                            # smh.com.au - worth dropping maybe..
                      ['meta[@name="byl"]', lambda { |el| el.attr('content') }],
                      '.timestamp a',
                      '.fn a',
                      '.fn',
                      '.byline-author',
                      '.ArticleAuthor a',
                      '.blog_meta a',
                      'cite a',
                      'cite',
                      '.contributor_details h4 a'
                      ], all)
                      
  return unless author

  # Strip off any "By [whoever]" section
  if String === author
    author.sub!(/^(post(ed)?\s)?by\W+/i, '')
    author.tr!('^a-zA-Z 0-9\'', '|')
    author = author.split(/\|{2,}/).first.to_s
    author.gsub!(/\s+/, ' ')
    author.gsub!(/\|/, '')
    author.strip!
  elsif Array === author
    author.map! { |a| a.sub(/^(post(ed)?\s)?by\W+/i, '') }.uniq!
  end
  
  author
end

#authorsObject



170
171
172
# File 'lib/pismo/internal_attributes.rb', line 170

def authors
  author(true)
end

#bodyObject

Returns body text as determined by Reader algorithm



269
270
271
# File 'lib/pismo/internal_attributes.rb', line 269

def body
  @body ||= reader_doc.content(true).strip      
end

#datetimeObject

Return an estimate of when the page/content was created As clients of this library should be doing HTTP retrieval themselves, they can fall to the Last-Updated HTTP header if they so wish. This method is just rough and based on content only.



75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# File 'lib/pismo/internal_attributes.rb', line 75

def datetime
  # TODO: Clean all this mess up
  
  mo = %r{(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|December)}i
  
  regexen = [
    /#{mo}\b\s+\d+\D{1,10}\d{4}/i,
    /(on\s+)?\d+\s+#{mo}\s+\D{1,10}\d+/i,
    /(on[^\d+]{1,10})\d+(th|st|rd)?.{1,10}#{mo}\b[^\d]{1,10}\d+/i,
    /\b\d{4}\-\d{2}\-\d{2}\b/i,
    /\d+(th|st|rd).{1,10}#{mo}\b[^\d]{1,10}\d+/i,
    /\d+\s+#{mo}\b[^\d]{1,10}\d+/i,
    /on\s+#{mo}\s+\d+/i,
    /#{mo}\s+\d+/i,
    /\d{4}[\.\/\-]\d{2}[\.\/\-]\d{2}/,
    /\d{2}[\.\/\-]\d{2}[\.\/\-]\d{4}/
  ]
  
  datetime = 10
  
  regexen.each do |r|
    datetime = @doc.to_html[r]
    # p datetime
    break if datetime
  end
  
  return unless datetime && datetime.length > 4
  
  # Clean up the string for use by Chronic
  datetime.strip!
  datetime.gsub!(/(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)[^\w]*/i, '')
  datetime.gsub!(/(mon|tues|tue|weds|wed|thurs|thur|thu|fri|sat|sun)[^\w]*/i, '')
  datetime.sub!(/on\s+/, '')
  datetime.gsub!(/\,/, '')
  datetime.sub!(/(\d+)(th|st|rd)/, '\1')
  
  Chronic.parse(datetime) || datetime
end

#descriptionObject

Returns the “description” of the page, usually comes from a meta tag



176
177
178
179
180
181
182
183
184
# File 'lib/pismo/internal_attributes.rb', line 176

def description
  @doc.match([
              ['meta[@name="description"]', lambda { |el| el.attr('content') }],
              ['meta[@name="Description"]', lambda { |el| el.attr('content') }],
              ['meta[@name="DESCRIPTION"]', lambda { |el| el.attr('content') }],
              'rdf:Description[@name="dc:description"]',
              '.description'
   ])
end

#faviconObject

Returns URL to the site’s favicon



279
280
281
282
283
284
285
286
287
288
# File 'lib/pismo/internal_attributes.rb', line 279

def favicon
  url = @doc.match([['link[@rel="fluid-icon"]', lambda { |el| el.attr('href') }],      # Get a Fluid icon if possible..
                    ['link[@rel="shortcut icon"]', lambda { |el| el.attr('href') }],
                    ['link[@rel="icon"]', lambda { |el| el.attr('href') }]])
  if url && url !~ /^http/ && @url
    url = URI.join(@url , url).to_s
  end
  
  url
end

#feed(all = false) ⇒ Object

Returns URL(s) of Web feed(s)



291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
# File 'lib/pismo/internal_attributes.rb', line 291

def feed(all = false)
  url = @doc.match([['link[@type="application/rss+xml"]', lambda { |el| el.attr('href') }],
                    ['link[@type="application/atom+xml"]', lambda { |el| el.attr('href') }]], all
  )
  
  if url && String === url && url !~ /^http/ && @url
    url = URI.join(@url , url).to_s
  elsif url && Array === url
    url.map! do |u|
      if u !~ /^http/ && @url
        URI.join(@url, u).to_s
      else
        u
      end
    end
    url.uniq!
  end
  
  url
end

#feedsObject



312
313
314
# File 'lib/pismo/internal_attributes.rb', line 312

def feeds
  feed(true)
end

#html_bodyObject

Returns body text as determined by Reader algorithm WITH basic HTML formatting intact



274
275
276
# File 'lib/pismo/internal_attributes.rb', line 274

def html_body
  @html_body ||= reader_doc.content.strip      
end

#html_titleObject

HTML title



66
67
68
69
70
# File 'lib/pismo/internal_attributes.rb', line 66

def html_title
  title = @doc.match('title')
  return unless title
  title
end

#images(limit = 3) ⇒ Object

Returns any images with absolute URLs in the document



233
234
235
# File 'lib/pismo/internal_attributes.rb', line 233

def images(limit = 3)
  reader_doc && !reader_doc.images.empty? ? reader_doc.images(limit) : nil
end

#keywords(options = {}) ⇒ Object

Returns the “keywords” in the document (not the meta keywords - they’re next to useless now)



238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
# File 'lib/pismo/internal_attributes.rb', line 238

def keywords(options = {})
  options = { :stem_at => 20, :word_length_limit => 15, :limit => 20 }.merge(options)
  
  words = {}
  
  # Convert doc to lowercase, scrub out most HTML tags, then keep track of words
  cached_title = title
  content_to_use = body.to_s.downcase + " " + description.to_s.downcase

  # old regex for safe keeping -- \b[a-z][a-z\+\.\'\+\#\-]*\b
  content_to_use.downcase.gsub(/\<[^\>]{1,100}\>/, '').gsub(/\.+\s+/, ' ').gsub(/\&\w+\;/, '').scan(/(\b|\s|\A)([a-z0-9][a-z0-9\+\.\'\+\#\-\/\\]*)(\b|\s|\Z)/i).map{ |ta1| ta1[1] }.each do |word|
    next if word.length > options[:word_length_limit]
    word.gsub!(/\'\w+/, '')
    words[word] ||= 0
    words[word] += (cached_title.downcase.include?(word) ? 5 : 1)
  end

  # Stem the words and stop words if necessary
  d = words.keys.uniq.map { |a| a.length > options[:stem_at] ? a.stem : a }
  s = Pismo.stopwords.map { |a| a.length > options[:stem_at] ? a.stem : a }

        
  w = words.delete_if { |k1, v1| s.include?(k1) || (v1 < 2 && words.size > 80) }.sort_by { |k2, v2| v2 }.reverse.first(options[:limit])
  return w
end

#lede(all = false) ⇒ Object

Returns the “lede(s)” or first paragraph(s) of the story/page



187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
# File 'lib/pismo/internal_attributes.rb', line 187

def lede(all = false)
  lede = @doc.match([ 
              '.post-text p',
              '#blogpost p',
              '.story-teaser',
              '//div[@class="entrytext"]//p[string-length()>10]',                      # Ruby Inside / Kubrick style
              'section p',
              '.entry .text p',
              '.entry-content p',
              '#wikicontent p',                                                        # Google Code style
              '.wikistyle p',                                                          # GitHub style
              '//td[@class="storybody"]/p[string-length()>10]',                        # BBC News style
              '//div[@class="entry"]//p[string-length()>100]',
              # The below is a horrible, horrible way to pluck out lead paras from crappy Blogspot blogs that
              # don't use <p> tags..
              ['.entry-content', lambda { |el| el.inner_html[/(#{el.inner_text[0..4].strip}.*?)\<br/, 1] }],
              ['.entry', lambda { |el| el.inner_html[/(#{el.inner_text[0..4].strip}.*?)\<br/, 1] }],
              '.entry',
              '#content p',
              '#article p',
              '.post-body',
              '.entry-content',
              '.document_description_short p',    # Scribd
              '.single-post p'
              ], all)

  # TODO: Improve sentence extraction - this is dire even if it "works for now"
  if lede && String === lede
    return (lede[/^(.*?[\.\!\?]\s){2}/m] || lede).to_s.strip
  elsif lede && Array === lede
    return lede.map { |l| l.to_s[/^(.*?[\.\!\?]\s){2}/m].strip || l }.uniq
  else
    return reader_doc && !reader_doc.sentences(3).empty? ? reader_doc.sentences(3).join(' ') : nil
  end
end

#ledesObject



223
224
225
# File 'lib/pismo/internal_attributes.rb', line 223

def ledes
  lede(true)
end

#reader_docObject



264
265
266
# File 'lib/pismo/internal_attributes.rb', line 264

def reader_doc
  @reader_doc ||= Reader::Document.new(@doc.to_s)
end

#sentences(limit = 3) ⇒ Object

Returns a string containing the first [limit] sentences as determined by the Reader algorithm



228
229
230
# File 'lib/pismo/internal_attributes.rb', line 228

def sentences(limit = 3)
  reader_doc && !reader_doc.sentences.empty? ? reader_doc.sentences(limit).join(' ') : nil
end

#title(all = false) ⇒ Object

Returns the title of the page/content - attempts to strip site name, etc, if possible



5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/pismo/internal_attributes.rb', line 5

def title(all = false)
  # TODO: Memoizations
  title = @doc.match( 
                      [
                        '#pname a',                                                       # Google Code style
                        '.entryheader h1',                                                # Ruby Inside/Kubrick
                        '.entry-title a',                                               # Common Blogger/Blogspot rules
                        '.post-title a',
                        '.post_title a',
                        '.posttitle a',
                        '.post-header h1',
                        '.entry-title',
                        '.post-title',
                        '.post h3 a',
                        'a.datitle',          # Slashdot style
                        '.posttitle',
                        '.post_title',
                        '.pageTitle',
                        '#main h1.title',
                        '.title h1',                          
                        '.post h2',
                        'h2.title',
                        '.entry h2 a',
                        '.entry h2',                                                      # Common style
                        '.boite_titre a',
                        ['meta[@name="title"]', lambda { |el| el.attr('content') }],
                        'h1.headermain',
                        'h1.title',
                        '.mxb h1',                                                        # BBC News
                        '#content h1',
                        '#content h2',
                        '#content h3',
                        'a[@rel="bookmark"]',
                        '.products h2',
                        '.caption h3',
                        '#main h2',
                        '#body h1',
                        '#wrapper h1',
                        '#page h1',
                        '.asset-header h1',
                        '#body_content h2'
                      ],
                      all
                    )
  
  # If all else fails, go to the HTML title
  if all
    return [html_title] if !title
    return ([*title] + [html_title]).uniq
  else
    return html_title if !title
    return title
  end
end

#titlesObject



60
61
62
# File 'lib/pismo/internal_attributes.rb', line 60

def titles
  title(true)
end