Module: Pismo::InternalAttributes

Included in:
Document
Defined in:
lib/pismo/internal_attributes.rb

Overview

Internal attributes are different pieces of data we can extract from a document’s content

Instance Method Summary collapse

Instance Method Details

#author(all = false) ⇒ Object

Returns the author of the page/content



116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
# File 'lib/pismo/internal_attributes.rb', line 116

def author(all = false)
  author = @doc.match([
                      '.post-author .fn',
                      '.wire_author',
                      '.cnnByline b',
                      '.editorlink',
                      '.authors p',
                      ['meta[@name="author"]', lambda { |el| el.attr('content') }],     # Traditional meta tag style
                      ['meta[@name="Author"]', lambda { |el| el.attr('content') }],     # CNN style
                      ['meta[@name="AUTHOR"]', lambda { |el| el.attr('content') }],     # CNN style
                      '.byline a',                                                      # Ruby Inside style
                      '.byline',
                      '.post_subheader_left a',                                         # TechCrunch style
                      '.byl',                                                           # BBC News style
                      '.articledata .author a',
                      '#owners a',                                                      # Google Code style
                      '.author a',
                      '.author',
                      '.auth a',
                      '.auth',
                      '.cT-storyDetails h5',                                            # smh.com.au - worth dropping maybe..
                      ['meta[@name="byl"]', lambda { |el| el.attr('content') }],
                      '.timestamp a',
                      '.fn a',
                      '.fn',
                      '.byline-author',
                      '.ArticleAuthor a',
                      '.blog_meta a',
                      'cite a',
                      'cite',
                      '.contributor_details h4 a',
                      '.meta a'
                      ], all)
                      
  return unless author

  # Strip off any "By [whoever]" section
  if String === author
    author.sub!(/^(post(ed)?\s)?by\W+/i, '')
    author.tr!('^a-zA-Z 0-9\'', '|')
    author = author.split(/\|{2,}/).first.to_s
    author.gsub!(/\s+/, ' ')
    author.gsub!(/\|/, '')
    author.strip!
  elsif Array === author
    author.map! { |a| a.sub(/^(post(ed)?\s)?by\W+/i, '') }.uniq!
  end
  
  author
end

#authorsObject



167
168
169
# File 'lib/pismo/internal_attributes.rb', line 167

def authors
  author(true)
end

#bodyObject

Returns body text as determined by Reader algorithm



300
301
302
# File 'lib/pismo/internal_attributes.rb', line 300

def body
  @body ||= reader_doc.content(true).strip      
end

#datetimeObject

Return an estimate of when the page/content was created As clients of this library should be doing HTTP retrieval themselves, they can fall to the Last-Updated HTTP header if they so wish. This method is just rough and based on content only.



76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# File 'lib/pismo/internal_attributes.rb', line 76

def datetime
  # TODO: Clean all this mess up
  
  mo = %r{(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|December)\.?}i
  
  regexen = [
    /#{mo}\b\s+\d+\D{1,10}\d{4}/i,
    /(on\s+)?\d+\s+#{mo}\s+\D{1,10}\d+/i,
    /(on[^\d+]{1,10})\d+(th|st|rd)?.{1,10}#{mo}\b[^\d]{1,10}\d+/i,
    /\b\d{4}\-\d{2}\-\d{2}\b/i,
    /\d+(th|st|rd).{1,10}#{mo}\b[^\d]{1,10}\d+/i,
    /\d+\s+#{mo}\b[^\d]{1,10}\d+/i,
    /on\s+#{mo}\s+\d+/i,
    /#{mo}\s+\d+,? \d{4}+/i,
    /#{mo}\s+\d+/i,
    /\d{4}[\.\/\-]\d{2}[\.\/\-]\d{2}/,
    /\d{2}[\.\/\-]\d{2}[\.\/\-]\d{4}/
  ]
  
  datetime = 10
  
  regexen.each do |r|
    break if datetime = @doc.to_html[r]
  end
  
  return unless datetime && datetime.length > 4
  
  # Clean up the string for use by Chronic
  datetime.strip!
  datetime.gsub!(/(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)[^\w]*/i, '')
  datetime.gsub!(/(mon|tues|tue|weds|wed|thurs|thur|thu|fri|sat|sun)\.?[^\w]*/i, '')
  datetime.gsub!(/(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\./i, '\1')
  datetime.sub!(/on\s+/, '')
  datetime.gsub!(/\,/, '')
  datetime.sub!(/(\d+)(th|st|rd)/, '\1')
  
  Chronic.parse(datetime) || datetime
end

#descriptionObject

Returns the “description” of the page, usually comes from a meta tag



173
174
175
176
177
178
179
180
181
# File 'lib/pismo/internal_attributes.rb', line 173

def description
  @doc.match([
              ['meta[@name="description"]', lambda { |el| el.attr('content') }],
              ['meta[@name="Description"]', lambda { |el| el.attr('content') }],
              ['meta[@name="DESCRIPTION"]', lambda { |el| el.attr('content') }],
              'rdf:Description[@name="dc:description"]',
              '.description'
   ])
end

#faviconObject

Returns URL to the site’s favicon



310
311
312
313
314
315
316
317
318
319
# File 'lib/pismo/internal_attributes.rb', line 310

def favicon
  url = @doc.match([['link[@rel="fluid-icon"]', lambda { |el| el.attr('href') }],      # Get a Fluid icon if possible..
                    ['link[@rel="shortcut icon"]', lambda { |el| el.attr('href') }],
                    ['link[@rel="icon"]', lambda { |el| el.attr('href') }]])
  if url && url !~ /^http/ && @url
    url = URI.join(@url , url).to_s
  end
  
  url
end

#feed(all = false) ⇒ Object

Returns URL(s) of Web feed(s)



322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
# File 'lib/pismo/internal_attributes.rb', line 322

def feed(all = false)
  url = @doc.match([['link[@type="application/rss+xml"]', lambda { |el| el.attr('href') }],
                    ['link[@type="application/atom+xml"]', lambda { |el| el.attr('href') }]], all
  )
  
  if url && String === url && url !~ /^http/ && @url
    url = URI.join(@url , url).to_s
  elsif url && Array === url
    url.map! do |u|
      if u !~ /^http/ && @url
        URI.join(@url, u).to_s
      else
        u
      end
    end
    url.uniq!
  end
  
  url
end

#feedsObject



343
344
345
# File 'lib/pismo/internal_attributes.rb', line 343

def feeds
  feed(true)
end

#html_bodyObject

Returns body text as determined by Reader algorithm WITH basic HTML formatting intact



305
306
307
# File 'lib/pismo/internal_attributes.rb', line 305

def html_body
  @html_body ||= reader_doc.content.strip      
end

#html_titleObject

HTML title



67
68
69
70
71
# File 'lib/pismo/internal_attributes.rb', line 67

def html_title
  title = @doc.match('title')
  return unless title
  title
end

#images(limit = 3) ⇒ Object

Returns any images with absolute URLs in the document



232
233
234
# File 'lib/pismo/internal_attributes.rb', line 232

def images(limit = 3)
  reader_doc && !reader_doc.images.empty? ? reader_doc.images(limit) : nil
end

#keywords(options = {}) ⇒ Object

Returns the “keywords” in the document (not the meta ‘ss’keywords - they’re next to useless now)



267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
# File 'lib/pismo/internal_attributes.rb', line 267

def keywords(options = {})
  options = { :stem_at => 20, :word_length_limit => 15, :limit => 20, :remove_stopwords => true, :minimum_score => 2 }.merge(options)
  
  words = {}
  
  # Convert doc to lowercase, scrub out most HTML tags, then keep track of words
  cached_title = title.to_s
  content_to_use = body.to_s.downcase + " " + description.to_s.downcase

  # old regex for safe keeping -- \b[a-z][a-z\+\.\'\+\#\-]*\b
  content_to_use.downcase.gsub(/\<[^\>]{1,100}\>/, '').gsub(/\.+\s+/, ' ').gsub(/\&\w+\;/, '').scan(/(\b|\s|\A)([a-z0-9][a-z0-9\+\.\'\+\#\-\\]*)(\b|\s|\Z)/i).map{ |ta1| ta1[1] }.compact.each do |word|
    next if word.length > options[:word_length_limit]
    word.gsub!(/^[\']/, '')
    word.gsub!(/[\.\-\']$/, '')
    next if options[:hints] && !options[:hints].include?(word)
    words[word] ||= 0
    words[word] += (cached_title.downcase =~ /\b#{word}\b/ ? 5 : 1)
  end

  # Stem the words and stop words if necessary
  d = words.keys.uniq.map { |a| a.length > options[:stem_at] ? a.stem : a }
  s = Pismo.stopwords.map { |a| a.length > options[:stem_at] ? a.stem : a }

  words.delete_if { |k1, v1| v1 < options[:minimum_score] }
  words.delete_if { |k1, v1| s.include?(k1) } if options[:remove_stopwords]
  words.sort_by { |k2, v2| v2 }.reverse.first(options[:limit])
end

#lede(all = false) ⇒ Object

Returns the “lede(s)” or first paragraph(s) of the story/page



184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
# File 'lib/pismo/internal_attributes.rb', line 184

def lede(all = false)
  lede = @doc.match([ 
              '.post-text p',
              '#blogpost p',
              '.story-teaser',
              '.article .body p',
              '//div[@class="entrytext"]//p[string-length()>40]',                      # Ruby Inside / Kubrick style
              'section p',
              '.entry .text p',
              '.hentry .content p',
              '.entry-content p',
              '#wikicontent p',                                                        # Google Code style
              '.wikistyle p',                                                          # GitHub style
              '//td[@class="storybody"]/p[string-length()>40]',                        # BBC News style
              '//div[@class="entry"]//p[string-length()>100]',
              # The below is a horrible, horrible way to pluck out lead paras from crappy Blogspot blogs that
              # don't use <p> tags..
              ['.entry-content', lambda { |el| el.inner_html[/(#{el.inner_text[0..4].strip}.*?)\<br/, 1] }],
              ['.entry', lambda { |el| el.inner_html[/(#{el.inner_text[0..4].strip}.*?)\<br/, 1] }],
              '.entry',
              '#content p',
              '#article p',
              '.post-body',
              '.entry-content',
              '.document_description_short p',    # Scribd
              '.single-post p'
              ], all)

  # TODO: Improve sentence extraction - this is dire even if it "works for now"
  if lede && String === lede
    return (lede[/^(.*?[\.\!\?]\s){1,3}/m] || lede).to_s.strip
  elsif lede && Array === lede
    return lede.map { |l| l.to_s[/^(.*?[\.\!\?]\s){1,3}/m].strip || l }.uniq
  else
    return reader_doc && !reader_doc.sentences(4).empty? ? reader_doc.sentences(4).join(' ') : nil
  end
end

#ledesObject



222
223
224
# File 'lib/pismo/internal_attributes.rb', line 222

def ledes
  lede(true) rescue []
end

#reader_docObject



295
296
297
# File 'lib/pismo/internal_attributes.rb', line 295

def reader_doc
  @reader_doc ||= Reader::Document.create(@doc.to_s, @options)
end

#sentences(limit = 3) ⇒ Object

Returns a string containing the first [limit] sentences as determined by the Reader algorithm



227
228
229
# File 'lib/pismo/internal_attributes.rb', line 227

def sentences(limit = 3)
  reader_doc && !reader_doc.sentences.empty? ? reader_doc.sentences(limit).join(' ') : nil
end

#tagsObject

Returns the tags or categories of the page/content



237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
# File 'lib/pismo/internal_attributes.rb', line 237

def tags
  css_selectors = [
                   '.watch-info-tag-list a',  # YouTube
                   '.entry .tags a',          # Livejournal
                   'a[rel~=tag]',             # Wordpress and many others
                   'a.tag',                   # Tumblr
                   '.tags a',
                   '.labels a',
                   '.categories a',
                   '.topics a'
                  ]

  tags = []

  # grab the first one we get results from
  css_selectors.each do |css_selector|
    tags += @doc.css(css_selector)
    break if tags.any?
  end

  # convert from Nokogiri Element objects to strings
  tags.map!(&:inner_text)

  # remove "#" from hashtag-like tags
  tags.map! { |t| t.gsub(/^#/, '') }

  tags
end

#title(all = false) ⇒ Object

Returns the title of the page/content - attempts to strip site name, etc, if possible



5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# File 'lib/pismo/internal_attributes.rb', line 5

def title(all = false)
  # TODO: Memoizations
  title = @doc.match( 
                      [
                        '#pname a',                                                       # Google Code style
                        '.entryheader h1',                                                # Ruby Inside/Kubrick
                        '.entry-title a',                                               # Common Blogger/Blogspot rules
                        '.post-title a',
                        '.post_title a',
                        '.posttitle a',
                        '.post-header h1',
                        '.entry-title',
                        '.post-title',
                        '.post h1',
                        '.post h3 a',
                        'a.datitle',          # Slashdot style
                        '.posttitle',
                        '.post_title',
                        '.pageTitle',
                        '#main h1.title',
                        '.title h1',                          
                        '.post h2',
                        'h2.title',
                        '.entry h2 a',
                        '.entry h2',                                                      # Common style
                        '.boite_titre a',
                        ['meta[@name="title"]', lambda { |el| el.attr('content') }],
                        'h1.headermain',
                        'h1.title',
                        '.mxb h1',                                                        # BBC News
                        '#content h1',
                        '#content h2',
                        '#content h3',
                        'a[@rel="bookmark"]',
                        '.products h2',
                        '.caption h3',
                        '#main h2',
                        '#body h1',
                        '#wrapper h1',
                        '#page h1',
                        '.asset-header h1',
                        '#body_content h2'
                      ],
                      all
                    )
  
  # If all else fails, go to the HTML title
  if all
    return [html_title] if !title
    return ([*title] + [html_title]).uniq
  else
    return html_title if !title
    return title
  end
end

#titlesObject



61
62
63
# File 'lib/pismo/internal_attributes.rb', line 61

def titles
  title(true)
end