Module: CSteamer::InternalAttributes

Included in:
Document
Defined in:
lib/csteamer/internal_attributes.rb

Overview

Internal attributes are different pieces of data we can extract from a document’s content

Instance Method Summary collapse

Instance Method Details

#authorObject

Returns the author of the page/content



16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# File 'lib/csteamer/internal_attributes.rb', line 16

def author
  author = @doc.match('.wire_author',      
                      ['meta[@name="author"]', lambda { |el| el.attr('content') }],     # Traditional meta tag style
                      '.byline a',                                                      # Ruby Inside style
                      '.post_subheader_left a',                                         # TechCrunch style
                      '.byl'                                                            # BBC News style
                      )
                      
  return unless author

  # Strip off any "By [whoever]" section
  author.sub!(/^by\s+/i, '')
  
  author
end

#descriptionObject

Returns the “description” of the page, usually comes from a meta tag



33
34
35
36
37
# File 'lib/csteamer/internal_attributes.rb', line 33

def description
  @doc.match( ['meta[@name="description"]', lambda { |el| el.attr('content') }],
              ['meta[@name="Description"]', lambda { |el| el.attr('content') }]
   )
end

#ledeObject

Returns the “lede” or first paragraph of the story/page



40
41
42
43
44
45
# File 'lib/csteamer/internal_attributes.rb', line 40

def lede
  @doc.match( '//div[@class="entrytext"]//p[string-length()>10]',
              'section p',
              '//td[@class="storybody"]/p[string-length()>10]'                         # BBC News style
              )
end

#titleObject

Returns the title of the page/content - attempts to strip site name, etc, if possible



5
6
7
8
9
10
11
12
13
# File 'lib/csteamer/internal_attributes.rb', line 5

def title
  title = @doc.match('title')
  return unless title

  # Strip off any leading site names - a scrappy way to try it out..
  title.sub!(/^.{0,20}\s\-\s/, '')
  
  title
end