Class: Biblionet::Extractors::AuthorDataExtractor

Inherits:
Object
  • Object
show all
Defined in:
lib/bookshark/extractors/author_extractor.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(document) ⇒ AuthorDataExtractor

Returns a new instance of AuthorDataExtractor.



86
87
88
89
90
91
92
93
94
95
# File 'lib/bookshark/extractors/author_extractor.rb', line 86

def initialize(document)
  # No need to operate on whole page. Just on part containing the content.
  content_re = /<!-- CONTENT START -->.*<!-- CONTENT END -->/m
  if (content_re.match(document)).nil?
    puts document
  end
  content = content_re.match(document)[0] unless (content_re.match(document)).nil?

  @nodeset = Nokogiri::HTML(content)        
end

Instance Attribute Details

#nodesetObject (readonly)

Returns the value of attribute nodeset.



84
85
86
# File 'lib/bookshark/extractors/author_extractor.rb', line 84

def nodeset
  @nodeset
end

Instance Method Details

#awardsObject



111
112
113
114
115
116
117
118
119
# File 'lib/bookshark/extractors/author_extractor.rb', line 111

def awards
  awards = []        
  @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'page=showaward') ]]").each do |item|
    award = {name: item.text, year: item.next_sibling.text.strip.gsub(/[^\d]/, '')}          
    awards << award
  end

  return awards
end

#bioObject



101
102
103
# File 'lib/bookshark/extractors/author_extractor.rb', line 101

def bio
  @nodeset.css('//p[align="justify"]').text
end

#fullnameObject



97
98
99
# File 'lib/bookshark/extractors/author_extractor.rb', line 97

def fullname
  @nodeset.css('h1.page_title').text
end

#imageObject



105
106
107
108
109
# File 'lib/bookshark/extractors/author_extractor.rb', line 105

def image
  img_node  = @nodeset.xpath("//img[@src[contains(.,'/persons/')]][1]")                                                   
  img       = (img_node.nil? or img_node.empty?) ? nil : BASE_URL+(img_node.first)['src']                             
  return img         
end