Class: Biblionet::Extractors::BookDataExtractor

Inherits:

Object

Object
Biblionet::Extractors::BookDataExtractor

show all

Defined in:: lib/bookshark/extractors/book_extractor.rb

Instance Attribute Summary collapse

#nodeset ⇒ Object readonly

Returns the value of attribute nodeset.

Instance Method Summary collapse

#awards ⇒ Object
#collective_work? ⇒ Boolean
#contributors ⇒ Object
#ddcs ⇒ Object
#description ⇒ Object
#details ⇒ Object
#has_contributors_but_no_authors? ⇒ Boolean

Special case in which there is no author but there are contributors.
#image ⇒ Object
#initialize(document) ⇒ BookDataExtractor constructor

A new instance of BookDataExtractor.
#publisher ⇒ Object
#subtitle ⇒ Object
#title ⇒ Object

Constructor Details

#initialize(document) ⇒ `BookDataExtractor`

Returns a new instance of BookDataExtractor.

# File 'lib/bookshark/extractors/book_extractor.rb', line 251

def initialize(document)
  # No need to operate on whole page. Just on part containing the book.
  content_re = /<!-- CONTENT START -->.*<!-- CONTENT END -->/m
  if (content_re.match(document)).nil?
    puts document
  end
  content = content_re.match(document)[0] unless (content_re.match(document)).nil?
  
  # If content is nil, there is something wrong with the html, so return nil
  if content.nil?
    @nodeset = nil
  else
    @nodeset = Nokogiri::HTML(content) 
  end        
end

Instance Attribute Details

#nodeset ⇒ `Object` (readonly)

Returns the value of attribute nodeset.



249
250
251

# File 'lib/bookshark/extractors/book_extractor.rb', line 249

def nodeset
  @nodeset
end

Instance Method Details

#awards ⇒ `Object`

# File 'lib/bookshark/extractors/book_extractor.rb', line 364

def awards
  awards = []        
  @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'page=showaward') ]]").each do |item|
    award = {name: item.text, year: item.next_sibling.text.strip.gsub(/[^\d]/, '')}          
    awards << award
  end
  
  return awards
end

#collective_work? ⇒ `Boolean`

Returns:

(Boolean)



347
348
349

# File 'lib/bookshark/extractors/book_extractor.rb', line 347

def collective_work?
  return @nodeset.at_css('h1.book_title').parent.text.include?('Συλλογικό έργο') ? true : false
end

#contributors ⇒ `Object`

# File 'lib/bookshark/extractors/book_extractor.rb', line 302

def contributors
  contributors = []
  @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'/author/') ]]").each do |item| 
    pre_text = item.previous.text.strip           
    contributors << pre_text unless pre_text == ',' or !pre_text.end_with? ':'
    contributor = {}
    contributor[:name] = item.text 
    contributor[:b_id] = (item[:href].split("/"))[2]      
    contributors << contributor
  end
  # Alternative way based on intersecting sets
  # set_A = "//a[@class='booklink' and @href[contains(.,'/com/') ]][1]/preceding::text()"
  # set_B = "//a[@class='booklink' and @href[not(contains(.,'/com/')) ]][1]/following::text()"

  # others = book.xpath("#{set_A}[count(.|#{set_B}) = count(#{set_B})]").map do |other|
  #           text = other.inner_text.strip
  #           other = text == "," ? nil : text          
  #         end.compact         
  contributors
end

#ddcs ⇒ `Object`



343
344
345

# File 'lib/bookshark/extractors/book_extractor.rb', line 343

def ddcs
  @nodeset.xpath("//a[@class='subjectlink' and @href[contains(.,'/index/') ]]")
end

#description ⇒ `Object`

# File 'lib/bookshark/extractors/book_extractor.rb', line 332

def description
  desc = @nodeset.css('p').last.inner_html #.to_s.gsub(/<br>/,'\\n')
  desc = Sanitize.clean(desc, elements: ['br'])

  if (desc =~ /\p{Word}{3,}/).nil?
    return nil
  else
    return desc
  end
end

#details ⇒ `Object`

# File 'lib/bookshark/extractors/book_extractor.rb', line 323

def details
  details = @nodeset.css('.book_details')[0].inner_html.gsub(/(^\d,\d)|(\D,|,\D)(?=[^\]]*(?:\[|$))/, "<br>").split("<br>").map(&:strip).reject!(&:empty?)
  if details.nil?
    details = @nodeset.css('.book_details')[1].inner_html.gsub(/(^\d,\d)|(\D,|,\D)(?=[^\]]*(?:\[|$))/, "<br>").split("<br>").map(&:strip).reject!(&:empty?)           
  end

  return details     
end

#has_contributors_but_no_authors? ⇒ `Boolean`

Special case in which there is no author but there are contributors

Returns:

(Boolean)

# File 'lib/bookshark/extractors/book_extractor.rb', line 352

def has_contributors_but_no_authors?
  node_start = "//h1[@class='book_title']/following::text()"
  node_end = "//a[@class='booklink' and @href[contains(.,'/author/') ]][1]/preceding::text()"
  between = (@nodeset.xpath(node_start) & @nodeset.xpath(node_end)).text.strip
          
  if !between.empty? and between.end_with? ':'        
    true
  else
    false
  end
end

#image ⇒ `Object`

# File 'lib/bookshark/extractors/book_extractor.rb', line 267

def image
  img_node = nil
  img_nodes = @nodeset.xpath("/html/body//img").each do |i|
    img_candidate = i.xpath("//img[@src[contains(.,'/covers/')]][1]") 
    img_node = img_candidate unless img_candidate.nil? or img_candidate.empty?                        
  end                    

  img = img_node.nil? ? nil : BASE_URL+(img_node.first)[:src]                             

  return img 
end

#publisher ⇒ `Object`

# File 'lib/bookshark/extractors/book_extractor.rb', line 293

def publisher
  publisher_hash = {}
  @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'/com/') ]]").each do |item| 
    publisher_hash[:name] = item.text
    publisher_hash[:b_id] = (item[:href].split("/"))[2]
  end
  publisher_hash
end

#subtitle ⇒ `Object`

# File 'lib/bookshark/extractors/book_extractor.rb', line 283

def subtitle
  subtitle = nil
  @nodeset.xpath("//h1[@class='book_title']").each do |item|
    if item.next_element.name == 'br' and item.next_element.next.name != 'br'
      subtitle = item.next_element.next.text.strip
    end
  end
  subtitle
end

#title ⇒ `Object`



279
280
281

# File 'lib/bookshark/extractors/book_extractor.rb', line 279

def title
  @nodeset.css('h1.book_title').text
end

Class: Biblionet::Extractors::BookDataExtractor

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(document) ⇒ BookDataExtractor

Instance Attribute Details

#nodeset ⇒ Object (readonly)

Instance Method Details

#awards ⇒ Object

#collective_work? ⇒ Boolean

#contributors ⇒ Object

#ddcs ⇒ Object

#description ⇒ Object

#details ⇒ Object

#has_contributors_but_no_authors? ⇒ Boolean

#image ⇒ Object

#publisher ⇒ Object

#subtitle ⇒ Object

#title ⇒ Object

#initialize(document) ⇒ `BookDataExtractor`

#nodeset ⇒ `Object` (readonly)

#awards ⇒ `Object`

#collective_work? ⇒ `Boolean`

#contributors ⇒ `Object`

#ddcs ⇒ `Object`

#description ⇒ `Object`

#details ⇒ `Object`

#has_contributors_but_no_authors? ⇒ `Boolean`

#image ⇒ `Object`

#publisher ⇒ `Object`

#subtitle ⇒ `Object`

#title ⇒ `Object`