Class: Biblionet::Extractors::BookExtractor

Inherits:

Base

Object
Base
Biblionet::Extractors::BookExtractor

show all

Defined in:: lib/bookshark/extractors/book_extractor.rb

Direct Known Subclasses

Instance Attribute Summary collapse

#book ⇒ Object readonly

Returns the value of attribute book.

Attributes inherited from Base

#biblionet_id, #filepath, #page, #url

Instance Method Summary collapse

#extract_book(biblionet_id = @biblionet_id, book_page = @page) ⇒ Object
#initialize(uri = nil) ⇒ BookExtractor constructor

A new instance of BookExtractor.
#load_and_extract_book(uri = nil) ⇒ Object
#proccess_contributors(raw_contributors) ⇒ Object

Converts the parsed contributors string to hash.
#proccess_ddc(ddc, extract_parents = false) ⇒ Object
#proccess_details(details) ⇒ Object

Methods inherited from Base

#decode_text, decode_text, #load_page, #load_page_from_file, #load_page_from_url, #present?, #save_page

Methods included from FileManager

#list_directories, #list_files, #save_to

Constructor Details

#initialize(uri = nil) ⇒ `BookExtractor`

# File 'lib/bookshark/extractors/book_extractor.rb', line 14

def initialize(uri=nil)
  super(uri)        
  extract_book unless uri.nil? or @page.nil?        
end

Instance Attribute Details

#book ⇒ `Object` (readonly)

Returns the value of attribute book.



12
13
14

# File 'lib/bookshark/extractors/book_extractor.rb', line 12

def book
  @book
end

Instance Method Details

#extract_book(biblionet_id = @biblionet_id, book_page = @page) ⇒ `Object`

# File 'lib/bookshark/extractors/book_extractor.rb', line 117

def extract_book(biblionet_id=@biblionet_id, book_page=@page)                
  # log = Logger.new(File.new(File.dirname(__dir__).to_s + "/logs/book_parsing.log",'a+'))
  log = Logger.new(STDOUT)
         
  page = BookDataExtractor.new(book_page)

  # End extraction if BookDataExtractor couldnt create a nodeset
  return nil if page.nodeset.nil?


  book_hash = Hash.new      

  begin                
    img = page.image                            
    raise NoImageError.new(biblionet_id) if img.nil?
  rescue NoImageError => e
    pp e 
    log.warn(e.message)                
  rescue StandardError => e
    pp err_msg = "Error #{e} at book: #{biblionet_id}" 
    log.error(err_msg)                            
  end

  book_hash[:title] = page.title 
  book_hash[:subtitle] = page.subtitle        
  book_hash[:image] = img                          

  contributors = proccess_contributors(page.contributors)

  author = contributors[:author]
  contributors.delete(:author)
  
  # If author is empty, maybe its a collective work.
  if author.nil? or author.empty?
    if page.collective_work?     
      # author = 'Συλλογικό έργο'
      author = ['Συλλογικό έργο']
    else            
      pp err_msg = "No author has been found at book: #{biblionet_id}" 
      log.warn(err_msg)   
      author = []          
    end
  end

  book_hash[:author]       = author
  book_hash[:contributors] = contributors        
  book_hash[:publisher]    = page.publisher

  details = page.details
  if details.nil?
    pp err_msg = "No details at book: #{biblionet_id}"
    log.error(err_msg)       
  end        

  details_hash = proccess_details(details)

  # book_hash[:publication_year] = details_hash[:publication_year]
  # book_hash[:pages]            = details_hash[:pages]
  book_hash[:isbn] = details_hash[:isbn]
  
  if details_hash[:isbn_13].nil?
    if present?(details_hash[:isbn]) and (details_hash[:isbn].strip.gsub('-','').length == 13)
      book_hash[:isbn_13] = book_hash[:isbn]
    else
      book_hash[:isbn_13] = nil
    end
  else
    book_hash[:isbn_13] = details_hash[:isbn_13]
  end

  # book_hash[:isbn_13]          = details_hash[:isbn_13].nil? ? nil : details_hash[:isbn_13]
  # book_hash[:status]           = details_hash[:status]
  # book_hash[:price]            = details_hash[:price]
  book_hash[:award]            = page.awards


  book_hash[:description] = page.description

  ddcs = page.ddcs.map do |ddc|      
          # Extract from href the ddc id used by biblionet. --- DdC url http://biblionet.gr/index/id ---
          ddc_biblionet_id = ddc[:href].split(/\//).last
          # Extact DdC id and DdC text.     
          ddc = proccess_ddc(ddc.text)

          ddc.merge!(b_id: ddc_biblionet_id)

        end


  book_hash[:category]   = ddcs
  book_hash[:b_id] = biblionet_id

  uri = nil

  if @url
    uri =  "http://www.biblionet.gr/main.asp?page=results&Titlesid=#{biblionet_id}"
  elsif @filepath
    uri = File.dirname(@filepath) + "/" + "bg_record_#{biblionet_id}.html"
  end

  # uri = "http://www.biblionet.gr/main.asp?page=results&Titlesid=#{biblionet_id}"

  bibliographical_book_extractor = Biblionet::Extractors::BibliographicalBookExtractor.new
  bibliographical_details = bibliographical_book_extractor.load_and_extract_book(uri)      

  book_hash[:publisher]         = bibliographical_details[:publisher]
  book_hash[:publication]       = bibliographical_details[:publication]   

  book_hash[:format]            = bibliographical_details[:format]     

  book_hash[:original_language] = bibliographical_details[:original_language]
  book_hash[:original_title]    = bibliographical_details[:original_title]

  book_hash[:price]             = bibliographical_details[:price]      
  book_hash[:availability]      = bibliographical_details[:availability]
  book_hash[:last_update]       = bibliographical_details[:last_update]
  
  book_hash[:series]            = bibliographical_details[:series]        

  physical_description_hash = {}
  physical_description_hash[:pages]      = details_hash[:pages]
  physical_description_hash[:size]       = bibliographical_details[:physical_size]
  physical_description_hash[:cover_type] = bibliographical_details[:cover_type]

  book_hash[:physical_description] = physical_description_hash
  

  return @book = book_hash  
end

#load_and_extract_book(uri = nil) ⇒ `Object`

# File 'lib/bookshark/extractors/book_extractor.rb', line 19

def load_and_extract_book(uri=nil)
  load_page(uri)
  extract_book unless uri.nil? or @page.nil?
end

#proccess_contributors(raw_contributors) ⇒ `Object`

Converts the parsed contributors string to hash. String must have been processed into the following form: job1: contributor1, contributor2 job2: contributor3 The returned hash is in form: => [“contributor1”,“contributor2”],job2 => [“contributor3”]

# File 'lib/bookshark/extractors/book_extractor.rb', line 28

def proccess_contributors(raw_contributors)
  contributors  = Hash.new
  partners      = Array.new
  job           = :author
  raw_contributors.each do |cb|
    if cb.is_a?(String) and cb.end_with? ":"
      job = cb[0..-2]
      partners.clear
    else
      partners << cb
      contributors[job] =  partners.clone
    end  
  end unless raw_contributors.nil? or raw_contributors.empty?
  
  return contributors
end

#proccess_ddc(ddc, extract_parents = false) ⇒ `Object`

# File 'lib/bookshark/extractors/book_extractor.rb', line 96

def proccess_ddc(ddc, extract_parents = false)
  # Matches only the digits inside [] in text like: [889.09300] Νεοελληνική λογοτεχνία - Ιστορία και κριτική (300)  
  id_re = /(\[DDC\:\s\d*(?:[\.|\s]\d*)*\])/

  # Matches [digits] and (digits) in text like: [889.09300] Νεοελληνική λογοτεχνία - Ιστορία και κριτική (300)   
  non_text_re = /\s*(\[.*\]|\(.*\))\s*/
          
  # Gets the dcc part from text and removes anything but digits in [DDC: digits].                
  ddc_id = ddc.scan(id_re).join.gsub(/[\[\]DDC: ]/, '') # Gets the dcc part from text. 

  # Extracts the parent tree of current ddc.
  # ddcparser.parse(ddc_id)       

  # Gets text by reomoving anything but text.
  ddc_text = ddc.gsub(non_text_re, '').strip

  ddc_hash = { ddc: ddc_id, name: ddc_text } 
  return ddc_hash
end

#proccess_details(details) ⇒ `Object`

# File 'lib/bookshark/extractors/book_extractor.rb', line 45

def proccess_details(details)
  details_hash = Hash.new
  
  details.each do |detail|          
    date_regex = /(^\d{4}$)/
    status_regex = /^\[\p{Word}+(?:\s*[\'\-\+\s]\s*\p{Word}+)*\]$/  
    detail = decode_text(detail)

    begin
      if detail =~ date_regex
        #puts "Publication Year: #{detail}"
        details_hash[:publication_year] = detail
      elsif detail.end_with? "σελ."
        pages = detail.gsub(/[^\d]/, '')
        #puts "Pages: #{pages}"
        details_hash[:pages] = pages
      elsif detail.start_with? "ISBN-13"
        isbn_13 = detail.gsub(/ISBN-13 /, "")
        details_hash[:isbn_13] = isbn_13
        #puts "ISBN: #{isbn_13}"      
      elsif detail.start_with? "ISBN"
        isbn = detail.gsub(/ISBN /, "")
        #puts "ISBN: #{isbn}"
        details_hash[:isbn] = isbn
      elsif detail =~ status_regex
        status = detail.gsub(/\[|\]/, '')
        #puts "Status: #{status}"
        details_hash[:status] = status
      elsif detail.start_with? "Τιμή"
        price = detail.gsub(/[^\d,\d]/, '')
        #puts "Price: #{price}"
        details_hash[:price] = price
      elsif detail.start_with? '<img src="/images/award.jpg" border="0" title="Βραβείο">'
        award = Sanitize.clean(detail).strip
        details_hash[:awards] = [] if details_hash[:awards].nil?
        details_hash[:awards] << award
      elsif detail.start_with? "ISMN" #Special typo case
        isbn = detail.gsub(/ISMN /, "")
        #puts "ISBN: #{isbn}"
        details_hash[:isbn] = isbn              
      else 
        raise NoIdeaWhatThisIsError.new(@biblionet_id, detail)
      end
    rescue NoIdeaWhatThisIsError => e
      pp e        
    end
  end

  return details_hash
end