Class: Biblionet::Extractors::BookDataExtractor

Inherits:
Object
  • Object
show all
Defined in:
lib/bookshark/extractors/book_extractor.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(document) ⇒ BookDataExtractor

Returns a new instance of BookDataExtractor.



251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
# File 'lib/bookshark/extractors/book_extractor.rb', line 251

def initialize(document)
  # No need to operate on whole page. Just on part containing the book.
  content_re = /<!-- CONTENT START -->.*<!-- CONTENT END -->/m
  if (content_re.match(document)).nil?
    puts document
  end
  content = content_re.match(document)[0] unless (content_re.match(document)).nil?
  
  # If content is nil, there is something wrong with the html, so return nil
  if content.nil?
    @nodeset = nil
  else
    @nodeset = Nokogiri::HTML(content) 
  end        
end

Instance Attribute Details

#nodesetObject (readonly)

Returns the value of attribute nodeset.



249
250
251
# File 'lib/bookshark/extractors/book_extractor.rb', line 249

def nodeset
  @nodeset
end

Instance Method Details

#awardsObject



364
365
366
367
368
369
370
371
372
# File 'lib/bookshark/extractors/book_extractor.rb', line 364

def awards
  awards = []        
  @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'page=showaward') ]]").each do |item|
    award = {name: item.text, year: item.next_sibling.text.strip.gsub(/[^\d]/, '')}          
    awards << award
  end
  
  return awards
end

#collective_work?Boolean

Returns:

  • (Boolean)


347
348
349
# File 'lib/bookshark/extractors/book_extractor.rb', line 347

def collective_work?
  return @nodeset.at_css('h1.book_title').parent.text.include?('Συλλογικό έργο') ? true : false
end

#contributorsObject



302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
# File 'lib/bookshark/extractors/book_extractor.rb', line 302

def contributors
  contributors = []
  @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'/author/') ]]").each do |item| 
    pre_text = item.previous.text.strip           
    contributors << pre_text unless pre_text == ',' or !pre_text.end_with? ':'
    contributor = {}
    contributor[:name] = item.text 
    contributor[:b_id] = (item[:href].split("/"))[2]      
    contributors << contributor
  end
  # Alternative way based on intersecting sets
  # set_A = "//a[@class='booklink' and @href[contains(.,'/com/') ]][1]/preceding::text()"
  # set_B = "//a[@class='booklink' and @href[not(contains(.,'/com/')) ]][1]/following::text()"

  # others = book.xpath("#{set_A}[count(.|#{set_B}) = count(#{set_B})]").map do |other|
  #           text = other.inner_text.strip
  #           other = text == "," ? nil : text          
  #         end.compact         
  contributors
end

#ddcsObject



343
344
345
# File 'lib/bookshark/extractors/book_extractor.rb', line 343

def ddcs
  @nodeset.xpath("//a[@class='subjectlink' and @href[contains(.,'/index/') ]]")
end

#descriptionObject



332
333
334
335
336
337
338
339
340
341
# File 'lib/bookshark/extractors/book_extractor.rb', line 332

def description
  desc = @nodeset.css('p').last.inner_html #.to_s.gsub(/<br>/,'\\n')
  desc = Sanitize.clean(desc, elements: ['br'])

  if (desc =~ /\p{Word}{3,}/).nil?
    return nil
  else
    return desc
  end
end

#detailsObject



323
324
325
326
327
328
329
330
# File 'lib/bookshark/extractors/book_extractor.rb', line 323

def details
  details = @nodeset.css('.book_details')[0].inner_html.gsub(/(^\d,\d)|(\D,|,\D)(?=[^\]]*(?:\[|$))/, "<br>").split("<br>").map(&:strip).reject!(&:empty?)
  if details.nil?
    details = @nodeset.css('.book_details')[1].inner_html.gsub(/(^\d,\d)|(\D,|,\D)(?=[^\]]*(?:\[|$))/, "<br>").split("<br>").map(&:strip).reject!(&:empty?)           
  end

  return details     
end

#has_contributors_but_no_authors?Boolean

Special case in which there is no author but there are contributors

Returns:

  • (Boolean)


352
353
354
355
356
357
358
359
360
361
362
# File 'lib/bookshark/extractors/book_extractor.rb', line 352

def has_contributors_but_no_authors?
  node_start = "//h1[@class='book_title']/following::text()"
  node_end = "//a[@class='booklink' and @href[contains(.,'/author/') ]][1]/preceding::text()"
  between = (@nodeset.xpath(node_start) & @nodeset.xpath(node_end)).text.strip
          
  if !between.empty? and between.end_with? ':'        
    true
  else
    false
  end
end

#imageObject



267
268
269
270
271
272
273
274
275
276
277
# File 'lib/bookshark/extractors/book_extractor.rb', line 267

def image
  img_node = nil
  img_nodes = @nodeset.xpath("/html/body//img").each do |i|
    img_candidate = i.xpath("//img[@src[contains(.,'/covers/')]][1]") 
    img_node = img_candidate unless img_candidate.nil? or img_candidate.empty?                        
  end                    

  img = img_node.nil? ? nil : BASE_URL+(img_node.first)[:src]                             

  return img 
end

#publisherObject



293
294
295
296
297
298
299
300
# File 'lib/bookshark/extractors/book_extractor.rb', line 293

def publisher
  publisher_hash = {}
  @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'/com/') ]]").each do |item| 
    publisher_hash[:name] = item.text
    publisher_hash[:b_id] = (item[:href].split("/"))[2]
  end
  publisher_hash
end

#subtitleObject



283
284
285
286
287
288
289
290
291
# File 'lib/bookshark/extractors/book_extractor.rb', line 283

def subtitle
  subtitle = nil
  @nodeset.xpath("//h1[@class='book_title']").each do |item|
    if item.next_element.name == 'br' and item.next_element.next.name != 'br'
      subtitle = item.next_element.next.text.strip
    end
  end
  subtitle
end

#titleObject



279
280
281
# File 'lib/bookshark/extractors/book_extractor.rb', line 279

def title
  @nodeset.css('h1.book_title').text
end