Class: Biblionet::Extractors::BookDataExtractor
- Inherits:
-
Object
- Object
- Biblionet::Extractors::BookDataExtractor
- Defined in:
- lib/bookshark/extractors/book_extractor.rb
Instance Attribute Summary collapse
-
#nodeset ⇒ Object
readonly
Returns the value of attribute nodeset.
Instance Method Summary collapse
- #awards ⇒ Object
- #collective_work? ⇒ Boolean
- #contributors ⇒ Object
- #ddcs ⇒ Object
- #description ⇒ Object
- #details ⇒ Object
-
#has_contributors_but_no_authors? ⇒ Boolean
Special case in which there is no author but there are contributors.
- #image ⇒ Object
-
#initialize(document) ⇒ BookDataExtractor
constructor
A new instance of BookDataExtractor.
- #publisher ⇒ Object
- #subtitle ⇒ Object
- #title ⇒ Object
Constructor Details
#initialize(document) ⇒ BookDataExtractor
Returns a new instance of BookDataExtractor.
251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 |
# File 'lib/bookshark/extractors/book_extractor.rb', line 251 def initialize(document) # No need to operate on whole page. Just on part containing the book. content_re = /<!-- CONTENT START -->.*<!-- CONTENT END -->/m if (content_re.match(document)).nil? puts document end content = content_re.match(document)[0] unless (content_re.match(document)).nil? # If content is nil, there is something wrong with the html, so return nil if content.nil? @nodeset = nil else @nodeset = Nokogiri::HTML(content) end end |
Instance Attribute Details
#nodeset ⇒ Object (readonly)
Returns the value of attribute nodeset.
249 250 251 |
# File 'lib/bookshark/extractors/book_extractor.rb', line 249 def nodeset @nodeset end |
Instance Method Details
#awards ⇒ Object
364 365 366 367 368 369 370 371 372 |
# File 'lib/bookshark/extractors/book_extractor.rb', line 364 def awards awards = [] @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'page=showaward') ]]").each do |item| award = {name: item.text, year: item.next_sibling.text.strip.gsub(/[^\d]/, '')} awards << award end return awards end |
#collective_work? ⇒ Boolean
347 348 349 |
# File 'lib/bookshark/extractors/book_extractor.rb', line 347 def collective_work? return @nodeset.at_css('h1.book_title').parent.text.include?('Συλλογικό έργο') ? true : false end |
#contributors ⇒ Object
302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 |
# File 'lib/bookshark/extractors/book_extractor.rb', line 302 def contributors contributors = [] @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'/author/') ]]").each do |item| pre_text = item.previous.text.strip contributors << pre_text unless pre_text == ',' or !pre_text.end_with? ':' contributor = {} contributor[:name] = item.text contributor[:b_id] = (item[:href].split("/"))[2] contributors << contributor end # Alternative way based on intersecting sets # set_A = "//a[@class='booklink' and @href[contains(.,'/com/') ]][1]/preceding::text()" # set_B = "//a[@class='booklink' and @href[not(contains(.,'/com/')) ]][1]/following::text()" # others = book.xpath("#{set_A}[count(.|#{set_B}) = count(#{set_B})]").map do |other| # text = other.inner_text.strip # other = text == "," ? nil : text # end.compact contributors end |
#ddcs ⇒ Object
343 344 345 |
# File 'lib/bookshark/extractors/book_extractor.rb', line 343 def ddcs @nodeset.xpath("//a[@class='subjectlink' and @href[contains(.,'/index/') ]]") end |
#description ⇒ Object
332 333 334 335 336 337 338 339 340 341 |
# File 'lib/bookshark/extractors/book_extractor.rb', line 332 def description desc = @nodeset.css('p').last.inner_html #.to_s.gsub(/<br>/,'\\n') desc = Sanitize.clean(desc, elements: ['br']) if (desc =~ /\p{Word}{3,}/).nil? return nil else return desc end end |
#details ⇒ Object
323 324 325 326 327 328 329 330 |
# File 'lib/bookshark/extractors/book_extractor.rb', line 323 def details details = @nodeset.css('.book_details')[0].inner_html.gsub(/(^\d,\d)|(\D,|,\D)(?=[^\]]*(?:\[|$))/, "<br>").split("<br>").map(&:strip).reject!(&:empty?) if details.nil? details = @nodeset.css('.book_details')[1].inner_html.gsub(/(^\d,\d)|(\D,|,\D)(?=[^\]]*(?:\[|$))/, "<br>").split("<br>").map(&:strip).reject!(&:empty?) end return details end |
#has_contributors_but_no_authors? ⇒ Boolean
Special case in which there is no author but there are contributors
352 353 354 355 356 357 358 359 360 361 362 |
# File 'lib/bookshark/extractors/book_extractor.rb', line 352 def node_start = "//h1[@class='book_title']/following::text()" node_end = "//a[@class='booklink' and @href[contains(.,'/author/') ]][1]/preceding::text()" between = (@nodeset.xpath(node_start) & @nodeset.xpath(node_end)).text.strip if !between.empty? and between.end_with? ':' true else false end end |
#image ⇒ Object
267 268 269 270 271 272 273 274 275 276 277 |
# File 'lib/bookshark/extractors/book_extractor.rb', line 267 def image img_node = nil img_nodes = @nodeset.xpath("/html/body//img").each do |i| img_candidate = i.xpath("//img[@src[contains(.,'/covers/')]][1]") img_node = img_candidate unless img_candidate.nil? or img_candidate.empty? end img = img_node.nil? ? nil : BASE_URL+(img_node.first)[:src] return img end |
#publisher ⇒ Object
293 294 295 296 297 298 299 300 |
# File 'lib/bookshark/extractors/book_extractor.rb', line 293 def publisher publisher_hash = {} @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'/com/') ]]").each do |item| publisher_hash[:name] = item.text publisher_hash[:b_id] = (item[:href].split("/"))[2] end publisher_hash end |
#subtitle ⇒ Object
283 284 285 286 287 288 289 290 291 |
# File 'lib/bookshark/extractors/book_extractor.rb', line 283 def subtitle subtitle = nil @nodeset.xpath("//h1[@class='book_title']").each do |item| if item.next_element.name == 'br' and item.next_element.next.name != 'br' subtitle = item.next_element.next.text.strip end end subtitle end |
#title ⇒ Object
279 280 281 |
# File 'lib/bookshark/extractors/book_extractor.rb', line 279 def title @nodeset.css('h1.book_title').text end |