Class: Biblionet::Extractors::CategoryExtractor

Inherits:

Base

Object
Base
Biblionet::Extractors::CategoryExtractor

show all

Defined in:: lib/bookshark/extractors/category_extractor.rb

Instance Attribute Summary collapse

#categories ⇒ Object readonly

Returns the value of attribute categories.

Attributes inherited from Base

#biblionet_id, #filepath, #page, #url

Instance Method Summary collapse

#extract_categories(category_page = @page) ⇒ Object
#extract_categories_from(uri = nil) ⇒ Object
#initialize(uri = nil) ⇒ CategoryExtractor constructor

A new instance of CategoryExtractor.

Methods inherited from Base

#decode_text, decode_text, #load_page, #load_page_from_file, #load_page_from_url, #present?, #save_page

Methods included from FileManager

#list_directories, #list_files, #save_to

Constructor Details

#initialize(uri = nil) ⇒ `CategoryExtractor`

Returns a new instance of CategoryExtractor.

# File 'lib/bookshark/extractors/category_extractor.rb', line 12

def initialize(uri=nil)
  super(uri)        
  extract_categories unless uri.nil? or @page.nil?         
end

Instance Attribute Details

#categories ⇒ `Object` (readonly)

Returns the value of attribute categories.



10
11
12

# File 'lib/bookshark/extractors/category_extractor.rb', line 10

def categories
  @categories
end

Instance Method Details

#extract_categories(category_page = @page) ⇒ `Object`

# File 'lib/bookshark/extractors/category_extractor.rb', line 17

def extract_categories(category_page=@page)
  page = Nokogiri::HTML(category_page)  
  parent, previous_indent, previous_id = nil, nil, nil,

  @categories = page.xpath("//a[@class='menu' and @href[contains(.,'/index/') ]]").map do |category|      
    # Extract from href the id used by biblionet. --- DdC url http://biblionet.gr/index/id ---
    biblionet_id = category[:href].split(/\//).last

    # Get the text before <a>. It is expected to be a number of space characters
    spaces = category.previous_sibling.text # TODO: make sure text is only spaces           
    # Indent size
    indent = spaces.size

    # Determine parent-child-sibling relationships based on indent.
    # Indent size seems to be inconsistent, so it better to compare sizes than actually use them.
    if (indent <=> previous_indent).nil?
      previous_indent = indent
    elsif (indent <=> previous_indent)>0
      parent = previous_id
      previous_indent = indent        
    end

    previous_id = biblionet_id

    # Extact DdC id and DdC text.     
    category = proccess_category(category.text)

    category.merge!(parent: parent)
    
    category_hash = {biblionet_id => category.clone}
  end.reduce({}, :update) unless @page.nil?               

  if present?(@categories)
    @categories[:current] = (@categories[@biblionet_id.to_s].clone)
    @categories[:current][:b_id] = @biblionet_id
    return @categories
  else
    return nil
  end                
end

#extract_categories_from(uri = nil) ⇒ `Object`

# File 'lib/bookshark/extractors/category_extractor.rb', line 58

def extract_categories_from(uri=nil)
  load_page(uri)
  extract_categories unless uri.nil? or @page.nil? 
end

Class: Biblionet::Extractors::CategoryExtractor

Instance Attribute Summary collapse

Attributes inherited from Base

Instance Method Summary collapse

Methods inherited from Base

Methods included from FileManager

Constructor Details

#initialize(uri = nil) ⇒ CategoryExtractor

Instance Attribute Details

#categories ⇒ Object (readonly)

Instance Method Details

#extract_categories(category_page = @page) ⇒ Object

#extract_categories_from(uri = nil) ⇒ Object

#initialize(uri = nil) ⇒ `CategoryExtractor`

#categories ⇒ `Object` (readonly)

#extract_categories(category_page = @page) ⇒ `Object`

#extract_categories_from(uri = nil) ⇒ `Object`