Class: Biblionet::Extractors::CategoryExtractor

Inherits:
Base
  • Object
show all
Defined in:
lib/bookshark/extractors/category_extractor.rb

Instance Attribute Summary collapse

Attributes inherited from Base

#biblionet_id, #filepath, #page, #url

Instance Method Summary collapse

Methods inherited from Base

#decode_text, decode_text, #load_page, #load_page_from_file, #load_page_from_url, #present?, #save_page

Methods included from FileManager

#list_directories, #list_files, #save_to

Constructor Details

#initialize(uri = nil) ⇒ CategoryExtractor

Returns a new instance of CategoryExtractor.



12
13
14
15
# File 'lib/bookshark/extractors/category_extractor.rb', line 12

def initialize(uri=nil)
  super(uri)        
  extract_categories unless uri.nil? or @page.nil?         
end

Instance Attribute Details

#categoriesObject (readonly)

Returns the value of attribute categories.



10
11
12
# File 'lib/bookshark/extractors/category_extractor.rb', line 10

def categories
  @categories
end

Instance Method Details

#extract_categories(category_page = @page) ⇒ Object



17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# File 'lib/bookshark/extractors/category_extractor.rb', line 17

def extract_categories(category_page=@page)
  page = Nokogiri::HTML(category_page)  
  parent, previous_indent, previous_id = nil, nil, nil,

  @categories = page.xpath("//a[@class='menu' and @href[contains(.,'/index/') ]]").map do |category|      
    # Extract from href the id used by biblionet. --- DdC url http://biblionet.gr/index/id ---
    biblionet_id = category[:href].split(/\//).last

    # Get the text before <a>. It is expected to be a number of space characters
    spaces = category.previous_sibling.text # TODO: make sure text is only spaces           
    # Indent size
    indent = spaces.size

    # Determine parent-child-sibling relationships based on indent.
    # Indent size seems to be inconsistent, so it better to compare sizes than actually use them.
    if (indent <=> previous_indent).nil?
      previous_indent = indent
    elsif (indent <=> previous_indent)>0
      parent = previous_id
      previous_indent = indent        
    end

    previous_id = biblionet_id

    # Extact DdC id and DdC text.     
    category = proccess_category(category.text)

    category.merge!(parent: parent)
    
    category_hash = {biblionet_id => category.clone}
  end.reduce({}, :update) unless @page.nil?               

  if present?(@categories)
    @categories[:current] = (@categories[@biblionet_id.to_s].clone)
    @categories[:current][:b_id] = @biblionet_id
    return @categories
  else
    return nil
  end                
end

#extract_categories_from(uri = nil) ⇒ Object



58
59
60
61
# File 'lib/bookshark/extractors/category_extractor.rb', line 58

def extract_categories_from(uri=nil)
  load_page(uri)
  extract_categories unless uri.nil? or @page.nil? 
end