Class: Biblionet::Extractors::Base

Inherits:

Object

Object
Biblionet::Extractors::Base

show all

Includes:: FileManager

Defined in:: lib/bookshark/extractors/base.rb

Direct Known Subclasses

AuthorExtractor, BibliographicalBookExtractor, BookExtractor, CategoryExtractor, PublisherExtractor

Instance Attribute Summary collapse

#biblionet_id ⇒ Object readonly

Returns the value of attribute biblionet_id.
#filepath ⇒ Object

Returns the value of attribute filepath.
#page ⇒ Object readonly

Returns the value of attribute page.
#url ⇒ Object

Returns the value of attribute url.

Class Method Summary collapse

.decode_text(encoded_text) ⇒ Object

Instance Method Summary collapse

#decode_text(encoded_text) ⇒ Object

Decodes text with escaped html entities and returns the decoded text.
#initialize(uri = nil) ⇒ Base constructor

Initializes the Base class.
#load_page(uri = nil) ⇒ Object

Loads a page from the web or from local file storage depending on passed argument.
#load_page_from_file(filepath) ⇒ Object

Reads a page from the local file system.
#load_page_from_url(url) ⇒ Object

Downloads a page from the web.
#present?(value) ⇒ Boolean
#save_page(path) ⇒ Object

Saves page to file.

Methods included from FileManager

#list_directories, #list_files, #save_to

Constructor Details

#initialize(uri = nil) ⇒ `Base`

Initializes the Base class. Without arguments nothing happens. Otherwise loads a page by url or file.

Attributes

uri - It can be a url or a path/to/file.ext on local storage.



31
32
33

# File 'lib/bookshark/extractors/base.rb', line 31

def initialize(uri=nil)          
  load_page(uri)
end

Instance Attribute Details

#biblionet_id ⇒ `Object` (readonly)

Returns the value of attribute biblionet_id.



23
24
25

# File 'lib/bookshark/extractors/base.rb', line 23

def biblionet_id
  @biblionet_id
end

#filepath ⇒ `Object`

Returns the value of attribute filepath.



23
24
25

# File 'lib/bookshark/extractors/base.rb', line 23

def filepath
  @filepath
end

#page ⇒ `Object` (readonly)

Returns the value of attribute page.



23
24
25

# File 'lib/bookshark/extractors/base.rb', line 23

def page
  @page
end

#url ⇒ `Object`

Returns the value of attribute url.



23
24
25

# File 'lib/bookshark/extractors/base.rb', line 23

def url
  @url
end

Class Method Details

.decode_text(encoded_text) ⇒ `Object`

# File 'lib/bookshark/extractors/base.rb', line 147

def self.decode_text(encoded_text)
  # encoded_text = File.read(encoded_file_path)
  coder = HTMLEntities.new
  coder.decode(encoded_text)
end

Instance Method Details

#decode_text(encoded_text) ⇒ `Object`

Decodes text with escaped html entities and returns the decoded text.

Params:

encoded_text: the text which contains encoded entities



143
144
145

# File 'lib/bookshark/extractors/base.rb', line 143

def decode_text(encoded_text)
  self.class.decode_text(encoded_text)
end

#load_page(uri = nil) ⇒ `Object`

Loads a page from the web or from local file storage depending on passed argument.

Attributes

uri - It can be a url(starting with http/https) or a path/to/file.ext on local storage.

# File 'lib/bookshark/extractors/base.rb', line 41

def load_page(uri=nil)      
  if uri.match(/\A#{URI::regexp(['http', 'https'])}\z/)        
    load_page_from_url(uri)
  else                
    load_page_from_file(uri)
  end unless uri.nil?
end

#load_page_from_file(filepath) ⇒ `Object`

Reads a page from the local file system.

Attributes

filepath - The path to target file which will be read.

# File 'lib/bookshark/extractors/base.rb', line 96

def load_page_from_file(filepath)    
  begin        
    @filepath = filepath
    @biblionet_id = filepath[/\d+(?!.*\d+)/] unless filepath.nil?
    @page = open(filepath).read  
  rescue StandardError => e
    puts e
  end     
end

#load_page_from_url(url) ⇒ `Object`

Downloads a page from the web.

Attributes

url - The url of webpage to download.

# File 'lib/bookshark/extractors/base.rb', line 55

def load_page_from_url(url)
  begin
    @url = url
    @biblionet_id = url[/\d+(?!.*\d+)/] unless url.nil? # id is expected to be the last number.

    pp "Downloading page: #{url}"
    open(url, :content_length_proc => lambda do |content_length|
      raise EmptyPageError.new(url, content_length) unless content_length.nil? or content_length > 1024
    end) do |f|        
      # pp f.status == ["200", "OK"] ? "success: #{f.status}" : f.status            
      # pp  f.meta
      # pp "Content-Type: " + f.content_type
      # pp "Content-Size: " + (f.meta)["content-length"]
      # pp "last modified" + f.last_modified.to_s + is_empty = (f.last_modified.nil?) ? 'Empty' : 'Not Empty' 

      @page = f.read.gsub(/\s+/, " ")
    end
  rescue Errno::ENOENT => e
    pp "Page: #{url} NOT FOUND."
    pp e
  rescue EmptyPageError => e
    pp "Page: #{url} is EMPTY."
    pp e        
    @page = nil
  rescue OpenURI::HTTPError => e
    pp e
    pp e.io.status          
  rescue StandardError => e          
    pp "Generic error #{e.class}. Will wait for 2 minutes and then try again."
    pp e        
    sleep(120)
    retry        
  end
end

#present?(value) ⇒ `Boolean`

Returns:

(Boolean)



153
154
155

# File 'lib/bookshark/extractors/base.rb', line 153

def present?(value)
  return (not value.nil? and not value.empty?) ? true : false
end

#save_page(path) ⇒ `Object`

Saves page to file.

Attributes

path - The path to file(including filename) where content will be saved.

# File 'lib/bookshark/extractors/base.rb', line 132

def save_page(path)
  save_to(path, @page)
  pp "Saving page: #{path}"
end

Class: Biblionet::Extractors::Base

Direct Known Subclasses

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from FileManager

Constructor Details

#initialize(uri = nil) ⇒ Base

Attributes

Instance Attribute Details

#biblionet_id ⇒ Object (readonly)

#filepath ⇒ Object

#page ⇒ Object (readonly)

#url ⇒ Object

Class Method Details

.decode_text(encoded_text) ⇒ Object

Instance Method Details

#decode_text(encoded_text) ⇒ Object

Params:

#load_page(uri = nil) ⇒ Object

Attributes

#load_page_from_file(filepath) ⇒ Object

Attributes

#load_page_from_url(url) ⇒ Object

Attributes

#present?(value) ⇒ Boolean

#save_page(path) ⇒ Object

Attributes

#initialize(uri = nil) ⇒ `Base`

#biblionet_id ⇒ `Object` (readonly)

#filepath ⇒ `Object`

#page ⇒ `Object` (readonly)

#url ⇒ `Object`

.decode_text(encoded_text) ⇒ `Object`

#decode_text(encoded_text) ⇒ `Object`

#load_page(uri = nil) ⇒ `Object`

#load_page_from_file(filepath) ⇒ `Object`

#load_page_from_url(url) ⇒ `Object`

#present?(value) ⇒ `Boolean`

#save_page(path) ⇒ `Object`