Class: TaliaUtil::ImportJobHelper

Inherits:
Object
  • Object
show all
Includes:
IoHelper
Defined in:
lib/talia_util/import_job_helper.rb

Overview

Helper methods that will be used during import job runs. The import jobs may use the following environment parameters:

base_url

The base URL or directory. This will be prefixed to all urls, or if it is a local directory, it will be made the current directory during the import

index

If given, the importer will try to read this document. While this will still support the old-style “hyper” format with sigla, it should usually contain a main element called “index” followed by “url” entries.

xml

URL of an XML file to import. This is incompatible with the “index” option. If neither “xml” nor “index” are given, the class will try to read the XML data from STDIN

importer

Name of the importer class to be used for the data. Uses the default class if not given

reset_store
  • If this is set, the data store will be cleared before the import

user

Username for HTTP authentication, if required

pass

Password for HTTP authentication, if required

callback

Name of a class. If given, the import will call the #before_import and #after_import methods on an object of that class. The call will receive a block which may be yielded to for each progress step and which can receive the overall number of steps

extension

Only used with index files; file extension to use

duplicates

How to deal with elements that already exist in the datastore. This may be set to one of the following options (default: :skip):

  • :add - Database fields will be updated and the system will add semantic properties as additional values, without removing any of the existing semantic relations. Example: If the data store already contains a title for an element, and the import file contains another for that element, the element will have two titles after the import. The system will not check for duplicates. Files will always be imported in addition to the existing ones.

  • :update - Database fields will be updated, and semantic properties will be overwritten with the new value(s). Semantic properties that are not included in the import data will be left untouched. In the example above, the element would only contain the new title. If the element also contained author information, and no author information was in the import file, the existing author information will be untouched. Existing files are replaced if the import contains new files

  • :overwrite - Database fields will be updated. All semantic data will be deleted before the import. Files are always removed.

  • :skip - If an element already exists, the import will be skipped.

trace

Enable tracing output for errors. (By default, this takes the rake task’s setting if possible)

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from IoHelper

#base_for, #file_url, #open_from_url, #open_generic

Constructor Details

#initialize(message_stream = STDOUT, progressor = TaliaCore::BackgroundJobs::Job) ⇒ ImportJobHelper

The message_stream will be used for printing progress messages



54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# File 'lib/talia_util/import_job_helper.rb', line 54

def initialize(message_stream = STDOUT, progressor = TaliaCore::BackgroundJobs::Job)
  @trace = (defined?(Rake) ? Rake.application.options.trace : false) || ENV['trace']
  @progressor = progressor
  @message_stream = message_stream
  @duplicates = ENV['duplicates'].to_sym if(ENV['duplicates'])
  @importer = ENV['importer'] || 'TaliaCore::ActiveSourceParts::Xml::SourceReader'
  @credentials = { :http_basic_authentication => [ENV['user'], ENV['pass']] } unless(ENV['user'].blank?)
  assit(!(ENV['xml'] && ENV['index']), 'Not both xml and index parameters allowed')
  @reset = ['yes', 'true'].include?(ENV['reset_store'].downcase) if(ENV['reset_store'])

  @base_url = ENV['base_url'].blank? ? '' : ENV['base_url']
  if(base_url && File.directory?(base_url))
    message_stream.puts "Setting directory to #{base_url}"
    FileUtils.cd(base_url)
  end

  init_data
  
  @callback = ENV['callback'].classify.constantize.new unless(ENV['callback'].blank?)
  
  message_stream.puts "Registered callback (#{callback.class.name}) - (#{callback.respond_to?(:before_import)}|#{callback.respond_to?(:after_import)})" if(callback)
  
  callback.progressor = progressor if(callback && callback.respond_to?(:'progressor='))
end

Instance Attribute Details

#base_urlObject (readonly)

Returns the value of attribute base_url.



51
52
53
# File 'lib/talia_util/import_job_helper.rb', line 51

def base_url
  @base_url
end

#callbackObject (readonly)

Returns the value of attribute callback.



51
52
53
# File 'lib/talia_util/import_job_helper.rb', line 51

def callback
  @callback
end

#credentialsObject (readonly)

Returns the value of attribute credentials.



51
52
53
# File 'lib/talia_util/import_job_helper.rb', line 51

def credentials
  @credentials
end

#duplicatesObject (readonly)

Returns the value of attribute duplicates.



51
52
53
# File 'lib/talia_util/import_job_helper.rb', line 51

def duplicates
  @duplicates
end

#importerObject (readonly)

Returns the value of attribute importer.



51
52
53
# File 'lib/talia_util/import_job_helper.rb', line 51

def importer
  @importer
end

#index_dataObject (readonly)

Returns the value of attribute index_data.



51
52
53
# File 'lib/talia_util/import_job_helper.rb', line 51

def index_data
  @index_data
end

#message_streamObject (readonly)

Returns the value of attribute message_stream.



51
52
53
# File 'lib/talia_util/import_job_helper.rb', line 51

def message_stream
  @message_stream
end

#progressorObject (readonly)

Returns the value of attribute progressor.



51
52
53
# File 'lib/talia_util/import_job_helper.rb', line 51

def progressor
  @progressor
end

#resetObject (readonly)

Returns the value of attribute reset.



51
52
53
# File 'lib/talia_util/import_job_helper.rb', line 51

def reset
  @reset
end

#traceObject (readonly)

Returns the value of attribute trace.



51
52
53
# File 'lib/talia_util/import_job_helper.rb', line 51

def trace
  @trace
end

#xml_dataObject (readonly)

Returns the value of attribute xml_data.



51
52
53
# File 'lib/talia_util/import_job_helper.rb', line 51

def xml_data
  @xml_data
end

Instance Method Details

#do_importObject



95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# File 'lib/talia_util/import_job_helper.rb', line 95

def do_import
  if(reset)
    TaliaUtil::Util.full_reset
    puts "Data Store has been completely reset"
  end
  errors = []
  run_callback(:before_import)
  if(index_data)
    import_from_index(errors)
  else
    puts "Importing from single data file."
    TaliaCore::ActiveSource.create_from_xml(xml_data, :progressor => progressor, :reader => importer, :base_file_uri => @true_root, :errors => errors, :duplicates => duplicates)
  end
  if(errors.size > 0)
    puts "WARNING: #{errors.size} errors during import:"
    errors.each { |e| print_error e }
  end
  run_callback(:after_import)
end

#import_from_index(errors) ⇒ Object



120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# File 'lib/talia_util/import_job_helper.rb', line 120

def import_from_index(errors)
  doc = Hpricot.XML(index_data)
  hyper_format = (doc.root.name == 'sigla')
  elements = hyper_format ? (doc/:siglum) : (doc/:url)
  puts "Import from Index file, #{elements.size} elements"
  # Read the Attributes from the urls
  source_attributes = []
  my_importer = importer.classify.constantize
  progressor.run_with_progress('Reading w/ index', elements.size) do |prog|
    elements.each do |element|
      url = make_url_from("#{element.inner_text}#{ENV['extension']}")
      begin
        this_attribs = my_importer.sources_from_url(url, credentials)
        source_attributes = source_attributes + this_attribs
      rescue Exception => e
        message_stream.puts "Problem importing #{url} (#{e.message})"
        message_stream.puts e.backtrace
      end
      prog.inc
    end
  end
  # Write the data
  TaliaCore::ActiveSource.progressor = progressor
  TaliaCore::ActiveSource.create_multi_from(source_attributes, :errors => errors, :duplicates => duplicates)
end

#init_dataObject



79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# File 'lib/talia_util/import_job_helper.rb', line 79

def init_data
  if(ENV['index'].blank?)
    @xml_data = if(ENV['xml'].blank?)
      STDIN.read
    else
      xml_url = ENV['xml']
      xml_url = base_url + xml_url unless(File.exists?(xml_url))
      @true_root = base_for(xml_url)
      open_generic(xml_url, credentials) { |io| io.read }
    end
  else
    index = make_url_from(ENV['index'])
    @index_data = open_generic(index, credentials) { |io| io.read }
  end
end

#make_url_from(url) ⇒ Object



146
147
148
149
# File 'lib/talia_util/import_job_helper.rb', line 146

def make_url_from(url)
  return url if(File.exist?(url))
  "#{base_url}#{url}"
end


115
116
117
118
# File 'lib/talia_util/import_job_helper.rb', line 115

def print_error(e)
  puts e.message
  puts e.backtrace if(trace)
end

#run_callback(name) ⇒ Object



151
152
153
154
155
156
# File 'lib/talia_util/import_job_helper.rb', line 151

def run_callback(name)
  if(callback && callback.respond_to?(name))
    puts "Running callback #{name}"
    callback.send(name)
  end
end