Class: OAI::Harvester::Harvest

Inherits:
Object
  • Object
show all
Defined in:
lib/oai/harvester/harvest.rb,
lib/oai/harvester/logging.rb

Constant Summary collapse

DIRECTORY_LAYOUT =
"%Y/%m".freeze

Instance Method Summary collapse

Constructor Details

#initialize(*args) ⇒ Harvest

Returns a new instance of Harvest.



9
10
11
12
13
14
15
16
17
# File 'lib/oai/harvester/harvest.rb', line 9

def initialize(config = nil, directory = nil, date = nil, to = nil)
  @config = config || Config.load
  @directory = directory || @config.storage
  @from = date
  @from.freeze
  @until = to
  @until.freeze
  @parser = defined?(XML::Document) ? 'libxml' : 'rexml'
end

Instance Method Details

#orig_callObject



8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# File 'lib/oai/harvester/logging.rb', line 8

def call(url, opts)
  # Preserve original options
  options = opts.dup
  
  records = 0;
  client = OAI::Client.new(url, :parser => @parser)
  provider_config = client.identify

  file = Tempfile.new('oai_data')
  gz = Zlib::GzipWriter.new(file)
  gz << "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n"
  gz << "<records>"
  begin
    response = client.list_records(options)
    response.each do |rec|
      gz << rec._source
      records += 1
    end
    puts "#{records} records retrieved" if @interactive

    # Get a full response by iterating with the resumption tokens.  
    # Not very Ruby like.  Should fix OAI::Client to handle resumption
    # tokens internally.
    while(response.resumption_token and not response.resumption_token.empty?)
      puts "\nresumption token recieved, continuing" if @interactive
      response = client.list_records(:resumption_token => 
        response.resumption_token)
        response.each do |rec|
          gz << rec._source
          records += 1
        end
      puts "#{records} records retrieved" if @interactive
    end

      gz << "</records>"
      
  ensure
    gz.close
    file.close
  end

  [file, records]
end

#orig_harvestObject



7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# File 'lib/oai/harvester/logging.rb', line 7

def harvest(site)
  opts = build_options_hash(@config.sites[site])
  if @until
    harvest_time = @until.to_time.utc
  else
    harvest_time = Time.now.utc
  end

  if OAI::Const::Granularity::LOW == granularity(opts[:url])
    opts[:until] = harvest_time.strftime("%Y-%m-%d")
    opts[:from] = @from.strftime("%Y-%m-%d") if @from
  else
    opts[:until] = harvest_time.xmlschema
    opts[:from] = @from.xmlschema if @from
  end

  # Allow a from date to be passed in
  opts[:from] = earliest(opts[:url]) unless opts[:from]
  opts.delete(:set) if 'all' == opts[:set]
  begin
    # Connect, and download
    file, records = call(opts.delete(:url), opts)

    # Move document to storage directory if configured
    if @directory
      directory_layout = @config.layouts[site] if @config.layouts
      dir = File.join(@directory, date_based_directory(harvest_time, directory_layout))
      FileUtils.mkdir_p dir
      FileUtils.mv(file.path,
        File.join(dir, "#{site}-#{filename(Time.parse(opts[:from]),
        harvest_time)}.xml.gz"))
    else
      puts "no configured destination for temp file" if @interactive
    end
    @config.sites[site]['last'] = harvest_time
  rescue OAI::NoMatchException
    puts "No new records available" if @interactive
  rescue OAI::Exception => ex
    raise ex if not @interactive
    puts ex.message
  end
end

#orig_initHarvest

Returns a new instance of Harvest.

Returns:

  • (Harvest)

    a new instance of Harvest



9
10
11
12
13
14
15
16
17
# File 'lib/oai/harvester/logging.rb', line 9

def initialize(config = nil, directory = nil, date = nil, to = nil)
  @config = config || Config.load
  @directory = directory || @config.storage
  @from = date
  @from.freeze
  @until = to
  @until.freeze
  @parser = defined?(XML::Document) ? 'libxml' : 'rexml'
end

#orig_startObject



6
7
8
9
10
11
12
13
14
15
16
# File 'lib/oai/harvester/logging.rb', line 6

def start(sites = nil, interactive = false)
  @interactive = interactive
  sites = (@config.sites.keys rescue {}) unless sites
  begin
    sites.each do |site|
      harvest(site)
    end
  ensure
    @config.save
  end
end

#start(sites = nil, interactive = false) ⇒ Object



19
20
21
22
23
24
25
26
27
28
29
# File 'lib/oai/harvester/harvest.rb', line 19

def start(sites = nil, interactive = false)
  @interactive = interactive
  sites = (@config.sites.keys rescue {}) unless sites
  begin
    sites.each do |site|
      harvest(site)
    end
  ensure
    @config.save
  end
end