Class: OAI::Harvester::Harvest

Inherits:
Object
  • Object
show all
Defined in:
lib/oai/harvester/harvest.rb,
lib/oai/harvester/logging.rb

Instance Method Summary collapse

Constructor Details

#initialize(*args) ⇒ Harvest

Returns a new instance of Harvest.



9
10
11
12
13
14
15
# File 'lib/oai/harvester/harvest.rb', line 9

def initialize(config = nil, directory = nil, date = nil)
  @config = config || Config.load
  @directory = directory || @config.storage
  @from = date
  @from.freeze
  @parser = defined?(XML::Document) ? 'libxml' : 'rexml'
end

Instance Method Details

#orig_callObject



8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# File 'lib/oai/harvester/logging.rb', line 8

def call(url, opts)
  # Preserve original options
  options = opts.dup
  
  records = 0;
  client = OAI::Client.new(url, :parser => @parser)
  provider_config = client.identify
  
  file = Tempfile.new('oai_data')
  gz = Zlib::GzipWriter.new(file)
  gz << "<? xml version=\"1.0\" encoding=\"UTF-8\" ?>\n"
  gz << "<records>"
  begin
    response = client.list_records(options)
    get_records(response.doc).each do |rec|
      gz << rec
      records += 1
    end
    puts "#{records} records retrieved" if @interactive

    # Get a full response by iterating with the resumption tokens.  
    # Not very Ruby like.  Should fix OAI::Client to handle resumption
    # tokens internally.
    while(response.resumption_token and not response.resumption_token.empty?)
      puts "\nresumption token recieved, continuing" if @interactive
      response = client.list_records(:resumption_token => 
        response.resumption_token)
        get_records(response.doc).each do |rec|
          gz << rec
          records += 1
        end
      puts "#{records} records retrieved" if @interactive
    end

      gz << "</records>"
      
  ensure
    gz.close
    file.close
  end

  [file, records]
end

#orig_harvestObject



7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# File 'lib/oai/harvester/logging.rb', line 7

def harvest(site)
  opts = build_options_hash(@config.sites[site])
  harvest_time = Time.now.utc

  if "YYYY-MM-DD" == granularity(opts[:url])
    opts[:until] = harvest_time.strftime("%Y-%m-%d")
    opts[:from] = @from.strftime("%Y-%m-%d") if @from
  else
    opts[:until] = harvest_time.xmlschema
    opts[:from] = @from.xmlschema if @from
  end

  # Allow a from date to be passed in
  opts[:from] = earliest(opts[:url]) unless opts[:from]
  opts.delete(:set) if 'all' == opts[:set]
  
  begin
    # Connect, and download
    file, records = call(opts.delete(:url), opts)

    # Move document to storage directory
    dir = File.join(@directory, date_based_directory(harvest_time))
    FileUtils.mkdir_p dir
    FileUtils.mv(file.path, 
      File.join(dir, "#{site}-#{filename(Time.parse(opts[:from]), 
      harvest_time)}.xml.gz"))
    @config.sites[site]['last'] = harvest_time
  rescue
    raise $! unless $!.respond_to?(:code)
    raise $! if not @interactive || "noRecordsMatch" != $!.code
    puts "No new records available"
  end
end

#orig_initHarvest

Returns a new instance of Harvest.

Returns:

  • (Harvest)

    a new instance of Harvest



9
10
11
12
13
14
15
# File 'lib/oai/harvester/logging.rb', line 9

def initialize(config = nil, directory = nil, date = nil)
  @config = config || Config.load
  @directory = directory || @config.storage
  @from = date
  @from.freeze
  @parser = defined?(XML::Document) ? 'libxml' : 'rexml'
end

#orig_startObject



6
7
8
9
10
11
12
13
14
15
16
# File 'lib/oai/harvester/logging.rb', line 6

def start(sites = nil, interactive = false)
  @interactive = interactive
  sites = (@config.sites.keys rescue {}) unless sites
  begin
    sites.each do |site|
      harvest(site)
    end
  ensure
    @config.save
  end
end

#start(sites = nil, interactive = false) ⇒ Object



17
18
19
20
21
22
23
24
25
26
27
# File 'lib/oai/harvester/harvest.rb', line 17

def start(sites = nil, interactive = false)
  @interactive = interactive
  sites = (@config.sites.keys rescue {}) unless sites
  begin
    sites.each do |site|
      harvest(site)
    end
  ensure
    @config.save
  end
end