Class: DirCrawl

Inherits:
Object
  • Object
show all
Defined in:
lib/dircrawl.rb

Instance Method Summary collapse

Constructor Details

#initialize(path, output_dir, ignore_includes, save, process_block, include_block, extras_block, failure_mode, cm_hash, *args) ⇒ DirCrawl

Returns a new instance of DirCrawl.



8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# File 'lib/dircrawl.rb', line 8

def initialize(path, output_dir, ignore_includes, save, process_block, include_block, extras_block, failure_mode, cm_hash, *args)
  @path = path
  @output_dir = output_dir
  @ignore_includes = ignore_includes
  include_block.call
  @process_block = process_block
  @extras_block = extras_block
  @failure_mode = failure_mode
  @output = Array.new
  @save = save

  # Handle crawler manager info
  @cm_url = cm_hash[:crawler_manager_url] if cm_hash
  @selector_id = cm_hash[:selector_id] if cm_hash

  # Crawl
  crawl_dir(path, *args)
end

Instance Method Details

#crawl_dir(dir, *args) ⇒ Object

Crawl dir and call block for each file



45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# File 'lib/dircrawl.rb', line 45

def crawl_dir(dir, *args)
  Dir.foreach(dir) do |file|
    # Skip . or .. files
    next if file == '.' or file == '..'

    # Recurse into directories
    if File.directory?(dir+"/"+file)
      report_status("Going to next directory: " + dir+"/"+file)
      crawl_dir(dir+"/"+file, *args)

    # Process file
    elsif !file.include?(@ignore_includes)

   # Create Output Directory
      create_write_dirs(dir.gsub(@path, @output_dir))

      begin

# Check if processed file exists
# Skip processing (if yes)
      if !File.exist?(get_write_dir(dir, file))

  # Process Extras
  if @extras_block != ""
          extras = @extras_block.call(@output_dir+"/")
  end

        # Now Process Main
        processed = @process_block.call(dir+"/"+file, *args)
      else
  puts "Processed file exists, skipping"
        puts " " + dir + file
        processed = File.read(get_write_dir(dir, file))
      end

      rescue Exception => e # really catch any failures
        report_status("Error on file "+file+": "+e.to_s)
        if @failure_mode == "debug"
          binding.pry
        elsif @failure_mode == "log"
          IO.write(@output_dir+"/error_log.txt", file+"\n", mode: 'a')
        end
      end

      # Only save in output if specified (to handle large dirs)
      report_results([JSON.parse(processed)], dir+"/"+file)

      # Write Output to file
      File.write(get_write_dir(dir, file), processed)
    end
  end
end

#create_write_dirs(dir) ⇒ Object

Create if they don’t exist



34
35
36
37
38
39
40
41
42
# File 'lib/dircrawl.rb', line 34

def create_write_dirs(dir)
  dirs = dir.split("/")
  dirs.delete("")
  overallpath = ""
  dirs.each do |d|
    Dir.mkdir(overallpath+"/"+d) if !File.directory?(overallpath+"/"+d)
    overallpath += ("/"+d)
  end
end

#get_outputObject

Get the output array



134
135
136
# File 'lib/dircrawl.rb', line 134

def get_output
  return JSON.pretty_generate(@output)
end

#get_write_dir(dir, file) ⇒ Object

Figure out where to write it



28
29
30
31
# File 'lib/dircrawl.rb', line 28

def get_write_dir(dir, file)
  dir_save = dir.gsub(@path, @output_dir)
  return dir_save+"/"+file+".json"
end

#report_batch(results) ⇒ Object

Report all results in one JSON



108
109
110
111
112
# File 'lib/dircrawl.rb', line 108

def report_batch(results)
  results.each do |result|
    @output.push(result)
  end
end

#report_incremental(results, path) ⇒ Object

Report results back to Harvester incrementally



125
126
127
128
129
130
131
# File 'lib/dircrawl.rb', line 125

def report_incremental(results, path)
  curl_url = @cm_url+"/relay_results"
  c = Curl::Easy.http_post(curl_url,
                           Curl::PostField.content('selector_id', @selector_id),
                           Curl::PostField.content('status_message', "Processed " + path),
                           Curl::PostField.content('results', JSON.pretty_generate(results)))
end

#report_results(results, path) ⇒ Object

Figure out how to report results



99
100
101
102
103
104
105
# File 'lib/dircrawl.rb', line 99

def report_results(results, path)
  if @cm_url
    report_incremental(results, path)
  else
    report_batch(results)
  end
end

#report_status(status_msg) ⇒ Object

Report Harvester status message



115
116
117
118
119
120
121
122
# File 'lib/dircrawl.rb', line 115

def report_status(status_msg)
  if @cm_url
    curl_url = @cm_url+"/update_status"
    c = Curl::Easy.http_post(curl_url,
                             Curl::PostField.content('selector_id', @selector_id),
                             Curl::PostField.content('status_message', status_msg))
  end
end