Class: DirCrawl
- Inherits:
-
Object
- Object
- DirCrawl
- Defined in:
- lib/dircrawl.rb
Instance Method Summary collapse
-
#crawl_dir(dir, *args) ⇒ Object
Crawl dir and call block for each file.
-
#create_write_dirs(dir) ⇒ Object
Create if they don’t exist.
-
#get_output ⇒ Object
Get the output array.
-
#get_write_dir(dir, file) ⇒ Object
Figure out where to write it.
-
#initialize(path, output_dir, ignore_includes, save, process_block, include_block, extras_block, failure_mode, cm_hash, *args) ⇒ DirCrawl
constructor
A new instance of DirCrawl.
-
#report_batch(results) ⇒ Object
Report all results in one JSON.
-
#report_incremental(results, path) ⇒ Object
Report results back to Harvester incrementally.
-
#report_results(results, path) ⇒ Object
Figure out how to report results.
-
#report_status(status_msg) ⇒ Object
Report Harvester status message.
Constructor Details
#initialize(path, output_dir, ignore_includes, save, process_block, include_block, extras_block, failure_mode, cm_hash, *args) ⇒ DirCrawl
Returns a new instance of DirCrawl.
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 |
# File 'lib/dircrawl.rb', line 8 def initialize(path, output_dir, ignore_includes, save, process_block, include_block, extras_block, failure_mode, cm_hash, *args) @path = path @output_dir = output_dir @ignore_includes = ignore_includes include_block.call @process_block = process_block @extras_block = extras_block @failure_mode = failure_mode @output = Array.new @save = save # Handle crawler manager info @cm_url = cm_hash[:crawler_manager_url] if cm_hash @selector_id = cm_hash[:selector_id] if cm_hash # Crawl crawl_dir(path, *args) end |
Instance Method Details
#crawl_dir(dir, *args) ⇒ Object
Crawl dir and call block for each file
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
# File 'lib/dircrawl.rb', line 45 def crawl_dir(dir, *args) Dir.foreach(dir) do |file| # Skip . or .. files next if file == '.' or file == '..' # Recurse into directories if File.directory?(dir+"/"+file) report_status("Going to next directory: " + dir+"/"+file) crawl_dir(dir+"/"+file, *args) # Process file elsif !file.include?(@ignore_includes) # Create Output Directory create_write_dirs(dir.gsub(@path, @output_dir)) begin # Check if processed file exists # Skip processing (if yes) if !File.exist?(get_write_dir(dir, file)) # Process Extras if @extras_block != "" extras = @extras_block.call(@output_dir+"/") end # Now Process Main processed = @process_block.call(dir+"/"+file, *args) else puts "Processed file exists, skipping" puts " " + dir + file processed = File.read(get_write_dir(dir, file)) end rescue Exception => e # really catch any failures report_status("Error on file "+file+": "+e.to_s) if @failure_mode == "debug" binding.pry elsif @failure_mode == "log" IO.write(@output_dir+"/error_log.txt", file+"\n", mode: 'a') end end # Only save in output if specified (to handle large dirs) report_results([JSON.parse(processed)], dir+"/"+file) # Write Output to file File.write(get_write_dir(dir, file), processed) end end end |
#create_write_dirs(dir) ⇒ Object
Create if they don’t exist
34 35 36 37 38 39 40 41 42 |
# File 'lib/dircrawl.rb', line 34 def create_write_dirs(dir) dirs = dir.split("/") dirs.delete("") overallpath = "" dirs.each do |d| Dir.mkdir(overallpath+"/"+d) if !File.directory?(overallpath+"/"+d) overallpath += ("/"+d) end end |
#get_output ⇒ Object
Get the output array
134 135 136 |
# File 'lib/dircrawl.rb', line 134 def get_output return JSON.pretty_generate(@output) end |
#get_write_dir(dir, file) ⇒ Object
Figure out where to write it
28 29 30 31 |
# File 'lib/dircrawl.rb', line 28 def get_write_dir(dir, file) dir_save = dir.gsub(@path, @output_dir) return dir_save+"/"+file+".json" end |
#report_batch(results) ⇒ Object
Report all results in one JSON
108 109 110 111 112 |
# File 'lib/dircrawl.rb', line 108 def report_batch(results) results.each do |result| @output.push(result) end end |
#report_incremental(results, path) ⇒ Object
Report results back to Harvester incrementally
125 126 127 128 129 130 131 |
# File 'lib/dircrawl.rb', line 125 def report_incremental(results, path) curl_url = @cm_url+"/relay_results" c = Curl::Easy.http_post(curl_url, Curl::PostField.content('selector_id', @selector_id), Curl::PostField.content('status_message', "Processed " + path), Curl::PostField.content('results', JSON.pretty_generate(results))) end |
#report_results(results, path) ⇒ Object
Figure out how to report results
99 100 101 102 103 104 105 |
# File 'lib/dircrawl.rb', line 99 def report_results(results, path) if @cm_url report_incremental(results, path) else report_batch(results) end end |
#report_status(status_msg) ⇒ Object
Report Harvester status message
115 116 117 118 119 120 121 122 |
# File 'lib/dircrawl.rb', line 115 def report_status(status_msg) if @cm_url curl_url = @cm_url+"/update_status" c = Curl::Easy.http_post(curl_url, Curl::PostField.content('selector_id', @selector_id), Curl::PostField.content('status_message', status_msg)) end end |