Class: PiplCollector
- Inherits:
-
Object
- Object
- PiplCollector
- Defined in:
- lib/piplcollector.rb
Instance Method Summary collapse
-
#create_write_dirs(dir) ⇒ Object
Create if they don’t exist.
-
#gen_filename_from_id(data_item) ⇒ Object
Generates a file-safe name from the id field.
-
#get_already_collected_person(data_item) ⇒ Object
Gets content for already collected person.
-
#get_person(data_item) ⇒ Object
Get info on person from pipl.
-
#get_write_dir(dir, file) ⇒ Object
Figure out where to write it.
-
#initialize(input_dir, output_dir, output_append_dir, id_field, ignore_files, api_key, field_mapping, geocoder_api_key) ⇒ PiplCollector
constructor
A new instance of PiplCollector.
-
#load_output_files ⇒ Object
Load the output files into already_collected.
-
#process(file) ⇒ Object
Process file.
-
#run(dir) ⇒ Object
Run on files.
-
#save_output_file(output_item, data_item) ⇒ Object
Save output file.
-
#was_collected?(data_item) ⇒ Boolean
Checks if it is already collected.
Constructor Details
#initialize(input_dir, output_dir, output_append_dir, id_field, ignore_files, api_key, field_mapping, geocoder_api_key) ⇒ PiplCollector
Returns a new instance of PiplCollector.
6 7 8 9 10 11 12 13 14 15 16 |
# File 'lib/piplcollector.rb', line 6 def initialize(input_dir, output_dir, output_append_dir, id_field, ignore_files, api_key, field_mapping, geocoder_api_key) @input_dir = input_dir @output_dir = output_dir @output_append_dir = output_append_dir @id_field = id_field @ignore_files = ignore_files @api_key = api_key @geocoder_api_key = geocoder_api_key @field_mapping = field_mapping @already_collected = load_output_files end |
Instance Method Details
#create_write_dirs(dir) ⇒ Object
Create if they don’t exist
90 91 92 93 94 95 96 97 98 |
# File 'lib/piplcollector.rb', line 90 def create_write_dirs(dir) dirs = dir.split("/") dirs.delete("") overallpath = "" dirs.each do |d| Dir.mkdir(overallpath+"/"+d) if !File.directory?(overallpath+"/"+d) overallpath += ("/"+d) end end |
#gen_filename_from_id(data_item) ⇒ Object
Generates a file-safe name from the id field
39 40 41 |
# File 'lib/piplcollector.rb', line 39 def gen_filename_from_id(data_item) data_item[@id_field].gsub(":", "").gsub("/", "").gsub(".", "") end |
#get_already_collected_person(data_item) ⇒ Object
Gets content for already collected person
66 67 68 69 |
# File 'lib/piplcollector.rb', line 66 def get_already_collected_person(data_item) filename = @output_dir+"/"+gen_filename_from_id(data_item)+".json" return file = JSON.parse(File.read(filename)) end |
#get_person(data_item) ⇒ Object
Get info on person from pipl
53 54 55 56 57 58 59 60 61 62 63 |
# File 'lib/piplcollector.rb', line 53 def get_person(data_item) sleep(1) # Get data from Pipl p = PiplRequest.new(@api_key, @field_mapping, @geocoder_api_key) output = p.get_data(data_item) # Handle output save_output_file(output, data_item) if output return JSON.parse(output) if output end |
#get_write_dir(dir, file) ⇒ Object
Figure out where to write it
101 102 103 104 |
# File 'lib/piplcollector.rb', line 101 def get_write_dir(dir, file) dir_save = dir.gsub(@input_dir, @output_append_dir) return dir_save+"/"+file end |
#load_output_files ⇒ Object
Load the output files into already_collected
19 20 21 22 23 24 25 26 27 28 29 |
# File 'lib/piplcollector.rb', line 19 def load_output_files collected = [] # Make a list of all saved files Dir.foreach(@output_dir) do |file| next if file == '.' or file == '..' collected.push(file.gsub(".json", "")) end return collected end |
#process(file) ⇒ Object
Process file
72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
# File 'lib/piplcollector.rb', line 72 def process(file) data = JSON.parse(File.read(file)) outfile = Array.new # Go through each item in file data.each do |item| if !was_collected?(item) item[:pipl] = get_person(item) if item[@id_field] else item[:pipl] = get_already_collected_person(item) if item[@id_field] end outfile.push(item) end JSON.pretty_generate(outfile) end |
#run(dir) ⇒ Object
Run on files
107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
# File 'lib/piplcollector.rb', line 107 def run(dir) Dir.foreach(dir) do |file| next if file == '.' or file == '..' if File.directory?(dir+"/"+file) run(dir+"/"+file) elsif file.include?(".json") && !file.include?(@ignore_files) if !File.exist?(get_write_dir(dir, file)) with_pipl = process(dir+"/"+file) create_write_dirs(dir.gsub(@input_dir, @output_append_dir)) File.write(get_write_dir(dir, file), with_pipl) end end end end |
#save_output_file(output_item, data_item) ⇒ Object
Save output file
32 33 34 35 36 |
# File 'lib/piplcollector.rb', line 32 def save_output_file(output_item, data_item) id = gen_filename_from_id(data_item) File.write(@output_dir+"/"+id+".json", output_item) @already_collected.push(id) end |
#was_collected?(data_item) ⇒ Boolean
Checks if it is already collected
44 45 46 47 48 49 50 |
# File 'lib/piplcollector.rb', line 44 def was_collected?(data_item) if data_item[@id_field] return @already_collected.include?(gen_filename_from_id(data_item)) else return true end end |