Class: PiplCollector

Inherits:
Object
  • Object
show all
Defined in:
lib/piplcollector.rb

Instance Method Summary collapse

Constructor Details

#initialize(input_dir, output_dir, output_append_dir, id_field, ignore_files, api_key, field_mapping, geocoder_api_key) ⇒ PiplCollector

Returns a new instance of PiplCollector.



6
7
8
9
10
11
12
13
14
15
16
# File 'lib/piplcollector.rb', line 6

def initialize(input_dir, output_dir, output_append_dir, id_field, ignore_files, api_key, field_mapping, geocoder_api_key)
  @input_dir = input_dir
  @output_dir = output_dir
  @output_append_dir = output_append_dir
  @id_field = id_field
  @ignore_files = ignore_files
  @api_key = api_key
  @geocoder_api_key = geocoder_api_key
  @field_mapping = field_mapping
  @already_collected = load_output_files
end

Instance Method Details

#create_write_dirs(dir) ⇒ Object

Create if they don’t exist



90
91
92
93
94
95
96
97
98
# File 'lib/piplcollector.rb', line 90

def create_write_dirs(dir)
  dirs = dir.split("/")
  dirs.delete("")
  overallpath = ""
  dirs.each do |d|
    Dir.mkdir(overallpath+"/"+d) if !File.directory?(overallpath+"/"+d)
    overallpath += ("/"+d)
  end
end

#gen_filename_from_id(data_item) ⇒ Object

Generates a file-safe name from the id field



39
40
41
# File 'lib/piplcollector.rb', line 39

def gen_filename_from_id(data_item)
  data_item[@id_field].gsub(":", "").gsub("/", "").gsub(".", "")
end

#get_already_collected_person(data_item) ⇒ Object

Gets content for already collected person



66
67
68
69
# File 'lib/piplcollector.rb', line 66

def get_already_collected_person(data_item)
  filename = @output_dir+"/"+gen_filename_from_id(data_item)+".json"
  return file = JSON.parse(File.read(filename))
end

#get_person(data_item) ⇒ Object

Get info on person from pipl



53
54
55
56
57
58
59
60
61
62
63
# File 'lib/piplcollector.rb', line 53

def get_person(data_item)
  sleep(1)
  
  # Get data from Pipl
  p = PiplRequest.new(@api_key, @field_mapping, @geocoder_api_key)
  output = p.get_data(data_item)

  # Handle output
  save_output_file(output, data_item) if output
  return JSON.parse(output) if output
end

#get_write_dir(dir, file) ⇒ Object

Figure out where to write it



101
102
103
104
# File 'lib/piplcollector.rb', line 101

def get_write_dir(dir, file)
  dir_save = dir.gsub(@input_dir, @output_append_dir)
  return dir_save+"/"+file
end

#load_output_filesObject

Load the output files into already_collected



19
20
21
22
23
24
25
26
27
28
29
# File 'lib/piplcollector.rb', line 19

def load_output_files
  collected = []

  # Make a list of all saved files
  Dir.foreach(@output_dir) do |file|
    next if file == '.' or file == '..'
    collected.push(file.gsub(".json", ""))
  end
  
  return collected
end

#process(file) ⇒ Object

Process file



72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# File 'lib/piplcollector.rb', line 72

def process(file)
  data = JSON.parse(File.read(file))
  outfile = Array.new

  # Go through each item in file
  data.each do |item|
    if !was_collected?(item)
      item[:pipl] = get_person(item) if item[@id_field]
    else
      item[:pipl] = get_already_collected_person(item) if item[@id_field]
    end
    outfile.push(item)
  end

  JSON.pretty_generate(outfile)
end

#run(dir) ⇒ Object

Run on files



107
108
109
110
111
112
113
114
115
116
117
118
119
120
# File 'lib/piplcollector.rb', line 107

def run(dir)
  Dir.foreach(dir) do |file|
    next if file == '.' or file == '..'
    if File.directory?(dir+"/"+file)
      run(dir+"/"+file)
    elsif file.include?(".json") && !file.include?(@ignore_files)
      if !File.exist?(get_write_dir(dir, file))
        with_pipl = process(dir+"/"+file)
        create_write_dirs(dir.gsub(@input_dir, @output_append_dir))
        File.write(get_write_dir(dir, file), with_pipl)
      end
    end
  end 
end

#save_output_file(output_item, data_item) ⇒ Object

Save output file



32
33
34
35
36
# File 'lib/piplcollector.rb', line 32

def save_output_file(output_item, data_item)
  id = gen_filename_from_id(data_item)
  File.write(@output_dir+"/"+id+".json", output_item)
  @already_collected.push(id)
end

#was_collected?(data_item) ⇒ Boolean

Checks if it is already collected

Returns:

  • (Boolean)


44
45
46
47
48
49
50
# File 'lib/piplcollector.rb', line 44

def was_collected?(data_item)
  if data_item[@id_field]
    return @already_collected.include?(gen_filename_from_id(data_item))
  else
    return true
  end
end