Class: LinScraper

Inherits:
Object
  • Object
show all
Includes:
CSVHandlers, Parsers
Defined in:
lib/linsc/lin.rb

Instance Method Summary collapse

Methods included from Parsers

#scrape_contact, #scrape_education, #scrape_employment

Methods included from CSVHandlers

#append_to_csv, #create_file, #create_file_with_headers, #create_row, #get_headers

Constructor Details

#initialize(working_dir, input_file, options) ⇒ LinScraper

Returns a new instance of LinScraper.



50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# File 'lib/linsc/lin.rb', line 50

def initialize(working_dir, input_file, options)
  @working_dir, @input_file, @options = working_dir, input_file, options
  @output_update = @working_dir + "contact_update.csv" if @options[:update]
  @output_insert = @working_dir + "contact_insert.csv" if @options[:insert]
  @output_employment_update = @working_dir + "contact_employment_update.csv" if @options[:update]
  @output_employment_insert = @working_dir + "contact_employment_insert.csv" if @options[:insert]
  @output_education_update = @working_dir + "contact_education_update.csv" if @options[:update]
  @output_education_insert = @working_dir + "contact_education_insert.csv" if @options[:insert]

  @cooldown = 20
  @noproxy = options[:noproxy]
  @proxies = ProxyHandler.new(@cooldown) unless @options[:noproxy]
  @headers = get_headers(@input_file)
  @new_headers = ["Contact ID", "LIN ID", "CV TR", "Account Name", "Account ID", "Linkedin Import Status", "First Name", "Last Name", "Email", "LinkedIn Profile", "Candidate ID",
          "LIN 1st Degree", "Title", "Contact Country", "Contact LIN Sector", "Resume Last Updated", "LIN Import Date", "CV Uploaded",
          "Employer 1 Title", "Employer Organization Name 1", "Employer 1 Start Date",
          "Employer 1 End Date", "Employer 1 Location", "Employer 1 Description",
          "Employer 2 Title", "Employer Organization Name 2", "Employer 2 Start Date",
          "Employer 2 End Date", "Employer 2 Location", "Employer 2 Description",
          "Employer 3 Title", "Employer Organization Name 3", "Employer 3 Start Date",
          "Employer 3 End Date", "Employer 3 Location", "Employer 3 Description",
          "License or Certification Name 1", "License or Certification Name 2",
          "License or Certification Credential Type", "Education School 1",
          "Education Degree Name 1", "Education Degree Date 1",
          "Education School 2", "Education Degree Name 2",
          "Education Degree Date 2", "Text Resume"]
  @new_headers.each do |header|
    @headers << header unless @headers.include?(header)
  end
  @headers.delete('Urls')
  @employment_headers = ["Contact ID", "Employer Name", "Job Title", "Start Date", "End Date", "Location", "LIN ID"]
  @education_headers = ["Contact ID", "School Name", "Major", "Graduation Year", "LIN ID"]
  @input_length = CSV.read(@input_file).length - 1
  I18n.available_locales = [:en]
  if (@output_update && File.exist?(@output_update)) || (@output_insert && File.exist?(@output_insert))
    if @output_update
      update_length = CSV.read(@output_update, headers: true).length
    else
      update_length = 0
    end
    if @output_insert
      insert_length = CSV.read(@output_insert, headers: true).length
    else
      insert_length = 0
    end
    @start = update_length + insert_length
  end
  [@output_insert, @output_update].each do |file|
    if file
      create_file(file) unless File.exist?(file)
    end
  end
  [@output_employment_update, @output_employment_insert].each do |file|
    if file
      create_file_with_headers(file, @employment_headers)
    end
  end
  [@output_education_update, @output_education_insert].each do |file|
    if file
      create_file_with_headers(file, @education_headers)
    end
  end
end

Instance Method Details

#format_date(input_date) ⇒ Object



128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# File 'lib/linsc/lin.rb', line 128

def format_date(input_date)
  if input_date.nil?
    return nil
  end
  begin
    date_arr = input_date.split(" ")
    if date_arr.length == 1
      output_date = Date.strptime(input_date, "%Y")
      return output_date.strftime("%Y-%m-%d")
    elsif date_arr.length == 2
      output_date = Date.strptime(input_date, "%B %Y")
      return output_date.strftime("%Y-%m-%d")
    else
      return nil
    end
  rescue
    if date_arr.length == 2
      return format_date(date_arr[1])
    else
      return nil
    end
  end
end

#name_check(lin_name, csv_name) ⇒ Object



114
115
116
117
118
119
120
121
122
123
124
# File 'lib/linsc/lin.rb', line 114

def name_check(lin_name, csv_name)
  csv_array = csv_name.downcase.alnum.split(" ")
  lin_array = lin_name.downcase.alnum.split(" ")
  match = true
  csv_array.each do |chunk|
    unless lin_array.include?(chunk)
      match = false
    end
  end
  return match
end

#startObject



209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
# File 'lib/linsc/lin.rb', line 209

def start
  count = 0
  CSV.foreach(@input_file, headers: true) do |input_row|
    count += 1
    next if @start && @start >= count
    tries = @proxies.length unless @noproxy
    puts "lin #{count}/#{@input_length}"
    # begin
      urls = input_row['Urls']
      if urls && urls.include?('http')
        urls = urls.split(', ')
        correct_url, correct_page = nil
        urls.each do |url|
          correct_url, correct_page = validate(url, input_row)
          break if correct_url && correct_page
        end
        if correct_url
          puts "correct page"
          input_row << ["Linkedin Profile", correct_url]
          input_row["Linkedin Import Status"] = 'Profile imported'
          input_row.delete('Urls')
          if input_row['Contact ID'] && input_row['Contact ID'].strip.length > 0
            if @options[:update]
              new_row = scrape_contact(input_row, correct_page, 'update')
              append_to_csv(@output_update, new_row)
              scrape_employment(input_row, correct_page).each do |emp_row|
                append_to_csv(@output_employment_update, emp_row)
              end
              scrape_education(input_row, correct_page).each do |ed_row|
                append_to_csv(@output_education_update, ed_row)
              end
            end
          else
            if @options[:insert]
              new_row = scrape_contact(input_row, correct_page, 'insert')
              append_to_csv(@output_insert, new_row)
              scrape_employment(input_row, correct_page).each do |emp_row|
                append_to_csv(@output_employment_insert, emp_row)
              end
              scrape_education(input_row, correct_page).each do |ed_row|
                append_to_csv(@output_education_insert, ed_row)
              end
            end
          end
        else
          if @options[:update] && input_row['Contact ID'] && input_row['Contact ID'].strip.length > 0
            input_row << ["Linkedin Profile", nil]
            input_row.delete('Urls')
            input_row["Linkedin Import Status"] = 'Profile not found'
            input_row["CV TR"] = '0' unless input_row["CV TR"] == '1'
            input_row["CV Uploaded"] = '0' unless input_row["CV Uploaded"] == '1'
            output_row = create_row(input_row, @headers)
            puts input_row["Linkedin Import Status"]
            append_to_csv(@output_update, output_row)
          elsif @options [:insert]
            input_row << ["Linkedin Profile", nil]
            input_row.delete('Urls')
            input_row["CV TR"] = '0'
            input_row["CV Uploaded"] = '0'
            input_row["Linkedin Import Status"] = 'Profile not found'
            puts input_row["Linkedin Import Status"]
            output_row = create_row(input_row, @headers)
            append_to_csv(@output_insert, output_row)
          end
        end
      else
        if @options[:update] && input_row['Contact ID'] && input_row['Contact ID'].strip.length > 0
          input_row << ["Linkedin Profile", nil]
          input_row.delete('Urls')
          input_row["CV TR"] = '0' unless input_row["CV TR"] == '1'
          input_row["CV Uploaded"] = '0' unless input_row["CV Uploaded"] == '1'
          puts input_row["Linkedin Import Status"]
          output_row = create_row(input_row, @headers)
          append_to_csv(@output_update, output_row)
        elsif @options [:insert]
          input_row << ["Linkedin Profile", nil]
          input_row.delete('Urls')
          input_row["CV TR"] = '0'
          input_row["CV Uploaded"] = '0'
          puts input_row["Linkedin Import Status"]
          output_row = create_row(input_row, @headers)
          append_to_csv(@output_insert, output_row)
        end
      end
    # rescue Exception => msg
    #   tries -= 1
    #   if tries > 0
    #     puts "\n\n"
    #     puts msg
    #     puts 'RETRYING'
    #     puts "\n\n"
    #     if msg.to_s.start_with?("999")
    #       proxy.dead
    #     else
    #       proxy.used
    #     end
    #     retry
    #   else
    #     #append_ddg_row(input_row, msg, nil)
    #     puts msg
    #   end
    # end
  end

end

#validate(url, row) ⇒ Object



152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
# File 'lib/linsc/lin.rb', line 152

def validate(url, row)
  puts "checking url: #{url}"
  begin
    cert_file = Pathname.new(File.dirname __dir__).realdirpath + '../data/cacert.pem'
    cert_store = OpenSSL::X509::Store.new
    cert_store.add_file(cert_file.to_s)
    agent = Mechanize.new
    agent.cert_store = cert_store

    unless @noproxy
      proxy = @proxies.get_proxy
      agent.set_proxy(proxy.ip, proxy.port, proxy.username, proxy.password)
      agent.user_agent = proxy.user_agent
      puts "proxy: #{proxy.ip}"
    end
    sleep(@cooldown) if @noproxy
    page = agent.get(url)
    puts 'ACCESS GRANTED'
    proxy.good if proxy

    return false unless page.at_css("#name") && page.css("#experience .positions .position")
    return false unless name_check(page.at_css("#name").text, "#{row['First Name']} #{row['Last Name']}")
    positions = page.css("#experience .positions .position")

    match = false
    positions.each do |position|
      if position.at_css("header .item-title a") && position.at_css("header .item-subtitle")
        profile_title = I18n.transliterate(position.at_css("header .item-title a").text).alnum
        profile_employer = I18n.transliterate(position.at_css("header .item-subtitle").text).alnum
        title = I18n.transliterate(row['Employer 1 Title']).alnum
        employer = I18n.transliterate(row['Employer Organization Name 1']).alnum
        if name_check(profile_title, title) && name_check(profile_employer, employer)
          match = true
        end
      end
    end
    if match
      return [url, page]
    else
      return false
    end
  rescue StandardError => e
    puts e
    if e.to_s.start_with?('999')
      proxy.dead if proxy
      retry
    elsif e.to_s.start_with?('404') || e.to_s.start_with?('403')
      proxy.good if proxy
      return false
    else
      puts e.backtrace
      proxy.used if proxy
      retry
    end
  end
end