Module: Parsers

Included in:
LinScraper
Defined in:
lib/linsc/parsers.rb

Instance Method Summary collapse

Instance Method Details

#scrape_contact(input_row, page, mode) ⇒ Object



2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
# File 'lib/linsc/parsers.rb', line 2

def scrape_contact(input_row, page, mode)
  row = CSV::Row.new(@headers, [])
  name = page.at_css("#name").text.split
  contact_id = input_row["Contact ID"]
  lin_id = input_row["LIN ID"]
   = input_row["Account ID"]
  cv_tr = input_row["CV TR"]
  acc_name = input_row["Account Name"]
  import_status = input_row["Linkedin Import Status"]
  email = input_row["Email"]
  lin_profile = input_row["Linkedin Profile"]
  cand_id = input_row["Candidate ID"]
  cand_source = input_row["LIN 1st Degree"]
  title = page.at_css(".headline.title").text
  country = page.at_css("#demographics .locality").text
  sector = page.at_css("#demographics .descriptor:not(.adr)").text

  positions = page.css("#experience .positions .position")
  if positions
    e1_title = positions[0].at_css(".item-title").text
    e1_org = positions[0].at_css(".item-subtitle").text
    e1_start = positions[0].css(".date-range time")[0].text
    e1_end = positions[0].css(".date-range time")[1].text
    e1_loc = positions[0].at_css(".location").text
    e1_desc = positions[0].at_css(".description").text
    e2_title = positions[1].at_css(".item-title").text
    e2_org = positions[1].at_css(".item-subtitle").text
    e2_start = positions[1].css(".date-range time")[0].text
    e2_end = positions[1].css(".date-range time")[1].text
    e2_loc = positions[1].at_css(".location").text
    e2_desc = positions[1].at_css(".description").text
    e3_title = positions[2].at_css(".item-title").text
    e3_org = positions[2].at_css(".item-subtitle").text
    e3_start = positions[2].css(".date-range time")[0].text
    e3_end = positions[2].css(".date-range time")[1].text
    e3_loc = positions[2].at_css(".location").text
    e3_desc = positions[2].at_css(".description").text
  end

  certs = page.css(".certifications .certification")
  if certs
    c1_name = certs[0].at_css(".item-title").text
    c2_name = certs[1].at_css(".item-title").text
    c_type  = certs[0].at_css(".item-subtitle").text
  end

  schools = page.css("#education .schools .school")
  if schools
    s1_name = schools[0].at_css(".item-title").text
    s2_name = schools[1].at_css(".item-title").text
    s1_start = schools[0].css(".date-range time")[0].text
    s2_start = schools[1].css(".date-range time")[0].text
    s1_end = schools[0].css(".date-range time")[1].text
    s2_end = schools[1].css(".date-range time")[1].text
    s1_degree = schools[0].at_css(".item-subtitle").text
    s2_degree = schools[1].at_css(".item-subtitle").text
  end

  summary = page.at_css("#summary .description")
  summary.css('br').each{|br| br.replace "\n"} if summary

  text_resume = "\n\n***IMPORTED FROM LINKEDIN***\n#{lin_profile}\n\n"
  text_resume += name.join(" ")
  text_resume += "\n#{email}"
  text_resume += "\nTitle: #{title}" if title
  text_resume += "\nLocation: #{country}" if country
  text_resume += "\nSector: #{sector}" if sector
  text_resume += "\n\nSUMMARY\n#{summary.text}" if summary
  text_resume += "\n\nEXPERIENCE\n" if positions && positions.length > 0
  positions.each do |position|
    jtitle = position.at_css(".item-title")
    jcompany = position.at_css(".item-subtitle")
    jdates = position.at_css(".date-range")
    jlocation = position.at_css(".location")
    jdesc = position.at_css(".description")
    jdesc.css('br').each{|br| br.replace "\n"} if jdesc
    text_resume += "\n#{jtitle.text}\n" if jtitle
    text_resume += " - #{jcompany.text}\n" if jcompany && jcompany.text.length > 0
    text_resume += "#{jdates.text}\n" if jdates
    text_resume += "#{jlocation.text}\n" if jlocation
    text_resume += "#{jdesc.text}\n" if jdesc
  end
  text_resume += "\n\nEDUCATION\n" if schools && schools.length > 0
  schools.each do |school|
    stitle = school.at_css(".item-title")
    sdegree = school.at_css(".item-subtitle")
    sdates = school.at_css(".date-range")
    sdesc = school.at_css(".description")
    sdesc.css('br').each{|br| br.replace "\n"} if sdesc
    text_resume += "\n#{stitle.text}\n" if stitle
    text_resume += " - #{sdegree.text}\n" if sdegree && sdegree.text.length > 0
    text_resume += "#{sdates.text}\n" if sdates
    text_resume += "#{sdesc.text}\n" if sdesc
  end
  text_resume  += "\n\nCERTIFICATIONS\n" if certs && certs.length > 0
  certs.each do |cert|
    ctitle = cert.at_css(".item-title")
    csub = cert.at_css(".item-subtitle")
    cdates = cert.at_css(".date-range")
    text_resume += "\n#{ctitle.text}\n" if ctitle
    text_resume += "#{csub.text}\n" if csub
    text_resume += "#{cdates.text}\n" if cdates
  end
  interests = page.css("#interests .pills .interest")
  text_resume += "\nINTERESTS\n" if interests && interests.length > 0
  ints = []
  interests.each do |interest|
    int = interest.at_css(".wrap").text
    if int
      ints << int unless (int == "See less") || (int.match(/See \d+\+/))
    end
  end
  text_resume += "#{ints.join(", ")}\n\n"
  skills = page.css("#skills .pills .skill")
  text_resume += "\n\nSKILLS\n" if skills && skills.length > 0
  sks = []
  skills.each do |skill|
    sk = skill.at_css(".wrap").text
    if sk
      sks << sk unless (sk == "See less") || (sk.match(/See \d+\+/))
    end
  end
  text_resume += "#{sks.join(", ")}\n\n"
  languages = page.css("#languages .language")
  text_resume += "\n\nLANGUAGES\n" if languages.length > 0
  langs = []
  languages.each do |language|
    lang = language.at_css(".name").text
    prof = language.at_css(".proficiency")
    lang += " (#{prof.text})" if prof && prof.text.length > 0
    langs << lang if lang
  end
  text_resume += "#{langs.join(", ")}\n\n"
  projects = page.css("#projects .project")
  text_resume += "\n\nPROJECTS\n" if projects && projects.length > 0
  projects.each do |project|
    ptitle = project.at_css(".item-title")
    pdates = project.at_css(".date-range")
    pdesc = project.at_css(".description")
    pdesc.css('br').each{|br| br.replace "\n"} if pdesc
    pcont = project.at_css(".contributors")
    text_resume += "\n#{ptitle.text}\n" if ptitle
    text_resume += "#{pdates.text}\n" if pdates
    text_resume += "#{pdesc.text}\n" if pdesc
    text_resume += "#{pcont.text}\n " if pcont
  end
  pubs = page.css("#publications .publication")
  text_resume += "\n\nPUBLICATIONS\n" if pubs && pubs.length > 0
  pubs.each do |pub|
    pubtitle = pub.at_css(".item-title")
    pubsub = pub.at_css(".item-subtitle")
    pubdates = pub.at_css(".date-range")
    pubdesc = pub.at_css(".description")
    pubdesc.css('br').each{|br| br.replace "\n"} if pubdesc
    pubcont = pub.at_css(".contributors")
    text_resume += "\n#{pubtitle.text}\n" if pubtitle
    text_resume += "#{pubsub.text}\n" if pubsub
    text_resume += "#{pubdates.text}\n" if pubdates
    text_resume += "#{pubdesc.text}\n" if pubdesc
    text_resume += "#{pubcont.text}\n" if pubcont
  end
  vols = page.css("#volunteering .position")
  text_resume += "\n\nVOLUNTEERING\n" if vols && vols.length > 0
  vols.each do |vol|
    voltitle = vol.at_css(".item-title")
    volsub = vol.at_css(".item-subtitle")
    voldates = vol.at_css(".date-range")
    voldesc = vol.at_css(".description")
    voldesc.css('br').each{|br| br.replace "\n"} if voldesc
    volcause = vol.at_css(".cause")
    text_resume += "\n#{voltitle.text}\n" if voltitle
    text_resume += "#{volsub.text}\n" if volsub
    text_resume += "#{voldates.text}\n" if voldates
    text_resume += "Cause: #{volcause.text}\n" if volcause
    text_resume += "#{voldesc.text}\n" if voldesc
  end
  orgs = page.css("#organizations li")
  text_resume += "\n\nORGANIZATIONS\n" if orgs && orgs.length > 0
  orgs.each do |org|
    orgtitle = org.at_css(".item-title")
    orgsub = org.at_css(".item-subtitle")
    orgdates = org.at_css(".date-range")
    orgdesc = org.at_css(".description")
    orgdesc.css('br').each{|br| br.replace "\n"} if orgdesc
    text_resume += "\n#{orgtitle.text}\n" if orgtitle
    text_resume += "#{orgsub.text}\n" if orgsub
    text_resume += "#{orgdates.text}\n" if orgdates
    text_resume += "#{orgdesc.text}\n" if orgdesc
  end
  pats = page.css("#patents .patent")
  text_resume += "\n\nPATENTS\n" if pats && pats.length > 0
  pats.each do |pat|
    pattitle = pat.at_css(".item-title")
    patsub = pat.at_css(".item-subtitle")
    patdates = pat.at_css(".date-range")
    patdesc = pat.at_css(".description")
    patdesc.css('br').each{|br| br.replace "\n"} if patdesc
    patcont = pat.at_css(".contributors")
    text_resume += "\n#{pattitle.text}\n" if pattitle
    text_resume += "#{patsub.text}\n" if patsub
    text_resume += "#{patdates.text}\n" if patdates
    text_resume += "#{patdesc.text}\n" if patdesc
    text_resume += "#{patcont.text}\n" if patcont
  end
  awards = page.css("#awards .award")
  text_resume += "\n\nAWARDS\n" if awards && awards.length > 0
  awards.each do |award|
    atitle = award.at_css(".item-title")
    asub = award.at_css(".item-subtitle")
    adates = award.at_css(".date-range")
    adesc = award.at_css(".description")
    adesc.css('br').each{|br| br.replace "\n"} if adesc
    text_resume += "\n#{atitle.text}\n" if atitle
    text_resume += "#{asub.text}\n" if asub
    text_resume += "#{adates.text}\n" if adates
    text_resume += "#{adesc.text}\n" if adesc
  end
  courses = page.css("#courses li")
  text_resume += "\n\nCOURSES\n" if courses && courses.length > 0
  courses.each do |course|
    coutitle = course.at_css(".item-title")
    coulist = course.at_css(".courses-list")
    text_resume += "\n#{coutitle.text}\n" if coutitle
    text_resume += "#{coulist.text}\n" if coulist
  end


  row["Contact ID"] = contact_id
  row["LIN ID"] = lin_id
  row["Account ID"] = 
  row["CV TR"] = "1"
  row["Account Name"] = acc_name
  row["Linkedin Import Status"] = import_status
  row["First Name"] = name[0].slice(0, 39)
  row["Last Name"] = name[1..-1].join(" ").slice(0, 79)
  row["Email"] = email
  row["Candidate ID"] = cand_id
  row["LIN 1st Degree"] = cand_source
  row["Title"] = title.slice(0, 127)
  row["Contact Country"] = country
  row["Contact LIN Sector"] = sector.slice(0, 99)
  row["Employer 1 Title"] = e1_title.slice(0, 31999)
  row["Employer Organization Name 1"] = e1_org.slice(0, 254)
  row["Employer 1 Start Date"] = format_date(e1_start) #format
  row["Employer 1 End Date"] = format_date(e1_end) #format
  row["Employer 1 Location"] = e1_loc.slice(0, 254)
  row["Employer 1 Description"] = e1_desc.slice(0, 31999)
  row["Employer 2 Title"] = e2_title.slice(0, 31999)
  row["Employer Organization Name 2"] = e2_org.slice(0, 254)
  row["Employer 2 Start Date"] = format_date(e2_start) #format
  row["Employer 2 End Date"] = format_date(e2_end) #format
  row["Employer 2 Location"] = e2_loc.slice(0, 254)
  row["Employer 2 Description"] = e2_desc.slice(0, 31999)
  row["Employer 3 Title"] = e3_title.slice(0, 31999)
  row["Employer Organization Name 3"] = e3_org.slice(0, 254)
  row["Employer 3 Start Date"] = format_date(e3_start) #format
  row["Employer 3 End Date"] = format_date(e3_end) #format
  row["Employer 3 Location"] = e3_loc.slice(0, 254)
  row["Employer 3 Description"] = e3_desc.slice(0, 31999)
  row["License or Certification Name 1"] = c1_name.slice(0, 254)
  row["License or Certification Name 2"] = c2_name.slice(0, 254)
  row["License or Certification Credential Type"] = c_type.slice(0, 254)
  row["Education School 1"] = s1_name.slice(0, 124)
  row["Education Degree Name 1"] = s1_degree.slice(0, 254)
  row["Education Degree Date 1"] = format_date(s1_end)
  row["Education School 2"] = s2_name.slice(0, 124)
  row["Education Degree Name 2"] = s2_degree.slice(0, 254)
  row["Education Degree Date 2"] = format_date(s2_end)
  row["Text Resume"] = text_resume.slice(0, 31999)
  row["LinkedIn Profile"] = lin_profile.slice(0, 254)
  row["Resume Last Updated"] = Time.now.strftime('%Y-%m-%d %H:%M:%S')
  row["LIN Import Date"] = Time.now.strftime('%Y-%m-%d')
  row["CV Uploaded"] = "1"

  row

end

#scrape_education(input_row, page) ⇒ Object



280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
# File 'lib/linsc/parsers.rb', line 280

def scrape_education(input_row, page)
  rows = []
  schools = page.css("#education .schools .school")

  schools.each do |school|
    row = CSV::Row.new(@education_headers, [])
    row["Contact ID"] = input_row["Contact ID"]
    row["LIN ID"] = input_row["LIN ID"]
    row["School Name"] = school.at_css(".item-title").text.slice(0, 149)
    row["Major"] = school.at_css(".item-subtitle").text.slice(0, 254)
    dstart = school.css(".date-range time")[0]
    dend = school.css(".date-range time")[1]
    if dend
      row["Graduation Year"] = dend.text.gsub(/\D/, '').slice(0, 74)
    else
      row["Graduation Year"] = dstart.text.gsub(/\D/, '').slice(0, 74)
    end
    rows << row
  end
  rows
end

#scrape_employment(input_row, page) ⇒ Object



302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
# File 'lib/linsc/parsers.rb', line 302

def scrape_employment(input_row, page)
  rows = []
  positions = page.css("#experience .positions .position")

  positions.each do |position|
    row = CSV::Row.new(@employment_headers, [])
    row["Contact ID"] = input_row["Contact ID"]
    row["LIN ID"] = input_row["LIN ID"]
    row["Job Title"] = position.at_css(".item-title").text.slice(0, 74)
    row["Employer Name"] = position.at_css(".item-subtitle").text.slice(0, 149)
    jstart = position.css(".date-range time")[0]
    jend = position.css(".date-range time")[1]
    row["Start Date"] = format_date(jstart.text)
    row["End Date"] = format_date(jend.text)
    row["Location"] = position.at_css(".location").text.slice(0, 254)
    rows << row
  end
  rows
end