Class: Rubyscholar::Parser

Inherits:
Object
  • Object
show all
Defined in:
lib/rubyscholar-main.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, crossRefEmail = "") ⇒ Parser

Returns a new instance of Parser.



21
22
23
24
25
# File 'lib/rubyscholar-main.rb', line 21

def initialize(url, crossRefEmail = "")
  @parsedPapers  = []
  @crossRefEmail = crossRefEmail # if nil doesn't return any DOI
  parse(url)
end

Instance Attribute Details

#crossRefEmailObject

Returns the value of attribute crossRefEmail.



19
20
21
# File 'lib/rubyscholar-main.rb', line 19

def crossRefEmail
  @crossRefEmail
end

#parsedPapersObject

Returns the value of attribute parsedPapers.



19
20
21
# File 'lib/rubyscholar-main.rb', line 19

def parsedPapers
  @parsedPapers
end

Instance Method Details

#getDoi(lastNameFirstAuthor, title, crossRefEmail) ⇒ Object

Scholar doesn’t provide DOI. But if registered at crossref (its free), DOI can be retreived.



64
65
66
67
68
69
70
71
72
73
74
# File 'lib/rubyscholar-main.rb', line 64

def getDoi(lastNameFirstAuthor, title, crossRefEmail)
  return '' if @crossRefEmail.nil?
  sleep(1) # to reduce risk 
  STDERR << "Getting DOI for paper by #{lastNameFirstAuthor}: #{title}.\n"
  url = 'http://www.crossref.org/openurl?redirect=false' +  
    '&pid='    + crossRefEmail + 
    '&aulast=' + lastNameFirstAuthor   +
    '&atitle=' + URI.escape(title)
  crossRefXML = Nokogiri::XML(open(url)) 
  crossRefXML.search("doi").children.first.content rescue ''
end

#parse(url) ⇒ Object



27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# File 'lib/rubyscholar-main.rb', line 27

def parse(url)
  STDERR << "Will check #{url}.\n"
  page = Nokogiri::HTML(open(url,
                             'User-Agent' => 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.121 Safari/535.2'), nil, 'utf-8')
  papers = page.css(".gsc_a_tr")
  STDERR << "Found #{papers.length} papers.\n"
  papers.each do |paper|
    title          = paper.css(".gsc_a_at").text rescue ''
    title.gsub!(/\.$/, '')

    googleUrl      = paper.children[0].children[0].attribute('href').text rescue ''
    authors        = paper.children[0].children[1].text.clean rescue ''
    authors.gsub!("...", "et al")

    journal        = paper.children[0].children[2].text rescue '' 
    journalName    = journal.split(/,|\d/).first.clean  rescue ''
    journalDetails = journal.gsub(journalName, '').clean 
    year           = journalDetails.match(/, \d+$/)[0]  rescue ''
    journalDetails = journalDetails.gsub(year, '').clean
    year           = year.clean

    #citations
    citeInfo      = paper.css('.gsc_a_ac')
    citationCount = citeInfo.text
    citationUrl   = citationCount.empty?  ? nil : citeInfo.attribute('href').to_s 

    # get DOI: needs last name of first author, no funny chars
    lastNameFirstAuthor = ((authors.split(',').first ).split(' ').last ).gsub(/[^A-Za-z\-]/, '')
    doi                 = getDoi( lastNameFirstAuthor, title, @crossRefEmail)

    @parsedPapers.push(Paper.new( title, googleUrl, authors, journalName, journalDetails, year, citationCount, citationUrl, doi))
  end
  STDERR << "Scraped #{parsedPapers.length} from Google Scholar.\n"
end