Class: Rubyscholar::Parser

Inherits:
Object
  • Object
show all
Defined in:
lib/rubyscholar.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, crossRefEmail = "") ⇒ Parser

Returns a new instance of Parser.



21
22
23
24
25
# File 'lib/rubyscholar.rb', line 21

def initialize(url, crossRefEmail = "")
  @parsedPapers  = []
  @crossRefEmail = crossRefEmail # if nil doesn't retursn any DOI
  parse(url)
end

Instance Attribute Details

#crossRefEmailObject

Returns the value of attribute crossRefEmail.



19
20
21
# File 'lib/rubyscholar.rb', line 19

def crossRefEmail
  @crossRefEmail
end

#parsedPapersObject

Returns the value of attribute parsedPapers.



19
20
21
# File 'lib/rubyscholar.rb', line 19

def parsedPapers
  @parsedPapers
end

Instance Method Details

#getDoi(lastNameFirstAuthor, title, crossRefEmail) ⇒ Object

Scholar doesn’t provide DOI. But if registered at crossref (its free), DOI can be retreived.



59
60
61
62
63
64
65
66
67
68
69
# File 'lib/rubyscholar.rb', line 59

def getDoi(lastNameFirstAuthor, title, crossRefEmail)
  return '' if @crossRefEmail.nil?
  sleep(1) # to reduce risk
  STDERR << "Getting DOI for paper by #{lastNameFirstAuthor}: #{title}.\n"
  url = 'http://www.crossref.org/openurl?redirect=false' +
    '&pid='    + crossRefEmail +
    '&aulast=' + lastNameFirstAuthor   +
    '&atitle=' + URI.escape(title)
  crossRefXML = Nokogiri::XML(open(url))
  crossRefXML.search("doi").children.first.content rescue ''
end

#parse(url) ⇒ Object



27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# File 'lib/rubyscholar.rb', line 27

def parse(url)
  papers = Nokogiri::HTML(open(url)).css(".cit-table .item")
  STDOUT << "Found #{papers.length} papers.\n"
  papers.each do |paper|
    paperDetails   = paper.css("#col-title")
    title          = paperDetails[0].children[0].content.clean
    googleUrl      = paperDetails[0].children[0].attribute('href')
    authors        = paperDetails[0].children[2].content.clean
    authors.gsub!("...", "et al")

    journal        = paperDetails[0].children[4].content
    journalName    = journal.split(/,|\d/).first.clean
    journalDetails = journal.gsub(journalName, '').clean

    year           = paper.css("#col-year").text # is the last thing we get

    #citations
    citeInfo      = paper.css(".cit-dark-link")
    citationCount = citeInfo.text
    citationUrl   = citationCount.empty?  ? nil : citeInfo.attribute('href').to_s

    # get DOI: needs last name of first author, no funny chars
    lastNameFirstAuthor = ((authors.split(',').first ).split(' ').last ).gsub(/[^A-Za-z\-]/, '')
    doi                 = getDoi( lastNameFirstAuthor, title, @crossRefEmail)

    @parsedPapers.push(Paper.new( title, googleUrl, authors, journalName, journalDetails, year, citationCount, citationUrl, doi))
  end
  STDOUT << "Scraped #{parsedPapers.length} from Google Scholar.\n"
end