Class: PubMed::Article

Inherits:
Object
  • Object
show all
Defined in:
lib/rbbt/sources/pubmed.rb

Overview

Processes the xml with an articles as served by MedLine and extracts the abstract, title and journal information

Constant Summary collapse

XML_KEYS =
[
  [:title    , "ArticleTitle"],
  [:journal  , "Journal/Title"],
  [:issue    , "Journal/JournalIssue/Issue"],
  [:volume   , "Journal/JournalIssue/Volume"],
  [:issn     , "Journal/ISSN"],
  [:year     , "Journal/JournalIssue/PubDate/Year"],
  [:month    , "Journal/JournalIssue/PubDate/Month"],
  [:pages    , "Pagination/MedlinePgn"],
  [:author    , "AuthorList/Author"],
  [:abstract , "Abstract/AbstractText"],
]
PMC_PDF_URL =
"http://www.ncbi.nlm.nih.gov/pmc/articles/PMCID/pdf/"

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(xml) ⇒ Article

Returns a new instance of Article.



108
109
110
111
112
113
114
115
# File 'lib/rbbt/sources/pubmed.rb', line 108

def initialize(xml)
  if xml && ! xml.empty?
    info = PubMed::Article.parse_xml xml
    info.each do |key, value|
      self.send("#{ key }=", value)
    end
  end
end

Instance Attribute Details

#abstractObject

Returns the value of attribute abstract.



105
106
107
# File 'lib/rbbt/sources/pubmed.rb', line 105

def abstract
  @abstract
end

#authorObject

Returns the value of attribute author.



105
106
107
# File 'lib/rbbt/sources/pubmed.rb', line 105

def author
  @author
end

#bibentryObject

Returns the value of attribute bibentry.



105
106
107
# File 'lib/rbbt/sources/pubmed.rb', line 105

def bibentry
  @bibentry
end

#gscholar_pdfObject

Returns the value of attribute gscholar_pdf.



105
106
107
# File 'lib/rbbt/sources/pubmed.rb', line 105

def gscholar_pdf
  @gscholar_pdf
end

#journalObject

Returns the value of attribute journal.



105
106
107
# File 'lib/rbbt/sources/pubmed.rb', line 105

def journal
  @journal
end

#pdf_urlObject

Returns the value of attribute pdf_url.



105
106
107
# File 'lib/rbbt/sources/pubmed.rb', line 105

def pdf_url
  @pdf_url
end

#pmc_pdfObject

Returns the value of attribute pmc_pdf.



105
106
107
# File 'lib/rbbt/sources/pubmed.rb', line 105

def pmc_pdf
  @pmc_pdf
end

#pmidObject

Returns the value of attribute pmid.



105
106
107
# File 'lib/rbbt/sources/pubmed.rb', line 105

def pmid
  @pmid
end

#titleObject

Returns the value of attribute title.



105
106
107
# File 'lib/rbbt/sources/pubmed.rb', line 105

def title
  @title
end

Class Method Details

.escape_title(title) ⇒ Object



41
42
43
# File 'lib/rbbt/sources/pubmed.rb', line 41

def self.escape_title(title)
  title.gsub(/(\w*[A-Z][A-Z]+\w*)/, '{\1}')
end

.make_bibentry(lastname, year, title) ⇒ Object



45
46
47
48
49
50
51
52
53
# File 'lib/rbbt/sources/pubmed.rb', line 45

def self.make_bibentry(lastname, year, title)
  words = title.downcase.scan(/\w+/)
  if words.first.length > 3
    abrev = words.first
  else
    abrev = words[0..2].collect{|w| w.chars.first} * ""
  end
  [lastname.gsub(/\s/,'_'), year || "NOYEAR", abrev] * ""
end

.parse_xml(xml) ⇒ Object



54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# File 'lib/rbbt/sources/pubmed.rb', line 54

def self.parse_xml(xml)
  require 'nokogiri'

  #parser  = LibXML::XML::Parser.string(xml)
  #pubmed  = parser.parse.find("/PubmedArticle").first
  #medline = parser.find("MedlineCitation").first
  #article = medline.find("Article").first

  parser  = Nokogiri.XML(xml)
  medline = parser.search("MedlineCitation").first
  article = medline.search("Article").first

  info = {}

  info[:pmid] = medline.search("PMID").first.content

  XML_KEYS.each do |p|
    name, key = p
    nodes = article.search(key)

    next if nodes.nil? || nodes.empty?

    info[name] = nodes.collect{|n| n.content } * "\n\n"
  end

  bibentry = nil
  info[:author] = article.search("AuthorList/Author").collect do |author|
    begin
      lastname = author.search("LastName").first.content
      if author.search("ForeName").first.nil?
        forename = nil
      else
        forename = author.search("ForeName").first.content.split(/\s/).collect{|word| if word.length == 1; then word + '.'; else word; end} * " "
      end
      bibentry ||= make_bibentry lastname, info[:year], info[:title]
    rescue
    end
    [lastname, forename] * ", "
  end * " and "

  info[:bibentry] = bibentry.downcase if bibentry

  info[:pmc_pdf] = parser.search("PubmedData/ArticleIdList/ArticleId").select{|id| id[:IdType] == "pmc"}.first

  if info[:pmc_pdf]
    info[:pmc_pdf] = PMC_PDF_URL.sub(/PMCID/, info[:pmc_pdf].content)
  end

  info
end

Instance Method Details

#bibtexObject



157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
# File 'lib/rbbt/sources/pubmed.rb', line 157

def bibtex
  keys = [:author] + XML_KEYS.collect{|p| p.first } - [:bibentry]
  bibtex = "@article{#{bibentry},\n"

  keys.each do |key|
    next if self.send(key).nil?

    case key

    when :title
      bibtex += "  title = { #{ PubMed::Article.escape_title title } },\n"

    when :issue
      bibtex += "  number = { #{ issue } },\n"

    else
      bibtex += "  #{ key } = { #{ self.send(key) } },\n"
    end

  end

  bibtex += "  fulltext = { #{ pdf_url } },\n" if pdf_url
  bibtex += "  pmid = { #{ pmid } }\n}"


  bibtex
end

#full_textObject



136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# File 'lib/rbbt/sources/pubmed.rb', line 136

def full_text
  text = if pdf_url
           text = nil
           TmpFile.with_file do |pdf|
             # Change user-agent, oh well...
             `wget --user-agent=firefox #{ pdf_url } -O #{ pdf } -t 3`
             TmpFile.with_file do |txt|
               `pdftotext #{ pdf } #{ txt }`
               text = Open.read(txt) if File.exists? txt
             end
           end
           text
         elsif pmc_full_xml
           pmc_full_xml
         else
           nil
         end

  Misc.fixutf8(text)
end

#pmc_full_xmlObject



117
118
119
120
121
122
123
# File 'lib/rbbt/sources/pubmed.rb', line 117

def pmc_full_xml
  begin
    Open.read("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=#{pmid}")
  rescue
    nil
  end
end

#textObject

Join the text from title and abstract



186
187
188
189
190
# File 'lib/rbbt/sources/pubmed.rb', line 186

def text
  text = [title, abstract].join("\n")

  Misc.fixutf8(text)
end