Method: Extcite.extract_from_metadata

Defined in:
lib/extcite.rb

.extract_from_metadata(path:) ⇒ Object

Try to extract DOIs from one or more PDF metadata sections

Return: DOI string

Examples:

require 'extcite'
require 'faraday'
# get a paper in pdf format
path = '2068.pdf'
res = Faraday.new(:url => "https://peerj.com/articles/2068.pdf").get;
f = File.new(path, "wb");
f.write(res.body)
f.close()
# extract doi from the pdf
Extcite.(path: path)

Parameters:

  • path (String)

    Path to a pdf file, or a folder of PDF files



137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
# File 'lib/extcite.rb', line 137

def self.(path:)
  path = make_paths(path)
  path.each do |x|
    # try PDF metadata first
    ids = nil
    rr = PDF::Reader.new(x)
    pdfmeta = rr.
    if !pdfmeta.nil?
      begin
        xml = Oga.parse_xml(pdfmeta);
      rescue Exception => e
        xml = nil
      end

      if !xml.nil?
        begin
          tt = xml.xpath('//rdf:Description')
          # try dc:identifier attribute
          ss = tt.attr('dc:identifier')[0]
          if !ss.nil?
            ids = ss.text.sub(/doi:/, '')
          else
            # try prism:doi node
            pdoi = xml.xpath('//rdf:Description//prism:doi')
            if pdoi.length == 1
              ids = pdoi.text
            else
              # try pdf:WPS-ARTICLEDOI node
              wpsdoi = xml.xpath('//rdf:Description//pdf:WPS-ARTICLEDOI')
              if wpsdoi.length == 1
                ids = wpsdoi.text
              else
                # try pdfx:WPS-ARTICLEDOI node
                pdfxwpsdoi = xml.xpath('//rdf:Description//pdfx:WPS-ARTICLEDOI')
                if pdfxwpsdoi.length == 1
                  ids = pdfxwpsdoi.text
                else
                  ids = nil
                end
              end
            end
          end
        rescue
          ids = nil
        end
      end
    end

    # if not found, try regexing for DOI
    if ids.nil?
      ids = Extcite.get_ids(txt: Extcite.extract_text_one(x))
    end

    return ids
  end
end