Method: Extcite.extract_from_metadata
- Defined in:
- lib/extcite.rb
.extract_from_metadata(path:) ⇒ Object
Try to extract DOIs from one or more PDF metadata sections
Return: DOI string
137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 |
# File 'lib/extcite.rb', line 137 def self.(path:) path = make_paths(path) path.each do |x| # try PDF metadata first ids = nil rr = PDF::Reader.new(x) = rr. if !.nil? begin xml = Oga.parse_xml(); rescue Exception => e xml = nil end if !xml.nil? begin tt = xml.xpath('//rdf:Description') # try dc:identifier attribute ss = tt.attr('dc:identifier')[0] if !ss.nil? ids = ss.text.sub(/doi:/, '') else # try prism:doi node pdoi = xml.xpath('//rdf:Description//prism:doi') if pdoi.length == 1 ids = pdoi.text else # try pdf:WPS-ARTICLEDOI node wpsdoi = xml.xpath('//rdf:Description//pdf:WPS-ARTICLEDOI') if wpsdoi.length == 1 ids = wpsdoi.text else # try pdfx:WPS-ARTICLEDOI node pdfxwpsdoi = xml.xpath('//rdf:Description//pdfx:WPS-ARTICLEDOI') if pdfxwpsdoi.length == 1 ids = pdfxwpsdoi.text else ids = nil end end end end rescue ids = nil end end end # if not found, try regexing for DOI if ids.nil? ids = Extcite.get_ids(txt: Extcite.extract_text_one(x)) end return ids end end |