Class: ScientificNameParser

Inherits:
Object
  • Object
show all
Defined in:
lib/biodiversity/parser.rb

Overview

we can use these expressions when we are ready to parse virus names class VirusParser

def initialize
  @order     = /^\s*[A-Z][a-z]\+virales/i
  @family    = /^\s*[A-Z][a-z]\+viridae|viroidae/i
  @subfamily = /^\s*[A-Z][a-z]\+virinae|viroinae/i
  @genus     = /^\s*[A-Z][a-z]\+virus|viroid/i
  @species   = /^\s*[A-z0-9u0391-u03C9\[\] ]\+virus|phage|
                viroid|satellite|prion[A-z0-9u0391-u03C9\[\] ]\+/ix
  @parsed    = nil
end

end

Constant Summary

FAILED_RESULT =
->(name) do
  { scientificName:
    { id: GnUUID.uuid(name), parsed: false, verbatim: name,
      error: "Parser internal error" }
  }
end

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(opts = {}) ⇒ ScientificNameParser

Returns a new instance of ScientificNameParser



172
173
174
175
176
177
178
179
180
# File 'lib/biodiversity/parser.rb', line 172

def initialize(opts = {})
  @canonical_with_rank = !!opts[:canonical_with_rank]
  @verbatim = ""
  @clean = ScientificNameCleanParser.new
  @dirty = ScientificNameDirtyParser.new
  @canonical = ScientificNameCanonicalParser.new
  @parsed = nil
  @tail = nil
end

Class Method Details

.add_rank_to_canonical(parsed) ⇒ Object



120
121
122
123
124
125
126
127
128
129
130
131
132
# File 'lib/biodiversity/parser.rb', line 120

def self.add_rank_to_canonical(parsed)
  return parsed if parsed[:scientificName][:hybrid]
  name = parsed[:scientificName]
  parts = name[:canonical].split(" ")
  name_ary = parts[0..1]
  name[:details][0][:infraspecies].each do |data|
    infrasp = data[:string]
    rank = data[:rank]
    name_ary << (rank && rank != "n/a" ? "#{rank} #{infrasp}" : infrasp)
  end
  parsed[:scientificName][:canonical] = name_ary.join(" ")
  parsed
end

.all(opts = {}) ⇒ Object



231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
# File 'lib/biodiversity/parser.rb', line 231

def @parsed.all(opts = {})
  canonical_with_rank = !!opts[:canonical_with_rank]
  parsed = self.class != Hash
  res = { id: @id, parsed: parsed,
          parser_version: ScientificNameParser::version}

  if parsed
    hybrid = self.hybrid rescue false
    res.merge!({
      verbatim: @verbatim,
      normalized: self.value,
      canonical: self.canonical,
      hybrid: hybrid,
      details: self.details,
      parser_run: self.parser_run,
      positions: self.pos
      })
  else
    res.merge!(self)
  end
  res[:surrogate] = true if ScientificNameParser.surrogate?(res)
  res = {:scientificName => res}
  if (canonical_with_rank &&
      canonical.count(" ") > 1 &&
      res[:scientificName][:details][0][:infraspecies])
    ScientificNameParser.add_rank_to_canonical(res)
  end
  res
end

.all_jsonObject



265
266
267
# File 'lib/biodiversity/parser.rb', line 265

def @parsed.all_json
  self.all.to_json rescue ""
end

.fix_case(name_string) ⇒ Object



138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# File 'lib/biodiversity/parser.rb', line 138

def self.fix_case(name_string)
  name_ary = name_string.split(/\s+/)
  words_num = name_ary.size
  res = nil
  if words_num == 1
    res = name_ary[0].gsub(/[\(\)\{\}]/, "")
    if res.size > 1
      res = UnicodeUtils.upcase(res[0]) + UnicodeUtils.downcase(res[1..-1])
    else
      res = nil
    end
  else
    if name_ary[0].size > 1
      word1 = UnicodeUtils.upcase(name_ary[0][0]) +
        UnicodeUtils.downcase(name_ary[0][1..-1])
    else
      word1 = name_ary[0]
    end
    if name_ary[1].match(/^\(/)
      word2 = name_ary[1].gsub(/\)$/, "") + ")"
      word2 = word2[0] + UnicodeUtils.upcase(word2[1]) +
        UnicodeUtils.downcase(word2[2..-1])
    else
      word2 = UnicodeUtils.downcase(name_ary[1])
    end
    res = word1 + " " +
      word2 + " " +
      name_ary[2..-1].map { |w| UnicodeUtils.downcase(w) }.join(" ")
    res.strip!
  end
  res
end

.pos_jsonObject



261
262
263
# File 'lib/biodiversity/parser.rb', line 261

def @parsed.pos_json
  self.pos.to_json rescue ""
end

.verbatim=(a_string) ⇒ Object



226
227
228
229
# File 'lib/biodiversity/parser.rb', line 226

def @parsed.verbatim=(a_string)
  @verbatim = a_string
  @id = GnUUID.uuid(@verbatim)
end

.versionObject



134
135
136
# File 'lib/biodiversity/parser.rb', line 134

def self.version
  Biodiversity::VERSION
end

Instance Method Details

#noparse?(a_string) ⇒ Boolean

Returns:

  • (Boolean)


191
192
193
194
195
196
# File 'lib/biodiversity/parser.rb', line 191

def noparse?(a_string)
  incertae_sedis = a_string.match(/incertae\s+sedis/i) ||
    a_string.match(/inc\.\s*sed\./i)
  rna = a_string.match(/[^A-Z]RNA[^A-Z]*/)
  incertae_sedis || rna
end

#parse(a_string) ⇒ Object



202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
# File 'lib/biodiversity/parser.rb', line 202

def parse(a_string)
  @verbatim = a_string
  a_string, @tail = PreProcessor::clean(a_string)

  if virus?(a_string)
    @parsed = { verbatim: @verbatim, virus: true }
  elsif noparse?(a_string)
    @parsed = { verbatim: @verbatim }
  else
    begin
      @parsed = @clean.parse(a_string) || @dirty.parse(a_string)
      unless @parsed
        index = @dirty.index || @clean.index
        salvage_match = a_string[0..index].split(/\s+/)[0..-2]
        salvage_string = salvage_match ? salvage_match.join(" ") : a_string
        @parsed =  @dirty.parse(salvage_string) ||
                   @canonical.parse(a_string) ||
                   { verbatim: @verbatim }
      end
    rescue
      @parsed = FAILED_RESULT.(@verbatim)
    end
  end

  def @parsed.verbatim=(a_string)
    @verbatim = a_string
    @id = GnUUID.uuid(@verbatim)
  end

  def @parsed.all(opts = {})
    canonical_with_rank = !!opts[:canonical_with_rank]
    parsed = self.class != Hash
    res = { id: @id, parsed: parsed,
            parser_version: ScientificNameParser::version}

    if parsed
      hybrid = self.hybrid rescue false
      res.merge!({
        verbatim: @verbatim,
        normalized: self.value,
        canonical: self.canonical,
        hybrid: hybrid,
        details: self.details,
        parser_run: self.parser_run,
        positions: self.pos
        })
    else
      res.merge!(self)
    end
    res[:surrogate] = true if ScientificNameParser.surrogate?(res)
    res = {:scientificName => res}
    if (canonical_with_rank &&
        canonical.count(" ") > 1 &&
        res[:scientificName][:details][0][:infraspecies])
      ScientificNameParser.add_rank_to_canonical(res)
    end
    res
  end

  def @parsed.pos_json
    self.pos.to_json rescue ""
  end

  def @parsed.all_json
    self.all.to_json rescue ""
  end

  @parsed.verbatim = @verbatim
  res = @parsed.all(canonical_with_rank: @canonical_with_rank)
  res[:scientificName].merge!(tail: @tail) if @tail && @tail != ""
  res
end

#parsedObject



198
199
200
# File 'lib/biodiversity/parser.rb', line 198

def parsed
  @parsed
end

#virus?(a_string) ⇒ Boolean

Returns:

  • (Boolean)


182
183
184
185
186
187
188
189
# File 'lib/biodiversity/parser.rb', line 182

def virus?(a_string)
  !!(a_string.match(/\sICTV\s*$/) ||
     a_string.match(/\b(virus|viruses|particle|particles|
                        phage|phages|viroid|viroids|virophage|
                        prion|prions|NPV)\b/ix) ||
     a_string.match(/[A-Z]?[a-z]+virus\b/) ||
     a_string.match(/\b[A-Za-z]*(satellite[s]?|NPV)\b/))
end