Module: Taxonifi::Lumper

Defined in:
lib/taxonifi/lumper.rb,
lib/taxonifi/lumper/clump.rb,
lib/taxonifi/lumper/name_index.rb

Overview

A Clump is a “C”ollection of lump derivatives and the relationships between these derivatives! It's used to define relationships among objects derived, for example, between single rows of data

Defined Under Namespace

Modules: Lumps Classes: Clump, LumperError, NameIndex

Constant Summary collapse

QUAD =

Columns used for species epithets. !! Todo: map DwC URIs to these labels (at present they largely correllate with Tokens, perhaps map URIs to tokens!?)

['genus', 'subgenus', 'species', 'subspecies']
AUTHOR_YEAR =

Columns representing author and year

['author', 'year']
LUMPS =

A Hash of named column combinations

{
  quadrinomial: QUAD,
  quad_author_year: QUAD + AUTHOR_YEAR,
  names:  Taxonifi::RANKS + AUTHOR_YEAR,
  higher: Taxonifi::RANKS - [QUAD + AUTHOR_YEAR],
  species: ['species', 'subspecies', 'variety'],
  genera: ['genus', 'subgenus'],
  citation_basic: %w{authors year title publication volume number pages pg_start pg_end},
  citation_small: %w{authors year title publication volume_number pages},
  basic_geog: %w{country state county}, # add 'continent'
  eol_basic: %w{identifier parent child rank synonyms}
}

Class Method Summary collapse

Class Method Details

.available_lumps(columns) ⇒ Object

Lumps for which all columns are represented TODO: This is really an assessor method


38
39
40
41
# File 'lib/taxonifi/lumper.rb', line 38

def self.available_lumps(columns)
  raise Taxonifi::Lumper::LumperError, 'Array not passed to Lumper.available_lumps.' if !(columns.class == Array)
  LUMPS.keys.select{|k| (LUMPS[k] - columns) == []}
end

.create_geog_collection(csv) ⇒ Object

Return a geog collection from a csv file.


309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
# File 'lib/taxonifi/lumper.rb', line 309

def self.create_geog_collection(csv)
  raise Taxonifi::Lumper::LumperError, 'Something that is not a CSV::Table was passed to Lumper.create_geog_collection.' if csv.class != CSV::Table
  gc = Taxonifi::Model::GeogCollection.new

  row_size = csv.size
  row_index = Taxonifi::Utils::Array.build_array_of_empty_arrays(row_size)

  name_index = {}
  geog_headers =  Taxonifi::Assessor::RowAssessor.geog_headers(csv.headers)
  geog_headers.each do |h|
    name_index[h] = {}
  end

  # We don't have the same problems as with taxon names, i.e.
  # boo in 
  #  Foo nil boo
  #  Foo bar boo
  # is the same thing wrt geography, not the case for taxon names.
  # We can use a row first loop to build as we go

  csv.each_with_index do |row, i|
    geog_headers.each do |level|
      name = row[level]
      if !name.nil? && !name.empty?  # cell has data
        g = nil         # a Name if necessary
        name_id = nil   # index the new or existing name 

        if name_index[level][name] # name exists
          name_id  = name_index[level][name] 
        else
          g = Taxonifi::Model::Geog.new()
          name_id = gc.add_object(g).id
        end

        if !g.nil? 
          g.name = name
          g.rank = level
          g.parent = gc.object_by_id(row_index[i].last) if row_index[i].size > 0 # it's parent is the previous id in this row 
        end

        name_index[level][name] = name_id
        row_index[i].push name_id                       
      end
    end
  end
  gc
end

.create_hierarchical_collection(csv, headers) ⇒ Object

Creates a generic Collection with Objects of GenericObject Objects are assigned to parents (rank) according to the order provided in headers. Objects are considered the same if they have the same name and the same parents closure, e.g.

a b c
a b d
e b f

Will return 7 objects named in order a,b,c,d,e,b,f

a,b b,c b,d e,b b,f are the unique parent/child relationships stored


254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
# File 'lib/taxonifi/lumper.rb', line 254

def self.create_hierarchical_collection(csv, headers)
  raise Taxonifi::Lumper::LumperError, 'Something that is not a CSV::Table was passed to Lumper.create_name_collection.' if csv.class != CSV::Table
  raise Taxonifi::Lumper::LumperError, 'No headers provided to create_hierarchical_collection.' if headers.size == 0

  c = Taxonifi::Model::Collection.new
  row_size = csv.size

  # See create_name_collection
  row_index = Taxonifi::Utils::Array.build_array_of_empty_arrays(row_size)
  name_index = Taxonifi::Utils::Hash.build_hash_of_hashes_with_keys(headers)

  csv.each_with_index do |row, i|
    headers.each do |rank|
      name = row[rank]
      if !name.nil? && !name.empty?  # cell has data
        o = nil                      # a Name if necessary
        name_id = nil                # index the new or existing name 

        if name_index[rank][name] # Matching name is found 

          exists = false
          name_index[rank][name].each do |id|
            if c.parent_id_vector(id) == row_index[i]
              exists = true
              name_id = id
              break
            end
          end

          if !exists
            o = Taxonifi::Model::GenericObject.new()
          end
        else
          o = Taxonifi::Model::GenericObject.new()
        end

        if !o.nil? 
          o.name = name
          o.rank = rank
          o.row_number = i
          o.parent = c.object_by_id(row_index[i].last) if row_index[i].size > 0 # it's parent is the previous id in this row 

          name_id = c.add_object(o).id 
          name_index[rank][name] ||= []
          name_index[rank][name].push name_id                
        end

        row_index[i].push name_id                       
      end
    end
  end
  c
end

.create_name_collection(options = {}) ⇒ Object

return [Taxonifi::Model::NameCollection] from a csv file.


56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# File 'lib/taxonifi/lumper.rb', line 56

def self.create_name_collection(options = {})
  opts = {
    :csv => [],
    :initial_id => 0,
    :capture_related_fields => true   # Stores other column values in (column_header => value) pairs in Name#properties
  }.merge!(options)
  
  csv = opts[:csv]
  raise Taxonifi::Lumper::LumperError, 'Something that is not a CSV::Table was passed to Lumper.create_name_collection.' if csv.class != CSV::Table
  
  nc = Taxonifi::Model::NameCollection.new(:initial_id => opts[:initial_id])
  row_size = csv.size

  # The row index contains a vector of parent ids like
  # [0, 4, 29]
  # This implies that Name with #id 29 has Parent with #id 4
  # Initialize an empty index. 
  row_index = Taxonifi::Utils::Array.build_array_of_empty_arrays(row_size)

  # The name_index keeps track of unique name per rank like
  # :genus => {'Foo' => [0,2]}
  # This says that "Foo" is instantiated two times in the
  # name collection, with id 0, and id 2.
  name_index = {} # Taxonifi::Lumper::NameIndex.new # {} 

  has_ref_fields = ([:citation_basic, :citation_small] & Taxonifi::Lumper.intersecting_lumps(csv.headers)).size > 0
  unused_fields = csv.headers - Taxonifi::Lumper::LUMPS[:names]

  # First pass, create and index names
  Taxonifi::Assessor::RowAssessor.rank_headers(csv.headers).each do |rank|
    # name_index.new_rank(rank)
    name_index[rank] = {}
    csv.each_with_index do |row, i|
      shares_rank = (rank == Taxonifi::Assessor::RowAssessor.lump_name_rank(row).to_s)
      name = row[rank] 

      if !name.nil?     # cell has data
        n = nil         # a Name if necessary
        name_id = nil   # index the new or existing Name 

        exists = false
        if name_index[rank][name] # A matching name (String) has been previously added
          name_index[rank][name].each do |id|
            # Compare vectors of parent_ids for name presence
            if nc.parent_id_vector(id) == row_index[i]      
              exists = true
              name_id = id
              break 
            end 
          end
        end # end name exists

        n = Taxonifi::Model::Name.new() if !exists

        unused_data = row.to_hash.select{|f| unused_fields.include?(f)}
        row_identifier = (row['identifier'] ? row['identifier'] : i)

        # Populate the new name if created.  Previously matched names are not effected. 
        if !n.nil? 
          n.rank = rank
          n.name = name
          n.parent = nc.object_by_id(row_index[i].last) if row_index[i].size > 0 # it's parent is the previous id in this row 
          n.row_number = i

          # Name/year needs to be standardized / cased out
          # headers are overlapping at times

          # Check to see if metadata (e.g. author year) apply to this rank, attach if so.
          if shares_rank 
            if row['author_year'] 
              builder = Taxonifi::Splitter::Builder.build_author_year(row['author_year'])                
              n.authors              = builder.people  # was author!?
              n.year                 = builder.year 
              n.parens               = builder.parens
            end

            n.add_property(:link_to_ref_from_row, i) if has_ref_fields # TODO: update this
            n.add_properties(unused_data) if opts[:capture_related_fields]
          end

          name_id = nc.add_object(n).id
          name_index[rank][name] ||= []
          name_index[rank][name].push name_id                

          $DEBUG && $stderr.puts("added #{nc.collection.size - 1} | #{n.name} | #{n.rank} | #{n.parent ? n.parent.name : '-'} | #{n.parent ? n.parent.id : '-'}")
        else
          $DEBUG && $stderr.puts("already present #{rank} | #{name}")
          if shares_rank 
            # original:: 
            nc.(name_id, row_identifier, unused_data)

            # hack 
            # nc.add_duplicate_entry_metadata(name_id, row_identifier, row.to_hash) 
          
          end
        end

        # build a by row vector of parent child relationships
        row_index[i].push name_id                       
      end # end cell has data

    end
  end
  nc
end

.create_ref_collection(options = {}) ⇒ Object

return [Taxonifi::Model::RefCollection] from a CSV file.


163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
# File 'lib/taxonifi/lumper.rb', line 163

def self.create_ref_collection(options = {})
  opts = {
    :csv => nil,
    :inital_id => 1,
    :capture_related_fields => true   # Stores other column values in (column_header => value) pairs in Ref#related
  }.merge!(options)
  csv = opts[:csv]

  raise Taxonifi::Lumper::LumperError, 'Something that is not a CSV::Table was passed to Lumper.create_ref_collection.' if csv.class != CSV::Table
  rc = Taxonifi::Model::RefCollection.new(opts)

  unused_fields = csv.headers - (Taxonifi::Lumper::LUMPS[:citation_basic] | Taxonifi::Lumper::LUMPS[:citation_small])

  ref_index = {}
  csv.each_with_index do |row, i|
    if Taxonifi::Assessor::RowAssessor.intersecting_lumps_with_data(row, [:citation_small]).include?(:citation_small)
      r = Taxonifi::Model::Ref.new(
        :year => row['year'],
        :title => row['title'],
        :publication => row['publication']
      ) 

      # TODO: break out each of these lexes to a builder
      if row['authors'] && !row['authors'].empty?
        lexer = Taxonifi::Splitter::Lexer.new(row['authors'])
        authors = lexer.pop(Taxonifi::Splitter::Tokens::Authors)
        authors.names.each do |a|
          n = Taxonifi::Model::Person.new()
          n.last_name = a[:last_name]
          n.initials = a[:initials]
          r.authors.push n
        end
      end

      if row['volume_number'] && !row['volume_number'].empty?
        lexer = Taxonifi::Splitter::Lexer.new(row['volume_number'], :volume_number)
        t = lexer.pop(Taxonifi::Splitter::Tokens::VolumeNumber)
        r.volume = t.volume
        r.number = t.number
      end

      if row['pages'] && !row['pages'].empty?
        # If our regex doesn't match dump the field into pages
        lexer = Taxonifi::Splitter::Lexer.new(row['pages'], :pages)
        begin
        if t = lexer.pop(Taxonifi::Splitter::Tokens::Pages)
          r.pg_start = t.pg_start
          r.pg_end = t.pg_end
          r.pages = t.remainder
        else
          r.pages = row['pages']
        end
        rescue
          r.pages = row['pages']
        end
      end
     
      r.add_properties(row.to_hash.select{|f| unused_fields.include?(f)}) if opts[:capture_related_fields]

      # Do some indexing.
      ref_str = r.compact_string 
      if !ref_index.keys.include?(ref_str)
        ref_id = rc.add_object(r).id
        ref_index.merge!(ref_str => ref_id)
        # puts "#{i} : #{ref_id}"
        rc.row_index[i] = r 
      else
        rc.row_index[i] = rc.object_by_id(ref_index[ref_str])
        # puts "#{i} : #{ref_index[ref_str]}"
      end
    end
  end
  rc
end

.intersecting_lumps(columns) ⇒ Object

Lumps for which any column is represented # TODO: This is really an assessor method


45
46
47
48
49
50
51
52
# File 'lib/taxonifi/lumper.rb', line 45

def self.intersecting_lumps(columns)
  raise Taxonifi::Lumper::LumperError, 'Array not passed to Lumper.intersecting_lumps.' if !(columns.class == Array)
  intersections = []
  LUMPS.keys.each do |k|
    intersections.push k if (LUMPS[k] & columns).size > 0
  end
  intersections
end