Class: Taxonifi::Splitter::Tokens::Authors

Inherits:
Token
  • Object
show all
Defined in:
lib/taxonifi/splitter/tokens.rb

Overview

Complex breakdown of author strings. Handles a wide variety of formats.

See test_splitter_tokens.rb for scope. As with AuthorYear this will match just about anything when used alone. Add exceptions at will, just test using TestSplittTokens#test_authors. TODO: Unicode the [a-z] bits?

Instance Attribute Summary collapse

Attributes inherited from Token

#flag, #value

Instance Method Summary collapse

Constructor Details

#initialize(input) ⇒ Authors

Returns a new instance of Authors.



76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
# File 'lib/taxonifi/splitter/tokens.rb', line 76

def initialize(input)
  str = input 
  @names = [] 
  str.strip!
  naked_and = false # look for the pattern 'Foo, Bar and Smith', i.e. no initials
  individuals = []
  last_individual = nil

  # We can simplify if there is an "and" or & 
  if str =~ /(\s+and\s+|\&)/i
    l,r = str.split(/\s+\,?\s*and\s+|\s+\&\s+/i, 2) # added \, \s+
    last_individual = r
    str = l  
    naked_and = true
  end

  # Look for an exception case, no initials, "and" or "&" previously present, like:
  #   Foo, Bar and Smith  
  if naked_and && not(str =~ /\./) && str =~ /s*([A-Z][a-z]{1,})\s*\,+\s*([A-Z][a-z]{1,})/ 
    individuals.unshift str.split(/\s*\,\s*/)
    str = nil 
  end

  # Look for an exception case, no periods and multiple commas, like:
  #   Foo A, Bar ZA, Smith-Blorf A
  if str && !naked_and && (str.split(",").size > 2) && !(str =~ /\./)
    individuals = str.split(",")
    str = nil
  end

  # Look for an exception case.  Last name, commas, no and.  The idea is to decompose and
  # have nothing left, if possible then the match is good.
  if str && !naked_and && (str.split(",").size > 1) && (str =~ /[A-Z]\./)
    test_str = str.clone 
    ok = true 
    pseudo_individuals = test_str.split(",").collect{|i| i.strip}
    pseudo_individuals.each do |pi|
      # All names must be identically formatted in this special case.
      if pi =~ /(([A-Z][a-z]+)\s*(([A-Z]\.\s*)+))/
        if not($1 == pi)
          ok = false
        end
      else
        ok = false
      end
      test_str.gsub!(/#{Regexp.quote(pi)}/, "")
    end
    
    if ok 
      test_str.gsub!(/\s*/, "") 
      if test_str.split(//).uniq == [","]
        individuals = pseudo_individuals
        str = nil 
      end
    end
  end

  prefix = ['van den ', 'Van ', "O'", "Mc", 'Campos ', 'Costa ']
  pre_reg = prefix.collect{|p| "(#{Regexp.escape(p)})?"}.join

  postfix = ['de la', 'von', 'da', 'van', ', Jr.'] 
  post_reg = postfix.collect{|p| "(#{Regexp.escape(p)})?"}.join

  # Initials second
  m1 = Regexp.new(/^\s*(#{pre_reg}             # legal prefix words, includes space if present
                        [A-Z][a-z]+            # a captialized Name 
                        (\-[A-Z][a-z]+)?       # optional dashed addition
                        \s*,\s*                # required comma
                        (\s*                   #  initials, optionally surrounded by whitescape
                         (\-)?                 # optional preceeding dash, hits second initials 
                         [A-Z]                 # required capital initial
                         (\-)?                 # optional initial dash   
                         (\-[A-Z])?            # optional dashed initial
                        \s*\.                  # required period
                        \s*)              
                        {1,}                   # repeat initials as necessary
                        #{post_reg})           # optional legal postfixes
                    \s*/x)

  # Initials first
  m2 = Regexp.new(/^\s*(([A-Z]\.\s*){1,}#{pre_reg}[A-Z][a-z]+#{post_reg}),/)  #  (R. Watson | R.F. Watson),

  # pick off remaining authors one at a time 
  if str
    parsing = true
    i = 0
    while parsing
      individual = ''
      check_for_more_individuals = false
      [ m2, m1].each do |regex|
        if str =~ regex
          individual = $1
          str.slice!(individual)
          str.strip!
          str.slice!(",")
          individuals.push(individual)
          check_for_more_individuals = true # at least once match, keep going
        end
      end

      # puts "[#{individual}] : #{str}"
      if !check_for_more_individuals
        if str && str.size != 0
          individuals.push(str)
          parsing = false
        end
      end

      i += 1
      raise if i > 100
      parsing = false if str.size == 0
    end
  end

  # Note to remember positive look behind (?<= ) for future hax
  # str.split(/(?<=[A-Z])\s*[;,]{1}\s*/, 2)

  individuals.push(last_individual) if !last_individual.nil?
  individuals.flatten!
  
  # At this point we have isolated individuals.  Strategy is to slice out initials and remainder is last name.
  # Initials regex matches any A-B. A. or " A ", "A-B" pattern (including repeats) 
  # TODO: Make a Token
  match_initials = Regexp.new(/(((\s((\-)?[A-Z](\-[A-Z])?\s?){1,})$)|(((\-)?[A-Z](\-[A-Z|a-z]\s*)?\.\s*){1,})|(\s((\-)?[A-Z](\-[A-Z])?\s){1,}))/)

  # TODO: merge with pre/postfix list
  suffixes = [
    Regexp.new(/\s(van)\s?/i),
    Regexp.new(/\s(jr\.)/i),
    Regexp.new(/\s(von)\s?/i),
    Regexp.new(/\s(de la)\s?/i),
    Regexp.new(/\s(da)\s?/i),
  ]

  individuals.each do |i|
    a = {}  # new author

    initials = nil
    last_name = nil
    if i =~ match_initials
      initials = $1
      i.slice!(initials)
      i.strip! 
      last_name = i
    else
      last_name = i
    end

    suffix = [] 
    suffixes.each do |s| # .collect{|p| Regexp.escape(p)}.each do |s|
      if last_name =~ s
        t = $1 
        suffix.push(t) 
        last_name.slice!(t)
      end
    end
    a[:suffix] = suffix.join(" ") if suffix.size > 0 

    last_name.gsub!(/\.|\,/, '')

    a[:last_name] = last_name.strip if last_name # "if" not fully tested for consequences
    a[:initials] = initials.strip.split(/\s|\./).collect{|v| v.strip}.select{|x| x.size > 0} if initials && initials.size > 0

    @names << a
  end
end

Instance Attribute Details

#namesObject (readonly)

Returns the value of attribute names.



73
74
75
# File 'lib/taxonifi/splitter/tokens.rb', line 73

def names
  @names
end