Class: Taxonifi::Splitter::Tokens::Authors

Inherits:

Object
Token
Taxonifi::Splitter::Tokens::Authors

Defined in:: lib/taxonifi/splitter/tokens.rb

Overview

Complex breakdown of author strings. Handles a wide variety of formats.

See test_splitter_tokens.rb for scope. As with AuthorYear this will match just about anything when used alone. Add exceptions at will, just test using TestSplittTokens#test_authors. TODO: Unicode the [a-z] bits?

Instance Attribute Summary collapse

#names ⇒ Object readonly

Returns the value of attribute names.

Attributes inherited from Token

#flag, #value

Instance Method Summary collapse

#initialize(input) ⇒ Authors constructor

A new instance of Authors.

Constructor Details

#initialize(input) ⇒ `Authors`

Returns a new instance of Authors.

# File 'lib/taxonifi/splitter/tokens.rb', line 76

def initialize(input)
  str = input 
  @names = [] 
  str.strip!
  naked_and = false # look for the pattern 'Foo, Bar and Smith', i.e. no initials
  individuals = []
  last_individual = nil

  # We can simplify if there is an "and" or & 
  if str =~ /(\s+and\s+|\&)/i
    l,r = str.split(/\s+\,?\s*and\s+|\s+\&\s+/i, 2) # added \, \s+
    last_individual = r
    str = l  
    naked_and = true
  end

  # Look for an exception case, no initials, "and" or "&" previously present, like:
  #   Foo, Bar and Smith  
  if naked_and && not(str =~ /\./) && str =~ /s*([A-Z][a-z]{1,})\s*\,+\s*([A-Z][a-z]{1,})/ 
    individuals.unshift str.split(/\s*\,\s*/)
    str = nil 
  end

  # Look for an exception case, no periods and multiple commas, like:
  #   Foo A, Bar ZA, Smith-Blorf A
  if str && !naked_and && (str.split(",").size > 2) && !(str =~ /\./)
    individuals = str.split(",")
    str = nil
  end

  # Look for an exception case.  Last name, commas, no and.  The idea is to decompose and
  # have nothing left, if possible then the match is good.
  if str && !naked_and && (str.split(",").size > 1) && (str =~ /[A-Z]\./)
    test_str = str.clone 
    ok = true 
    pseudo_individuals = test_str.split(",").collect{|i| i.strip}
    pseudo_individuals.each do |pi|
      # All names must be identically formatted in this special case.
      if pi =~ /(([A-Z][a-z]+)\s*(([A-Z]\.\s*)+))/
        if not($1 == pi)
          ok = false
        end
      else
        ok = false
      end
      test_str.gsub!(/#{Regexp.quote(pi)}/, "")
    end
    
    if ok 
      test_str.gsub!(/\s*/, "") 
      if test_str.split(//).uniq == [","]
        individuals = pseudo_individuals
        str = nil 
      end
    end
  end

  prefix = ['van den ', 'Van ', "O'", "Mc", 'Campos ', 'Costa ']
  pre_reg = prefix.collect{|p| "(#{Regexp.escape(p)})?"}.join

  postfix = ['de la', 'von', 'da', 'van', ', Jr.'] 
  post_reg = postfix.collect{|p| "(#{Regexp.escape(p)})?"}.join

  # Initials second
  m1 = Regexp.new(/^\s*(#{pre_reg}             # legal prefix words, includes space if present
                        [A-Z][a-z]+            # a captialized Name 
                        (\-[A-Z][a-z]+)?       # optional dashed addition
                        \s*,\s*                # required comma
                        (\s*                   #  initials, optionally surrounded by whitescape
                         (\-)?                 # optional preceeding dash, hits second initials 
                         [A-Z]                 # required capital initial
                         (\-)?                 # optional initial dash   
                         (\-[A-Z])?            # optional dashed initial
                        \s*\.                  # required period
                        \s*)              
                        {1,}                   # repeat initials as necessary
                        #{post_reg})           # optional legal postfixes
                    \s*/x)

  # Initials first
  m2 = Regexp.new(/^\s*(([A-Z]\.\s*){1,}#{pre_reg}[A-Z][a-z]+#{post_reg}),/)  #  (R. Watson | R.F. Watson),

  # pick off remaining authors one at a time 
  if str
    parsing = true
    i = 0
    while parsing
      individual = ''
      check_for_more_individuals = false
      [ m2, m1].each do |regex|
        if str =~ regex
          individual = $1
          str.slice!(individual)
          str.strip!
          str.slice!(",")
          individuals.push(individual)
          check_for_more_individuals = true # at least once match, keep going
        end
      end

      # puts "[#{individual}] : #{str}"
      if !check_for_more_individuals
        if str && str.size != 0
          individuals.push(str)
          parsing = false
        end
      end

      i += 1
      raise if i > 100
      parsing = false if str.size == 0
    end
  end

  # Note to remember positive look behind (?<= ) for future hax
  # str.split(/(?<=[A-Z])\s*[;,]{1}\s*/, 2)

  individuals.push(last_individual) if !last_individual.nil?
  individuals.flatten!
  
  # At this point we have isolated individuals.  Strategy is to slice out initials and remainder is last name.
  # Initials regex matches any A-B. A. or " A ", "A-B" pattern (including repeats) 
  # TODO: Make a Token
  match_initials = Regexp.new(/(((\s((\-)?[A-Z](\-[A-Z])?\s?){1,})$)|(((\-)?[A-Z](\-[A-Z|a-z]\s*)?\.\s*){1,})|(\s((\-)?[A-Z](\-[A-Z])?\s){1,}))/)

  # TODO: merge with pre/postfix list
  suffixes = [
    Regexp.new(/\s(van)\s?/i),
    Regexp.new(/\s(jr\.)/i),
    Regexp.new(/\s(von)\s?/i),
    Regexp.new(/\s(de la)\s?/i),
    Regexp.new(/\s(da)\s?/i),
  ]

  individuals.each do |i|
    a = {}  # new author

    initials = nil
    last_name = nil
    if i =~ match_initials
      initials = $1
      i.slice!(initials)
      i.strip! 
      last_name = i
    else
      last_name = i
    end

    suffix = [] 
    suffixes.each do |s| # .collect{|p| Regexp.escape(p)}.each do |s|
      if last_name =~ s
        t = $1 
        suffix.push(t) 
        last_name.slice!(t)
      end
    end
    a[:suffix] = suffix.join(" ") if suffix.size > 0 

    last_name.gsub!(/\.|\,/, '')

    a[:last_name] = last_name.strip if last_name # "if" not fully tested for consequences
    a[:initials] = initials.strip.split(/\s|\./).collect{|v| v.strip}.select{|x| x.size > 0} if initials && initials.size > 0

    @names << a
  end
end

Instance Attribute Details

#names ⇒ `Object` (readonly)

Returns the value of attribute names.



73
74
75

# File 'lib/taxonifi/splitter/tokens.rb', line 73

def names
  @names
end