Module: MathMetadata

Defined in:
lib/math_metadata_lookup/lookup.rb,
lib/math_metadata_lookup/site.rb,
lib/math_metadata_lookup/tools.rb,
lib/math_metadata_lookup/author.rb,
lib/math_metadata_lookup/entity.rb,
lib/math_metadata_lookup/result.rb,
lib/math_metadata_lookup/article.rb,
lib/math_metadata_lookup/sites/mr.rb,
lib/math_metadata_lookup/reference.rb,
lib/math_metadata_lookup/sites/zbl.rb

Overview

vi: fenc=utf-8:expandtab:ts=2:sw=2:sts=2

Defined Under Namespace

Classes: Article, Author, Entity, Lookup, MR, Reference, Result, Site, ZBL

Constant Summary collapse

SITES =
[]
ACCENT_REPL =
{
  "`" => "\u0300", # grave accent
  "'" => "\u0301", # acute accent
  "^" => "\u0302", # circumflex
  '"' => "\u0308", # umlaut or dieresis
  "~" => "\u0303", # tilde
  "H" => "\u030b", # long Hungarian umlaut (double acute)
  "c" => "\u0327", # cedilla
  "=" => "\u0304", # macron accent
  "." => "\u0307", # dot over the letter
  "r" => "\u030a", # ring over the letter
  "u" => "\u0306", # breve over the letter
  "v" => "\u030c"  # caron/hacek ("v") over the letter
}

Class Method Summary collapse

Class Method Details

.latex_to_utf8(s) ⇒ Object



100
101
102
103
104
105
106
# File 'lib/math_metadata_lookup/tools.rb', line 100

def latex_to_utf8( s )
  str = s.gsub( /\\(.)(?:([a-zA-Z])|\{([a-zA-Z])\}|\{\\([a-zA-Z])\})/ ) do |match|
    accent = ACCENT_REPL[$1]
    char = $2 || $3 || $4
    accent ? Unicode.normalize_KC( char + accent ) : match
  end
end

.levenshtein_distance(s1, s2) ⇒ Object



10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# File 'lib/math_metadata_lookup/tools.rb', line 10

def levenshtein_distance( s1, s2 )
  return 1.0 if s1 == s2

  s1u, s2u = s1.split(//u), s2.split(//u)
  tab = Array.new(s1u.size+1){ Array.new(s2u.size+1){0} }

  (0..s1u.size).each do |i|
    tab[i][0] = i
  end
  (0..s2u.size).each do |j|
    tab[0][j] = j
  end

  (1..s2u.size).each do |j|
    (1..s1u.size).each do |i|
      if s2u[j-1] == s1u[i-1]
        tab[i][j] = tab[i-1][j-1]
      else
        tab[i][j] = [
          tab[i-1][j] + 1,
          tab[i][j-1] + 1,
          tab[i-1][j-1] + 1
        ].sort.first
      end
    end
  end
  1 - (tab.last.last.to_f / ([s1u.size, s2u.size].sort.last))
end

.normalize_mscs(mscs) ⇒ Object



45
46
47
# File 'lib/math_metadata_lookup/tools.rb', line 45

def normalize_mscs( mscs )
  mscs.map{|m| m.split(/,|;/) }.flatten.map{|m| m =~ /\s*\(?([^\s\)\(]+)\)?\s*/; $1}
end

.normalize_name(name) ⇒ Object



50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# File 'lib/math_metadata_lookup/tools.rb', line 50

def normalize_name( name )
  # only latin chars
  trans = latex_to_utf8(name.to_s)
  trans = I18n.transliterate(trans)

  # remove Jr. 
  trans.sub! %r{\bjr\.(\b|$)}i, ' '

  # remove abbr.: Rakosnik, J. => Rakosnik, 
  trans.sub! %r{(\W|^)\w\.}i, ' '
  
  # transform: Surname, N.M. => Surname, N. M.
  trans.gsub( /([^\s,])?\.([^\s,])/, '\1. \2' )

  #MathMetadata.remove_punctuation(trans)
  trans
end

.normalize_range(range) ⇒ Object



40
41
42
# File 'lib/math_metadata_lookup/tools.rb', line 40

def normalize_range( range )
  range.to_s.gsub(/–|--/,'-')
end

.normalize_text(s) ⇒ Object



76
77
78
79
80
81
82
83
# File 'lib/math_metadata_lookup/tools.rb', line 76

def normalize_text( s )
  str = latex_to_utf8(s)
  str = I18n.transliterate(str).downcase
  str = remove_punctuation(str)
  str.gsub!(%r{\W+}, ' ')
  str.gsub!(%r{(?:the|a|of|)\s+}i, ' ')
  str.strip
end

.remove_punctuation(s) ⇒ Object



69
70
71
72
73
# File 'lib/math_metadata_lookup/tools.rb', line 69

def remove_punctuation( s )
  str = s.gsub %r{(\w)[.,]+( |$)}i, '\1 '
  str.gsub! %r{(\s)[.,]+( |$)}i, '\1 '
  str.strip
end