Class: Alphabet

Inherits:
Object
  • Object
show all
Defined in:
lib/alphabets/version.rb,
lib/alphabets/utils.rb,
lib/alphabets/reader.rb,
lib/alphabets/alphabets.rb

Overview

todo/check: use a module Alphabets with s to keep version and banner separate - why? why not?

Defined Under Namespace

Classes: Reader, Unaccenter

Constant Summary collapse

MAJOR =

todo: namespace inside version or something - why? why not??

0
MINOR =
1
PATCH =
2
VERSION =
[MAJOR,MINOR,PATCH].join('.')
UNACCENT =

“simple” unaccent (remove accents/diacritics and unfold ligatures) translation table / mapping

Reader.parse( "    \u00C4 A   \u00E4 a\n    \u00C1 A   \u00E1 a\n    \u00C0 A   \u00E0 a\n    \u00C3 A   \u00E3 a\n    \u00C2 A   \u00E2 a\n    \u00C5 A   \u00E5 a\n    \u00C6 AE  \u00E6 ae   # ae ligature\n          \u0101 a\n          \u0103 a\n    \u0104 A   \u0105 a    # \u0105 - U+0105 (261) - LATIN SMALL LETTER A WITH OGONEK\n\n    \u00C7 C   \u00E7 c    # \u00E7 - U+00E7 (231) - LATIN SMALL LETTER C WITH CEDILLA\n    \u0106 C   \u0107 c\n    \u010C C   \u010D c\n\n    \u010E D   \u010F d\n    \u00D0 D   \u00F0 d    # iceland - d\n\n    \u00C9 E   \u00E9 e\n    \u00C8 E   \u00E8 e\n    \u00CA E   \u00EA e\n    \u00CB E   \u00EB e\n          \u0117 e\n    \u0118 E   \u0119 e\n    \u011A E   \u011B e\n\n          \u011F g\n\n    \u0130 I\n    \u00CD I   \u00ED i\n    \u00CC I   \u00EC i\n    \u00CE I   \u00EE i\n          \u012B i\n          \u0131 i    # \u0131 - U+0131 (305) - LATIN SMALL LETTER DOTLESS I\n    \u00CF I   \u00EF i\n\n    \u0141 L   \u0142 l\n\n    \u00D1 N   \u00F1 n\n    \u0143 N   \u0144 n\n    \u0147 N   \u0148 n\n\n    \u00D6 O   \u00F6 o\n    \u0150 OE  \u0151 oe    # hungarian - just use O/o  - why? (it's not a ligature) why not?\n    \u00D3 O   \u00F3 o\n    \u00D2 O   \u00F2 o\n    \u00D5 O   \u00F5 o\n    \u00D4 O   \u00F4 o\n          \u00F8 o\n    \u0152 OE  \u0153 oe   # oe ligature\n\n    \u0158 R   \u0159 r\n\n    \u015A S   \u015B s\n    \u015E S   \u015F s   # \u015F - U+015F (351) - LATIN SMALL LETTER S WITH CEDILLA\n    \u0218 S   \u0219 s   # \u0219 - U+0219 (537) - LATIN SMALL LETTER S WITH COMMA BELOW\n    \u0160 S   \u0161 s\n          \u00DF ss  # \u00DF - U+00DF (223) - LATIN SMALL LETTER SHARP S\n\n    \u0162 T   \u0163 t   # \u0163 - U+0163 (355) - LATIN SMALL LETTER T WITH CEDILLA\n    \u021A T   \u021B t   # \u021B - U+021B (539) - LATIN SMALL LETTER T WITH COMMA BELOW\n    \u0164 T   \u0165 t\n\n    \u00DE P   \u00FE p   # \u00FE - U+00FE (254) - LATIN SMALL LETTER THORN\n                #### fix/check!!!! icelandic - use p is p or th - why? why not?\n\n    \u00DC U   \u00FC u\n    \u00DA U   \u00FA u\n    \u00D9 U   \u00F9 u\n          \u016B u\n    \u016E U   \u016F u\n    \u00DB U   \u00FB u\n\n    \u00DD Y   \u00FD y\n    \u0178 Y   \u00FF y\n\n    \u0179 Z   \u017A z\n    \u017B Z   \u017C z\n    \u017D Z   \u017E z\n" )
UNACCENT_DE =

de,at,ch translation for umlauts

Reader.parse( "    \u00C4 AE  \u00E4 ae    ### note: Use upcase AE, OE, UE and NOT titlecase Ae, Oe, Ue - why? why not? e.g.V\u00D6ST => VOEST or \u00D6 => OE\n    \u00D6 OE  \u00F6 oe\n    \u00DC UE  \u00FC ue\n          \u00DF ss\n" )
DOWNCASE =

add UNACCENT_ES - why? why not? is Espanyol catalan spelling or spanish (castillian)? ‘ñ’=>‘ny’, ## e.g. Español => Espanyol

%w[A B C D E F G H I J K L M N O P Q R S T U V W X Y Z].reduce({}) do |h,ch|
    h[ch] = ch.downcase
    h
  end.merge( Reader.parse( "    \u00C4 \u00E4\n    \u00C1 \u00E1\n    \u00C0 \u00E0\n    \u00C2 \u00E2\n    \u00C5 \u00E5\n    \u00C6 \u00E6   # LATIN LETTER AE  - ae ligature\n    \u0104 \u0105\n    \u00C3 \u00E3\n\n    \u00C7 \u00E7   # LATIN LETTER C WITH CEDILLA\n    \u010C \u010D\n    \u0106 \u0107\n\n    \u010E \u010F\n\n    \u00D0 \u00F0    # iceland - d\n\n    \u00C9 \u00E9\n    \u00C8 \u00E8\n    \u00CB \u00EB\n    \u00CA \u00EA\n    \u0118 \u0119\n    \u011A \u011B\n\n    \u0130 i\n    \u00CD \u00ED\n    \u00CC \u00EC\n    \u00CF \u00EF\n    \u00CE \u00EE\n\n    \u0141 \u0142\n\n    \u0143 \u0144\n    \u0147 \u0148\n    \u00D1 \u00F1\n\n    \u00D6 \u00F6\n    \u0150 \u0151\n    \u0152 \u0153   # LATIN LIGATURE OE\n    \u00D3 \u00F3\n    \u00D2 \u00F2\n    \u00D4 \u00F4\n    \u00D5 \u00F5\n\n    \u00DE \u00FE    # iceland - p\n\n    \u0158 \u0159\n\n    \u015A \u015B\n    \u015E \u015F   # LATIN LETTER S WITH CEDILLA\n    \u0218 \u0219   # LATIN LETTER S WITH COMMA BELOW\n    \u0160 \u0161\n\n    \u0162 \u0163   # LATIN LETTER T WITH CEDILLA\n    \u021A \u021B   # LATIN LETTER T WITH COMMA BELOW\n    \u0164 \u0165\n\n    \u00DC \u00FC\n    \u00DA \u00FA\n    \u00D9 \u00F9\n    \u016E \u016F\n    \u00DB \u00FB\n\n    \u00DD \u00FD\n    \u0178 \u00FF\n\n    \u017D \u017E\n    \u017B \u017C\n    \u0179 \u017A\n" ) )

Class Method Summary collapse

Class Method Details



16
17
18
# File 'lib/alphabets/version.rb', line 16

def self.banner
  "alphabets/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}]"
end

.count(freq, mapping_or_chars) ⇒ Object



14
15
16
17
18
19
20
21
22
23
24
25
# File 'lib/alphabets/utils.rb', line 14

def self.count( freq, mapping_or_chars )
  chars = if mapping_or_chars.is_a?( Hash )
            mapping_or_chars.keys
          else   ## todo/fix: check for is_a? Array and if is String split into Array (on char at a time?) - why? why not?
            mapping_or_chars  ## assume it's an array/list of characters
          end

  chars.reduce(0) do |count,ch|
    count += freq[ch]
    count
  end
end

.downcase_i18n(name) ⇒ Object

our very own downcase for int’l characters / letters



70
71
72
# File 'lib/alphabets/utils.rb', line 70

def self.downcase_i18n( name )    ## our very own downcase for int'l characters / letters
  sub( name, DOWNCASE )
end

.find_unaccenter(key) ⇒ Object



51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/alphabets/utils.rb', line 51

def self.find_unaccenter( key )
  if key == :de
    @de ||= Unaccenter.new( UNACCENT_DE )
    @de
  else
    ## use uni(versal) or unicode or something - why? why not?
    ##  use all or int'l (international) - why? why not?
    ##  use en  (english) - why? why not?
    @default ||= Unaccenter.new( UNACCENT )
    @default
  end
end

.frequency_table(name) ⇒ Object

todo/check: use/rename to char_frequency_table



4
5
6
7
8
9
10
11
# File 'lib/alphabets/utils.rb', line 4

def self.frequency_table( name )   ## todo/check: use/rename to char_frequency_table
  ## calculate the frequency table of letters, digits, etc.
  freq = Hash.new(0)
  name.each_char do |ch|
     freq[ch] += 1
  end
  freq
end

.rootObject



20
21
22
# File 'lib/alphabets/version.rb', line 20

def self.root
  File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) )
end

.sub(name, mapping) ⇒ Object

todo/check: use a different/better name - gsub/map/replace/fold/… - why? why not?



28
29
30
31
32
33
34
35
36
37
38
# File 'lib/alphabets/utils.rb', line 28

def self.sub( name, mapping )   ## todo/check: use a different/better name - gsub/map/replace/fold/... - why? why not?
  buf = String.new
  name.each_char do |ch|
    buf << if mapping[ch]
              mapping[ch]
            else
              ch
            end
  end
  buf
end

.unaccent(name) ⇒ Object



64
65
66
67
# File 'lib/alphabets/utils.rb', line 64

def self.unaccent( name )
  @default ||= Unaccenter.new( UNACCENT )
  @default.unaccent( name )
end

.versionObject



12
13
14
# File 'lib/alphabets/version.rb', line 12

def self.version
  VERSION
end