Class: Alphabet

Inherits:
Object
  • Object
show all
Defined in:
lib/alphabets/version.rb,
lib/alphabets/utils.rb,
lib/alphabets/reader.rb,
lib/alphabets/alphabets.rb

Overview

todo/check: use a module Alphabets with s to keep version and banner separate - why? why not?

Defined Under Namespace

Classes: Reader, Unaccenter

Constant Summary collapse

MAJOR =

todo: namespace inside version or something - why? why not??

1
MINOR =
0
PATCH =
0
VERSION =
[MAJOR,MINOR,PATCH].join('.')
UNACCENT =

“simple” unaccent (remove accents/diacritics and unfold ligatures) translation table / mapping

Reader.parse( <<TXT )
    Ä A   ä a
    Á A   á a
    À A   à a
    Ã A   ã a
    Â A   â a
    Å A   å a
    Æ AE  æ ae   # ae ligature
          ā a
          ă a
    Ą A   ą a    # ą - U+0105 (261) - LATIN SMALL LETTER A WITH OGONEK

    Ç C   ç c    # ç - U+00E7 (231) - LATIN SMALL LETTER C WITH CEDILLA
    Ć C   ć c
    Č C   č c

    Ď D   ď d
    Ð D   ð d    # iceland - d

    É E   é e
    È E   è e
    Ê E   ê e
    Ë E   ë e
          ė e
    Ę E   ę e
    Ě E   ě e

          ğ g

    İ I
    Í I   í i
    Ì I   ì i
    Î I   î i
          ī i
          ı i    # ı - U+0131 (305) - LATIN SMALL LETTER DOTLESS I
    Ï I   ï i

    Ł L   ł l

    Ñ N   ñ n
    Ń N   ń n
    Ň N   ň n

    Ö O   ö o
    Ő O   ő o    # hungarian - use OE/oe  - why? (it's not a ligature) why not?
    Ó O   ó o
    Ò O   ò o
    Õ O   õ o
    Ô O   ô o
          ø o
    Œ OE  œ oe   # oe ligature

    Ř R   ř r

    Ś S   ś s
    Ş S   ş s   # ş - U+015F (351) - LATIN SMALL LETTER S WITH CEDILLA
    Ș S   ș s   # ș - U+0219 (537) - LATIN SMALL LETTER S WITH COMMA BELOW
    Š S   š s
          ß ss  # ß - U+00DF (223) - LATIN SMALL LETTER SHARP S

    Ţ T   ţ t   # ţ - U+0163 (355) - LATIN SMALL LETTER T WITH CEDILLA
    Ț T   ț t   # ț - U+021B (539) - LATIN SMALL LETTER T WITH COMMA BELOW
    Ť T   ť t

    Þ P   þ p   # þ - U+00FE (254) - LATIN SMALL LETTER THORN
                #### fix/check!!!! icelandic - use p is p or th - why? why not?

    Ü U   ü u
    Ú U   ú u
    Ù U   ù u
          ū u
    Ů U   ů u
    Û U   û u

    Ý Y   ý y
    Ÿ Y   ÿ y

    Ź Z   ź z
    Ż Z   ż z
    Ž Z   ž z
TXT
UNACCENT_DE =

de,at,ch translation for umlauts

Reader.parse( <<TXT )
    Ä AE  ä ae    ### note: Use upcase AE, OE, UE and NOT titlecase Ae, Oe, Ue - why? why not? e.g.VÖST => VOEST or Ö => OE
    Ö OE  ö oe
    Ü UE  ü ue
          ß ss
TXT
DOWNCASE =
%w[A B C D E F G H I J K L M N O P Q R S T U V W X Y Z].reduce({}) do |h,ch|
    h[ch] = ch.downcase
    h
  end.merge( Reader.parse( <<TXT ) )
    Ä ä
    Á á
    À à
    Â â
    Å å
    Æ æ   # LATIN LETTER AE  - ae ligature
    Ą ą
    Ã ã

    Ç ç   # LATIN LETTER C WITH CEDILLA
    Č č
    Ć ć

    Ď ď

    Ð ð    # iceland - d

    É é
    È è
    Ë ë
    Ê ê
    Ę ę
    Ě ě

    İ i
    Í í
    Ì ì
    Ï ï
    Î î

    Ł ł

    Ń ń
    Ň ň
    Ñ ñ

    Ö ö
    Ő ő
    Œ œ   # LATIN LIGATURE OE
    Ó ó
    Ò ò
    Ô ô
    Õ õ

    Þ þ    # iceland - p

    Ř ř

    Ś ś
    Ş ş   # LATIN LETTER S WITH CEDILLA
    Ș ș   # LATIN LETTER S WITH COMMA BELOW
    Š š

    Ţ ţ   # LATIN LETTER T WITH CEDILLA
    Ț ț   # LATIN LETTER T WITH COMMA BELOW
    Ť ť

    Ü ü
    Ú ú
    Ù ù
    Ů ů
    Û û

    Ý ý
    Ÿ ÿ

    Ž ž
    Ż ż
    Ź ź
TXT

Class Method Summary collapse

Class Method Details



16
17
18
# File 'lib/alphabets/version.rb', line 16

def self.banner
  "alphabets/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}]"
end

.count(freq, mapping_or_chars) ⇒ Object



14
15
16
17
18
19
20
21
22
23
24
25
# File 'lib/alphabets/utils.rb', line 14

def self.count( freq, mapping_or_chars )
  chars = if mapping_or_chars.is_a?( Hash )
            mapping_or_chars.keys
          else   ## todo/fix: check for is_a? Array and if is String split into Array (on char at a time?) - why? why not?
            mapping_or_chars  ## assume it's an array/list of characters
          end

  chars.reduce(0) do |count,ch|
    count += freq[ch]
    count
  end
end

.downcase_i18n(name) ⇒ Object

our very own downcase for int’l characters / letters



70
71
72
# File 'lib/alphabets/utils.rb', line 70

def self.downcase_i18n( name )    ## our very own downcase for int'l characters / letters
  sub( name, DOWNCASE )
end

.find_unaccenter(key) ⇒ Object



51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/alphabets/utils.rb', line 51

def self.find_unaccenter( key )
  if key == :de
    @de ||= Unaccenter.new( UNACCENT_DE )
    @de
  else
    ## use uni(versal) or unicode or something - why? why not?
    ##  use all or int'l (international) - why? why not?
    ##  use en  (english) - why? why not?
    @default ||= Unaccenter.new( UNACCENT )
    @default
  end
end

.frequency_table(name) ⇒ Object

todo/check: use/rename to char_frequency_table



4
5
6
7
8
9
10
11
# File 'lib/alphabets/utils.rb', line 4

def self.frequency_table( name )   ## todo/check: use/rename to char_frequency_table
  ## calculate the frequency table of letters, digits, etc.
  freq = Hash.new(0)
  name.each_char do |ch|
     freq[ch] += 1
  end
  freq
end

.rootObject



20
21
22
# File 'lib/alphabets/version.rb', line 20

def self.root
  File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) )
end

.sub(name, mapping) ⇒ Object

todo/check: use a different/better name - gsub/map/replace/fold/… - why? why not?



28
29
30
31
32
33
34
35
36
37
38
# File 'lib/alphabets/utils.rb', line 28

def self.sub( name, mapping )   ## todo/check: use a different/better name - gsub/map/replace/fold/... - why? why not?
  buf = String.new
  name.each_char do |ch|
    buf << if mapping[ch]
              mapping[ch]
            else
              ch
            end
  end
  buf
end

.unaccent(name) ⇒ Object



64
65
66
67
# File 'lib/alphabets/utils.rb', line 64

def self.unaccent( name )
  @default ||= Unaccenter.new( UNACCENT )
  @default.unaccent( name )
end

.versionObject



12
13
14
# File 'lib/alphabets/version.rb', line 12

def self.version
  VERSION
end