Class: Alphabet

Inherits:

Object

Object
Alphabet

show all

Defined in:: lib/alphabets/version.rb,
lib/alphabets/utils.rb,
lib/alphabets/reader.rb,
lib/alphabets/alphabets.rb

Overview

todo/check: use a module Alphabets with s to keep version and banner separate - why? why not?

Defined Under Namespace

Classes: Reader, Unaccenter

Constant Summary collapse

MAJOR = todo: namespace inside version or something - why? why not??

MINOR =

PATCH =

VERSION =

[MAJOR,MINOR,PATCH].join('.')

UNACCENT = “simple” unaccent (remove accents/diacritics and unfold ligatures) translation table / mapping

Reader.parse( "    \u00C4 A   \u00E4 a\n    \u00C1 A   \u00E1 a\n          \u00E0 a\n          \u00E3 a\n          \u00E2 a\n    \u00C5 A   \u00E5 a\n    \u00C6 AE  \u00E6 ae   # ae ligature\n          \u0101 a\n          \u0103 a\n          \u0105 a    # \u0105 - U+0105 (261) - LATIN SMALL LETTER A WITH OGONEK\n\n    \u00C7 C   \u00E7 c    # \u00E7 - U+00E7 (231) - LATIN SMALL LETTER C WITH CEDILLA\n          \u0107 c\n    \u010C C   \u010D c\n\n    \u00C9 E   \u00E9 e\n          \u00E8 e\n          \u00EA e\n          \u00EB e\n          \u0117 e\n          \u0119 e\n\n          \u011F g\n\n    \u0130 I\n    \u00CD I   \u00ED i\n          \u00EE i\n          \u012B i\n          \u0131 i    # \u0131 - U+0131 (305) - LATIN SMALL LETTER DOTLESS I\n\n    \u0141 L   \u0142 l\n\n          \u00F1 n\n          \u0144 n\n          \u0148 n\n\n    \u00D6 O   \u00F6 o\n          \u00F3 o\n          \u00F2 o\n          \u00F5 o\n          \u00F4 o\n          \u00F8 o\n          \u0151 o\n    \u0152 OE  \u0153 oe   # oe ligature\n\n          \u0159 r\n\n    \u015A S   \u015B s\n    \u015E S   \u015F s   # \u015F - U+015F (351) - LATIN SMALL LETTER S WITH CEDILLA\n    \u0218 S   \u0219 s   # \u0219 - U+0219 (537) - LATIN SMALL LETTER S WITH COMMA BELOW\n    \u0160 S   \u0161 s\n          \u00DF ss  # \u00DF - U+00DF (223) - LATIN SMALL LETTER SHARP S\n\n    \u0162 t   \u0163 t   # \u0163 - U+0163 (355) - LATIN SMALL LETTER T WITH CEDILLA\n    \u021A t   \u021B t   # \u021B - U+021B (539) - LATIN SMALL LETTER T WITH COMMA BELOW\n\n          \u00FE p   # \u00FE - U+00FE (254) - LATIN SMALL LETTER THORN\n                #### fix/check!!!! icelandic - use p is p or th - why? why not?\n\n    \u00DC U   \u00FC u\n    \u00DA U   \u00FA u\n          \u016B u\n\n          \u00FD y\n\n          \u017A z\n          \u017C z\n    \u017D Z   \u017E z\n" )

UNACCENT_DE = de,at,ch translation for umlauts

Reader.parse( "    \u00C4 AE  \u00E4 ae    ### note: Use upcase AE, OE, UE and NOT titlecase Ae, Oe, Ue - why? why not? e.g.V\u00D6ST => VOEST or \u00D6 => OE\n    \u00D6 OE  \u00F6 oe\n    \u00DC UE  \u00FC ue\n          \u00DF ss\n" )

DOWNCASE = add UNACCENT_ES - why? why not? is Espanyol catalan spelling or spanish (castillian)? ‘ñ’=>‘ny’, ## e.g. Español => Espanyol

%w[A B C D E F G H I J K L M N O P Q R S T U V W X Y Z].reduce({}) do |h,ch|
    h[ch] = ch.downcase
    h
  end.merge( Reader.parse( "    \u00C4 \u00E4\n    \u00C1 \u00E1\n    \u00C5 \u00E5\n    \u00C6 \u00E6   # LATIN LETTER AE  - ae ligature\n\n    \u00C7 \u00E7   # LATIN LETTER C WITH CEDILLA\n    \u010C \u010D\n\n    \u00C9 \u00E9\n\n    \u0130 i\n    \u00CD \u00ED\n\n    \u0141 \u0142\n\n    \u00D6 \u00F6\n    \u0152 \u0153   # LATIN LIGATURE OE\n\n    \u015A \u015B\n    \u015E \u015F   # LATIN LETTER S WITH CEDILLA\n    \u0218 \u0219   # LATIN LETTER S WITH COMMA BELOW\n    \u0160 \u0161\n\n    \u0162 \u0163   # LATIN LETTER T WITH CEDILLA\n    \u021A \u021B   # LATIN LETTER T WITH COMMA BELOW\n\n    \u00DC \u00FC\n    \u00DA \u00FA\n\n    \u017D \u017E\n" ) )

Class Method Summary collapse

.banner ⇒ Object
.count(freq, mapping_or_chars) ⇒ Object
.downcase_i18n(name) ⇒ Object

our very own downcase for int’l characters / letters.
.find_unaccenter(key) ⇒ Object
.frequency_table(name) ⇒ Object

todo/check: use/rename to char_frequency_table.
.root ⇒ Object
.sub(name, mapping) ⇒ Object

todo/check: use a different/better name - gsub/map/replace/fold/…
.unaccent(name) ⇒ Object
.version ⇒ Object

Class Method Details



16
17
18

# File 'lib/alphabets/version.rb', line 16

def self.banner
  "alphabets/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}]"
end

.count(freq, mapping_or_chars) ⇒ `Object`

# File 'lib/alphabets/utils.rb', line 14

def self.count( freq, mapping_or_chars )
  chars = if mapping_or_chars.is_a?( Hash )
            mapping_or_chars.keys
          else   ## todo/fix: check for is_a? Array and if is String split into Array (on char at a time?) - why? why not?
            mapping_or_chars  ## assume it's an array/list of characters
          end

  chars.reduce(0) do |count,ch|
    count += freq[ch]
    count
  end
end

.downcase_i18n(name) ⇒ `Object`

our very own downcase for int’l characters / letters



70
71
72

# File 'lib/alphabets/utils.rb', line 70

def self.downcase_i18n( name )    ## our very own downcase for int'l characters / letters
  sub( name, DOWNCASE )
end

.find_unaccenter(key) ⇒ `Object`

# File 'lib/alphabets/utils.rb', line 51

def self.find_unaccenter( key )
  if key == :de
    @de ||= Unaccenter.new( UNACCENT_DE )
    @de
  else
    ## use uni(versal) or unicode or something - why? why not?
    ##  use all or int'l (international) - why? why not?
    ##  use en  (english) - why? why not?
    @default ||= Unaccenter.new( UNACCENT )
    @default
  end
end

.frequency_table(name) ⇒ `Object`

todo/check: use/rename to char_frequency_table

# File 'lib/alphabets/utils.rb', line 4

def self.frequency_table( name )   ## todo/check: use/rename to char_frequency_table
  ## calculate the frequency table of letters, digits, etc.
  freq = Hash.new(0)
  name.each_char do |ch|
     freq[ch] += 1
  end
  freq
end

.root ⇒ `Object`



20
21
22

# File 'lib/alphabets/version.rb', line 20

def self.root
  File.expand_path( File.dirname(File.dirname(File.dirname(__FILE__))) )
end

.sub(name, mapping) ⇒ `Object`

todo/check: use a different/better name - gsub/map/replace/fold/… - why? why not?

# File 'lib/alphabets/utils.rb', line 28

def self.sub( name, mapping )   ## todo/check: use a different/better name - gsub/map/replace/fold/... - why? why not?
  buf = String.new
  name.each_char do |ch|
    buf << if mapping[ch]
              mapping[ch]
            else
              ch
            end
  end
  buf
end

.unaccent(name) ⇒ `Object`

# File 'lib/alphabets/utils.rb', line 64

def self.unaccent( name )
  @default ||= Unaccenter.new( UNACCENT )
  @default.unaccent( name )
end

.version ⇒ `Object`



12
13
14

# File 'lib/alphabets/version.rb', line 12

def self.version
  VERSION
end

Class: Alphabet

Overview

Defined Under Namespace

Constant Summary collapse

Class Method Summary collapse

Class Method Details

.banner ⇒ Object

.count(freq, mapping_or_chars) ⇒ Object

.downcase_i18n(name) ⇒ Object

.find_unaccenter(key) ⇒ Object

.frequency_table(name) ⇒ Object

.root ⇒ Object

.sub(name, mapping) ⇒ Object

.unaccent(name) ⇒ Object

.version ⇒ Object

.banner ⇒ `Object`