Module: TextUtils::StringFilter

Included in:
TextUtils, Filter
Defined in:
lib/textutils/filter/string_filter.rb

Constant Summary collapse

ASCIIFY_MAPPINGS =

todo: add unicode codepoint name ???

{
    'ß' => 'ss',  # -- Latin small letter sharp s (ess-zed); see German Eszett // &szlig

    'æ' => 'ae',  # -- Latin small letter ae (Latin small ligature ae) // &aelig
    'ä' => 'ae',  # -- Latin small letter a with diaeresis // &auml
    'ā' => 'a',  # e.g. Liepājas, Kāṭhmāḍaũ
    'á' => 'a',  # e.g. Bogotá, Králové
    'à' => 'a',  # e.g. Thành Phố Hồ Chí Minh [Saigon]
    'ã' => 'a',  # e.g  São Paulo
    'ă '=> 'a',  # e.g. Chișinău
    'â' => 'a',  # e.g  Goiânia
    'å' => 'a',  # e.g. Vålerenga
    'ą' => 'a',  # e.g. Śląsk
    'ă' => 'a',  # e.g. Chișinău

    'ç' => 'c',  # e.g. São Gonçalo, Iguaçu, Neftçi
    'ć' => 'c',  # e.g. Budućnost
    'č' => 'c',  # e.g. Tradiční, Výčepní

    '' => 'd',  # e.g. Kāṭhmāḍaũ [Kathmandu]

    'é' => 'e',  # e.g. Vélez, Králové
    'è' => 'e',  # e.g. Rivières
    'ê' => 'e',  # e.g. Grêmio
    'ě' => 'e',  # e.g. Budějovice
    'ĕ' => 'e',  # e.g. Svĕtlý
    'ė' => 'e',  # e.g. Vėtra
    'ë' => 'e',  # e.g. Skënderbeu

    'ğ' => 'g',  # e.g. Qarabağ

    '' => 'h',  # e.g. Ad-Dawḥah [Doha]

    'ì' => 'i',  # e.g. Potosì
    'í' => 'i',  # e.g. Ústí
    'ï' => 'i',  # e.g. El Djazaïr
    'ī' => 'i',  # e.g. Al-Iskandarīyah [Alexandria]

    'ł' => 'l',  # e.g. Wisła, Wrocław
    'ñ' => 'n',  # e.g. Porteño
    'ň' => 'n',  # e.g. Plzeň, Třeboň

    'ö' => 'oe',
    'ő' => 'o',  # e.g. Győri
    'ó' => 'o',  # e.g. Colón, Łódź, Kraków
    'õ' => 'o',  # e.g. Nõmme
    'ô' => 'o',  # e.g. Amazônia (pt)
    'ō' => 'o',  # e.g. Tōkyō, Pishōr
    'ŏ' => 'o',  # e.g. P'yŏngyang [Pyongyang]
    'ø' => 'o',  # e.g. Fuglafjørdur, København
    '' => 'o',  # e.g. Thành Phố Hồ Chí Minh [Saigon]
    '' => 'o',  # e.g. Thành Phố Hồ Chí Minh [Saigon]
    '' => 'o',  # e.g. Hà Nội [Hanoi]

    'ř' => 'r',  # e.g. Třeboň

    'ș' => 's',  # e.g. Chișinău, București
    'ş' => 's',  # e.g. Beşiktaş
    'š' => 's',  # e.g. Košice
    '' => 's', # e.g. Al-Mawṣil [Mosul]

    'ť' => 't',  # e.g. Měšťan
    '' => 't',  # e.g. Al-Kharṭūm [Khartoum], Kāṭhmāḍaũ
    'ț' => 't',  # e.g. Bistrița (ro)

    'ü' => 'ue',
    'ú' => 'u',  # e.g. Fútbol
    'ù' => 'u',  # e.g. Xyauyù (it)
    'ū' => 'u',  # e.g. Sūduva
    'ů' => 'u',  # e.g. Sládkův
    'ũ' => 'u',  # e.g. Kāṭhmāḍaũ [Kathmandu]

    'ı' => 'u',  # e.g. Bakı   # use u?? (Baku) why-why not?

    'ý' => 'y',  # e.g. Nefitrovaný
    'ź' => 'z',  # e.g. Łódź
    'ž' => 'z',  # e.g. Domžale, Petržalka
    'ż' => 'z',  # e.g. Lomża  (polish)

    'Æ' => 'Ae', # -- Latin capital letter AE
    'Á' => 'A',  # e.g. Águila (es)
    'Å' => 'A',  # e.g. Åland Islands -- Latin capital letter A with ring above // Å

    'Ç' => 'C',  # --  Latin capital letter C with cedilla -- &Ccedil
    'Č' => 'C',  # e.g. České

    '' => 'H',  # e.g. Ḥalab [Aleppo]
    '' => 'H',  # e.g. Ḫamīs Mušayṭ
    'İ' => 'I',  # e.g. İnter
    'Í' => 'I',  # e.g. ÍBV
    'Ł' => 'L',  # e.g. Łódź

    'Ö' => 'Oe', # e.g. Örebro -- Latin capital letter O with diaeresis // Ö
    'Ō' => 'O',  # e.g. Ōsaka [Osaka] -- 
    'Ø' => 'O',  # e.g. Nogne Ø Imperial Stout (no) -- Latin capital letter O with stroke (Latin capital letter O slash) // &Oslash
    
    'Ř' => 'R',  # e.g. Řezák

    'Ś' => 'S',  # e.g. Śląsk
    'Š' => 'S',  # e.g. MŠK   -- Latin capital letter S with caron // Š
    'Ş' => 'S',  # e.g. Şüvälan
    '' => 'S',  # e.g. Ṣan'ā' [Sana'a]

    '' => 'T',  # e.g. Ṭarābulus [Tripoli]

    'Ü' => 'Ue', # e.g. Übelbach
    'Ú' => 'U',  # e.g. Ústí, Újpest

    'Ž' => 'Z',  # e.g. Žilina
    'Ż' => 'Z',   # e.g. Żywiec (polish) -- Latin captial letter Z with caron

    "\u{030C}" => 'x',   # e.g. Pex̌awar  [Peshawar] -- note: use unicode codepoint as some editors mess up 'x̌'
}

Instance Method Summary collapse

Instance Method Details

#asciify(content, options = {}) ⇒ Object



132
133
134
135
136
137
138
139
140
141
142
# File 'lib/textutils/filter/string_filter.rb', line 132

def asciify( content, options={} )
  buf = ''
  content.each_char do |c|
    if ASCIIFY_MAPPINGS.has_key?( c )
      buf << ASCIIFY_MAPPINGS[ c ]
    else
      buf << c    # just add as is (no mapping)
    end
  end
  buf
end

#slugify(content, options = {}) ⇒ Object



144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# File 'lib/textutils/filter/string_filter.rb', line 144

def slugify( content, options={} )
  
  ## NOTE: for now we do NOT strip non-word characters!!!!
  ##   if it is an accented char, add it to asciify first!!!

  ## converts to lowercase,
  ##  removes non-word characters (alphanumerics and underscores)
  ##  and converts spaces to hyphens.
  ##  Also strips leading and trailing whitespace.

  # 1) asciify and downcase
  content = asciify( content ).downcase

  # 2) replace special chars w/space e.g $&%?!§#=*+._/()[]{}
  ##  --  check in [] do we need to espcae / () [] {}
  content = content.gsub( /[$&%?!§#=*+._\/\(\)\[\]\{\}]/, ' ' )  ## -- replace w/ dash (-)
  content = content.gsub( /["']/, '' )  ## -- remove (use replace too? why? why not? add others???

  # 3) strip leading and trailing spaces; squeeze spaces (e.g. more than one into one space)
  content = content.strip
  content = content.gsub( / {2,}/, ' ' )

  # 4) replace remaining (inner) spaces ( ) with dash (-)
  content = content.gsub( ' ', '-' )
  content
end