Module: TextUtils::StringFilter
Constant Summary collapse
- ASCIIFY_MAPPINGS =
todo: add unicode codepoint name ???
{ 'ß' => 'ss', # -- Latin small letter sharp s (ess-zed); see German Eszett // ß 'æ' => 'ae', # -- Latin small letter ae (Latin small ligature ae) // æ 'ä' => 'ae', # -- Latin small letter a with diaeresis // ä 'ā' => 'a', # e.g. Liepājas, Kāṭhmāḍaũ 'á' => 'a', # e.g. Bogotá, Králové 'à' => 'a', # e.g. Thành Phố Hồ Chí Minh [Saigon] 'ã' => 'a', # e.g São Paulo 'ă '=> 'a', # e.g. Chișinău 'â' => 'a', # e.g Goiânia 'å' => 'a', # e.g. Vålerenga 'ą' => 'a', # e.g. Śląsk 'ă' => 'a', # e.g. Chișinău 'ç' => 'c', # e.g. São Gonçalo, Iguaçu, Neftçi 'ć' => 'c', # e.g. Budućnost 'č' => 'c', # e.g. Tradiční, Výčepní 'ḍ' => 'd', # e.g. Kāṭhmāḍaũ [Kathmandu] 'é' => 'e', # e.g. Vélez, Králové 'è' => 'e', # e.g. Rivières 'ê' => 'e', # e.g. Grêmio 'ě' => 'e', # e.g. Budějovice 'ĕ' => 'e', # e.g. Svĕtlý 'ė' => 'e', # e.g. Vėtra 'ë' => 'e', # e.g. Skënderbeu 'ğ' => 'g', # e.g. Qarabağ 'ḥ' => 'h', # e.g. Ad-Dawḥah [Doha] 'ì' => 'i', # e.g. Potosì 'í' => 'i', # e.g. Ústí 'ï' => 'i', # e.g. El Djazaïr 'ī' => 'i', # e.g. Al-Iskandarīyah [Alexandria] 'ł' => 'l', # e.g. Wisła, Wrocław 'ñ' => 'n', # e.g. Porteño 'ň' => 'n', # e.g. Plzeň, Třeboň 'ö' => 'oe', 'ő' => 'o', # e.g. Győri 'ó' => 'o', # e.g. Colón, Łódź, Kraków 'õ' => 'o', # e.g. Nõmme 'ô' => 'o', # e.g. Amazônia (pt) 'ō' => 'o', # e.g. Tōkyō, Pishōr 'ŏ' => 'o', # e.g. P'yŏngyang [Pyongyang] 'ø' => 'o', # e.g. Fuglafjørdur, København 'ố' => 'o', # e.g. Thành Phố Hồ Chí Minh [Saigon] 'ồ' => 'o', # e.g. Thành Phố Hồ Chí Minh [Saigon] 'ộ' => 'o', # e.g. Hà Nội [Hanoi] 'ř' => 'r', # e.g. Třeboň 'ș' => 's', # e.g. Chișinău, București 'ş' => 's', # e.g. Beşiktaş 'š' => 's', # e.g. Košice 'ṣ' => 's', # e.g. Al-Mawṣil [Mosul] 'ť' => 't', # e.g. Měšťan 'ṭ' => 't', # e.g. Al-Kharṭūm [Khartoum], Kāṭhmāḍaũ 'ț' => 't', # e.g. Bistrița (ro) 'ü' => 'ue', 'ú' => 'u', # e.g. Fútbol 'ù' => 'u', # e.g. Xyauyù (it) 'ū' => 'u', # e.g. Sūduva 'ů' => 'u', # e.g. Sládkův 'ũ' => 'u', # e.g. Kāṭhmāḍaũ [Kathmandu] 'ı' => 'u', # e.g. Bakı # use u?? (Baku) why-why not? 'ý' => 'y', # e.g. Nefitrovaný 'ź' => 'z', # e.g. Łódź 'ž' => 'z', # e.g. Domžale, Petržalka 'ż' => 'z', # e.g. Lomża (polish) 'Æ' => 'Ae', # -- Latin capital letter AE 'Á' => 'A', # e.g. Águila (es) 'Å' => 'A', # e.g. Åland Islands -- Latin capital letter A with ring above // Å 'Ç' => 'C', # -- Latin capital letter C with cedilla -- Ç 'Č' => 'C', # e.g. České 'Ḥ' => 'H', # e.g. Ḥalab [Aleppo] 'Ḫ' => 'H', # e.g. Ḫamīs Mušayṭ 'İ' => 'I', # e.g. İnter 'Í' => 'I', # e.g. ÍBV 'Ł' => 'L', # e.g. Łódź 'Ö' => 'Oe', # e.g. Örebro -- Latin capital letter O with diaeresis // Ö 'Ō' => 'O', # e.g. Ōsaka [Osaka] -- 'Ø' => 'O', # e.g. Nogne Ø Imperial Stout (no) -- Latin capital letter O with stroke (Latin capital letter O slash) // Ø 'Ř' => 'R', # e.g. Řezák 'Ś' => 'S', # e.g. Śląsk 'Š' => 'S', # e.g. MŠK -- Latin capital letter S with caron // Š 'Ş' => 'S', # e.g. Şüvälan 'Ṣ' => 'S', # e.g. Ṣan'ā' [Sana'a] 'Ṭ' => 'T', # e.g. Ṭarābulus [Tripoli] 'Ü' => 'Ue', # e.g. Übelbach 'Ú' => 'U', # e.g. Ústí, Újpest 'Ž' => 'Z', # e.g. Žilina 'Ż' => 'Z', # e.g. Żywiec (polish) -- Latin captial letter Z with caron "\u{030C}" => 'x', # e.g. Pex̌awar [Peshawar] -- note: use unicode codepoint as some editors mess up 'x̌' }
Instance Method Summary collapse
Instance Method Details
#asciify(content, options = {}) ⇒ Object
132 133 134 135 136 137 138 139 140 141 142 |
# File 'lib/textutils/filter/string_filter.rb', line 132 def asciify( content, ={} ) buf = '' content.each_char do |c| if ASCIIFY_MAPPINGS.has_key?( c ) buf << ASCIIFY_MAPPINGS[ c ] else buf << c # just add as is (no mapping) end end buf end |
#slugify(content, options = {}) ⇒ Object
144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
# File 'lib/textutils/filter/string_filter.rb', line 144 def slugify( content, ={} ) ## NOTE: for now we do NOT strip non-word characters!!!! ## if it is an accented char, add it to asciify first!!! ## converts to lowercase, ## removes non-word characters (alphanumerics and underscores) ## and converts spaces to hyphens. ## Also strips leading and trailing whitespace. # 1) asciify and downcase content = asciify( content ).downcase # 2) replace special chars w/space e.g $&%?!§#=*+._/()[]{} ## -- check in [] do we need to espcae / () [] {} content = content.gsub( /[$&%?!§#=*+._\/\(\)\[\]\{\}]/, ' ' ) ## -- replace w/ dash (-) content = content.gsub( /["']/, '' ) ## -- remove (use replace too? why? why not? add others??? # 3) strip leading and trailing spaces; squeeze spaces (e.g. more than one into one space) content = content.strip content = content.gsub( / {2,}/, ' ' ) # 4) replace remaining (inner) spaces ( ) with dash (-) content = content.gsub( ' ', '-' ) content end |