Module: TextUtils::TitleHelper

Included in:: TextUtils

Defined in:: lib/textutils/helper/title_helper.rb

Instance Method Summary collapse

#strip_part_markers(title) ⇒ Object
- todo: use new additional sub module ??? e.g.
#strip_special_chars(title) ⇒ Object
#strip_subtitles(title) ⇒ Object
#strip_tags(title) ⇒ Object

todo: use an alias or rename for better name ??.
#strip_translations(title) ⇒ Object
#strip_whitespaces(title) ⇒ Object
#title_esc_regex(title_unescaped) ⇒ Object
#title_to_key(title) ⇒ Object

Instance Method Details

#strip_part_markers(title) ⇒ `Object`

todo: use new additional sub module ???
```
e.g. TextUtils::Reader::TagHelper
```
lets us use “classic” web helpers a la rails find a good name for sub module - Reader? Fixtures? Values? Parser?

# File 'lib/textutils/helper/title_helper.rb', line 19

def strip_part_markers( title )   # use different name e.g. strip_name_markers/strip_name_enclosure etc.??
   # remove optional part markers
   # e.g. Bock ‹Damm› becomes =>  Bock Damm
   #      ‹Estrella› ‹Damm› Inedit becomes =>  Estrella Damm Inedit

   title.gsub( /[<>‹›]/, '' )
end

#strip_special_chars(title) ⇒ `Object`

# File 'lib/textutils/helper/title_helper.rb', line 57

def strip_special_chars( title )
    # remove special chars (e.g. %°&$)
    # e.g. +Malta
    #      Minerva 8:60
    #      $Alianz$ Arena
    title.gsub( /[%&°+:$]/, '' )
end

#strip_subtitles(title) ⇒ `Object`

# File 'lib/textutils/helper/title_helper.rb', line 34

def strip_subtitles( title )
    # remove optional longer title part in ()
    # e.g. Las Palmas (de Gran Canaria) => Las Palmas
    #      Palma (de Mallorca) => Palma

    title.gsub( /\([^\)]+\)/, '' )
end

#strip_tags(title) ⇒ `Object`

todo: use an alias or rename for better name ??

# File 'lib/textutils/helper/title_helper.rb', line 42

def strip_tags( title )   # todo: use an alias or rename for better name ??
    # remove optional longer title part in {}
    #  e.g. Ottakringer {Bio}   => Ottakringer
    #       Ottakringer {Alkoholfrei} => Ottakringer
    #
    # todo: use for autotags? e.g. {Bio} => bio 
    
    title.gsub( /\{[^\}]+\}/, '' )
end

#strip_translations(title) ⇒ `Object`

# File 'lib/textutils/helper/title_helper.rb', line 27

def strip_translations( title )
    # remove optional english translation in square brackets ([])
    # e.g. Wien [Vienna]  =>  Wien

    title.gsub( /\[[^\]]+\]/, '' )
end

#strip_whitespaces(title) ⇒ `Object`

# File 'lib/textutils/helper/title_helper.rb', line 52

def strip_whitespaces( title )
    # remove all whitespace and punctuation
    title.gsub( /[ \t_\-\.!()\[\]'"’\/]/, '' )
end

#title_esc_regex(title_unescaped) ⇒ `Object`

# File 'lib/textutils/helper/title_helper.rb', line 90

def title_esc_regex( title_unescaped )
    
    ##  escape regex special chars e.g.
    #    . to \. and
    #    ( to \(
    #    ) to \)
    #    ? to \? -- zero or one
    #    * to \* -- zero or more
    #    + to \+ -- one or more
    #    $ to \$ -- end of line
    #    ^ to \^ -- start of line etc.
    
    ### add { and } ???
    ### add [ and ] ???
    ### add \ too ???
    ### add | too ???

    # e.g. Benfica Lis.
    # e.g. Club Atlético Colón (Santa Fe)
    # e.g. Bauer Anton (????)

    ## NB: cannot use Regexp.escape! will escape space '' to '\ '
    ## title = Regexp.escape( title_unescaped )
    title = title_unescaped.gsub( '.', '\.' )
    title = title.gsub( '(', '\(' )
    title = title.gsub( ')', '\)' )
    title = title.gsub( '?', '\?' )
    title = title.gsub( '*', '\*' )
    title = title.gsub( '+', '\+' )
    title = title.gsub( '$', '\$' )
    title = title.gsub( '^', '\^' )

    ##  match accented char with or without accents
    ##  add (ü|ue) etc.
    ## also make - optional change to (-| ) e.g. Blau-Weiss == Blau Weiss

    ## todo: add some more
    ## see http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references  for more
    ##
    ##  reuse for all readers!
    
    alternatives = [
      ['-', '(-| )'],  ## e.g. Blau-Weiß Linz
      ['æ', '(æ|ae)'],  ## e.g. 
      ['ä', '(ä|ae)'],  ## e.g. 
      ['Ö', '(Ö|Oe)'],  ## e.g. Österreich
      ['ö', '(ö|oe)'],  ## e.g. Mönchengladbach
      ['ß', '(ß|ss)'],  ## e.g. Blau-Weiß Linz
      ['ü', '(ü|ue)'],  ## e.g. 

      ['á', '(á|a)'],  ## e.g. Bogotá, Sársfield
      ['ã', '(ã|a)'],  ## e.g  São Paulo
      ['ç', '(ç|c)'],  ## e.g. Fenerbahçe
      ['é', '(é|e)'],  ## e.g. Vélez
      ['ê', '(ê|e)'],  ## e.g. Grêmio
      ['ï', '(ï|i)' ], ## e.g. El Djazaïr
      ['ñ', '(ñ|n)'],  ## e.g. Porteño
      ['ň', '(ň|n)'],  ## e.g. Plzeň
      ['ó', '(ó|o)'],   ## e.g. Colón
      ['ō', '(ō|o)'],  # # e.g. Tōkyō
      ['ș', '(ș|s)'],   ## e.g. Bucarești
      ['ú', '(ú|u)']  ## e.g. Fútbol
    ]

    ### fix/todo:  check for  dot+space e.g. . and make dot optional
    ##
    #  e.g. make  dot (.) optional plus allow alternative optional space e.g.
    #   -- for U.S.A. => allow USA or U S A
    #
    ##    e.g. U. de G. or U de G or U.de G. ??
    ##   collect some more (real-world) examples first!!!!!

    alternatives.each do |alt|
      title = title.gsub( alt[0], alt[1] )
    end

    title
end

#title_to_key(title) ⇒ `Object`

# File 'lib/textutils/helper/title_helper.rb', line 65

def title_to_key( title )

    ## NB: used in/moved from readers/values_reader.rb

    ## NB: downcase does NOT work for accented chars (thus, include in alternatives)
    key = title.downcase

    key = strip_part_markers( key )  # e.g. ‹Estrella› ‹Damm› Inedit becomes =>  Estrella Damm Inedit

    key = strip_translations( key )

    key = strip_subtitles( key )

    key = strip_tags( key )

    key = strip_whitespaces( key )

    key = strip_special_chars( key )

    key = TextUtils.asciify( key ).downcase  ## see filter/string_filter

    key
end

Module: TextUtils::TitleHelper

Instance Method Summary collapse

Instance Method Details

#strip_part_markers(title) ⇒ Object

#strip_special_chars(title) ⇒ Object

#strip_subtitles(title) ⇒ Object

#strip_tags(title) ⇒ Object

#strip_translations(title) ⇒ Object

#strip_whitespaces(title) ⇒ Object

#title_esc_regex(title_unescaped) ⇒ Object

#title_to_key(title) ⇒ Object

#strip_part_markers(title) ⇒ `Object`

#strip_special_chars(title) ⇒ `Object`

#strip_subtitles(title) ⇒ `Object`

#strip_tags(title) ⇒ `Object`

#strip_translations(title) ⇒ `Object`

#strip_whitespaces(title) ⇒ `Object`

#title_esc_regex(title_unescaped) ⇒ `Object`

#title_to_key(title) ⇒ `Object`