Class: SportDb::Import::CountryReader

Inherits:
Object
  • Object
show all
Defined in:
lib/sportdb/formats/country/country_reader.rb

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(txt) ⇒ CountryReader

Returns a new instance of CountryReader.



21
22
23
# File 'lib/sportdb/formats/country/country_reader.rb', line 21

def initialize( txt )
  @txt = txt
end

Class Method Details

.parse(txt) ⇒ Object



16
17
18
# File 'lib/sportdb/formats/country/country_reader.rb', line 16

def self.parse( txt )
  new( txt ).parse
end

.read(path) ⇒ Object

use - rename to read_file or from_file etc. - why? why not?



11
12
13
14
# File 'lib/sportdb/formats/country/country_reader.rb', line 11

def self.read( path )   ## use - rename to read_file or from_file etc. - why? why not?
  txt = File.open( path, 'r:utf-8' ) { |f| f.read }
  parse( txt )
end

Instance Method Details

#parseObject



25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# File 'lib/sportdb/formats/country/country_reader.rb', line 25

def parse
  countries    = []
  last_country = nil    ## note/check/fix: use countries[-1] - why? why not?

  OutlineReader.parse( @txt ).each do |node|

     node_type = node[0]

     if [:h1, :h2].include?( node_type )
       ## skip headings (and headings) for now too
     elsif node_type == :p    ## paragraph
      lines = node[1]
      lines.each do |line|
      if line.start_with?( '|' )
        ## assume continuation with line of alternative names
        ##  note: skip leading pipe
        values = line[1..-1].split( '|' )   # team names - allow/use pipe(|)
        ## strip and squish (white)spaces
        #   e.g. East Germany        (-1989)  => East Germany (-1989)
        values = values.map { |value| value.strip.gsub( /[ \t]+/, ' ' ) }
        last_country.alt_names += values
      elsif line =~ /^-[ ]*(\d{4})
                        [ ]+
                       (.+)$
                    /x     ## check for historic lines e.g. -1989
         year   = $1.to_i
         parts  = $2.split( /=>|⇒/ )
         values = parts[0].split( ',' )
         values = values.map { |value| value.strip.gsub( /[ \t]+/, ' ' ) }

         name = values[0]
         code = values[1]

         last_country = country = Country.new( name: "#{name} (-#{year})",
                                               code: code )
         ## country.alt_names << name    ## note: for now do NOT add name without year to alt_names - gets auto-add by index!!!

         countries << country
         ## todo/fix: add reference to country today (in parts[1] !!!!)
      else
        ## assume "regular" line
        ##  check if starts with id  (todo/check: use a more "strict"/better regex capture pattern!!!)
        ##   note: allow country codes upto 4 (!!) e.g. Northern Cyprus
        if line =~ /^([a-z]{2,4})
                        [ ]+
                       (.+)$/x
          key    = $1
          values = $2.split( ',' )
          ## strip and squish (white)spaces
          #   e.g. East Germany        (-1989)  => East Germany (-1989)
          values = values.map { |value| value.strip.gsub( /[ \t]+/, ' ' ) }

          ## note: remove "overlords" from geo-tree marked territories  e.g. UK, US, etc. from name
          ##    e.g. England › UK      => England
          ##         Puerto Rico › US  => Puerto Rico
          geos = split_geo( values[0] )
          name = geos[0]    ## note: ignore all other geos for now

          ##   note: allow country codes up to 4 (!!) e.g. Northern Cyprus
          code = if values[1] && values[1] =~ /^[A-Z]{3,4}$/   ## note: also check format
                   values[1]
                 else
                   if values[1]
                     puts "** !!! ERROR !!! wrong code format >#{values[1]}<; expected three (or four)-letter all up-case"
                   else
                     puts "** !!! ERROR !!! missing code for (canonical) country name"
                   end
                   exit 1
                 end

          tags = if values[2]   ## check if tags presents
                   split_tags( values[2] )
                 else
                   []
                 end

          last_country = country = Country.new( key: key,
                                                name: name,
                                                code: code,
                                                tags: tags )
          countries << country
        else
          puts "** !! ERROR - missing key for (canonical) country name"
          exit 1
        end
      end
      end  # each line
    else
      puts "** !! ERROR - unknown node type / (input) source line:"
      pp node
      exit 1
    end
  end    # each node

  countries
end

#split_geo(str) ⇒ Object

todo/check: rename to parse_geo(s) - why? why not?



132
133
134
135
136
137
# File 'lib/sportdb/formats/country/country_reader.rb', line 132

def split_geo( str )   ## todo/check: rename to parse_geo(s) - why? why not?
  ## split into geo tree
  geos = str.split( /[<>‹›]/ )          ## note: allow > < or › ‹ for now
  geos = geos.map { |geo| geo.strip }   ## remove all whitespaces
  geos
end

#split_tags(str) ⇒ Object

helpers



126
127
128
129
130
# File 'lib/sportdb/formats/country/country_reader.rb', line 126

def split_tags( str )
  tags = str.split( /[|<>‹›]/ )   ## allow pipe (|) and (<>‹›) as divider for now - add more? why? why not?
  tags = tags.map { |tag| tag.strip }
  tags
end