Class: Rsssf::Page

Inherits:
Object
  • Object
show all
Includes:
Utils
Defined in:
lib/rsssf/page.rb

Overview

note:

 a rsssf page may contain:
  many leagues, cups
  - tables, schedules (rounds), notes, etc.

a rsssf page MUST be in plain text (.txt) and utf-8 character encoding assumed

Constant Summary collapse

LEAGUE_ROUND_REGEX =
/\b
Round
\b/ix
CUP_ROUND_REGEX =
/\b(
  Round         |
  1\/8\sFinals  |
  1\/16\sFinals |
  Quarterfinals |
  Semifinals    |
  Final
)\b/ix

Class Method Summary collapse

Instance Method Summary collapse

Methods included from Utils

#archive_dir_for_year, #year_from_file, #year_from_name, #year_to_season

Constructor Details

#initialize(txt) ⇒ Page

Returns a new instance of Page.



46
47
48
# File 'lib/rsssf/page.rb', line 46

def initialize( txt )
  @txt = txt
end

Class Method Details

.from_file(path) ⇒ Object



37
38
39
40
# File 'lib/rsssf/page.rb', line 37

def self.from_file( path )
  txt = File.read_utf8( path )  # note: always assume sources (already) converted to utf-8 
  self.from_string( txt )
end

.from_string(txt) ⇒ Object



42
43
44
# File 'lib/rsssf/page.rb', line 42

def self.from_string( txt )
  self.new( txt )
end

.from_url(src) ⇒ Object



31
32
33
34
# File 'lib/rsssf/page.rb', line 31

def self.from_url( src )
  txt = PageFetcher.new.fetch( src )
  self.from_string( txt )
end

Instance Method Details

#build_statObject



215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
# File 'lib/rsssf/page.rb', line 215

def build_stat
  source       = nil
  authors      = nil
  last_updated = nil

  ### find source ref
  if @txt =~ /source: ([^ \n]+)/im
    source = $1.to_s
    puts "source: >#{source}<"
  end

  ##
  ## fix/todo: move authors n last updated  whitespace cleanup to sanitize - why? why not?? 

  if @txt =~ /authors?:\s+(.+?)\s+last updated:\s+(\d{1,2} [a-z]{3,10} \d{4})/im
    last_updated = $2.to_s   # note: save a copy first (gets "reset" by next regex)
    authors      = $1.to_s.strip.gsub(/\s+/, ' ' )  # cleanup whitespace; squish-style
    authors = authors.gsub( /[ ]*,[ ]*/, ', ' )    # prettify commas - always single space after comma (no space before)
    puts "authors: >#{authors}<"
    puts "last updated: >#{last_updated}<"
  end

  puts "*** !!! missing source"  if source.nil?
  puts "*** !!! missing authors n last updated"   if authors.nil? || last_updated.nil?

  sections = []

  ## count lines
  line_count = 0
  @txt.each_line do |line|
    line_count +=1

    ### find sections
    ## todo: add more patterns? how? why?
    if line =~ /####\s+(.+)/
      puts "  found section >#{$1}<"
      sections << $1.strip
    end
  end


  # get path from url
  url  = URI.parse( source )
  ## pp url
  ## puts url.host
  path = url.path
  extname  = File.extname( path )
  basename = File.basename( path, extname )  ## e.g. duit92.txt or duit92.html => duit92
  year     = year_from_name( basename )
  season   = year_to_season( year )

  rec = PageStat.new
  rec.source       = source         # e.g. http://rsssf.org/tabled/duit89.html   -- use source_url - why?? why not??
  rec.basename     = basename       # e.g. duit89
  rec.year         = year           # e.g. 89 => 1989  -- note: always four digits
  rec.season       = season
  rec.authors      = authors
  rec.last_updated = last_updated
  rec.line_count   = line_count
  rec.char_count   = @txt.size      ## fix: use "true" char count not byte count
  rec.sections     = sections  

  rec
end

#find_schedule(opts = {}) ⇒ Object

change to build_schedule - why? why not???



64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
# File 'lib/rsssf/page.rb', line 64

def find_schedule( opts={} )     ## change to build_schedule - why? why not???

  ## find match schedule/fixtures in multi-league doc
  new_txt = ''

  ## note: keep track of statistics
  ##   e.g. number of rounds found
  
  round_count = 0

  header = opts[:header]
  if header
    league_header_found        = false

     ## header:
     ##  - assumes heading 4 e.g. #### Premier League or
     ##  - bold e.g. **FA Cup** for now
     ##  note: markers must start line (^)

     ## note:
     ## header gsub spaces to \s otherwise no match in regex (using free-form x-flag)!!!
     header_esc   = header.gsub( ' ', '\s' )

     ## note: somehow #{2,4} will not work with free-form /xi defined (picked up as comment?)
     ##  use [#] hack ??
     header_regex = /^
                      ([#]{2,4}\s+(#{header_esc}))
                        |
                      (\*{2}(#{header_esc})\*{2})
                    /ix

    ## todo:
    ##   use new stage_regex e.g. **xxx** - why? why not?
    ##  allow more than one stage in one schedule (e.g. regular stage,playoff stage etc)

  else
    league_header_found = true   # default (no header; assume single league file)
    header_regex = /^---dummy---$/  ## non-matching dummy regex
  end

  ## puts "header_regex:"
  ## pp header_regex


  if opts[:cup]
    round_regex = CUP_ROUND_REGEX   ## note: only allow final, quaterfinals, etc. if knockout cup
  else
    round_regex = LEAGUE_ROUND_REGEX
  end


  ## stages
  first_round_header_found   = false
  round_header_found         = false
  round_body_found           = false   ## allow round header followed by blank lines

  blank_found = false



  @txt.each_line do |line|

    if league_header_found == false
      ## first find start of league header/section
      if line =~ header_regex
        puts "!!! bingo - found header >#{line}<"
        league_header_found = true
        title = line.gsub( /[#*]/, '' ).strip   ##  quick hack: extract title from header
        new_txt << "## #{title}\n\n"    # note: use header/stage title (regex group capture)
      else
        puts "  searching for header >#{header}<; skipping line >#{line}<"
        next
      end
    elsif first_round_header_found == false
      ## next look for first round (starting w/ Round)
      if line =~ round_regex
        puts "!!! bingo - found first round >#{line}<"
        round_count += 1
        first_round_header_found = true
        round_header_found       = true
        round_body_found         = false
        new_txt << line
      elsif line =~ /^=-=-=-=/
        puts "*** no rounds found; hit section marker (horizontal rule)"
        break
      elsif line =~ /^\*{2}[^*]+\*{2}/   ## e.g. **FA Cup**
        puts "*** no rounds found; hit section/stage header: #{line}"
        break
      else
        puts "  searching for first round; skipping line >#{line}<"
        next ## continue; searching
      end
    elsif round_header_found == true
      ## collect rounds;
      ##   assume text block until next blank line
      ##   new block must allways start w/ round
      if line =~ /^\s*$/   ## blank line?
        if round_body_found
          round_header_found = false
          blank_found        = true    ## keep track of blank (lines) - allow inside round block (can continue w/ date header/marker)
          new_txt << line
        else
          ## note: skip blanks following header
          next
        end
      else
        round_body_found = true
        new_txt << line   ## keep going until next blank line
      end
    else
      ## skip (more) blank lines
      if line =~ /^\s*$/
        next  ## continue; skip extra blank line
      elsif line =~ round_regex
        puts "!!! bingo - found new round >#{line}<"
        round_count += 1
        round_header_found = true   # more rounds; continue
        round_body_found   = false
        blank_found        = false  # reset blank tracker
        new_txt << line
      elsif blank_found && line =~ /\[[a-z]{3} \d{1,2}\]/i   ## e.g. [Mar 13] or [May 5] with leading blank line; continue round
        puts "!!! bingo - continue round >#{line}<"
        round_header_found = true
        blank_found        = false  # reset blank tracker
        new_txt << line
      elsif blank_found && line =~ /First Legs|Second Legs/i
        puts "!!! bingo - continue round >#{line}<"
        round_header_found = true
        blank_found        = false  # reset blank tracker
        new_txt << line
      elsif line =~ /=-=-=-=/
        puts "!!! stop schedule; hit section marker (horizontal rule)"
        break;
      elsif line =~ /^\*{2}[^*]+\*{2}/   ## e.g. **FA Cup**
        puts "!!! stop schedule; hit section/stage header: #{line}"
        break
      else
        blank_found  = false
        puts "skipping line in schedule >#{line}<"
        next # continue
      end
    end
  end  # each line

  schedule = Schedule.from_string( new_txt )
  schedule.rounds = round_count

  schedule
end

#save(path) ⇒ Object



281
282
283
284
285
# File 'lib/rsssf/page.rb', line 281

def save( path )
  File.open( path, 'w' ) do |f|
    f.write @txt
  end
end