Class: Webcache::DiskCache

Inherits:

Object

Object
Webcache::DiskCache

show all

Defined in:: lib/webget/webcache.rb

Instance Method Summary collapse

#cached?(url) ⇒ Boolean (also: #exist?)
#read(url) ⇒ Object
#read_csv(url) ⇒ Object
#read_json(url) ⇒ Object
#record(url, response, path: nil, encoding: 'UTF-8', format: 'html') ⇒ Object

add more save / put / etc.
#url_to_id(str) ⇒ Object

note: use file path as id for DiskCache (is different for DbCache/SqlCache?) use file:// instead of disk:// - why? why not?.
#url_to_path(str, path: nil) ⇒ Object

helpers.

Instance Method Details

#cached?(url) ⇒ `Boolean` Also known as: exist?

Returns:

(Boolean)

# File 'lib/webget/webcache.rb', line 78

def cached?( url )
  body_path = "#{Webcache.root}/#{url_to_path( url )}"
  File.exist?( body_path )
end

#read(url) ⇒ `Object`

# File 'lib/webget/webcache.rb', line 85

def read( url )
  body_path = "#{Webcache.root}/#{url_to_path( url )}"
  File.open( body_path, 'r:utf-8' ) {|f| f.read }
end

#read_csv(url) ⇒ `Object`

# File 'lib/webget/webcache.rb', line 97

def read_csv( url )
  body_path = "#{Webcache.root}/#{url_to_path( url )}"
  txt = File.open( body_path, 'r:utf-8' ) {|f| f.read }
  data = CsvHash.parse( txt )
  data
end

#read_json(url) ⇒ `Object`

# File 'lib/webget/webcache.rb', line 90

def read_json( url )
  body_path = "#{Webcache.root}/#{url_to_path( url )}"
  txt = File.open( body_path, 'r:utf-8' ) {|f| f.read }
  data = JSON.parse( txt )
  data
end

#record(url, response, path: nil, encoding: 'UTF-8', format: 'html') ⇒ `Object`

add more save / put / etc. aliases - why? why not?

rename to record_html - why? why not?

# File 'lib/webget/webcache.rb', line 107

def record( url, response,
            path: nil,
            encoding: 'UTF-8',
            format: 'html' )

  body_path = "#{Webcache.root}/#{url_to_path( url, path: path )}"
  meta_path = "#{body_path}.meta.txt"

  ## make sure path exits

  FileUtils.mkdir_p( File.dirname( body_path ) )


  puts "[cache] saving #{body_path}..."

  ## todo/check: verify content-type - why? why not?

  ## note - for now respone.text always assume (converted) to utf8!!!!!!!!!

  if format == 'json'
    File.open( body_path, 'w:utf-8' ) {|f| f.write( JSON.pretty_generate( response.json )) }
  elsif format == 'csv'
    ## fix: newlines - always use "unix" style" - why? why not?

    ## fix:  use :newline => :universal option? translates to univeral "\n"

    text = response.text( encoding: encoding ).gsub( "\r\n", "\n" )
    File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
  else   ## html or txt

    text = response.text( encoding: encoding )
    File.open( body_path, 'w:utf-8' ) {|f| f.write( text ) }
  end


  File.open( meta_path, 'w:utf-8' ) do |f|
    ## todo/check:

    ##  do headers also need to converted (like text) if encoding is NOT utf-8 ???

    response.headers.each do |key, value|  # iterate all response headers

      f.write( "#{key}: #{value}" )
      f.write( "\n" )
    end
  end
end

#url_to_id(str) ⇒ `Object`

note: use file path as id for DiskCache (is different for DbCache/SqlCache?)

use file:// instead of disk:// - why? why not?

150	# File 'lib/webget/webcache.rb', line 150 def url_to_id( str ) "disk://#{url_to_path( str )}"; end

#url_to_path(str, path: nil) ⇒ `Object`

helpers

# File 'lib/webget/webcache.rb', line 154

def url_to_path( str, path: nil )
  ## map url to file path

  uri = URI.parse( str )

  ## note: ignore scheme (e.g. http/https)

  ##         and  post  (e.g. 80, 8080, etc.) for now

  ##    always downcase for now (internet domain is case insensitive)

  host_dir = uri.host.downcase

  req_path = if path   ## use "custom" (file)path for cache storage if passed in

               path
             else
              ## "/this/is/everything?query=params"

              ##   cut-off leading slash and

              ##    convert query ? =

               uri.request_uri[1..-1]
             end



  ### special "prettify" rule for weltfussball

  ##   /eng-league-one-2019-2020/  => /eng-league-one-2019-2020.html

  if host_dir.index( 'weltfussball.de' ) ||
     host_dir.index( 'worldfootball.net' )
        if req_path.end_with?( '/' )
           req_path = "#{req_path[0..-2]}.html"
        else
          puts "ERROR: expected request_uri for >#{host_dir}< ending with '/'; got: >#{req_path}<"
          exit 1
        end
  elsif host_dir.index( 'tipp3.at' )
    req_path = req_path.sub( '.jsp', '' )  # shorten - cut off .jsp extension


    ##   change ? to -I-

    ##   change = to ~

    ##   Example:

    ##   sportwetten/classicresults.jsp?oddsetProgramID=888

    ##     =>

    ##   sportwetten/classicresults-I-oddsetProgramID~888

    req_path = req_path.gsub( '?', '-I-' )
                       .gsub( '=', '~')

    req_path = "#{req_path}.html"
  elsif host_dir.index( 'fbref.com' )
    req_path = req_path.sub( 'en/', '' )      # shorten - cut off en/

    req_path = "#{req_path}.html"             # auto-add html extension

  elsif host_dir.index( 'football-data.co.uk' )
    req_path = req_path.sub( 'mmz4281/', '' )  # shorten - cut off mmz4281/

    req_path = req_path.sub( 'new/', '' )      # shorten - cut off new/

  elsif host_dir.index( 'football-data.org' )
    req_path = req_path.sub( 'v2/', '' )  # shorten - cut off v2/


    ## flattern - make a file path - for auto-save

    ##   change ? to -I-

    ##   change / to ~~

    ##   change = to ~

    req_path = req_path.gsub( '?', '-I-' )
                       .gsub( '/', '~~' )
                       .gsub( '=', '~')

    req_path = "#{req_path}.json"
  else
    ## no special rule

  end

  page_path = "#{host_dir}/#{req_path}"
  page_path
end

Class: Webcache::DiskCache

Instance Method Summary collapse

Instance Method Details

#cached?(url) ⇒ Boolean Also known as: exist?

#read(url) ⇒ Object

#read_csv(url) ⇒ Object

#read_json(url) ⇒ Object

#record(url, response, path: nil, encoding: 'UTF-8', format: 'html') ⇒ Object

#url_to_id(str) ⇒ Object

#url_to_path(str, path: nil) ⇒ Object

#cached?(url) ⇒ `Boolean` Also known as: exist?

#read(url) ⇒ `Object`

#read_csv(url) ⇒ `Object`

#read_json(url) ⇒ `Object`

#record(url, response, path: nil, encoding: 'UTF-8', format: 'html') ⇒ `Object`

#url_to_id(str) ⇒ `Object`

#url_to_path(str, path: nil) ⇒ `Object`