Class: NHKore::Scraper

Inherits:
Object
  • Object
show all
Extended by:
AttrBool::Ext
Defined in:
lib/nhkore/scraper.rb

Direct Known Subclasses

ArticleScraper, DictScraper, SearchScraper

Constant Summary collapse

DEFAULT_HEADER =
{
  # See for better ones:
  # - https://www.useragentstring.com/pages/Chrome/
  'user-agent' => UserAgents.sample,

  'accept' => 'text/html,application/xhtml+xml,application/xml,application/rss+xml,text/xml;' \
              'q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
  'accept-language' => 'en;q=0.9,ja-JP;q=0.8,ja',
  'cache-control' => 'max-age=0',
  'dnt' => '1',
  'ect' => '4g',
  'priority' => 'u=0, i',
  'upgrade-insecure-requests' => '1',
}.freeze

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, eat_cookie: false, header: nil, is_file: false, max_redirects: 3, max_retries: 3, redirect_rule: :strict, str_or_io: nil, **kargs) ⇒ Scraper

max_redirects defaults to 3 for safety (infinite-loop attack).

All URL options: ruby-doc.org/stdlib-2.7.0/libdoc/open-uri/rdoc/OpenURI/OpenRead.html

Pass in header: {} for the default HTTP header fields to be set.

Parameters:

  • eat_cookie (true, false) (defaults to: false)

    true to set the HTTP header field ‘cookie’, which can be an expensive (time-consuming) operation since it opens the URL again, but necessary for some URLs.

  • redirect_rule (nil, :lenient, :strict) (defaults to: :strict)


56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# File 'lib/nhkore/scraper.rb', line 56

def initialize(url,eat_cookie: false,header: nil,is_file: false,max_redirects: 3,max_retries: 3,
               redirect_rule: :strict,str_or_io: nil,**kargs)
  super()

  if !header.nil? && !is_file
    # Some sites (Search Engines) hate scrapers, so need HTTP header fields.
    # If this isn't enough, look at googler for more header fields to set:
    # - https://github.com/jarun/googler
    # If necessary, can use Faraday, HTTParty, or RestClient gem and
    #   pass in to str_or_io.

    header = DEFAULT_HEADER.merge(header)
    kargs.merge!(header)
  end

  @eat_cookie = eat_cookie
  @is_file = is_file
  @kargs = kargs
  @max_redirects = max_redirects
  @max_retries = max_retries
  @redirect_rule = redirect_rule

  self.open(url,str_or_io,is_file: is_file)
end

Instance Attribute Details

#kargsObject (readonly)

Returns the value of attribute kargs.



40
41
42
# File 'lib/nhkore/scraper.rb', line 40

def kargs
  @kargs
end

#max_redirectsObject

Returns the value of attribute max_redirects.



41
42
43
# File 'lib/nhkore/scraper.rb', line 41

def max_redirects
  @max_redirects
end

#max_retriesObject

Returns the value of attribute max_retries.



42
43
44
# File 'lib/nhkore/scraper.rb', line 42

def max_retries
  @max_retries
end

#redirect_ruleObject

Returns the value of attribute redirect_rule.



43
44
45
# File 'lib/nhkore/scraper.rb', line 43

def redirect_rule
  @redirect_rule
end

#str_or_ioObject

Returns the value of attribute str_or_io.



44
45
46
# File 'lib/nhkore/scraper.rb', line 44

def str_or_io
  @str_or_io
end

#urlObject

Returns the value of attribute url.



45
46
47
# File 'lib/nhkore/scraper.rb', line 45

def url
  @url
end

Instance Method Details



81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# File 'lib/nhkore/scraper.rb', line 81

def fetch_cookie(url)
  require 'http-cookie'

  open_url(url)

  cookies = Array(@str_or_io.meta['set-cookie']) # nil will be []

  if !cookies.empty?
    jar = HTTP::CookieJar.new
    uri = URI(url)

    cookies.each do |cookie|
      jar.parse(cookie,uri)
    end

    @kargs['cookie'] = HTTP::Cookie.cookie_value(jar.cookies(uri))
  end

  return self
end

#html_docObject



102
103
104
# File 'lib/nhkore/scraper.rb', line 102

def html_doc
  return Nokogiri::HTML(@str_or_io)
end

#join_url(relative_url) ⇒ Object



106
107
108
109
110
111
112
113
# File 'lib/nhkore/scraper.rb', line 106

def join_url(relative_url)
  # For a file, don't know what to do.
  # It would be unsafe to return something else;
  #   for example, it could return a lot of "../../../" to your root dir.
  return nil if @is_file

  return URI.join(@url,relative_url)
end

#open(url, str_or_io = nil, is_file: @is_file) ⇒ Object



115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# File 'lib/nhkore/scraper.rb', line 115

def open(url,str_or_io = nil,is_file: @is_file)
  @is_file = is_file
  @str_or_io = str_or_io
  @url = url

  if str_or_io.nil?
    if @is_file
      open_file(url)
    else
      fetch_cookie(url) if @eat_cookie
      open_url(url)
    end
  end

  return self
end

#open_file(file) ⇒ Object



132
133
134
135
136
137
138
139
140
# File 'lib/nhkore/scraper.rb', line 132

def open_file(file)
  @is_file = true
  @url = file

  # NHK's website tends to always use UTF-8.
  @str_or_io = File.open(file,'rt:UTF-8',**@kargs)

  return self
end

#open_url(url) ⇒ Object



142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
# File 'lib/nhkore/scraper.rb', line 142

def open_url(url)
  max_redirects = (@max_redirects.nil? || @max_redirects < 0) ? 10_000 : @max_redirects
  max_retries = (@max_retries.nil? || @max_retries < 0) ? 10_000 : @max_retries

  top_uri = URI(url)
  top_domain = Util.domain(top_uri.host)

  begin
    # Use URI().open() instead of URI.open()/(Kernel.)open() for safety (code-injection attack).
    # Use URI() instead of URI.parse() because url can be a URI (not just a string).
    @str_or_io = URI(url).open(redirect: false,**@kargs)
    @url = url
  rescue OpenURI::HTTPRedirect => redirect
    redirect_uri = redirect.uri

    if (max_redirects -= 1) < 0
      raise redirect.exception("redirected to URL[#{redirect_uri}]: #{redirect}")
    end

    case @redirect_rule
    when :lenient,:strict
      if redirect_uri.scheme != top_uri.scheme
        raise redirect.exception(
          "redirect scheme[#{redirect_uri.scheme}] does not match original " \
          "scheme[#{top_uri.scheme}] at redirect URL[#{redirect_uri}]: #{redirect}"
        )
      end

      if @redirect_rule == :strict
        redirect_domain = Util.domain(redirect_uri.host)

        if redirect_domain != top_domain
          raise redirect.exception(
            "redirect domain[#{redirect_domain}] does not match original " \
            "domain[#{top_domain}] at redirect URL[#{redirect_uri}]: #{redirect}"
          )
        end
      end
    end

    url = redirect_uri

    retry
  # Must come after HTTPRedirect since a subclass of HTTPError.
  rescue OpenURI::HTTPError => e
    msg = "HTTP error[#{e}] at URL[#{url}]"

    if e.to_s.include?('404 Not Found')
      raise Http404Error,msg
    else
      raise e.exception(msg)
    end
  rescue SocketError => e
    if (max_retries -= 1) < 0
      raise e.exception("Socket error[#{e}] at URL[#{url}]")
    end

    retry
  end

  return self
end

#readObject



205
206
207
208
209
# File 'lib/nhkore/scraper.rb', line 205

def read
  @str_or_io = @str_or_io.read if @str_or_io.respond_to?(:read)

  return @str_or_io
end

#reopenObject



211
212
213
# File 'lib/nhkore/scraper.rb', line 211

def reopen
  return self.open(@url)
end

#rss_docObject



215
216
217
218
219
# File 'lib/nhkore/scraper.rb', line 215

def rss_doc
  require 'rss'

  return RSS::Parser.parse(@str_or_io,validate: false)
end