Module: NHKore::CLI::NewsCmd

Included in:
App
Defined in:
lib/nhkore/cli/news_cmd.rb

Constant Summary collapse

DEFAULT_NEWS_SCRAPE =
1

Instance Method Summary collapse

Instance Method Details

#build_news_cmdObject



26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# File 'lib/nhkore/cli/news_cmd.rb', line 26

def build_news_cmd
  app = self

  @news_cmd = @app_cmd.define_command do
    name    'news'
    usage   'news [OPTIONS] [COMMAND]...'
    aliases :n
    summary "Scrape NHK News Web (Easy) articles (aliases: #{app.color_alias('n')})"

    description "      Scrape NHK News Web (Easy) articles &\n      save to folder: \#{News::DEFAULT_DIR}\n    DESC\n\n    option :d,:datetime,<<-DESC,argument: :required,transform: lambda { |value|\n      date time to use as a fallback in cases when an article doesn't have one;\n      format: YYYY-mm-dd H:M; example: 2020-03-30 15:30\n    DESC\n      value = Time.strptime(value,'%Y-%m-%d %H:%M',&DatetimeParser.method(:guess_year))\n      value = Util.jst_time(value)\n      value\n    }\n    option :i,:in,<<-DESC,argument: :required,transform: lambda { |value|\n      HTML file of article to read instead of URL (for offline testing and/or slow internet;\n      see '--no-dict' option)\n    DESC\n      app.check_empty_opt(:in,value)\n    }\n    flag :L,:lenient,<<-DESC\n      leniently (not strict) scrape articles:\n      body & title content without the proper HTML/CSS classes/IDs and no futsuurl;\n      example URLs that need this flag:\n      -https://www3.nhk.or.jp/news/easy/article/disaster_earthquake_02.html\n      -https://www3.nhk.or.jp/news/easy/tsunamikeihou/index.html\n    DESC\n    option :k,:like,<<-DESC,argument: :required,transform: lambda { |value|\n      text to fuzzy search links for; for example, \"--like '00123'\" will only scrape links containing\n      text '00123' -- like '*00123*'\n    DESC\n      value = Util.strip_web_str(value).downcase\n      value\n    }\n    option :l,:links,<<-DESC,argument: :required,transform: lambda { |value|\n      'directory/file' of article links to scrape (see '\#{App::NAME} search';\n      defaults: \#{SearchLinks::DEFAULT_YASASHII_FILE}, \#{SearchLinks::DEFAULT_FUTSUU_FILE})\n    DESC\n      app.check_empty_opt(:links,value)\n    }\n    flag :M,:missingno,<<-DESC\n      very rarely an article will not have kana or kanji for a Ruby tag;\n      to not raise an error, this will use previously scraped data to fill it in;\n      example URL:\n      -https://www3.nhk.or.jp/news/easy/k10012331311000/k10012331311000.html\n    DESC\n    flag :D,:'no-dict',<<-DESC\n      do not try to parse the dictionary files for the articles; useful in case of errors trying to load\n      the dictionaries (or for offline testing)\n    DESC\n    flag :H,'no-sha256',<<-DESC\n      do not check the SHA-256 of the content to see if an article has already been scraped;\n      for example, 2 URLs with the same content, but 1 with 'http' & 1 with 'https', will both be scraped;\n      this is useful if 2 articles have the same SHA-256, but different content (unlikely)\n    DESC\n    option :o,:out,<<-DESC,argument: :required,transform: lambda { |value|\n      'directory/file' to save words to; if you only specify a directory or a file, it will attach\n      the appropriate default directory/file name\n      (defaults: \#{YasashiiNews::DEFAULT_FILE}, \#{FutsuuNews::DEFAULT_FILE})\n    DESC\n      app.check_empty_opt(:out,value)\n    }\n    flag :r,:redo,'scrape article links even if they have already been scraped'\n    option :s,:scrape,'number of unscraped article links to scrape',argument: :required,\n        default: DEFAULT_NEWS_SCRAPE,transform: lambda { |value|\n          value = value.to_i\n          value = 1 if value < 1\n          value\n        }\n    option nil,:'show-dict',<<-DESC\n      show dictionary URL and contents for the first article and exit;\n      useful for debugging dictionary errors (see '--no-dict' option);\n      implies '--dry-run' option\n    DESC\n    option :u,:url,<<-DESC,argument: :required,transform: lambda { |value|\n      URL of article to scrape, instead of article links file (see '--links' option)\n    DESC\n      app.check_empty_opt(:url,value)\n    }\n\n    run do |_opts,_args,cmd|\n      puts cmd.help\n    end\n  end\n\n  @news_easy_cmd = @news_cmd.define_command do\n    name    'easy'\n    usage   'easy [OPTIONS] [COMMAND]...'\n    aliases :e,:ez\n    summary \"Scrape NHK News Web Easy (Yasashii) articles (aliases: \#{app.color_alias('e ez')})\"\n\n    description <<-DESC\n      Search for NHK News Web Easy (Yasashii) links &\n      save to file: \#{YasashiiNews::DEFAULT_FILE}\n    DESC\n\n    run do |opts,args,cmd|\n      app.refresh_cmd(opts,args,cmd)\n      app.run_news_cmd(:yasashii)\n    end\n  end\n\n  @news_regular_cmd = @news_cmd.define_command do\n    name    'regular'\n    usage   'regular [OPTIONS] [COMMAND]...'\n    aliases :r,:reg\n    summary \"Scrape NHK News Web Regular (Futsuu) articles (aliases: \#{app.color_alias('r reg')})\"\n\n    description <<-DESC\n      Search for NHK News Web Regular (Futsuu) links &\n      save to file: \#{FutsuuNews::DEFAULT_FILE}\n    DESC\n\n    run do |opts,args,cmd|\n      app.refresh_cmd(opts,args,cmd)\n      app.run_news_cmd(:futsuu)\n    end\n  end\nend\n"

#run_news_cmd(type) ⇒ Object



154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
# File 'lib/nhkore/cli/news_cmd.rb', line 154

def run_news_cmd(type)
  @cmd_opts[:dry_run] = true if @cmd_opts[:show_dict]
  news_name = nil

  build_in_file(:in)

  case type
  when :futsuu
    build_in_file(:links,default_dir: SearchLinks::DEFAULT_DIR,
      default_filename: SearchLinks::DEFAULT_FUTSUU_FILENAME)
    build_out_file(:out,default_dir: News::DEFAULT_DIR,default_filename: FutsuuNews::DEFAULT_FILENAME)

    news_name = 'Regular'
  when :yasashii
    build_in_file(:links,default_dir: SearchLinks::DEFAULT_DIR,
      default_filename: SearchLinks::DEFAULT_YASASHII_FILENAME)
    build_out_file(:out,default_dir: News::DEFAULT_DIR,default_filename: YasashiiNews::DEFAULT_FILENAME)

    news_name = 'Easy'
  else
    raise ArgumentError,"invalid type[#{type}]"
  end

  return unless check_in_file(:in,empty_ok: true)
  return unless check_out_file(:out)

  datetime = @cmd_opts[:datetime]
  dict = @cmd_opts[:no_dict] ? nil : :scrape
  dry_run = @cmd_opts[:dry_run]
  in_file = @cmd_opts[:in]
  lenient = @cmd_opts[:lenient]
  like = @cmd_opts[:like]
  links_file = @cmd_opts[:links]
  max_scrapes = @cmd_opts[:scrape]
  max_scrapes = DEFAULT_NEWS_SCRAPE if max_scrapes.nil?
  missingno = @cmd_opts[:missingno]
  no_sha256 = @cmd_opts[:no_sha256]
  out_file = @cmd_opts[:out]
  redo_scrapes = @cmd_opts[:redo]
  show_dict = @cmd_opts[:show_dict]

  # Favor in_file option over url option.
  url = in_file.nil? ? Util.strip_web_str(@cmd_opts[:url].to_s) : in_file
  url = nil if url.empty?

  # Then we must have a links file that exists.
  return if url.nil? && !check_in_file(:links,empty_ok: false)

  start_spin("Scraping NHK News Web #{news_name} articles")

  is_file = !in_file.nil?
  link_count = -1
  links = File.exist?(links_file) ? SearchLinks.load_file(links_file) : SearchLinks.new
  new_articles = [] # For --dry-run
  scrape_count = 0

  news = if File.exist?(out_file)
           (type == :yasashii) ? YasashiiNews.load_file(out_file,overwrite: no_sha256)
                               : FutsuuNews.load_file(out_file,overwrite: no_sha256)
         else
           (type == :yasashii) ? YasashiiNews.new : FutsuuNews.new
         end

  @news_article_scraper_kargs = @scraper_kargs.merge({
    datetime: datetime,
    dict: dict,
    is_file: is_file,
    missingno: missingno ? Missingno.new(news) : nil,
    strict: !lenient,
  })
  @news_dict_scraper_kargs = @scraper_kargs.merge({
    is_file: is_file,
  })

  if url.nil?
    # Why store each() and do `links_len` instead of `links-len - 1`?
    #
    # If links contains 5 entries and you scrape all 5, then the output of
    # update_spin_detail() will end on 4, so all of this complexity is so
    # that update_spin_detail() only needs to be written/updated on one line.

    links_each = links.links.values.each
    links_len = links.length

    0.upto(links_len) do |i|
      update_spin_detail(" (scraped=#{scrape_count}, considered=#{link_count += 1})")

      break if i >= links_len || scrape_count >= max_scrapes

      link = links_each.next

      next if !like.nil? && !link.url.to_s.downcase.include?(like)
      next if !redo_scrapes && scraped_news_article?(news,link)

      url = link.url
      result = scrape_news_article(url,link: link,new_articles: new_articles,news: news)

      if result == :scraped
        scrape_count += 1
      elsif result == :unscraped
        next
      else
        # --show-dict
        url = result
        scrape_count = max_scrapes # Break on next iteration for update_spin_detail().
      end

      # Break on next iteration for update_spin_detail().
      next if scrape_count >= max_scrapes
      sleep_scraper
    end
  else
    link = links[url]

    if link.nil?
      link = SearchLink.new(url)
      links.add_link(link)
    end

    result = scrape_news_article(url,link: link,new_articles: new_articles,news: news)
    scrape_count += 1 if result != :unscraped
  end

  stop_spin
  puts

  if scrape_count <= 0
    puts 'Nothing scraped!'

    if !dry_run && !show_dict
      puts
      start_spin('Saving updated links to file')

      links.save_file(links_file)

      stop_spin
      puts "> #{links_file}"
    end
  else
    puts 'Last URL scraped:'
    puts "> #{url}"
    puts

    if show_dict
      puts @cmd_opts[:show_dict] # Updated in scrape_news_article()
    elsif dry_run
      if new_articles.empty?
        raise CLIError,"scrape_count[#{scrape_count}] != new_articles[#{new_articles.length}]; " \
                       'internal code is broken'
      elsif new_articles.length == 1
        puts new_articles.first
      else
        # Don't show the words (mini), too verbose for more than 1.
        new_articles.each do |article|
          puts article.to_s(mini: true)
        end
      end
    else
      start_spin('Saving scraped data to files')

      links.save_file(links_file)
      news.save_file(out_file)

      stop_spin
      puts "> #{out_file}"
      puts "> #{links_file}"
    end
  end
end

#scrape_news_article(url, link:, new_articles:, news:) ⇒ Object



324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
# File 'lib/nhkore/cli/news_cmd.rb', line 324

def scrape_news_article(url,link:,new_articles:,news:)
  show_dict = @cmd_opts[:show_dict]

  if show_dict
    scraper = DictScraper.new(url,**@news_dict_scraper_kargs)

    @cmd_opts[:show_dict] = scraper.scrape.to_s

    return scraper.url
  end

  scraper = nil

  begin
    scraper = ArticleScraper.new(url,**@news_article_scraper_kargs)
  rescue Http404Error
    # - https://www3.nhk.or.jp/news/easy/k10014157491000/k10014157491000.html
    Util.warn("Ignoring URL with 404 error: #{url}.")
    return :unscraped
  end

  article = scraper.scrape
  # run_news_cmd() handles overwriting with --redo or not
  #   using scraped_news_article?().
  news.add_article(article,overwrite: true)

  news.update_article(article,link.url) # Favors https
  link.update_from_article(article)

  new_articles << article

  return :scraped # No --show-dict
end

#scraped_news_article?(news, link) ⇒ Boolean

Returns:

  • (Boolean)


358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
# File 'lib/nhkore/cli/news_cmd.rb', line 358

def scraped_news_article?(news,link)
  return true if link.scraped?

  no_sha256 = @cmd_opts[:no_sha256]

  article = news.article(link.url)

  if !no_sha256 && article.nil?
    if !Util.empty_web_str?(link.sha256) && news.sha256?(link.sha256)
      article = news.article_with_sha256(link.sha256)
    end

    if article.nil?
      scraper = nil

      begin
        scraper = ArticleScraper.new(link.url,**@news_article_scraper_kargs)
      rescue Http404Error
        return false
      end

      sha256 = scraper.scrape_sha256_only
      article = news.article_with_sha256(sha256) if news.sha256?(sha256)
    end
  end

  if article
    news.update_article(article,link.url) # Favors https
    link.update_from_article(article)

    return true
  end

  return false
end