Module: NHKore::CLI::NewsCmd

Included in:: App

Defined in:: lib/nhkore/cli/news_cmd.rb

Constant Summary collapse

DEFAULT_NEWS_SCRAPE =

Instance Method Summary collapse

Instance Method Details

#build_news_cmd ⇒ `Object`

# File 'lib/nhkore/cli/news_cmd.rb', line 26

def build_news_cmd
  app = self

  @news_cmd = @app_cmd.define_command do
    name    'news'
    usage   'news [OPTIONS] [COMMAND]...'
    aliases :n
    summary "Scrape NHK News Web (Easy) articles (aliases: #{app.color_alias('n')})"

    description "      Scrape NHK News Web (Easy) articles &\n      save to folder: \#{News::DEFAULT_DIR}\n    DESC\n\n    option :d,:datetime,<<-DESC,argument: :required,transform: lambda { |value|\n      date time to use as a fallback in cases when an article doesn't have one;\n      format: YYYY-mm-dd H:M; example: 2020-03-30 15:30\n    DESC\n      value = Time.strptime(value,'%Y-%m-%d %H:%M',&DatetimeParser.method(:guess_year))\n      value = Util.jst_time(value)\n      value\n    }\n    option :i,:in,<<-DESC,argument: :required,transform: lambda { |value|\n      HTML file of article to read instead of URL (for offline testing and/or slow internet;\n      see '--no-dict' option)\n    DESC\n      app.check_empty_opt(:in,value)\n    }\n    flag :L,:lenient,<<-DESC\n      leniently (not strict) scrape articles:\n      body & title content without the proper HTML/CSS classes/IDs and no futsuurl;\n      example URLs that need this flag:\n      -https://www3.nhk.or.jp/news/easy/article/disaster_earthquake_02.html\n      -https://www3.nhk.or.jp/news/easy/tsunamikeihou/index.html\n    DESC\n    option :k,:like,<<-DESC,argument: :required,transform: lambda { |value|\n      text to fuzzy search links for; for example, \"--like '00123'\" will only scrape links containing\n      text '00123' -- like '*00123*'\n    DESC\n      value = Util.strip_web_str(value).downcase\n      value\n    }\n    option :l,:links,<<-DESC,argument: :required,transform: lambda { |value|\n      'directory/file' of article links to scrape (see '\#{App::NAME} search';\n      defaults: \#{SearchLinks::DEFAULT_YASASHII_FILE}, \#{SearchLinks::DEFAULT_FUTSUU_FILE})\n    DESC\n      app.check_empty_opt(:links,value)\n    }\n    flag :M,:missingno,<<-DESC\n      very rarely an article will not have kana or kanji for a Ruby tag;\n      to not raise an error, this will use previously scraped data to fill it in;\n      example URL:\n      -https://www3.nhk.or.jp/news/easy/k10012331311000/k10012331311000.html\n    DESC\n    flag :D,:'no-dict',<<-DESC\n      do not try to parse the dictionary files for the articles; useful in case of errors trying to load\n      the dictionaries (or for offline testing)\n    DESC\n    flag :H,'no-sha256',<<-DESC\n      do not check the SHA-256 of the content to see if an article has already been scraped;\n      for example, 2 URLs with the same content, but 1 with 'http' & 1 with 'https', will both be scraped;\n      this is useful if 2 articles have the same SHA-256, but different content (unlikely)\n    DESC\n    option :o,:out,<<-DESC,argument: :required,transform: lambda { |value|\n      'directory/file' to save words to; if you only specify a directory or a file, it will attach\n      the appropriate default directory/file name\n      (defaults: \#{YasashiiNews::DEFAULT_FILE}, \#{FutsuuNews::DEFAULT_FILE})\n    DESC\n      app.check_empty_opt(:out,value)\n    }\n    flag :r,:redo,'scrape article links even if they have already been scraped'\n    option :s,:scrape,'number of unscraped article links to scrape',argument: :required,\n        default: DEFAULT_NEWS_SCRAPE,transform: lambda { |value|\n          value = value.to_i\n          value = 1 if value < 1\n          value\n        }\n    option nil,:'show-dict',<<-DESC\n      show dictionary URL and contents for the first article and exit;\n      useful for debugging dictionary errors (see '--no-dict' option);\n      implies '--dry-run' option\n    DESC\n    option :u,:url,<<-DESC,argument: :required,transform: lambda { |value|\n      URL of article to scrape, instead of article links file (see '--links' option)\n    DESC\n      app.check_empty_opt(:url,value)\n    }\n\n    run do |_opts,_args,cmd|\n      puts cmd.help\n    end\n  end\n\n  @news_easy_cmd = @news_cmd.define_command do\n    name    'easy'\n    usage   'easy [OPTIONS] [COMMAND]...'\n    aliases :e,:ez\n    summary \"Scrape NHK News Web Easy (Yasashii) articles (aliases: \#{app.color_alias('e ez')})\"\n\n    description <<-DESC\n      Search for NHK News Web Easy (Yasashii) links &\n      save to file: \#{YasashiiNews::DEFAULT_FILE}\n    DESC\n\n    run do |opts,args,cmd|\n      app.refresh_cmd(opts,args,cmd)\n      app.run_news_cmd(:yasashii)\n    end\n  end\n\n  @news_regular_cmd = @news_cmd.define_command do\n    name    'regular'\n    usage   'regular [OPTIONS] [COMMAND]...'\n    aliases :r,:reg\n    summary \"Scrape NHK News Web Regular (Futsuu) articles (aliases: \#{app.color_alias('r reg')})\"\n\n    description <<-DESC\n      Search for NHK News Web Regular (Futsuu) links &\n      save to file: \#{FutsuuNews::DEFAULT_FILE}\n    DESC\n\n    run do |opts,args,cmd|\n      app.refresh_cmd(opts,args,cmd)\n      app.run_news_cmd(:futsuu)\n    end\n  end\nend\n"

#run_news_cmd(type) ⇒ `Object`

# File 'lib/nhkore/cli/news_cmd.rb', line 154

def run_news_cmd(type)
  @cmd_opts[:dry_run] = true if @cmd_opts[:show_dict]
  news_name = nil

  build_in_file(:in)

  case type
  when :futsuu
    build_in_file(:links,default_dir: SearchLinks::DEFAULT_DIR,
      default_filename: SearchLinks::DEFAULT_FUTSUU_FILENAME)
    build_out_file(:out,default_dir: News::DEFAULT_DIR,default_filename: FutsuuNews::DEFAULT_FILENAME)

    news_name = 'Regular'
  when :yasashii
    build_in_file(:links,default_dir: SearchLinks::DEFAULT_DIR,
      default_filename: SearchLinks::DEFAULT_YASASHII_FILENAME)
    build_out_file(:out,default_dir: News::DEFAULT_DIR,default_filename: YasashiiNews::DEFAULT_FILENAME)

    news_name = 'Easy'
  else
    raise ArgumentError,"invalid type[#{type}]"
  end

  return unless check_in_file(:in,empty_ok: true)
  return unless check_out_file(:out)

  datetime = @cmd_opts[:datetime]
  dict = @cmd_opts[:no_dict] ? nil : :scrape
  dry_run = @cmd_opts[:dry_run]
  in_file = @cmd_opts[:in]
  lenient = @cmd_opts[:lenient]
  like = @cmd_opts[:like]
  links_file = @cmd_opts[:links]
  max_scrapes = @cmd_opts[:scrape]
  max_scrapes = DEFAULT_NEWS_SCRAPE if max_scrapes.nil?
  missingno = @cmd_opts[:missingno]
  no_sha256 = @cmd_opts[:no_sha256]
  out_file = @cmd_opts[:out]
  redo_scrapes = @cmd_opts[:redo]
  show_dict = @cmd_opts[:show_dict]

  # Favor in_file option over url option.
  url = in_file.nil? ? Util.strip_web_str(@cmd_opts[:url].to_s) : in_file
  url = nil if url.empty?

  # Then we must have a links file that exists.
  return if url.nil? && !check_in_file(:links,empty_ok: false)

  start_spin("Scraping NHK News Web #{news_name} articles")

  is_file = !in_file.nil?
  link_count = -1
  links = File.exist?(links_file) ? SearchLinks.load_file(links_file) : SearchLinks.new
  new_articles = [] # For --dry-run
  scrape_count = 0

  news = if File.exist?(out_file)
           (type == :yasashii) ? YasashiiNews.load_file(out_file,overwrite: no_sha256)
                               : FutsuuNews.load_file(out_file,overwrite: no_sha256)
         else
           (type == :yasashii) ? YasashiiNews.new : FutsuuNews.new
         end

  @news_article_scraper_kargs = @scraper_kargs.merge({
    datetime: datetime,
    dict: dict,
    is_file: is_file,
    missingno: missingno ? Missingno.new(news) : nil,
    strict: !lenient,
  })
  @news_dict_scraper_kargs = @scraper_kargs.merge({
    is_file: is_file,
  })

  if url.nil?
    # Why store each() and do `links_len` instead of `links-len - 1`?
    #
    # If links contains 5 entries and you scrape all 5, then the output of
    # update_spin_detail() will end on 4, so all of this complexity is so
    # that update_spin_detail() only needs to be written/updated on one line.

    links_each = links.links.values.each
    links_len = links.length

    0.upto(links_len) do |i|
      update_spin_detail(" (scraped=#{scrape_count}, considered=#{link_count += 1})")

      break if i >= links_len || scrape_count >= max_scrapes

      link = links_each.next

      next if !like.nil? && !link.url.to_s.downcase.include?(like)
      next if !redo_scrapes && scraped_news_article?(news,link)

      url = link.url
      result = scrape_news_article(url,link: link,new_articles: new_articles,news: news)

      if result == :scraped
        scrape_count += 1
      elsif result == :unscraped
        next
      else
        # --show-dict
        url = result
        scrape_count = max_scrapes # Break on next iteration for update_spin_detail().
      end

      # Break on next iteration for update_spin_detail().
      next if scrape_count >= max_scrapes
      sleep_scraper
    end
  else
    link = links[url]

    if link.nil?
      link = SearchLink.new(url)
      links.add_link(link)
    end

    result = scrape_news_article(url,link: link,new_articles: new_articles,news: news)
    scrape_count += 1 if result != :unscraped
  end

  stop_spin
  puts

  if scrape_count <= 0
    puts 'Nothing scraped!'

    if !dry_run && !show_dict
      puts
      start_spin('Saving updated links to file')

      links.save_file(links_file)

      stop_spin
      puts "> #{links_file}"
    end
  else
    puts 'Last URL scraped:'
    puts "> #{url}"
    puts

    if show_dict
      puts @cmd_opts[:show_dict] # Updated in scrape_news_article()
    elsif dry_run
      if new_articles.empty?
        raise CLIError,"scrape_count[#{scrape_count}] != new_articles[#{new_articles.length}]; " \
                       'internal code is broken'
      elsif new_articles.length == 1
        puts new_articles.first
      else
        # Don't show the words (mini), too verbose for more than 1.
        new_articles.each do |article|
          puts article.to_s(mini: true)
        end
      end
    else
      start_spin('Saving scraped data to files')

      links.save_file(links_file)
      news.save_file(out_file)

      stop_spin
      puts "> #{out_file}"
      puts "> #{links_file}"
    end
  end
end

#scrape_news_article(url, link:, new_articles:, news:) ⇒ `Object`

# File 'lib/nhkore/cli/news_cmd.rb', line 324

def scrape_news_article(url,link:,new_articles:,news:)
  show_dict = @cmd_opts[:show_dict]

  if show_dict
    scraper = DictScraper.new(url,**@news_dict_scraper_kargs)

    @cmd_opts[:show_dict] = scraper.scrape.to_s

    return scraper.url
  end

  scraper = nil

  begin
    scraper = ArticleScraper.new(url,**@news_article_scraper_kargs)
  rescue Http404Error
    # - https://www3.nhk.or.jp/news/easy/k10014157491000/k10014157491000.html
    Util.warn("Ignoring URL with 404 error: #{url}.")
    return :unscraped
  end

  article = scraper.scrape
  # run_news_cmd() handles overwriting with --redo or not
  #   using scraped_news_article?().
  news.add_article(article,overwrite: true)

  news.update_article(article,link.url) # Favors https
  link.update_from_article(article)

  new_articles << article

  return :scraped # No --show-dict
end

#scraped_news_article?(news, link) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/nhkore/cli/news_cmd.rb', line 358

def scraped_news_article?(news,link)
  return true if link.scraped?

  no_sha256 = @cmd_opts[:no_sha256]

  article = news.article(link.url)

  if !no_sha256 && article.nil?
    if !Util.empty_web_str?(link.sha256) && news.sha256?(link.sha256)
      article = news.article_with_sha256(link.sha256)
    end

    if article.nil?
      scraper = nil

      begin
        scraper = ArticleScraper.new(link.url,**@news_article_scraper_kargs)
      rescue Http404Error
        return false
      end

      sha256 = scraper.scrape_sha256_only
      article = news.article_with_sha256(sha256) if news.sha256?(sha256)
    end
  end

  if article
    news.update_article(article,link.url) # Favors https
    link.update_from_article(article)

    return true
  end

  return false
end

Module: NHKore::CLI::NewsCmd

Constant Summary collapse

Instance Method Summary collapse

Instance Method Details

#build_news_cmd ⇒ Object

#run_news_cmd(type) ⇒ Object

#scrape_news_article(url, link:, new_articles:, news:) ⇒ Object

#scraped_news_article?(news, link) ⇒ Boolean

#build_news_cmd ⇒ `Object`

#run_news_cmd(type) ⇒ `Object`

#scrape_news_article(url, link:, new_articles:, news:) ⇒ `Object`

#scraped_news_article?(news, link) ⇒ `Boolean`