Class: Bioroebe::DownloadFasta

Inherits:

CommandlineApplication

Object
Base
CommandlineApplication
Bioroebe::DownloadFasta

show all

Defined in:: lib/bioroebe/fasta_and_fastq/download_fasta.rb

Overview

Bioroebe::DownloadFasta

Constant Summary collapse

BASE_URL = # BASE_URL #

'https://www.ncbi.nlm.nih.gov'

DEFAULT_URL_TO_USE = # DEFAULT_URL_TO_USE Here you can designate a default file to use. This file should contain URLs to a target nucleotide sequence. The format is just one URL per line, then a newline. #

'/FASTA_FILE_URL.md'

Instance Method Summary collapse

#download_remote_file(url) ⇒ Object

# === download_remote_file.
#enter_log_directory(be_verbose = true) ⇒ Object

# === enter_log_directory ========================================================================= #.
#file_is_stored_here? ⇒ Boolean (also: #location?)

# === file_is_stored_here? ========================================================================= #.
#initialize(i = nil, run_already = true) ⇒ DownloadFasta constructor

# === initialize ========================================================================= #.
#rename_to_fasta(i) ⇒ Object

# === rename_to_fasta ========================================================================= #.
#report_where_the_file_is_stored_at(i = @file_is_stored_here) ⇒ Object

# === report_where_the_file_is_stored_at ========================================================================= #.
#reset ⇒ Object

# === reset (reset tag) ========================================================================= #.
#run ⇒ Object

# === run (run tag) ========================================================================= #.

Methods inherited from CommandlineApplication

#all_aminoacids?, #append_what_into, #at_home?, #be_silent, #be_verbose?, #cat, #ccliner, #change_directory, #cliner, #codon_table_dataset?, #codon_to_aminoacid, #codons_for?, #colourize_this_dna_sequence, #complement, #cp, #disable_warnings, #download_dir?, #editor?, #enable_warnings, #ensure_that_the_base_directories_exist, #esystem, #extract, #is_this_a_start_codon?, #is_this_a_stop_codon?, #leading_five_prime, #load_bioroebe_yaml_file, #log_directory?, #one_letter_to_long_name, #one_to_three, #only_numbers?, #open_in_browser, #opnerev, #opnn, #pad_with_double_quotes, #pad_with_single_quotes, #partner_nucleotide, #remove_numbers, #remove_trailing_ansii_escape_code, #return_all_possible_start_codons, #return_array_of_one_letter_aminoacids, #return_cheerful_person, #return_chunked_display, #return_ubiquitin_sequence, #runmode?, #set_be_verbose, #set_runmode, #start_codon?, #stop_codons?, #strict_filter_away_invalid_aminoacids, #taxonomy_download_directory?, #three_to_one, #to_rna, #trailing_three_prime, #use_opn?, #verbose_truth, #was_or_were, #without_extname, #write_what_into

Methods included from BaseModule

#absolute_path, #default_file_read, #file_readlines

Methods included from CommandlineArguments

#commandline_arguments?, #commandline_arguments_that_are_files?, #e, #first?, #first_non_hyphen_argument?, #remove_hyphens_from_the_commandline_arguments, #return_commandline_arguments_as_string, #return_commandline_arguments_that_are_not_files, #return_entries_without_two_leading_hyphens, #select_commandline_arguments, #select_entries_starting_with_two_hyphens, #set_commandline_arguments

Methods included from ColoursForBase

#colourize_this_aminoacid_sequence_for_the_commandline, #colourize_this_nucleotide_sequence, #disable_colours, #ecomment, #efancy, #egold, #enable_colours, #eorange, #eparse, #erev, #red, #remove_trailing_escape_part, #return_colour_for_nucleotides, #rev, #sdir, #set_will_we_use_colours, #sfancy, #sfile, #simp, #swarn, #use_colours?, #use_colours_within_the_bioroebe_namespace?

Methods inherited from Base

#append_what_into, #can_base_pair?, #convert_global_env, #delete_file, #directory_to_the_codon_tables?, #is_on_roebe?, #is_palindrome?, #main_encoding?, #mkdir, #move_file, #mv, #no_file_exists_at, #no_newlines, #project_yaml_directory?, #rds, #register_sigint, #return_pwd, #return_the_first_line_of_this_file, #word_wrap, #write_what_into

Methods included from InternalHashModule

#internal_hash?, #reset_the_internal_hash

Methods included from InferTheNamespaceModule

#infer_the_namespace, #namespace?

Constructor Details

#initialize(i = nil, run_already = true) ⇒ `DownloadFasta`

#

initialize

#

# File 'lib/bioroebe/fasta_and_fastq/download_fasta.rb', line 45

def initialize(
    i           = nil,
    run_already = true
  )
  reset
  set_commandline_arguments(i)
  run if run_already
end

Instance Method Details

#download_remote_file(url) ⇒ `Object`

#

download_remote_file

The URL should be in the form of a NM_ file.

We will then find the proper fasta entry from that.

#

# File 'lib/bioroebe/fasta_and_fastq/download_fasta.rb', line 114

def download_remote_file(url)
  if url.is_a? Array
    url.each {|entry| download_remote_file(entry) }
  else
    url = url.dup if url.frozen?
    url.strip! # We don't want trailing newlines or ' ' characters.
    url.delete!("\n")
    # ===================================================================== #
    # We will have to read the URL for the proper href="" entry. 
    # ===================================================================== #
    regex = /href="(.+\?report=fasta)"/ # http://rubular.com/r/pGYEMD2VCy
    # ===================================================================== #
    # Numbers are easier to handle, so we can bypass most of the next
    # logic.
    # ===================================================================== #
    if url =~ /^\d+$/ or url.start_with?('NC_') or # If only numbers or start with a NC_ value.
      url.include?('&id=XP_') or
      url.include?('&id=NP_') or
      url.include?('&id=YP_') or
      url.include?('&id=NM_') or
      url.include?('&id=WP_')
      match = url
    else
      opne "#{rev}We will use the URL `#{simp(url)}#{rev}`"
      remote_file_content = ''.dup # Our variable.
      # =================================================================== #
      # We store the content of the remote page in the variable called
      # remote_file_content, a String.
      # =================================================================== #
      begin
        URI.open(url) {|file|
          file.each_line {|line|
            remote_file_content << line
          }
        }
        if url.include?('ncbi')
          # =============================================================== #
          # Determine the regex we will use to grab more information.
          # Right now we look for two entries: fasta or genbank entry.
          # =============================================================== #
          regex = /(\d{1,18}\.?\d{0,5})\?report=(fasta|genbank)/ # See https://rubular.com/r/hiOp9ZPBOfDGJq
          # =============================================================== #
          # Match it against the page-content.
          # =============================================================== #
          remote_file_content =~ regex
          match = $1.to_s.dup
        else
          match = remote_file_content.split("\n").first.to_s.delete('>').strip
          if match.include? '|' # sp|Q9CR68
            match = match.split('|')[1]
          end 
        end
      rescue Errno::ENOENT => error
        opne tomato('We encountered an error, which we will feedback next.')
        pp error
        match = url
      end
    end
    # ===================================================================== #
    # Ok, next we have the NCBI Url. We still have to extract the
    # relevant fasta content from that. No longer needed.
    # ===================================================================== #
    # ncbi_url = BASE_URL+match
    if url.include? 'ncbi'
      # =================================================================== #
      # The url may look like this:
      #
      #   https://www.ncbi.nlm.nih.gov/protein/NP_001065289.1?report=fasta
      #
      # =================================================================== #
      _ = File.basename(match)
      @file_is_stored_here = ::Bioroebe::Ncbi.efetch_by_url(_)
    else
      @file_is_stored_here = log_dir?+match.to_s+'.fasta'
      write_what_into(remote_file_content, @file_is_stored_here)
    end
    report_where_the_file_is_stored_at(@file_is_stored_here)
    return @file_is_stored_here
  end
end

#enter_log_directory(be_verbose = true) ⇒ `Object`

#

enter_log_directory

#

# File 'lib/bioroebe/fasta_and_fastq/download_fasta.rb', line 65

def enter_log_directory(
    be_verbose = true
  )
  _ = log_dir?
  if !(return_pwd == _)
    if be_verbose
      opne "#{rev}Now changing directory to `#{sdir(_)}#{rev}`."
    end
    unless File.exist? _
      mkdir(_)
    end
    cd _
  end
end

#file_is_stored_here? ⇒ `Boolean` Also known as: location?

#

file_is_stored_here?

#

Returns:

(Boolean)



94
95
96

# File 'lib/bioroebe/fasta_and_fastq/download_fasta.rb', line 94

def file_is_stored_here?
  @file_is_stored_here
end

#rename_to_fasta(i) ⇒ `Object`

#

rename_to_fasta

#

# File 'lib/bioroebe/fasta_and_fastq/download_fasta.rb', line 101

def rename_to_fasta(i)
  @file = rds(File.dirname(i)+'/'+File.basename(i)+'.fa')
  opnerev "Next renaming `#{sfile(i)}` to `#{sfile(@file)}`."
  mv(i, @file)
end

#report_where_the_file_is_stored_at(i = @file_is_stored_here) ⇒ `Object`

#

report_where_the_file_is_stored_at

#

# File 'lib/bioroebe/fasta_and_fastq/download_fasta.rb', line 83

def report_where_the_file_is_stored_at(
    i = @file_is_stored_here
  )
  if i and File.exist?(i)
    opnerev "The file is now stored at `#{sfile(i)}#{rev}`."
  end
end

#reset ⇒ `Object`

#

reset (reset tag)

#

# File 'lib/bioroebe/fasta_and_fastq/download_fasta.rb', line 57

def reset
  super()
  infer_the_namespace
end

#run ⇒ `Object`

#

run (run tag)

#

# File 'lib/bioroebe/fasta_and_fastq/download_fasta.rb', line 198

def run
  enter_log_directory
  if commandline_arguments?.empty?
    # ===================================================================== #
    # Assign a default URL to use in the event that the user did not
    # provide a valid URL for us to download & fetch.
    # ===================================================================== #
    commandline_arguments? << DEFAULT_URL_TO_USE
  end
  commandline_arguments?.each {|this_file|
    this_file = this_file.to_s # To avoid follow-up problems.
    if File.exist? this_file # This here is mostly done for testing-purposes.
      this_file = File.readlines(this_file).reject {|entry| entry.strip.empty? }
      unless this_file.first.include? '/'
        opnerev 'Now working on '+this_file.size.to_s+' entries.'
        this_file.map! {|entry|
          entry = entry.dup if entry.frozen?
          entry.delete!("\n")
          entry = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id='+
                  entry.to_s+'&rettype=fasta&retmode=text'
          entry.strip
        }
      end
    end
    download_remote_file(this_file)
  }
end

Class: Bioroebe::DownloadFasta

Overview

Bioroebe::DownloadFasta

Constant Summary collapse

#

BASE_URL

#

#

DEFAULT_URL_TO_USE

#

Constants inherited from CommandlineApplication

Constants included from ColoursForBase

Constants inherited from Base

Instance Method Summary collapse

# === download_remote_file.

# === enter_log_directory ========================================================================= #.

# === file_is_stored_here? ========================================================================= #.

# === initialize ========================================================================= #.

# === rename_to_fasta ========================================================================= #.

# === report_where_the_file_is_stored_at ========================================================================= #.

# === reset (reset tag) ========================================================================= #.

# === run (run tag) ========================================================================= #.