Class: Bioroebe::DownloadFasta

Inherits:
CommandlineApplication show all
Defined in:
lib/bioroebe/fasta_and_fastq/download_fasta.rb

Overview

Bioroebe::DownloadFasta

Constant Summary collapse

NAMESPACE =
#

NAMESPACE

#
inspect
BASE_URL =
#

BASE_URL

#
'https://www.ncbi.nlm.nih.gov'
DEFAULT_URL_TO_USE =
#

DEFAULT_URL_TO_USE

Here you can designate a default file to use.

This file should contain URLs to a target nucleotide sequence.

The format is just one URL per line, then a newline.

#
'/FASTA_FILE_URL.md'

Constants inherited from CommandlineApplication

CommandlineApplication::OLD_VERBOSE_VALUE

Constants included from ColoursForBase

ColoursForBase::ARRAY_HTML_COLOURS_IN_USE

Instance Method Summary collapse

Methods inherited from CommandlineApplication

#all_aminoacids?, #append_what_into, #at_home?, #be_silent, #be_verbose?, #cat, #ccliner, #change_directory, #cliner, #codon_table_dataset?, #codon_to_aminoacid, #codons_for?, #colourize_this_dna_sequence, #complement, #cp, #disable_warnings, #download_dir?, #editor?, #enable_warnings, #ensure_that_the_base_directories_exist, #esystem, #extract, #is_this_a_start_codon?, #is_this_a_stop_codon?, #leading_five_prime, #load_bioroebe_yaml_file, #log_directory?, #one_letter_to_long_name, #one_to_three, #only_numbers?, #open_in_browser, #opne, #opnn, #pad_with_double_quotes, #pad_with_single_quotes, #partner_nucleotide, #remove_numbers, #remove_trailing_ansii_escape_code, #return_all_possible_start_codons, #return_array_of_one_letter_aminoacids, #return_cheerful_person, #return_chunked_display, #return_ubiquitin_sequence, #set_be_verbose, #start_codon?, #stop_codons?, #strict_filter_away_invalid_aminoacids, #taxonomy_download_directory?, #three_to_one, #to_rna, #trailing_three_prime, #use_opn?, #verbose_truth, #was_or_were, #without_extname, #write_what_into

Methods included from CommandlineArguments

#commandline_arguments?, #commandline_arguments_that_are_files?, #e, #first?, #first_non_hyphen_argument?, #remove_hyphens_from_the_commandline_arguments, #return_commandline_arguments_as_string, #return_commandline_arguments_that_are_not_files, #return_entries_without_two_leading_hyphens, #select_commandline_arguments, #select_entries_starting_with_two_hyphens, #set_commandline_arguments

Methods included from ColoursForBase

#colourize_this_aminoacid_sequence_for_the_commandline, #colourize_this_nucleotide_sequence, #disable_colours, #ecomment, #efancy, #egold, #enable_colours, #eorange, #eparse, #erev, #red, #remove_trailing_escape_part, #return_colour_for_nucleotides, #rev, #sdir, #set_use_colours, #sfancy, #sfile, #simp, #swarn, #use_colours?, #use_colours_within_the_bioroebe_namespace?

Methods inherited from Base

#append_what_into, #can_base_pair?, #convert_global_env, #delete_file, #directory_to_the_codon_tables?, #file_readlines, #infer_the_namespace, #is_on_roebe?, #is_palindrome?, #main_encoding?, #mkdir, #move_file, #mv, #namespace?, #no_file_exists_at, #no_newlines, #project_yaml_directory?, #rds, #register_sigint, #return_pwd, #return_the_first_line_of_this_file, #word_wrap, #write_what_into

Constructor Details

#initialize(i = nil, run_already = true) ⇒ DownloadFasta

#

initialize

#

50
51
52
53
54
55
56
57
# File 'lib/bioroebe/fasta_and_fastq/download_fasta.rb', line 50

def initialize(
    i           = nil,
    run_already = true
  )
  reset
  set_commandline_arguments(i)
  run if run_already
end

Instance Method Details

#download_remote_file(url) ⇒ Object

#

download_remote_file

The URL should be in the form of a NM_ file.

We will then find the proper fasta entry from that.

#

120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
# File 'lib/bioroebe/fasta_and_fastq/download_fasta.rb', line 120

def download_remote_file(url)
  if url.is_a? Array
    url.each {|entry| download_remote_file(entry) }
  else
    url = url.dup if url.frozen?
    url.strip! # We don't want trailing newlines or ' ' characters.
    url.delete!("\n")
    # ===================================================================== #
    # We will have to read the URL for the proper href="" entry. 
    # ===================================================================== #
    regex = /href="(.+\?report=fasta)"/ # http://rubular.com/r/pGYEMD2VCy
    # ===================================================================== #
    # Numbers are easier to handle, so we can bypass most of the next
    # logic.
    # ===================================================================== #
    if url =~ /^\d+$/ or url.start_with?('NC_') or # If only numbers or start with a NC_ value.
      url.include?('&id=XP_') or
      url.include?('&id=NP_') or
      url.include?('&id=YP_') or
      url.include?('&id=NM_') or
      url.include?('&id=WP_')
      match = url
    else
      opnn; e "We will use the URL `#{simp(url)}`"
      remote_file_content = ''.dup # Our variable.
      # =================================================================== #
      # We store the content of the remote page in the variable called
      # remote_file_content, a String.
      # =================================================================== #
      begin
        URI.open(url) {|file|
          file.each_line {|line|
            remote_file_content << line
          }
        }
        if url.include?('ncbi')
          # =============================================================== #
          # Determine the regex we will use to grab more information.
          # Right now we look for two entries: fasta or genbank entry.
          # =============================================================== #
          regex = /(\d{1,18}\.?\d{0,5})\?report=(fasta|genbank)/ # See https://rubular.com/r/hiOp9ZPBOfDGJq
          # =============================================================== #
          # Match it against the page-content.
          # =============================================================== #
          remote_file_content =~ regex
          match = $1.to_s.dup
        else
          match = remote_file_content.split("\n").first.to_s.delete('>').strip
          if match.include? '|' # sp|Q9CR68
            match = match.split('|')[1]
          end 
        end
      rescue Errno::ENOENT => error
        opnn; e tomato('We encountered an error, which we will feedback next.')
        pp error
        match = url
      end
    end
    # ===================================================================== #
    # Ok, next we have the NCBI Url. We still have to extract the
    # relevant fasta content from that. No longer needed.
    # ===================================================================== #
    # ncbi_url = BASE_URL+match
    if url.include? 'ncbi'
      # =================================================================== #
      # The url may look like this:
      #
      #   https://www.ncbi.nlm.nih.gov/protein/NP_001065289.1?report=fasta
      #
      # =================================================================== #
      _ = File.basename(match)
      @file_is_stored_here = ::Bioroebe::Ncbi.efetch_by_url(_)
    else
      @file_is_stored_here = log_dir?+match.to_s+'.fasta'
      write_what_into(remote_file_content, @file_is_stored_here)
    end
    report_where_the_file_is_stored_at(@file_is_stored_here)
    return @file_is_stored_here
  end
end

#enter_log_directory(be_verbose = true) ⇒ Object

#

enter_log_directory

#

73
74
75
76
77
78
79
80
81
82
83
84
# File 'lib/bioroebe/fasta_and_fastq/download_fasta.rb', line 73

def enter_log_directory(be_verbose = true)
  _ = log_dir?
  if !(return_pwd == _)
    if be_verbose
      opnn; e "Now changing directory to `#{sdir(_)}`."
    end
    unless File.exist? _
      mkdir(_)
    end
    cd _
  end
end

#file_is_stored_here?Boolean Also known as: location?

#

file_is_stored_here?

#

Returns:

  • (Boolean)

100
101
102
# File 'lib/bioroebe/fasta_and_fastq/download_fasta.rb', line 100

def file_is_stored_here?
  @file_is_stored_here
end

#rename_to_fasta(i) ⇒ Object

#

rename_to_fasta

#

107
108
109
110
111
# File 'lib/bioroebe/fasta_and_fastq/download_fasta.rb', line 107

def rename_to_fasta(i)
  @file = rds(File.dirname(i)+'/'+File.basename(i)+'.fa')
  opnn; erev "Next renaming `#{sfile(i)}` to `#{sfile(@file)}`."
  mv(i, @file)
end

#report_where_the_file_is_stored_at(i = @file_is_stored_here) ⇒ Object

#

report_where_the_file_is_stored_at

#

89
90
91
92
93
94
95
# File 'lib/bioroebe/fasta_and_fastq/download_fasta.rb', line 89

def report_where_the_file_is_stored_at(
    i = @file_is_stored_here
  )
  if i and File.exist?(i)
    opnn; erev "The file is now stored at `#{sfile(i)}#{rev}`."
  end
end

#resetObject

#

reset (reset tag)

#

62
63
64
65
66
67
68
# File 'lib/bioroebe/fasta_and_fastq/download_fasta.rb', line 62

def reset
  super()
  # ======================================================================= #
  # === @namespace
  # ======================================================================= #
  @namespace = NAMESPACE
end

#runObject

#

run (run tag)

#

204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
# File 'lib/bioroebe/fasta_and_fastq/download_fasta.rb', line 204

def run
  enter_log_directory
  if commandline_arguments?.empty?
    # ===================================================================== #
    # Assign a default URL to use in the event that the user did not
    # provide a valid URL for us to download & fetch.
    # ===================================================================== #
    commandline_arguments? << DEFAULT_URL_TO_USE
  end
  commandline_arguments?.each {|this_file|
    this_file = this_file.to_s # To avoid follow-up problems.
    if File.exist? this_file # This here is mostly done for testing-purposes.
      this_file = File.readlines(this_file).reject {|entry| entry.strip.empty? }
      unless this_file.first.include? '/'
        opnn; erev 'Now working on '+this_file.size.to_s+' entries.'
        this_file.map! {|entry|
          entry = entry.dup if entry.frozen?
          entry.delete!("\n")
          entry = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id='+
                  entry.to_s+'&rettype=fasta&retmode=text'
          entry.strip
        }
      end
    end
    download_remote_file(this_file)
  }
end