Class: Bioroebe::DownloadFasta

Inherits:
CommandlineApplication show all
Defined in:
lib/bioroebe/fasta_and_fastq/download_fasta.rb

Overview

Bioroebe::DownloadFasta

Constant Summary collapse

BASE_URL =
#

BASE_URL

#
'https://www.ncbi.nlm.nih.gov'
DEFAULT_URL_TO_USE =
#

DEFAULT_URL_TO_USE

Here you can designate a default file to use.

This file should contain URLs to a target nucleotide sequence.

The format is just one URL per line, then a newline.

#
'/FASTA_FILE_URL.md'

Constants inherited from CommandlineApplication

CommandlineApplication::OLD_VERBOSE_VALUE

Constants included from ColoursForBase

ColoursForBase::ARRAY_HTML_COLOURS_IN_USE

Constants inherited from Base

Base::NAMESPACE

Instance Method Summary collapse

Methods inherited from CommandlineApplication

#all_aminoacids?, #append_what_into, #at_home?, #be_silent, #be_verbose?, #cat, #ccliner, #change_directory, #cliner, #codon_table_dataset?, #codon_to_aminoacid, #codons_for?, #colourize_this_dna_sequence, #complement, #cp, #disable_warnings, #download_dir?, #editor?, #enable_warnings, #ensure_that_the_base_directories_exist, #esystem, #extract, #is_this_a_start_codon?, #is_this_a_stop_codon?, #leading_five_prime, #load_bioroebe_yaml_file, #log_directory?, #one_letter_to_long_name, #one_to_three, #only_numbers?, #open_in_browser, #opnerev, #opnn, #pad_with_double_quotes, #pad_with_single_quotes, #partner_nucleotide, #remove_numbers, #remove_trailing_ansii_escape_code, #return_all_possible_start_codons, #return_array_of_one_letter_aminoacids, #return_cheerful_person, #return_chunked_display, #return_ubiquitin_sequence, #runmode?, #set_be_verbose, #set_runmode, #start_codon?, #stop_codons?, #strict_filter_away_invalid_aminoacids, #taxonomy_download_directory?, #three_to_one, #to_rna, #trailing_three_prime, #use_opn?, #verbose_truth, #was_or_were, #without_extname, #write_what_into

Methods included from BaseModule

#absolute_path, #default_file_read, #file_readlines

Methods included from CommandlineArguments

#commandline_arguments?, #commandline_arguments_that_are_files?, #e, #first?, #first_non_hyphen_argument?, #remove_hyphens_from_the_commandline_arguments, #return_commandline_arguments_as_string, #return_commandline_arguments_that_are_not_files, #return_entries_without_two_leading_hyphens, #select_commandline_arguments, #select_entries_starting_with_two_hyphens, #set_commandline_arguments

Methods included from ColoursForBase

#colourize_this_aminoacid_sequence_for_the_commandline, #colourize_this_nucleotide_sequence, #disable_colours, #ecomment, #efancy, #egold, #enable_colours, #eorange, #eparse, #erev, #red, #remove_trailing_escape_part, #return_colour_for_nucleotides, #rev, #sdir, #set_will_we_use_colours, #sfancy, #sfile, #simp, #swarn, #use_colours?, #use_colours_within_the_bioroebe_namespace?

Methods inherited from Base

#append_what_into, #can_base_pair?, #convert_global_env, #delete_file, #directory_to_the_codon_tables?, #is_on_roebe?, #is_palindrome?, #main_encoding?, #mkdir, #move_file, #mv, #no_file_exists_at, #no_newlines, #project_yaml_directory?, #rds, #register_sigint, #return_pwd, #return_the_first_line_of_this_file, #word_wrap, #write_what_into

Methods included from InternalHashModule

#internal_hash?, #reset_the_internal_hash

Methods included from InferTheNamespaceModule

#infer_the_namespace, #namespace?

Constructor Details

#initialize(i = nil, run_already = true) ⇒ DownloadFasta

#

initialize

#


45
46
47
48
49
50
51
52
# File 'lib/bioroebe/fasta_and_fastq/download_fasta.rb', line 45

def initialize(
    i           = nil,
    run_already = true
  )
  reset
  set_commandline_arguments(i)
  run if run_already
end

Instance Method Details

#download_remote_file(url) ⇒ Object

#

download_remote_file

The URL should be in the form of a NM_ file.

We will then find the proper fasta entry from that.

#


114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
# File 'lib/bioroebe/fasta_and_fastq/download_fasta.rb', line 114

def download_remote_file(url)
  if url.is_a? Array
    url.each {|entry| download_remote_file(entry) }
  else
    url = url.dup if url.frozen?
    url.strip! # We don't want trailing newlines or ' ' characters.
    url.delete!("\n")
    # ===================================================================== #
    # We will have to read the URL for the proper href="" entry. 
    # ===================================================================== #
    regex = /href="(.+\?report=fasta)"/ # http://rubular.com/r/pGYEMD2VCy
    # ===================================================================== #
    # Numbers are easier to handle, so we can bypass most of the next
    # logic.
    # ===================================================================== #
    if url =~ /^\d+$/ or url.start_with?('NC_') or # If only numbers or start with a NC_ value.
      url.include?('&id=XP_') or
      url.include?('&id=NP_') or
      url.include?('&id=YP_') or
      url.include?('&id=NM_') or
      url.include?('&id=WP_')
      match = url
    else
      opne "#{rev}We will use the URL `#{simp(url)}#{rev}`"
      remote_file_content = ''.dup # Our variable.
      # =================================================================== #
      # We store the content of the remote page in the variable called
      # remote_file_content, a String.
      # =================================================================== #
      begin
        URI.open(url) {|file|
          file.each_line {|line|
            remote_file_content << line
          }
        }
        if url.include?('ncbi')
          # =============================================================== #
          # Determine the regex we will use to grab more information.
          # Right now we look for two entries: fasta or genbank entry.
          # =============================================================== #
          regex = /(\d{1,18}\.?\d{0,5})\?report=(fasta|genbank)/ # See https://rubular.com/r/hiOp9ZPBOfDGJq
          # =============================================================== #
          # Match it against the page-content.
          # =============================================================== #
          remote_file_content =~ regex
          match = $1.to_s.dup
        else
          match = remote_file_content.split("\n").first.to_s.delete('>').strip
          if match.include? '|' # sp|Q9CR68
            match = match.split('|')[1]
          end 
        end
      rescue Errno::ENOENT => error
        opne tomato('We encountered an error, which we will feedback next.')
        pp error
        match = url
      end
    end
    # ===================================================================== #
    # Ok, next we have the NCBI Url. We still have to extract the
    # relevant fasta content from that. No longer needed.
    # ===================================================================== #
    # ncbi_url = BASE_URL+match
    if url.include? 'ncbi'
      # =================================================================== #
      # The url may look like this:
      #
      #   https://www.ncbi.nlm.nih.gov/protein/NP_001065289.1?report=fasta
      #
      # =================================================================== #
      _ = File.basename(match)
      @file_is_stored_here = ::Bioroebe::Ncbi.efetch_by_url(_)
    else
      @file_is_stored_here = log_dir?+match.to_s+'.fasta'
      write_what_into(remote_file_content, @file_is_stored_here)
    end
    report_where_the_file_is_stored_at(@file_is_stored_here)
    return @file_is_stored_here
  end
end

#enter_log_directory(be_verbose = true) ⇒ Object

#

enter_log_directory

#


65
66
67
68
69
70
71
72
73
74
75
76
77
78
# File 'lib/bioroebe/fasta_and_fastq/download_fasta.rb', line 65

def enter_log_directory(
    be_verbose = true
  )
  _ = log_dir?
  if !(return_pwd == _)
    if be_verbose
      opne "#{rev}Now changing directory to `#{sdir(_)}#{rev}`."
    end
    unless File.exist? _
      mkdir(_)
    end
    cd _
  end
end

#file_is_stored_here?Boolean Also known as: location?

#

file_is_stored_here?

#

Returns:

  • (Boolean)


94
95
96
# File 'lib/bioroebe/fasta_and_fastq/download_fasta.rb', line 94

def file_is_stored_here?
  @file_is_stored_here
end

#rename_to_fasta(i) ⇒ Object

#

rename_to_fasta

#


101
102
103
104
105
# File 'lib/bioroebe/fasta_and_fastq/download_fasta.rb', line 101

def rename_to_fasta(i)
  @file = rds(File.dirname(i)+'/'+File.basename(i)+'.fa')
  opnerev "Next renaming `#{sfile(i)}` to `#{sfile(@file)}`."
  mv(i, @file)
end

#report_where_the_file_is_stored_at(i = @file_is_stored_here) ⇒ Object

#

report_where_the_file_is_stored_at

#


83
84
85
86
87
88
89
# File 'lib/bioroebe/fasta_and_fastq/download_fasta.rb', line 83

def report_where_the_file_is_stored_at(
    i = @file_is_stored_here
  )
  if i and File.exist?(i)
    opnerev "The file is now stored at `#{sfile(i)}#{rev}`."
  end
end

#resetObject

#

reset (reset tag)

#


57
58
59
60
# File 'lib/bioroebe/fasta_and_fastq/download_fasta.rb', line 57

def reset
  super()
  infer_the_namespace
end

#runObject

#

run (run tag)

#


198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
# File 'lib/bioroebe/fasta_and_fastq/download_fasta.rb', line 198

def run
  enter_log_directory
  if commandline_arguments?.empty?
    # ===================================================================== #
    # Assign a default URL to use in the event that the user did not
    # provide a valid URL for us to download & fetch.
    # ===================================================================== #
    commandline_arguments? << DEFAULT_URL_TO_USE
  end
  commandline_arguments?.each {|this_file|
    this_file = this_file.to_s # To avoid follow-up problems.
    if File.exist? this_file # This here is mostly done for testing-purposes.
      this_file = File.readlines(this_file).reject {|entry| entry.strip.empty? }
      unless this_file.first.include? '/'
        opnerev 'Now working on '+this_file.size.to_s+' entries.'
        this_file.map! {|entry|
          entry = entry.dup if entry.frozen?
          entry.delete!("\n")
          entry = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id='+
                  entry.to_s+'&rettype=fasta&retmode=text'
          entry.strip
        }
      end
    end
    download_remote_file(this_file)
  }
end