Class: Bioroebe::DownloadFasta
- Inherits:
-
CommandlineApplication
- Object
- Base
- CommandlineApplication
- Bioroebe::DownloadFasta
- Defined in:
- lib/bioroebe/fasta_and_fastq/download_fasta.rb
Overview
Bioroebe::DownloadFasta
Constant Summary collapse
- NAMESPACE =
#
NAMESPACE
#
inspect
- BASE_URL =
#
BASE_URL
#
'https://www.ncbi.nlm.nih.gov'
- DEFAULT_URL_TO_USE =
#
DEFAULT_URL_TO_USE
Here you can designate a default file to use.
This file should contain URLs to a target nucleotide sequence.
The format is just one URL per line, then a newline.
#
'/FASTA_FILE_URL.md'
Constants inherited from CommandlineApplication
CommandlineApplication::OLD_VERBOSE_VALUE
Constants included from ColoursForBase
ColoursForBase::ARRAY_HTML_COLOURS_IN_USE
Instance Method Summary collapse
-
#download_remote_file(url) ⇒ Object
# === download_remote_file.
-
#enter_log_directory(be_verbose = true) ⇒ Object
# === enter_log_directory ========================================================================= #.
-
#file_is_stored_here? ⇒ Boolean
(also: #location?)
# === file_is_stored_here? ========================================================================= #.
-
#initialize(i = nil, run_already = true) ⇒ DownloadFasta
constructor
# === initialize ========================================================================= #.
-
#rename_to_fasta(i) ⇒ Object
# === rename_to_fasta ========================================================================= #.
-
#report_where_the_file_is_stored_at(i = @file_is_stored_here) ⇒ Object
# === report_where_the_file_is_stored_at ========================================================================= #.
-
#reset ⇒ Object
# === reset (reset tag) ========================================================================= #.
-
#run ⇒ Object
# === run (run tag) ========================================================================= #.
Methods inherited from CommandlineApplication
#all_aminoacids?, #append_what_into, #at_home?, #be_silent, #be_verbose?, #cat, #ccliner, #change_directory, #cliner, #codon_table_dataset?, #codon_to_aminoacid, #codons_for?, #colourize_this_dna_sequence, #complement, #cp, #disable_warnings, #download_dir?, #editor?, #enable_warnings, #ensure_that_the_base_directories_exist, #esystem, #extract, #is_this_a_start_codon?, #is_this_a_stop_codon?, #leading_five_prime, #load_bioroebe_yaml_file, #log_directory?, #one_letter_to_long_name, #one_to_three, #only_numbers?, #open_in_browser, #opne, #opnn, #pad_with_double_quotes, #pad_with_single_quotes, #partner_nucleotide, #remove_numbers, #remove_trailing_ansii_escape_code, #return_all_possible_start_codons, #return_array_of_one_letter_aminoacids, #return_cheerful_person, #return_chunked_display, #return_ubiquitin_sequence, #set_be_verbose, #start_codon?, #stop_codons?, #strict_filter_away_invalid_aminoacids, #taxonomy_download_directory?, #three_to_one, #to_rna, #trailing_three_prime, #use_opn?, #verbose_truth, #was_or_were, #without_extname, #write_what_into
Methods included from CommandlineArguments
#commandline_arguments?, #commandline_arguments_that_are_files?, #e, #first?, #first_non_hyphen_argument?, #remove_hyphens_from_the_commandline_arguments, #return_commandline_arguments_as_string, #return_commandline_arguments_that_are_not_files, #return_entries_without_two_leading_hyphens, #select_commandline_arguments, #select_entries_starting_with_two_hyphens, #set_commandline_arguments
Methods included from ColoursForBase
#colourize_this_aminoacid_sequence_for_the_commandline, #colourize_this_nucleotide_sequence, #disable_colours, #ecomment, #efancy, #egold, #enable_colours, #eorange, #eparse, #erev, #red, #remove_trailing_escape_part, #return_colour_for_nucleotides, #rev, #sdir, #set_use_colours, #sfancy, #sfile, #simp, #swarn, #use_colours?, #use_colours_within_the_bioroebe_namespace?
Methods inherited from Base
#append_what_into, #can_base_pair?, #convert_global_env, #delete_file, #directory_to_the_codon_tables?, #file_readlines, #infer_the_namespace, #is_on_roebe?, #is_palindrome?, #main_encoding?, #mkdir, #move_file, #mv, #namespace?, #no_file_exists_at, #no_newlines, #project_yaml_directory?, #rds, #register_sigint, #return_pwd, #return_the_first_line_of_this_file, #word_wrap, #write_what_into
Constructor Details
#initialize(i = nil, run_already = true) ⇒ DownloadFasta
#
initialize
#
50 51 52 53 54 55 56 57 |
# File 'lib/bioroebe/fasta_and_fastq/download_fasta.rb', line 50 def initialize( i = nil, run_already = true ) reset set_commandline_arguments(i) run if run_already end |
Instance Method Details
#download_remote_file(url) ⇒ Object
#
download_remote_file
The URL should be in the form of a NM_ file.
We will then find the proper fasta entry from that.
#
120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 |
# File 'lib/bioroebe/fasta_and_fastq/download_fasta.rb', line 120 def download_remote_file(url) if url.is_a? Array url.each {|entry| download_remote_file(entry) } else url = url.dup if url.frozen? url.strip! # We don't want trailing newlines or ' ' characters. url.delete!("\n") # ===================================================================== # # We will have to read the URL for the proper href="" entry. # ===================================================================== # regex = /href="(.+\?report=fasta)"/ # http://rubular.com/r/pGYEMD2VCy # ===================================================================== # # Numbers are easier to handle, so we can bypass most of the next # logic. # ===================================================================== # if url =~ /^\d+$/ or url.start_with?('NC_') or # If only numbers or start with a NC_ value. url.include?('&id=XP_') or url.include?('&id=NP_') or url.include?('&id=YP_') or url.include?('&id=NM_') or url.include?('&id=WP_') match = url else opnn; e "We will use the URL `#{simp(url)}`" remote_file_content = ''.dup # Our variable. # =================================================================== # # We store the content of the remote page in the variable called # remote_file_content, a String. # =================================================================== # begin URI.open(url) {|file| file.each_line {|line| remote_file_content << line } } if url.include?('ncbi') # =============================================================== # # Determine the regex we will use to grab more information. # Right now we look for two entries: fasta or genbank entry. # =============================================================== # regex = /(\d{1,18}\.?\d{0,5})\?report=(fasta|genbank)/ # See https://rubular.com/r/hiOp9ZPBOfDGJq # =============================================================== # # Match it against the page-content. # =============================================================== # remote_file_content =~ regex match = $1.to_s.dup else match = remote_file_content.split("\n").first.to_s.delete('>').strip if match.include? '|' # sp|Q9CR68 match = match.split('|')[1] end end rescue Errno::ENOENT => error opnn; e tomato('We encountered an error, which we will feedback next.') pp error match = url end end # ===================================================================== # # Ok, next we have the NCBI Url. We still have to extract the # relevant fasta content from that. No longer needed. # ===================================================================== # # ncbi_url = BASE_URL+match if url.include? 'ncbi' # =================================================================== # # The url may look like this: # # https://www.ncbi.nlm.nih.gov/protein/NP_001065289.1?report=fasta # # =================================================================== # _ = File.basename(match) @file_is_stored_here = ::Bioroebe::Ncbi.efetch_by_url(_) else @file_is_stored_here = log_dir?+match.to_s+'.fasta' write_what_into(remote_file_content, @file_is_stored_here) end report_where_the_file_is_stored_at(@file_is_stored_here) return @file_is_stored_here end end |
#enter_log_directory(be_verbose = true) ⇒ Object
#
enter_log_directory
#
73 74 75 76 77 78 79 80 81 82 83 84 |
# File 'lib/bioroebe/fasta_and_fastq/download_fasta.rb', line 73 def enter_log_directory(be_verbose = true) _ = log_dir? if !(return_pwd == _) if be_verbose opnn; e "Now changing directory to `#{sdir(_)}`." end unless File.exist? _ mkdir(_) end cd _ end end |
#file_is_stored_here? ⇒ Boolean Also known as: location?
#
file_is_stored_here?
#
100 101 102 |
# File 'lib/bioroebe/fasta_and_fastq/download_fasta.rb', line 100 def file_is_stored_here? @file_is_stored_here end |
#rename_to_fasta(i) ⇒ Object
#
rename_to_fasta
#
107 108 109 110 111 |
# File 'lib/bioroebe/fasta_and_fastq/download_fasta.rb', line 107 def rename_to_fasta(i) @file = rds(File.dirname(i)+'/'+File.basename(i)+'.fa') opnn; erev "Next renaming `#{sfile(i)}` to `#{sfile(@file)}`." mv(i, @file) end |
#report_where_the_file_is_stored_at(i = @file_is_stored_here) ⇒ Object
#
report_where_the_file_is_stored_at
#
89 90 91 92 93 94 95 |
# File 'lib/bioroebe/fasta_and_fastq/download_fasta.rb', line 89 def report_where_the_file_is_stored_at( i = @file_is_stored_here ) if i and File.exist?(i) opnn; erev "The file is now stored at `#{sfile(i)}#{rev}`." end end |
#reset ⇒ Object
#
reset (reset tag)
#
62 63 64 65 66 67 68 |
# File 'lib/bioroebe/fasta_and_fastq/download_fasta.rb', line 62 def reset super() # ======================================================================= # # === @namespace # ======================================================================= # @namespace = NAMESPACE end |
#run ⇒ Object
#
run (run tag)
#
204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 |
# File 'lib/bioroebe/fasta_and_fastq/download_fasta.rb', line 204 def run enter_log_directory if commandline_arguments?.empty? # ===================================================================== # # Assign a default URL to use in the event that the user did not # provide a valid URL for us to download & fetch. # ===================================================================== # commandline_arguments? << DEFAULT_URL_TO_USE end commandline_arguments?.each {|this_file| this_file = this_file.to_s # To avoid follow-up problems. if File.exist? this_file # This here is mostly done for testing-purposes. this_file = File.readlines(this_file).reject {|entry| entry.strip.empty? } unless this_file.first.include? '/' opnn; erev 'Now working on '+this_file.size.to_s+' entries.' this_file.map! {|entry| entry = entry.dup if entry.frozen? entry.delete!("\n") entry = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id='+ entry.to_s+'&rettype=fasta&retmode=text' entry.strip } end end download_remote_file(this_file) } end |