Class: Bioroebe::GenbankParser

Inherits:
CommandlineApplication show all
Defined in:
lib/bioroebe/parsers/genbank_parser.rb

Overview

Bioroebe::GenbankParser

Constant Summary collapse

UPCASE_THE_SEQUENCE =
#

UPCASE_THE_SEQUENCE

Setting this constant to true will cause this class to store the FASTA sequence in an upcased variant, e. g. “AGCAGCTA” rather than “acgatcag”.

#
true
TEST_STRING =
#

TEST_STRING

Our example test-string, to see how such a genbank file usually looks like.

This will contain two different FASTA sequences.

#
'                   /note="internal transcribed spacer 2"
ORIGIN      
        1 cgtaacaagg tttccgtagg tgaaccttcg gaaggatcat tgttgagacc cccaaaaaaa
       61 cgatcgagtt aatccggagg accggtgtag tttggtctcc caggggcttt ggctactgtg
      121 gtggccgtga atttccgtcg aacctccttg ggagaattct tgatggcaat tgaacccttg
      181 gcccggcgca gtttcgcccc aagtcaaatg agatggaacc ggcggagggc atcgtcctcc
      241 atggaaccgg ggagggccgg cgttcttccg ttccccccat gaattttttt ttgacaactc
      301 tcggcaacgg atatctcggc tctttgcatc cgatgaaaga acccagcgaa atgtgataag
      361 tggtgtgaat tgcagaatcc cgtgaaccat cgagtctttg aacgcaagtt gcgcccgagg
      421 ccatcaggct aagggcacgc ctgcctgggc gttgcgtgct gcatctctct cccattgcta
      481 aggctgaaca ggcatactgt tcggccggcg cggatgagtg tttggcccct tgttcttcgg
      541 tgcgatgggt ccaagacctg ggcttttgac ggccggaaat ccggcaagag gtggacggac
      601 ggtggctgcg acgaagctgt cgtgcgaatg ccctacgctg tcgtatttga tgggccggaa
      661 taaatccctt ttgagcccca ttggaggcac gtcaacccgt gggcggtcga cggccatttg
      721 gatgcaaccc caggtcaggt gagga
//
LOCUS       Z78510                   750 bp    DNA     linear   PLN 30-NOV-2006
DEFINITION  P.caricinum 5.8S rRNA gene and ITS1 and ITS2 DNA.
ACCESSION   Z78510
VERSION     Z78510.1  GI:2765635
KEYWORDS    5.8S ribosomal RNA; 5.8S rRNA gene; internal transcribed spacer;
            ITS1; ITS2.
SOURCE      Phragmipedium caricinum
  ORGANISM  Phragmipedium caricinum
            Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta;
            Spermatophyta; Magnoliophyta; Liliopsida; Asparagales; Orchidaceae;
            Cypripedioideae; Phragmipedium.
REFERENCE   1
  AUTHORS   Cox,A.V., Pridgeon,A.M., Albert,V.A. and Chase,M.W.
  TITLE     Phylogenetics of the slipper orchids (Cypripedioideae:
            Orchidaceae): nuclear rDNA ITS sequences
  JOURNAL   Unpublished
REFERENCE   2  (bases 1 to 750)
  AUTHORS   Cox,A.V.
  TITLE     Direct Submission
  JOURNAL   Submitted (19-AUG-1996) Cox A.V., Royal Botanic Gardens, Kew,
            Richmond, Surrey TW9 3AB, UK
FEATURES             Location/Qualifiers
     source          1..750
                     /organism="Phragmipedium caricinum"
                     /mol_type="genomic DNA"
                     /db_xref="taxon:53127"
     misc_feature    1..380
                     /note="internal transcribed spacer 1"
     gene            381..550
                     /gene="5.8S rRNA"
     rRNA            381..550
                     /gene="5.8S rRNA"
                     /product="5.8S ribosomal RNA"
     misc_feature    551..750
                     /note="internal transcribed spacer 2"
ORIGIN      
        1 ctaaccaggg ttccgaggtg accttcggga ggattccttt ttaagccccc gaaaaaacga
       61 tcgaattaaa ccggaggacc ggtttaattt ggtctcccca ggggctttcc ccccttggtg
      121 gccgtgaatt tccatcgaac ccccctggga gaattcttgg tggccaatgg acccttggcc
      181 cggcgcaatt tcccccccaa tcaaatgaga taggaccggc agggggcgtc cccccccatg
      241 gaaccgggga gggccggcat tcttccgttc ccccctcgga ttttttgaca actctcgcaa
      301 cggatatctc gcctctttgc atcggatgga agaacgcagc gaaatgtgat aagtggtgtg
      361 aattgcagaa tcccgtgaac catcgagtct ttgaacgcaa gttgcgcccg aggccatcag
      421 gctaagggca cgcctgcctg ggcgttgcgt gctgcatctc tcccattgct aaggttgaac
      481 gggcatactg ttcggccggc gcggatgaga gattggcccc ttgttcttcg gtgcgatggg
      541 tccaagacct gggcttttga cggtccaaaa tccggcaaga ggtggacgga cggtggctgc
      601 gacaaagctg tcgtgcgaat gccctgcgtt gtcgtttttg atgggccgga ataaatccct
      661 tttgaacccc attggaggca cgtcaaccca tgggcggttg acggccattt ggatgcaacc
      721 ccaggtcagg tgagccaccc gctgagttta
//
LOCUS       Z78509                   731 bp    DNA     linear   PLN 30-NOV-2006
DEFINITION  P.pearcei 5.8S rRNA gene and ITS1 and ITS2 DNA.
ACCESSION   Z78509
VERSION     Z78509.1  GI:2765634
KEYWORDS    5.8S ribosomal RNA; 5.8S rRNA gene; internal transcribed spacer;
            ITS1; ITS2.
SOURCE      Phragmipedium pearcei
  ORGANISM  Phragmipedium pearcei
            Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta;
            Spermatophyta; Magnoliophyta; Liliopsida; Asparagales; Orchidaceae;
            Cypripedioideae; Phragmipedium.
REFERENCE   1
  AUTHORS   Cox,A.V., Pridgeon,A.M., Albert,V.A. and Chase,M.W.
  TITLE     Phylogenetics of the slipper orchids (Cypripedioideae:
            Orchidaceae): nuclear rDNA ITS sequences
  JOURNAL   Unpublished
REFERENCE   2  (bases 1 to 731)
  AUTHORS   Cox,A.V.
  TITLE     Direct Submission
  JOURNAL   Submitted (19-AUG-1996) Cox A.V., Royal Botanic Gardens, Kew,
            Richmond, Surrey TW9 3AB, UK
FEATURES             Location/Qualifiers
     source          1..731
                     /organism="Phragmipedium pearcei"
                     /mol_type="genomic DNA"
                     /db_xref="taxon:53135"
     misc_feature    1..380
                     /note="internal transcribed spacer 1"
     gene            381..550
                     /gene="5.8S rRNA"
     rRNA            381..550
                     /gene="5.8S rRNA"
                     /product="5.8S ribosomal RNA"
     misc_feature    551..731
                     /note="internal transcribed spacer 2"
ORIGIN      
        1 cgtaacaagg tttccgtagg tgaacctgcg gaaggatcat tgttgagacc gccaaatata
       61 cgatcgagtt aatccggagg accggtgtag tttggtctcc caggggcttt cgccgctgtg
      121 gtgaccgtga tttgccatcg agcctccttg ggagatttct tgatggcaat tgaacccttg
      181 gcccggcgca gtttcgcgcc aagtcatatg agatagaacc ggcggagggc gtcgtcctcc
      241 atggagcggg gagggccggc atgctccgtg cccccccatg aatttttctg acaactctcg
      301 gcaacggacg taacaaggtt taaatgtgat aagcaggtgt gaattgcaga atcccgtgaa
      361 ccatcgagtc tttgaacgca agttgcgccc gaggccatca ggttaagggc acgcctgcct
      421 gggcgttgcg tgctgcatct ctcccattgc taaggttgaa cgggcatact gttcggccgg
      481 cgcggatgag agtttggccc cttgttcttc ggtgcgatgg gtccaagacc tgggcttttg
      541 acggtccaaa atccggcaag aggtggacgg acggtggctg cgacagagct gtcgtgcgaa
      601 tgccctacgt tgtcgttttt gatgggccag aataaatccc ttttgaaccc cattggaggc
      661 acgtcaaccc aatggggggt gacgggcatt tggttaaccc cggcaagtta aggcacccgt
      721 taattttagg a
//
LOCUS       Z78508                   741 bp    DNA     linear   PLN 30-NOV-2006'

Constants inherited from CommandlineApplication

CommandlineApplication::OLD_VERBOSE_VALUE

Constants included from ColoursForBase

ColoursForBase::ARRAY_HTML_COLOURS_IN_USE

Constants inherited from Base

Base::NAMESPACE

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from CommandlineApplication

#all_aminoacids?, #append_what_into, #at_home?, #be_silent, #be_verbose?, #cat, #ccliner, #change_directory, #cliner, #codon_table_dataset?, #codon_to_aminoacid, #codons_for?, #colourize_this_dna_sequence, #complement, #cp, #disable_warnings, #download_dir?, #editor?, #enable_warnings, #ensure_that_the_base_directories_exist, #esystem, #extract, #is_this_a_start_codon?, #is_this_a_stop_codon?, #leading_five_prime, #load_bioroebe_yaml_file, #log_directory?, #one_letter_to_long_name, #one_to_three, #only_numbers?, #open_in_browser, #opne, #opnn, #pad_with_double_quotes, #pad_with_single_quotes, #partner_nucleotide, #remove_numbers, #remove_trailing_ansii_escape_code, #return_all_possible_start_codons, #return_array_of_one_letter_aminoacids, #return_cheerful_person, #return_chunked_display, #return_ubiquitin_sequence, #set_be_verbose, #start_codon?, #stop_codons?, #strict_filter_away_invalid_aminoacids, #taxonomy_download_directory?, #three_to_one, #to_rna, #trailing_three_prime, #use_opn?, #verbose_truth, #was_or_were, #without_extname, #write_what_into

Methods included from CommandlineArguments

#commandline_arguments?, #commandline_arguments_that_are_files?, #e, #first?, #first_non_hyphen_argument?, #remove_hyphens_from_the_commandline_arguments, #return_commandline_arguments_as_string, #return_commandline_arguments_that_are_not_files, #return_entries_without_two_leading_hyphens, #select_commandline_arguments, #select_entries_starting_with_two_hyphens, #set_commandline_arguments

Methods included from ColoursForBase

#colourize_this_aminoacid_sequence_for_the_commandline, #colourize_this_nucleotide_sequence, #disable_colours, #ecomment, #efancy, #egold, #enable_colours, #eorange, #eparse, #erev, #red, #remove_trailing_escape_part, #return_colour_for_nucleotides, #rev, #sdir, #set_use_colours, #sfancy, #sfile, #simp, #swarn, #use_colours?, #use_colours_within_the_bioroebe_namespace?

Methods inherited from Base

#append_what_into, #can_base_pair?, #convert_global_env, #delete_file, #directory_to_the_codon_tables?, #file_readlines, #infer_the_namespace, #is_on_roebe?, #is_palindrome?, #main_encoding?, #mkdir, #move_file, #mv, #namespace?, #no_file_exists_at, #no_newlines, #project_yaml_directory?, #rds, #register_sigint, #return_pwd, #return_the_first_line_of_this_file, #word_wrap, #write_what_into

Constructor Details

#initialize(commandline_arguments = nil, run_already = true) ⇒ GenbankParser

#

initialize

#

175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
# File 'lib/bioroebe/parsers/genbank_parser.rb', line 175

def initialize(
    commandline_arguments = nil,
    run_already           = true
  )
  reset
  set_commandline_arguments(
    commandline_arguments
  )
  menu
  if block_given?
    yielded = yield
    case yielded
    # ===================================================================== #
    # === :do_not_report_anything
    # ===================================================================== #
    when :do_not_report_anything
      @internal_hash[:report_the_dataset] = false
    end
  end
  run if run_already
end

Class Method Details

.[](i = '') ⇒ Object

#

Bioroebe::GenbankParser[]

#

384
385
386
# File 'lib/bioroebe/parsers/genbank_parser.rb', line 384

def self.[](i = '')
  new(i)
end

Instance Method Details

#analyse_this_dataset(dataset) ⇒ Object Also known as: determine_dataset

#

analyse_this_dataset

#

287
288
289
290
291
292
293
294
# File 'lib/bioroebe/parsers/genbank_parser.rb', line 287

def analyse_this_dataset(dataset)
  use_this_regex =
    /ORIGIN[\/\-\.\s0-9a-zA-Z]+VERSION\s*[\.0-9A-Z]+/ # See: https://rubular.com/r/0q7rFIUflX7yzw
  scanned = dataset.scan(use_this_regex)
  @internal_hash[:n_FASTA_entries_in_the_file] = scanned
  discover_the_corresponding_FASTA_entries_from_this_dataset(scanned)
  consider_reporting_our_findings_to_the_user
end

#consider_reporting_our_findings_to_the_userObject Also known as: report, report_the_dataset

#

consider_reporting_our_findings_to_the_user (report tag)

#

352
353
354
355
356
357
358
359
360
# File 'lib/bioroebe/parsers/genbank_parser.rb', line 352

def consider_reporting_our_findings_to_the_user
  if report_the_dataset? and dataset? and !dataset?.empty?
    main_dataset?.each_pair {|key, value|
      e steelblue("#{key}:")
      e lightblue(value)
      e
    }
  end
end

#dataset?Boolean Also known as: main_dataset?

#

dataset?

#

Returns:

  • (Boolean)

299
300
301
# File 'lib/bioroebe/parsers/genbank_parser.rb', line 299

def dataset?
  @internal_hash[:dataset_from_all_FASTA_entries_as_a_hash]
end

#discover_the_corresponding_FASTA_entries_from_this_dataset(i) ⇒ Object

#

discover_the_corresponding_FASTA_entries_from_this_dataset

#

321
322
323
324
325
326
327
328
329
330
331
332
333
334
# File 'lib/bioroebe/parsers/genbank_parser.rb', line 321

def discover_the_corresponding_FASTA_entries_from_this_dataset(i)
  regex_to_use_for_the_id = /VERSION\s*([\.A-Za-z0-9]+)/
  if i.is_a? Array
    i.each {|this_dataset|
      this_dataset =~ regex_to_use_for_the_id
      use_this_id = $1.to_s.dup
      use_this_FASTA_sequence = this_dataset.scan(
        /^\s*\d{1,100}([\sa-zA-Z]+)/
      ).flatten.join(' ').delete(" \n")
      use_this_FASTA_sequence.upcase! if UPCASE_THE_SEQUENCE
      @internal_hash[:dataset_from_all_FASTA_entries_as_a_hash][use_this_id] = use_this_FASTA_sequence
    }
  end
end
#

menu (menu tag)

#

235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
# File 'lib/bioroebe/parsers/genbank_parser.rb', line 235

def menu(
    i = commandline_arguments_containing_leading_hyphens?
  )
  if i.is_a? Array
    i.each {|entry| menu(entry) }
  else
    case i # (case tag)
    # ===================================================================== #
    # === gparser --help
    # ===================================================================== #
    when /^-?-?help$/i
      show_help
      exit
    # ===================================================================== #
    # === gparser --test
    #
    # This entry point can be used to test the default TEST_STRING.
    # ===================================================================== #
    when /^-?-?test$/i,
         /^-?-?test(-|_)?string$/i
      analyse_this_dataset(TEST_STRING)
      exit
    end
  end
end

#report_the_dataset?Boolean

#

report_the_dataset?

#

Returns:

  • (Boolean)

271
272
273
# File 'lib/bioroebe/parsers/genbank_parser.rb', line 271

def report_the_dataset?
  @internal_hash[:report_the_dataset]
end

#resetObject

#

reset (reset tag)

#

200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
# File 'lib/bioroebe/parsers/genbank_parser.rb', line 200

def reset
  super()
  infer_the_namespace
  # ======================================================================= #
  # === @internal_hash
  # ======================================================================= #
  # @internal_hash = {}
  # ======================================================================= #
  # === :work_on_this_file
  # ======================================================================= #
  @internal_hash[:work_on_this_file] = nil
  # ======================================================================= #
  # === :report_the_dataset
  # ======================================================================= #
  @internal_hash[:report_the_dataset] = true
  # ======================================================================= #
  # === :n_FASTA_entries_in_the_file
  #
  # This variable will keep track how many FASTA entries are in
  # the genbank file at hand.
  # ======================================================================= #
  @internal_hash[:n_FASTA_entries_in_the_file] = 0
  # ======================================================================= #
  # === :dataset_from_all_FASTA_entries_as_a_hash
  #
  # This hash will contain all the FASTA sequences in the given
  # genbank file at hand. This constitutes the main dataset of
  # this clas.
  # ======================================================================= #
  @internal_hash[:dataset_from_all_FASTA_entries_as_a_hash] = {}
end

#runObject

#

run (run tag)

#

366
367
368
369
370
371
372
373
374
375
376
377
378
379
# File 'lib/bioroebe/parsers/genbank_parser.rb', line 366

def run
  set_work_on_this_file(first_argument?)
  # ======================================================================= #
  # First check whether the given file exists or not:
  # ======================================================================= #
  if verbose_check_whether_the_file_exists
    original_dataset = File.read(@internal_hash[:work_on_this_file]) # Just store it completely.
    if original_dataset.include?('ORIGIN') and original_dataset.include?('VERSION ')
      analyse_this_dataset(original_dataset)
    else
      opnn; e 'No keywords ORIGIN and VERSION were found in this file.'
    end
  end
end

#sequence?Boolean Also known as: coding_sequence?, cds

#

sequence?

#

Returns:

  • (Boolean)

313
314
315
# File 'lib/bioroebe/parsers/genbank_parser.rb', line 313

def sequence?
  sequences?.first
end

#sequences?Boolean

#

sequences?

#

Returns:

  • (Boolean)

306
307
308
# File 'lib/bioroebe/parsers/genbank_parser.rb', line 306

def sequences?
  dataset?.values
end

#set_work_on_this_file(i = first_argument? ) ⇒ Object

#

set_work_on_this_file

#

278
279
280
281
282
# File 'lib/bioroebe/parsers/genbank_parser.rb', line 278

def set_work_on_this_file(
    i = first_argument?
  )
  @internal_hash[:work_on_this_file] = i
end

#verbose_check_whether_the_file_existsObject

#

verbose_check_whether_the_file_exists

#

339
340
341
342
343
344
345
346
347
# File 'lib/bioroebe/parsers/genbank_parser.rb', line 339

def verbose_check_whether_the_file_exists
  _ = @internal_hash[:work_on_this_file]
  if _ and File.exist?(_)
    true
  else
    opnn; e 'No file exists at '+sfile(_)
    false
  end
end

#work_on_which_file?Boolean

#

work_on_which_file?

#

Returns:

  • (Boolean)

264
265
266
# File 'lib/bioroebe/parsers/genbank_parser.rb', line 264

def work_on_which_file?
  @internal_hash[:work_on_this_file]
end