Class: SequenceServer::Database

Inherits:
Struct
  • Object
show all
Extended by:
Enumerable, Forwardable
Defined in:
lib/sequenceserver/database.rb,
lib/sequenceserver/database.rb

Overview

Model Database's eigenclass as a collection of Database objects.

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Attribute Details

#nameObject

Returns the value of attribute name

Returns:

  • (Object)

    the current value of name



21
22
23
# File 'lib/sequenceserver/database.rb', line 21

def name
  @name
end

#ncharactersObject

Returns the value of attribute ncharacters

Returns:

  • (Object)

    the current value of ncharacters



21
22
23
# File 'lib/sequenceserver/database.rb', line 21

def ncharacters
  @ncharacters
end

#nsequencesObject

Returns the value of attribute nsequences

Returns:

  • (Object)

    the current value of nsequences



21
22
23
# File 'lib/sequenceserver/database.rb', line 21

def nsequences
  @nsequences
end

#titleObject

Returns the value of attribute title

Returns:

  • (Object)

    the current value of title



21
22
23
# File 'lib/sequenceserver/database.rb', line 21

def title
  @title
end

#typeObject

Returns the value of attribute type

Returns:

  • (Object)

    the current value of type



21
22
23
# File 'lib/sequenceserver/database.rb', line 21

def type
  @type
end

#updated_onObject

Returns the value of attribute updated_on

Returns:

  • (Object)

    the current value of updated_on



21
22
23
# File 'lib/sequenceserver/database.rb', line 21

def updated_on
  @updated_on
end

Class Method Details

.<<(database) ⇒ Object



58
59
60
# File 'lib/sequenceserver/database.rb', line 58

def <<(database)
  collection[database.id] = database
end

.[](ids) ⇒ Object



62
63
64
65
# File 'lib/sequenceserver/database.rb', line 62

def [](ids)
  ids = Array ids
  collection.values_at(*ids)
end

._make_blast_database(file, type, title, quiet = false) ⇒ Object



145
146
147
148
149
150
# File 'lib/sequenceserver/database.rb', line 145

def _make_blast_database(file, type, title, quiet = false)
  cmd = 'makeblastdb -parse_seqids -hash_index ' \
        "-in #{file} -dbtype #{type.to_s.slice(0, 4)} -title '#{title}'"
  cmd << ' &> /dev/null' if quiet
  system cmd
end

.allObject



71
72
73
# File 'lib/sequenceserver/database.rb', line 71

def all
  collection.values
end

.clearObject

Intended to be used only for testing.



93
94
95
# File 'lib/sequenceserver/database.rb', line 93

def clear
  collection.clear
end

.each(&block) ⇒ Object



75
76
77
# File 'lib/sequenceserver/database.rb', line 75

def each(&block)
  all.each(&block)
end

.firstObject

Intended to be used only for testing.



88
89
90
# File 'lib/sequenceserver/database.rb', line 88

def first
  all.first
end

.get_database_title(path) ⇒ Object

Generate a title for the given database and show it to the user for confirmation.

Returns user input if any. Auto-generated title otherwise.



171
172
173
174
175
176
# File 'lib/sequenceserver/database.rb', line 171

def get_database_title(path)
  default = make_db_title(File.basename(path))
  print "Enter a database title or will use '#{default}': "
  from_user = STDIN.gets.to_s
  from_user.strip.empty? && default || from_user
end

.group_by(&block) ⇒ Object



83
84
85
# File 'lib/sequenceserver/database.rb', line 83

def group_by(&block)
  all.group_by(&block)
end

.guess_sequence_type_in_fasta(file) ⇒ Object

Guess whether FASTA file contains protein or nucleotide sequences based on first 32768 characters.

NOTE: 2^15 == 32786. Approximately 546 lines, assuming 60 characters on each line.



217
218
219
220
221
222
# File 'lib/sequenceserver/database.rb', line 217

def guess_sequence_type_in_fasta(file)
  sequences = sample_sequences(file)
  sequence_types = sequences.map { |seq| Sequence.guess_type(seq) }
  sequence_types = sequence_types.uniq.compact
  (sequence_types.length == 1) && sequence_types.first
end

.idsObject



67
68
69
# File 'lib/sequenceserver/database.rb', line 67

def ids
  collection.keys
end

.include?(path) ⇒ Boolean

Returns:

  • (Boolean)


79
80
81
# File 'lib/sequenceserver/database.rb', line 79

def include?(path)
  collection.include? Digest::MD5.hexdigest path
end

.make_blast_database(file, type) ⇒ Object

Create BLAST database, given FASTA file and sequence type in FASTA file.



139
140
141
142
143
# File 'lib/sequenceserver/database.rb', line 139

def make_blast_database(file, type)
  return unless make_blast_database? file, type
  title = get_database_title(file)
  _make_blast_database(file, type, title)
end

.make_blast_database?(file, type) ⇒ Boolean

Show file path and guessed sequence type to the user and obtain a y/n response.

Returns true if the user entered anything but 'n' or 'N'.

Returns:

  • (Boolean)


156
157
158
159
160
161
162
163
164
165
# File 'lib/sequenceserver/database.rb', line 156

def make_blast_database?(file, type)
  puts
  puts
  puts "FASTA file: #{file}"
  puts "FASTA type: #{type}"
  print 'Proceed? [y/n] (Default: y): '

  response = STDIN.gets.to_s.strip
  !response.match(/n/i)
end

.make_blast_databasesObject

Recursively scan `database_dir` for un-formatted FASTA and format them for use with BLAST+.



112
113
114
115
116
# File 'lib/sequenceserver/database.rb', line 112

def make_blast_databases
  unformatted_fastas.select do |file, sequence_type|
    make_blast_database(file, sequence_type)
  end
end

.make_db_title(db_name) ⇒ Object

Suggests improved titles when generating database names from files for improved apperance and readability in web interface. For example: Cobs1.4.proteins.fasta -> Cobs 1.4 proteins S_invicta.xx.2.5.small.nucl.fa -> S invicta xx 2.5 small nucl



199
200
201
202
203
204
205
206
207
208
209
210
# File 'lib/sequenceserver/database.rb', line 199

def make_db_title(db_name)
  db_name.gsub!('"', "'")
  # removes .fasta like extension names
  db_name.gsub!(File.extname(db_name), '')
  # replaces _ with ' ',
  db_name.gsub!(/(_)/, ' ')
  # replaces '.' with ' ' when no numbers are on either side,
  db_name.gsub!(/(\D)\.(?=\D)/, '\1 ')
  # preserves version numbers
  db_name.gsub!(/\W*(\d+([.-]\d+)+)\W*/, ' \1 ')
  db_name
end

.multipart_database_name?(db_name) ⇒ Boolean

Returns true if the database name appears to be a multi-part database name.

e.g. /home/ben/pd.ben/sequenceserver/db/nr.00 => yes /home/ben/pd.ben/sequenceserver/db/nr => no /home/ben/pd.ben/sequenceserver/db/img3.5.finished.faa.01 => yes

Returns:

  • (Boolean)


185
186
187
# File 'lib/sequenceserver/database.rb', line 185

def multipart_database_name?(db_name)
  !(db_name.match(/.+\/\S+\d{2}$/).nil?)
end

.probably_fasta?(file) ⇒ Boolean

Returns true if first character of the file is '>'.

Returns:

  • (Boolean)


190
191
192
# File 'lib/sequenceserver/database.rb', line 190

def probably_fasta?(file)
  File.read(file, 1) == '>'
end

.sample_sequences(file) ⇒ Object

Read first 32768 characters of the file. Split on fasta def line pattern and return.

If the given file is FASTA, returns Array of as many different sequences in the portion of the file read. Returns the portion of the file read wrapped in an Array otherwise.



230
231
232
# File 'lib/sequenceserver/database.rb', line 230

def sample_sequences(file)
  File.read(file, 32_768).split(/^>.+$/).delete_if(&:empty?)
end

.scan_databases_dirObject

Recurisvely scan `database_dir` for blast databases.



98
99
100
101
102
103
104
105
106
107
108
# File 'lib/sequenceserver/database.rb', line 98

def scan_databases_dir
  database_dir = config[:database_dir]
  cmd = "blastdbcmd -recursive -list #{database_dir}" \
        ' -list_outfmt "%f	%t	%p	%n	%l	%d" 2>&1'
  list = `#{cmd}`
  list.each_line do |line|
    name = line.split('	')[0]
    next if multipart_database_name?(name)
    self << Database.new(*line.split('	'))
  end
end

.unformatted_fastasObject

Returns an Array of FASTA files that may require formatting, and the type of sequence contained in each FASTA.

> unformatted_fastas
=> [['/foo/bar.fasta', :nulceotide], ...]


123
124
125
126
127
128
129
130
131
132
133
134
135
136
# File 'lib/sequenceserver/database.rb', line 123

def unformatted_fastas
  list = []
  database_dir = config[:database_dir]
  Find.find database_dir do |file|
    next if File.directory? file
    next if Database.include? file
    next unless probably_fasta? file
    sequence_type = guess_sequence_type_in_fasta file
    if [:protein, :nucleotide].include?(sequence_type)
      list << [file, sequence_type]
    end
  end
  list
end