Top Level Namespace
Defined Under Namespace
Classes: Allele, Annotation, Feature, Genotype, Snp, Strain
Instance Method Summary collapse
- #db_schema ⇒ Object
- #establish_connection(db_location) ⇒ Object
- #find_unqiue_snps(strain_names, out, cutoff_genotype, cutoff_snp) ⇒ Object
- #get_snps(out, ignore_snps_on_annotation, ignore_snps_in_range, ignore_strains, remove_non_informative_snps, fasta_output, tabular_output, cutoff_genotype, cutoff_snp, tree, fasttree_path) ⇒ Object
-
#guess_sequence_format(reference_genome) ⇒ Object
This method guesses the reference sequence file format.
- #information(out, cutoff_genotype, cutoff_snp) ⇒ Object
-
#output_information_methods(snps, outfile, cuttoff_genotype, cuttoff_snp, info = true) ⇒ Object
This method finds info_snps from a list of strains given by user.
-
#populate_features_and_annotations(sequence_file) ⇒ Object
A method to populate the database with the features (genes etc) and the annotations from the gbk/embl file.
-
#populate_snps_alleles_genotypes(vcf_file, cutoff_ad) ⇒ Object
This method populates the rest of the information, i.e.
Instance Method Details
#db_schema ⇒ Object
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
# File 'lib/snp_db_schema.rb', line 1 def db_schema ActiveRecord::Schema.define do unless table_exists? :strains create_table :strains do |t| t.column :name, :string t.column :description, :string end end unless table_exists? :features_snps create_table :features_snps, :id => false do |t| t.column :feature_id, :integer, :null => false t.column :snp_id, :integer, :null => false end end unless table_exists? :features create_table :features do |t| t.column :name, :string t.column :start, :integer t.column :end, :integer t.column :strand, :integer t.column :sequence, :string end end unless table_exists? :snps create_table :snps do |t| t.column :ref_pos, :integer t.column :qual, :float t.column :reference_allele_id, :integer end end unless table_exists? :alleles create_table :alleles do |t| t.column :snp_id, :integer t.column :base, :string end end unless table_exists? :genotypes create_table :genotypes do |t| t.column :allele_id, :integer t.column :strain_id, :integer t.column :geno_qual, :float end end unless table_exists? :annotations create_table :annotations do |t| t.column :qualifier, :string t.column :value, :string t.column :feature_id, :integer end end # indices unless index_exists? :strains, :id add_index :strains, :id end unless index_exists? :strains, :name add_index :strains, :name end unless index_exists? :features, :id add_index :features, :id end unless index_exists? :features, :name add_index :features, :name end unless index_exists? :features, :start add_index :features, :start end unless index_exists? :features, :end add_index :features, :end end unless index_exists? :features, :strand add_index :features, :strand end unless index_exists? :features, :sequence add_index :features, :sequence end unless index_exists? :snps, :id add_index :snps, :id end unless index_exists? :snps, :ref_pos add_index :snps, :ref_pos end unless index_exists? :snps, :qual add_index :snps, :qual end unless index_exists? :snps, :reference_allele_id add_index :snps, :reference_allele_id end unless index_exists? :features_snps, :feature_id add_index :features_snps, :feature_id end unless index_exists? :features_snps, :snp_id add_index :features_snps, :snp_id end unless index_exists? :snps, :qual add_index :snps, :qual end unless index_exists? :alleles, :snp_id add_index :alleles, :snp_id end unless index_exists? :alleles, :base add_index :alleles, :base end unless index_exists? :genotypes, :id add_index :genotypes, :id end unless index_exists? :genotypes, :allele_id add_index :genotypes, :allele_id end unless index_exists? :genotypes, :strain_id add_index :genotypes, :strain_id end unless index_exists? :genotypes, :geno_qual add_index :genotypes, :geno_qual end unless index_exists? :annotations, :feature_id add_index :annotations, :feature_id end unless index_exists? :annotations, :qualifier add_index :annotations, :qualifier end unless index_exists? :annotations, :value add_index :annotations, :value end end end |
#establish_connection(db_location) ⇒ Object
5 6 7 8 9 |
# File 'lib/snp_db_connection.rb', line 5 def establish_connection(db_location) ActiveRecord::Base.establish_connection( :adapter => "sqlite3", :database => db_location) end |
#find_unqiue_snps(strain_names, out, cutoff_genotype, cutoff_snp) ⇒ Object
10 11 12 13 14 15 16 17 18 19 20 21 22 |
# File 'lib/snp-search.rb', line 10 def find_unqiue_snps(strain_names, out, cutoff_genotype, cutoff_snp) *strain_names = strain_names where_statement = strain_names.collect{|strain_name| "strains.name = '#{strain_name}' OR "}.join("").sub(/ OR $/, "") outfile = File.open(out, "w") snps = Snp.find_by_sql("SELECT snps.* from snps INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id INNER JOIN strains ON strains.id = genotypes.strain_id WHERE (#{where_statement}) AND alleles.id <> snps.reference_allele_id AND genotypes.geno_qual >= #{cutoff_genotype} AND snps.qual >= #{cutoff_snp} AND (SELECT COUNT(*) from snps AS s INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id WHERE alleles.id <> snps.reference_allele_id and s.id = snps.id) = #{strain_names.size} GROUP BY snps.id HAVING COUNT(*) = #{strain_names.size}") # puts "The number of unique snps are #{snps.size}" output_information_methods(snps, outfile, cutoff_genotype, cutoff_snp, false) end |
#get_snps(out, ignore_snps_on_annotation, ignore_snps_in_range, ignore_strains, remove_non_informative_snps, fasta_output, tabular_output, cutoff_genotype, cutoff_snp, tree, fasttree_path) ⇒ Object
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
# File 'lib/filter_ignore_snps_methods.rb', line 7 def get_snps(out, ignore_snps_on_annotation, ignore_snps_in_range, ignore_strains, remove_non_informative_snps, fasta_output, tabular_output, cutoff_genotype, cutoff_snp, tree, fasttree_path) strains = Strain.all sequence_hash = Hash.new sequence_hash["ref"] = Array.new strains.each do |strain| sequence_hash[strain.name] = Array.new end snps_array = Array.new snp_positions = Array.new # output opened for data input output = File.open(out, "w") tab_delim_file_name = File.basename(out, File.extname(out)) + "_snps.tsv" tab_delim_file = File.open(tab_delim_file_name, "w") position_map_file_name = File.basename(out, File.extname(out)) + "_snps_positions.txt" position_map_file = File.open(position_map_file_name, "w") snps_within_features_with_annotation = "" # Perform query # puts ignore_snps_on_annotation.inspect if ignore_snps_on_annotation annotations_where = ignore_snps_on_annotation.split(",").map{|annotation| "annotations.value LIKE '%#{annotation}%'"}.join(" OR ") features_with_annotation = Feature.joins(:annotations).where(annotations_where) snps_within_features_with_annotation = Snp.joins(:features).where("features.id IN (?)", features_with_annotation.collect{|feature| feature.id}) end if snps_within_features_with_annotation.empty? snps = Snp.all else snps = Snp.where("snps.id NOT IN (?)", snps_within_features_with_annotation.collect{|snp| snp.id}) end positions_to_ignore = Array.new if ignore_snps_in_range range_strings = ignore_snps_in_range.split(",") range_strings.each do |range| start_position, end_position = range.split("..") positions_to_ignore += (start_position.to_i..end_position.to_i).to_a end end if ignore_strains strains_to_ignore = ignore_strains.split(",") end i = 0 puts "Your Query is submitted and is being processed......." strains = Strain.find(:all) if ignore_strains strains_to_ignore = ignore_strains.split(",") strains.reject!{|strain| strains_to_ignore.include?(strain.name)} end snps.each do |snp| ActiveRecord::Base.transaction do i += 1 next if positions_to_ignore.include?(snp.ref_pos) # Ignore positions that user specified alleles = snp.alleles genotypes = snp.alleles.collect{|allele| allele.genotypes}.flatten snp_qual = Snp.find_by_sql("select qual from snps where snps.id = #{snp.id}") # ignore snp if the snp qual is less than cutoff. next if snp_qual.any?{|snps_quality| snps_quality.qual < cutoff_snp.to_i} next if alleles.any?{|allele| allele.base.length > 1} # indel next unless genotypes.all?{|genotype| genotype.geno_qual >= cutoff_genotype.to_f} # all geno quals > cutoff # puts "#{i} SNPs processed so far" if i % 100 == 0 strain_alleles = Hash.new strains.each do |strain| strain_genotype = genotypes.select{|genotype| genotype.strain_id == strain.id}.first # next if strain_genotype == nil puts strain_genotype.inspect strain_allele = alleles.select{|allele| allele.id == strain_genotype.allele_id}.first strain_alleles[strain.name] = strain_allele.base end if remove_non_informative_snps next if strain_alleles.values.uniq.size == 1 # remove non-informative SNPs end snp_positions << snp.ref_pos snps_array << snp strain_alleles.each do |strain_name, allele_base| sequence_hash[strain_name] << allele_base end sequence_hash["ref"] << snp.reference_allele.base end end # If user has specified a tabular output if tabular_output output_information_methods(snps_array, output, cutoff_genotype, cutoff_snp, true) # If user has specified a fasta output elsif fasta_output # generate FASTA file output.puts ">ref\n#{sequence_hash["ref"].join("")}" tab_delim_file.puts "\t#{snp_positions.join("\t")}" tab_delim_file.puts "ref\t#{sequence_hash["ref"].join("\t")}" strains.each do |strain| output.puts ">#{strain.name}\n#{sequence_hash[strain.name].join("")}" tab_delim_file.puts "#{strain.name}\t#{sequence_hash[strain.name].join("\t")}" end snp_positions.each_with_index do |snp_position, index| position_map_file.puts "#{index+1} => #{snp_position}" end end # If user has chosen a newick output. if tree nwk_out_file_name = File.basename(out, File.extname(out)) + ".nwk" puts "running phylogeny" `#{fasttree_path} -fastest -nt #{output} > #{nwk_out_file_name}` end output.close tab_delim_file.close end |
#guess_sequence_format(reference_genome) ⇒ Object
This method guesses the reference sequence file format
7 8 9 10 11 12 13 14 15 16 17 |
# File 'lib/create_methods.rb', line 7 def guess_sequence_format(reference_genome) file_extension = File.extname(reference_genome).downcase file_format = nil case file_extension when ".gbk", ".genbank", ".gb" file_format = :genbank when ".embl", ".emb" file_format = :embl end return file_format end |
#information(out, cutoff_genotype, cutoff_snp) ⇒ Object
25 26 27 28 29 30 31 32 33 34 35 36 37 38 |
# File 'lib/snp-search.rb', line 25 def information(out, cutoff_genotype, cutoff_snp) puts "outputting SNP info....." strains = Strain.all # snps = Snp.find_by_sql("SELECT distinct snps.* from snps INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id INNER JOIN strains ON strains.id = genotypes.strain_id where alleles.id <> snps.reference_allele_id") snps = Snp.find_by_sql("SELECT distinct snps.* from snps INNER JOIN alleles ON alleles.snp_id = snps.id") outfile = File.open(out, "w") output_information_methods(snps, outfile, cutoff_genotype, cutoff_snp, true) end |
#output_information_methods(snps, outfile, cuttoff_genotype, cuttoff_snp, info = true) ⇒ Object
This method finds info_snps from a list of strains given by user. Its is called in lib/snp-search.rb
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
# File 'lib/output_information_methods.rb', line 5 def output_information_methods(snps, outfile, cuttoff_genotype, cuttoff_snp, info = true) strains = Strain.all outfile.puts "pos_of_SNP_in_ref\tref_base\tSNP_base\tsynonymous or non-synonymous\tGene_annotation\tpossible_pseudogene?\tamino_acid_original\tamino_acid_change\tchange_in_hydrophobicity_of_AA?\tchange_in_polarisation_of_AA?\tchange_in_size_of_AA?\t#{strains.map{|strain| strain.name}.join("\t") if info}" snps_counter = 0 cds_snps_counter = 0 total_number_of_syn_snps = 0 total_number_of_non_syn_snps = 0 total_number_of_pseudo = 0 snps.each do |snp| ActiveRecord::Base.transaction do snp.alleles.each do |allele| next if snp.alleles.any?{|allele| allele.base.length > 1} # indel if allele.id != snp.reference_allele_id # get annotation (if there is any) for each SNP features = Feature.joins(:snps).where("snps.id = ?", snp.id) # get snp quality for each snp snp_qual = Snp.find_by_sql("select qual from snps where snps.id = #{snp.id}") # ignore snp if the snp qual is less than cuttoff. next if snp_qual.any?{|snps_quality| snps_quality.qual < cuttoff_snp.to_i} # get all genotype qualities for each snp. gqs = Genotype.find_by_sql("select geno_qual from genotypes inner join alleles on alleles.id = genotypes.allele_id inner join snps on snps.id = alleles.snp_id where snps.id = #{snp.id}") # ignore snp if any of its genotype qualities is lower than the cuttoff. next if gqs.any?{|genotype_quality| genotype_quality.geno_qual < cuttoff_genotype.to_i} ref_base = Bio::Sequence.auto(Allele.find(snp.reference_allele_id).base) snp_base = Bio::Sequence.auto(allele.base) # count snps now: after you have selected the snps with gqs and snp_qual greater than the threshold. snps_counter += 1 # If the feature is empty then just output basic information about the snp. if features.empty? outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}" else features.each do |feature| if feature.name == "CDS" cds_snps_counter +=1 annotation = Annotation.where("annotations.qualifier = 'product' and annotations.feature_id = ?", feature.id).first #if annotation is nil, or empty if annotation.nil? outfile.puts "#{snp.ref_pos}\t#{feature.strand == 1 ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{feature.strand == 1 ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}" else feature_sequence = feature.sequence feature_sequence_bio = Bio::Sequence::NA.new(feature_sequence) #Mutate sequence with SNP feature_sequence_mutated = feature.sequence feature_sequence_snp_pos = (snp.ref_pos-1) - (feature.start-1) feature_sequence_mutated[feature_sequence_snp_pos] = allele.base feature_sequence_mutated_bio = Bio::Sequence::NA.new(feature_sequence_mutated) # Translate the sequences if feature.strand == -1 mutated_seq_translated = feature_sequence_mutated_bio.reverse_complement.translate original_seq_translated = feature_sequence_bio.reverse_complement.translate else mutated_seq_translated = feature_sequence_mutated_bio.translate original_seq_translated = feature_sequence_bio.translate end # Remove the star at the end of each translated sequence. mutated_seq_translated_clean = mutated_seq_translated.gsub(/\*$/,"") original_seq_translated_clean = original_seq_translated.gsub(/\*$/,"") # Amino acid properties hydrophobic = ["I", "L", "V", "C", "A", "G", "M", "F", "Y", "W", "H", "T"] non_hydrophobic = ["K", "E", "Q", "D", "N", "S", "P", "B"] polar = ["Y", "W", "H", "K", "R", "E", "Q", "D", "N", "S", "P", "B"] non_polar = ["I", "L", "V", "C", "A", "G", "M", "F", "T"] small = ["V","C","A","G","D","N","S","T","P"] non_small = ["I","L","M","F","Y","W","H","K","R","E","Q"] # Get alleles for each strain bases_from_alleles = [] strains.each do |strain| allele_for_strains = Allele.joins(:genotypes => :strain).where("strains.id = ? AND alleles.snp_id = ?", strain.id, snp.id).first # allele_for_strains = Allele.find_by_sql("select * from alleles inner join genotypes on genotypes.allele_id = alleles.id inner join strains on strains.id = genotypes.strain_id where strains.id = #{strain.id} and alleles.snp_id = #{snp.id}") puts allele_for_strains.inspect # next if bases_from_alleles.empty? bases_from_alleles << allele_for_strains.base end # If no difference between the amino acids then its synonymous SNP, if different then its non-synonymous. if original_seq_translated_clean == mutated_seq_translated_clean total_number_of_syn_snps +=1 if mutated_seq_translated_clean =~ /\*/ total_number_of_pseudo +=1 outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tsynonymous\t#{annotation.value}\tYes\tN/A\tN/A\tN/A\tN/A\tN/A\t#{bases_from_alleles.join("\t") if info}" else outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tsynonymous\t#{annotation.value}\tNo\tN/A\tN/A\tN/A\tN/A\tN/A\t#{bases_from_alleles.join("\t") if info}" end else total_number_of_non_syn_snps +=1 diffs = Diff::LCS.diff(original_seq_translated_clean, mutated_seq_translated_clean) if mutated_seq_translated_clean =~ /\*/ total_number_of_pseudo +=1 outfile.puts "#{snp.ref_pos}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tnon-synonymous\t#{annotation.value}\tYes\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}#{'No' if (hydrophobic.include? diffs[0][0].element) != (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}#{'No' if (polar.include? diffs[0][0].element) != (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}#{'No' if (small.include? diffs[0][0].element) != (non_small.include? diffs[0][1].element)}\t#{bases_from_alleles.join("\t") if info}" else outfile.puts "#{snp.ref_pos-1}\t#{features.map{|feature| feature.strand == 1} ? "#{ref_base.upcase}" : "#{ref_base.reverse_complement.upcase}"}\t#{features.map{|feature| feature.strand == 1} ? "#{snp_base.upcase}" : "#{snp_base.reverse_complement.upcase}"}\tnon-synonymous\t#{annotation.value}\tNo\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}#{'No' if (hydrophobic.include? diffs[0][0].element) != (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}#{'No' if (polar.include? diffs[0][0].element) != (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}#{'No' if (small.include? diffs[0][0].element) != (non_small.include? diffs[0][1].element)}\t#{bases_from_alleles.join("\t") if info}" end end end end end end puts "Total SNPs added so far: #{snps_counter}" if snps_counter % 100 == 0 end end end end puts "Total number of snps: #{snps_counter} with Genotype quality cutoff at #{cuttoff_genotype} and SNP quality cutoff at #{cuttoff_snp}" puts "Total number of snps in CDS region: #{cds_snps_counter}" puts "Total number of synonymous SNPs: #{total_number_of_syn_snps}" puts "Total number of non-synonymous SNPs: #{total_number_of_non_syn_snps}" puts "Total number of pseudogenes: #{total_number_of_pseudo}" outfile.puts "Total number of snps: #{snps_counter}" outfile.puts "Total number of snps in CDS region: #{cds_snps_counter}" outfile.puts "Total number of synonymous SNPs: #{total_number_of_syn_snps}" outfile.puts "Total number of non-synonymous SNPs: #{total_number_of_non_syn_snps}" outfile.puts "Total number of possible pseudogenes: #{total_number_of_pseudo}" end |
#populate_features_and_annotations(sequence_file) ⇒ Object
A method to populate the database with the features (genes etc) and the annotations from the gbk/embl file.
We include all features that are not ‘source’ or ‘gene’ as they are repetitive info. ‘CDS’ is the gene. The annotation table includes also the start and end coordinates of the CDS. The strand is also included. the ‘locations’ method is defined in bioruby under genbank. It must be required at the top (bio). Also, the qualifier and value are extracted from the gbk/embl file and added to the database.
23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
# File 'lib/create_methods.rb', line 23 def populate_features_and_annotations(sequence_file) puts "Adding features and their annotations...." ActiveRecord::Base.transaction do counter = 0 sequence_file.features.each do |feature| unless feature.feature == "source" || feature.feature == "gene" counter += 1 puts "Total number of features and annotations added: #{counter}" if counter % 100 == 0 db_feature = Feature.new db_feature.name = feature.feature db_feature.start = feature.locations.first.from db_feature.end = feature.locations.first.to db_feature.strand = feature.locations.first.strand #Add nucleotide sequence from ORIGIN of genbank file. db_feature.sequence = sequence_file.seq[feature.locations.first.from-1..feature.locations.first.to-1] db_feature.save # Populate the Annotation table with qualifier information from the genbank file feature.qualifiers.each do |qualifier| a = Annotation.new a.qualifier = qualifier.qualifier a.value = qualifier.value a.save db_feature.annotations << a end end end end end |
#populate_snps_alleles_genotypes(vcf_file, cutoff_ad) ⇒ Object
This method populates the rest of the information, i.e. SNP information, Alleles and Genotypes.
53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 |
# File 'lib/create_methods.rb', line 53 def populate_snps_alleles_genotypes(vcf_file, cutoff_ad) puts "Adding SNPs........" # open vcf file and parse each line File.open(vcf_file) do |f| # header names while line = f.gets if line =~ /CHROM/ line.chomp! column_headings = line.split("\t") potential_strain_names = column_headings[9..-1] potential_strain_names.map!{|name| name.sub(/\..*/, '')} # strain_names = column_headings[9..-1] # strain_names.map!{|name| name.sub(/\..*/, '')} strains = Array.new # strain_names.each do |str| # ss = Strain.new # ss.name = str # ss.save # end # strains = Array.new # strain_names.each do |strain_name| # strain = Strain.find_by_name(strain_name) # equivalent to Strain.find.where("strains.name=?", strain_name).first # strains << strain # end good_snps = 0 # start parsing snps while line = f.gets line.chomp! details = line.split("\t") ref = details[0] ref_pos = details[1].to_i ref_base = details[3] snp_bases = details[4].split(",") snp_qual = details [5] number_of_colons_in_format = details[8].count ":" format = details[8].split(":") gt_array_position = format.index("GT") gq_array_position = format.index("GQ") ad_array_position = format.index("AD") # dp = format.index("DP") # columns_after_format = details[9..-1] # samples = columns_after_format.select{|column_after_format| column_after_format.count(":") == number_of_colons_in_format} # puts samples samples = details[9..-1] if strains.empty? samples.zip(potential_strain_names).each do |column_after_format, potential_strain_name| if column_after_format.count(":") == number_of_colons_in_format ss = Strain.new ss.name = potential_strain_name ss.save strains << ss end end end gts = [] gqs = [] ad_ratios = [] next if samples.any?{|sample| sample =~ /\.\/\./} # no coverage in at least one sample samples.map do |sample| format_values = sample.split(":") # output (e.g.): ["0/0 ", "0,255,209", "99"] if gt_array_position gt = format_values[gt_array_position] # e.g. gt = gt.split("/") next if gt.size > 1 && (gt.first != gt.last) # if its 0/1, 1/2 etc then ignore next if gt.first == "." # no coverage gt = gt.first.to_i else puts "GT field in the FORMAT section of the vcf file is missing....snp-search requires this field to assess the SNP quality.....sorry aborting" exit end if gq_array_position gq = format_values[gq_array_position].to_f else puts "GQ field in the FORMAT section of the vcf file is missing....snp-search requires this field to assess the SNP quality.....sorry aborting" exit end if ad_array_position # If there is AD in vcf. Typically AD is Allele specific depth. i.e. if ref is 'A' and alt is 'G' and AD is '6,9' you got 6 A reads and 9 G reads. # ad below is 6 and 9 in the example above. ad = format_values[ad_array_position].split(",").map{|ad_value| ad_value.to_i} # Find the sum of all bases (sum_of_ad) reported by the ad, so its 15 in the example. sum_of_ad = ad.inject{|sum,x| sum + x } ad_ratios << ad[gt]/sum_of_ad.to_f end gqs << gq gts << gt end next if ad_ratios.any?{|ad_ratio| ad_ratio < cutoff_ad.to_i} # exclude if any samples have a call ratio of less than a cutoff set by user if gts.size == samples.size # if some gts have been rejected due to heterozygote or no coverage good_snps +=1 # populate snps ActiveRecord::Base.transaction do s = Snp.new s.ref_pos = ref_pos s.qual = snp_qual s.save # create ref allele ref_allele = Allele.new ref_allele.base = ref_base ref_allele.snp = s ref_allele.save s.reference_allele = ref_allele s.save snp_alleles = Array.new gts.uniq.select{|gt| gt > 0}.each do |gt| # create snp allele snp_allele = Allele.new snp_bases_index = gt - 1 snp_allele.base = snp_bases[snp_bases_index] snp_allele.snp = s snp_allele.save snp_alleles << snp_allele end genos = [] gts.each_with_index do |gt, index| genotype = Genotype.new genotype.strain = strains[index] #Adding the genotype quality with Genotype genotype.geno_qual = gqs[index] if gt == 0# wild type genotype.allele = ref_allele else # snp type genotype.allele = snp_alleles[gt - 1] end genos << genotype end # Using activerecord-import to speed up importing Genotype.import genos, :validate => false puts "Total SNPs added so far: #{good_snps}" if good_snps % 100 == 0 # puts "Total SNPs added so far: #{good_snps}" end end end end end end #Here we link the features to snps. puts "Linking features to SNPs" ActiveRecord::Base.transaction do Snp.all.each_with_index do |snp, index| puts "Total SNPs linked to features added so far: #{index}" if index % 100 == 0 features = Feature.where("features.start <= ? AND features.end >= ?", snp.ref_pos, snp.ref_pos) unless features.empty? features.each do |feature| snp.features << feature end end end end end |