Class: PubliSci::Generators::MAF
- Inherits:
-
Object
- Object
- PubliSci::Generators::MAF
show all
- Extended by:
- Base
- Defined in:
- lib/publisci/generators/maf.rb
Constant Summary
collapse
- COLUMN_NAMES =
%w{ Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position End_Position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score BAM_File Sequencer Tumor_Sample_UUID Matched_Norm_Sample_UUID patient_id sample_id}
- COMPONENT_RANGES =
{ "Tumor_Sample_Barcode" => "xsd:string", "Start_position" => "xsd:int", "Center" => "xsd:string", "NCBI_Build" => "xsd:int", "Chromosome" => "xsd:int" }
- TCGA_CODES =
{
"Variant_Classification" => %w{Frame_Shift_Del Frame_Shift_Ins In_Frame_Del In_Frame_Ins Missense_Mutation Nonsense_Mutation Silent Splice_Site Translation_Start_Site Nonstop_Mutation 3'UTR 3'Flank 5'UTR 5'Flank IGR1 Intron RNA Targeted_Region},
"Variant_Type" => %w{SNP DNP TNP ONP INS DEL Consolidated},
"dbSNP_Val_Status" => %w{by1000genomes by2Hit2Allele byCluster byFrequency byHapMap byOtherPop bySubmitter alternate_allele},
"Verification_Status" => %w{Verified, Unknown},
"Validation_Status" => %w{Untested Inconclusive Valid Invalid},
"Mutation_Status" => %w{None Germline Somatic LOH Post-transcriptional modification Unknown},
"Sequence_Source" => %w{WGS WGA WXS RNA-Seq miRNA-Seq Bisulfite-Seq VALIDATION Other ncRNA-Seq WCS CLONE POOLCLONE AMPLICON CLONEEND FINISHING ChIP-Seq MNase-Seq DNase-Hypersensitivity EST FL-cDNA CTS MRE-Seq MeDIP-Seq MBD-Seq Tn-Seq FAIRE-seq SELEX RIP-Seq ChIA-PET},
"Sequencer" => ["Illumina GAIIx", "Illumina HiSeq", "SOLID", "454", "ABI 3730xl", "Ion Torrent PGM", "Ion Torrent Proton", "PacBio RS", "Illumina MiSeq", "Illumina HiSeq 2500", "454 GS FLX Titanium", "AB SOLiD 4 System" ]
}
- BARCODE_INDEX =
COLUMN_NAMES.index('Tumor_Sample_Barcode')
Class Method Summary
collapse
-
.column_replace(entry, column, prefix, value = nil) ⇒ Object
-
.official_symbol(hugo_symbol) ⇒ Object
-
.parse_barcode(code) ⇒ Object
-
.post_process(file) ⇒ Object
-
.process_line(entry, label, options) ⇒ Object
-
.process_options(options) ⇒ Object
-
.sio_values(entry) ⇒ Object
-
.structure(options = {}) ⇒ Object
-
.write(record, out, label, options = {}) ⇒ Object
-
.write_structure(input, output, options) ⇒ Object
Methods included from Base
close_output, write, write_to
#abbreviate_known, #code_lists, #component_gen, #component_specifications, #concept_codes, #data_structure_definition, #dataset, #defaults, #dimension_properties, #encode_data, #generate, #generate_resources, #measure_properties, #observations, #prefixes, #vocabulary
Methods included from RDFParser
#add_node, #bnode_value, #encode_value, #get_ary, #get_hashes, #is_complex?, #is_uri?, #load_string, #observation_hash, #sanitize, #sanitize_hash, #strip_prefixes, #strip_uri, #to_literal, #to_resource, #turtle_indent
Class Method Details
.column_replace(entry, column, prefix, value = nil) ⇒ Object
140
141
142
143
144
145
146
|
# File 'lib/publisci/generators/maf.rb', line 140
def column_replace(entry,column,prefix,value=nil)
if value
entry[COLUMN_NAMES.index(column)] = prefix + value
else
entry[COLUMN_NAMES.index(column)] += prefix
end
end
|
.official_symbol(hugo_symbol) ⇒ Object
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
|
# File 'lib/publisci/generators/maf.rb', line 148
def official_symbol(hugo_symbol)
qry = <<-EOF
SELECT distinct ?official where {
{?hgnc <http://bio2rdf.org/hgnc_vocabulary:approved_symbol> "#{hugo_symbol}"}
UNION
{?hgnc <http://bio2rdf.org/hgnc_vocabulary:synonym> "#{hugo_symbol}"}
?hgnc <http://bio2rdf.org/hgnc_vocabulary:approved_symbol> ?official
}
EOF
sparql = SPARQL::Client.new("http://cu.hgnc.bio2rdf.org/sparql")
sparql.query(qry).map(&:official).first.to_s
end
|
.parse_barcode(code) ⇒ Object
165
166
167
168
|
# File 'lib/publisci/generators/maf.rb', line 165
def parse_barcode(code)
[code[5..11], code[13..-1]]
end
|
.post_process(file) ⇒ Object
131
132
133
134
135
136
137
138
|
# File 'lib/publisci/generators/maf.rb', line 131
def post_process(file)
reg = %r{http://identifiers.org/hgnc.symbol/(\w+)}
hugo_cache ||= {}
PubliSci::PostProcessor.process(file,file,reg){|g|
hugo_cache[g] ||= official_symbol(g)
'http://identifiers.org/hgnc.symbol/' + cache[g]
}
end
|
.process_line(entry, label, options) ⇒ Object
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
|
# File 'lib/publisci/generators/maf.rb', line 50
def process_line(entry,label,options)
entry = (entry.fill(nil,entry.length...COLUMN_NAMES.length-2) + parse_barcode(entry[BARCODE_INDEX])).flatten
entry[0] = "http://identifiers.org/hgnc.symbol/#{entry[0]}" if entry[0]
col=1
entry[col] = nil if entry[col] == '0'
entry[col] = "http://identifiers.org/ncbigene/#{entry[col]}" if entry[col]
col = COLUMN_NAMES.index('dbSNP_RS')
if entry[col] && entry[col][0..1] == "rs"
entry[col] = "http://identifiers.org/dbsnp/#{entry[col].gsub('rs','')}"
end
if options[:complex_objects]
entry = sio_values(entry)
end
data = {}
COLUMN_NAMES.each_with_index{|col,i|
data[col] = [entry[i]]
}
observations(options[:measures],options[:dimensions],options[:codes],data,[label],options[:dataset_name],options).first
end
|
.process_options(options) ⇒ Object
41
42
43
44
45
46
47
48
|
# File 'lib/publisci/generators/maf.rb', line 41
def process_options(options)
options[:dimensions] = dimensions = %w{Variant_Classification Variant_Type dbSNP_Val_Status Verification_Status Validation_Status Mutation_Status Sequence_Source Sequencer}
options[:codes] = codes = dimensions
options[:measures] = (COLUMN_NAMES - dimensions - codes)
options[:dataset_name] ||= "MAF_#{Time.now.nsec.to_s(32)}"
options
end
|
.sio_values(entry) ⇒ Object
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
|
# File 'lib/publisci/generators/maf.rb', line 79
def sio_values(entry)
entry[0] = sio_value('http://edamontology.org/data_1791',entry[0]) if entry[0]
col=1
entry[col] = sio_value("http://identifiers.org/ncbigene",entry[col]) if entry[col]
col = COLUMN_NAMES.index('dbSNP_RS')
entry[col] = sio_value("http://identifiers.org/dbsnp", entry[col])
col = COLUMN_NAMES.index('Chromosome')
entry[col] = sio_value("http://purl.org/obo/owl/SO#SO_0000340",entry[col])
%w{Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2}.each{|name|
col = COLUMN_NAMES.index(name)
entry[col] = sio_value("http://purl.org/obo/owl/SO#SO_0001023",entry[col])
}
col = COLUMN_NAMES.index("Strand")
entry[col] = sio_attribute("http://edamontology.org/data_0853",entry[col])
col = COLUMN_NAMES.index("Center")
entry[col] = sio_attribute("foaf:homepage",entry[col])
col = COLUMN_NAMES.index("Start_Position")
entry[col] = sio_attribute("http://biohackathon.org/resource/faldo#begin", entry[col],"http://biohackathon.org/resource/faldo#Position")
col = COLUMN_NAMES.index("End_Position")
entry[col] = sio_attribute("http://biohackathon.org/resource/faldo#end", entry[col],"http://biohackathon.org/resource/faldo#Position")
entry
end
|
.structure(options = {}) ⇒ Object
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
|
# File 'lib/publisci/generators/maf.rb', line 115
def structure(options={})
options = process_options(options)
str = prefixes(options[:dataset_name],options)
str << data_structure_definition(options[:measures],options[:dimensions],options[:codes],options[:dataset_name],options)
str << dataset(options[:dataset_name],options)
component_specifications(options[:measures], options[:dimensions], options[:codes], options[:dataset_name], options).map{ |c| str << c }
measure_properties(options[:measures],options[:dataset_name],options).map{|m| str << m}
dimension_properties(options[:dimensions],options[:codes], options[:dataset_name],options).map{|d| str << d}
code_lists(options[:codes],TCGA_CODES,options[:dataset_name],options).map{|c| str << c}
concept_codes(options[:codes],TCGA_CODES,options[:dataset_name],options).map{|c| str << c}
str
end
|
.write(record, out, label, options = {}) ⇒ Object
25
26
27
28
29
30
31
32
33
34
35
|
# File 'lib/publisci/generators/maf.rb', line 25
def write(record, out, label, options={})
options = process_options(options)
options[:no_labels] ||= true
options[:lookup_hugo] ||= false
options[:complex_objects] ||= false
options[:ranges] ||= COMPONENT_RANGES
write_to(out, process_line(record, label, options))
end
|
.write_structure(input, output, options) ⇒ Object
37
38
39
|
# File 'lib/publisci/generators/maf.rb', line 37
def write_structure(input, output, options)
write_to(output, structure(options))
end
|