Class: PubliSci::Readers::MAF
- Inherits:
-
Base
- Object
- Base
- PubliSci::Readers::MAF
show all
- Defined in:
- lib/bio-publisci/readers/maf.rb
Constant Summary
collapse
- COLUMN_NAMES =
%w{ Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position End_Position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score BAM_File Sequencer Tumor_Sample_UUID Matched_Norm_Sample_UUID patient_id sample_id}
- COMPONENT_RANGES =
{ "Tumor_Sample_Barcode" => "xsd:string", "Start_position" => "xsd:int", "Center" => "xsd:string", "NCBI_Build" => "xsd:int", "Chromosome" => "xsd:int" }
- TCGA_CODES =
{
"Variant_Classification" => %w{Frame_Shift_Del Frame_Shift_Ins In_Frame_Del In_Frame_Ins Missense_Mutation Nonsense_Mutation Silent Splice_Site Translation_Start_Site Nonstop_Mutation 3'UTR 3'Flank 5'UTR 5'Flank IGR1 Intron RNA Targeted_Region},
"Variant_Type" => %w{SNP DNP TNP ONP INS DEL Consolidated},
"dbSNP_Val_Status" => %w{by1000genomes by2Hit2Allele byCluster byFrequency byHapMap byOtherPop bySubmitter alternate_allele},
"Verification_Status" => %w{Verified, Unknown},
"Validation_Status" => %w{Untested Inconclusive Valid Invalid},
"Mutation_Status" => %w{None Germline Somatic LOH Post-transcriptional modification Unknown},
"Sequence_Source" => %w{WGS WGA WXS RNA-Seq miRNA-Seq Bisulfite-Seq VALIDATION Other ncRNA-Seq WCS CLONE POOLCLONE AMPLICON CLONEEND FINISHING ChIP-Seq MNase-Seq DNase-Hypersensitivity EST FL-cDNA CTS MRE-Seq MeDIP-Seq MBD-Seq Tn-Seq FAIRE-seq SELEX RIP-Seq ChIA-PET},
"Sequencer" => ["Illumina GAIIx", "Illumina HiSeq", "SOLID", "454", "ABI 3730xl", "Ion Torrent PGM", "Ion Torrent Proton", "PacBio RS", "Illumina MiSeq", "Illumina HiSeq 2500", "454 GS FLX Titanium", "AB SOLiD 4 System" ]
}
Instance Method Summary
collapse
Methods inherited from Base
#automatic, #next_label, #sio_attribute, #sio_value
#abbreviate_known, #code_lists, #component_gen, #component_specifications, #concept_codes, #data_structure_definition, #dataset, #defaults, #dimension_properties, #encode_data, #generate, #generate_resources, #measure_properties, #observations, #prefixes, #vocabulary
Methods included from Parser
#add_node, #bnode_value, #encode_value, #get_ary, #get_hashes, #is_complex?, #is_uri?, #load_string, #observation_hash, #sanitize, #sanitize_hash, #strip_prefixes, #strip_uri, #to_literal, #to_resource, #turtle_indent
#interact
Methods included from Analyzer
#check_integrity, #dirty?, #recommend_range, #recommend_range_strings
Methods included from Query
#execute, #execute_from_file, #property_names, #property_values, #row_names, #vocabulary
Instance Method Details
#column_replace(entry, column, prefix, value = nil) ⇒ Object
146
147
148
149
150
151
152
|
# File 'lib/bio-publisci/readers/maf.rb', line 146
def column_replace(entry,column,prefix,value=nil)
if value
entry[COLUMN_NAMES.index(column)] = prefix + value
else
entry[COLUMN_NAMES.index(column)] += prefix
end
end
|
#generate_n3(input_file, options = {}) ⇒ Object
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
|
# File 'lib/bio-publisci/readers/maf.rb', line 20
def generate_n3(input_file, options={})
dataset_name = options[:dataset_name] || nil
output = options[:output] || :file
output_base = options[:output_base] || nil
@dimensions = %w{Variant_Classification Variant_Type dbSNP_Val_Status Verification_Status Validation_Status Mutation_Status Sequence_Source Sequencer}
@codes = @dimensions
@measures = (COLUMN_NAMES - @dimensions - @codes)
@dataset_name ||= File.basename(input_file,'.*')
@barcode_index = COLUMN_NAMES.index('Tumor_Sample_Barcode')
options[:no_labels] ||= true
options[:lookup_hugo] ||= false
options[:complex_objects] ||= false
options[:ranges] ||= COMPONENT_RANGES
if output == :print
str = structure(options)
f = open(input_file)
n = 0
f.each_line{|line|
processed = process_line(line,n.to_s,options)
str << processed.first if processed
n +=1
}
str
else
file_base = output_base || @dataset_name
out = open("#{file_base}.ttl",'w')
out.write(structure(options))
f = open(input_file)
n = 0
f.each_line{|line|
processed = process_line(line,n.to_s,options)
out.write(processed.first) if processed
n += 1
}
if options[:lookup_hugo]
post_process(out)
else
out
end
end
end
|
#official_symbol(hugo_symbol) ⇒ Object
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
|
# File 'lib/bio-publisci/readers/maf.rb', line 154
def official_symbol(hugo_symbol)
qry = <<-EOF
SELECT distinct ?official where {
{?hgnc <http://bio2rdf.org/hgnc_vocabulary:approved_symbol> "#{hugo_symbol}"}
UNION
{?hgnc <http://bio2rdf.org/hgnc_vocabulary:synonym> "#{hugo_symbol}"}
?hgnc <http://bio2rdf.org/hgnc_vocabulary:approved_symbol> ?official
}
EOF
sparql = SPARQL::Client.new("http://cu.hgnc.bio2rdf.org/sparql")
sparql.query(qry).map(&:official).first.to_s
end
|
#parse_barcode(code) ⇒ Object
171
172
173
174
|
# File 'lib/bio-publisci/readers/maf.rb', line 171
def parse_barcode(code)
[code[5..11], code[13..-1]]
end
|
#post_process(file) ⇒ Object
189
190
191
192
193
194
195
196
|
# File 'lib/bio-publisci/readers/maf.rb', line 189
def post_process(file)
reg = %r{http://identifiers.org/hgnc.symbol/(\w+)}
@@hugo_cache ||= {}
PubliSci::PostProcessor.process(file,file,reg){|g|
@@hugo_cache[g] ||= official_symbol(g)
'http://identifiers.org/hgnc.symbol/' + cache[g]
}
end
|
#process_line(line, label, options) ⇒ Object
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
|
# File 'lib/bio-publisci/readers/maf.rb', line 71
def process_line(line,label,options)
unless line[0] == "#" || line[0..3] == "Hugo"
entry = ::CSV.parse(line, {col_sep: "\t"}).flatten[0..(COLUMN_NAMES.length-3)]
entry = (entry.fill(nil,entry.length...COLUMN_NAMES.length-2) + parse_barcode(entry[@barcode_index])).flatten
entry[0] = "http://identifiers.org/hgnc.symbol/#{entry[0]}" if entry[0]
col=1
entry[col] = nil if entry[col] == '0'
entry[col] = "http://identifiers.org/ncbigene/#{entry[col]}" if entry[col]
col = COLUMN_NAMES.index('dbSNP_RS')
if entry[col] && entry[col][0..1] == "rs"
entry[col] = "http://identifiers.org/dbsnp/#{entry[col].gsub('rs','')}"
end
if options[:complex_objects]
entry = sio_values(entry)
end
data = {}
COLUMN_NAMES.each_with_index{|col,i|
data[col] = [entry[i]]
}
observations(@measures,@dimensions,@codes,data,[label],@dataset_name,options)
end
end
|
#sio_values(entry) ⇒ Object
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
|
# File 'lib/bio-publisci/readers/maf.rb', line 104
def sio_values(entry)
entry[0] = sio_value('http://edamontology.org/data_1791',entry[0]) if entry[0]
col=1
entry[col] = sio_value("http://identifiers.org/ncbigene",entry[col]) if entry[col]
col = COLUMN_NAMES.index('dbSNP_RS')
entry[col] = sio_value("http://identifiers.org/dbsnp", entry[col])
col = COLUMN_NAMES.index('Chromosome')
entry[col] = sio_value("http://purl.org/obo/owl/SO#SO_0000340",entry[col])
%w{Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2}.each{|name|
col = COLUMN_NAMES.index(name)
entry[col] = sio_value("http://purl.org/obo/owl/SO#SO_0001023",entry[col])
}
col = COLUMN_NAMES.index("Strand")
entry[col] = sio_attribute("http://edamontology.org/data_0853",entry[col])
col = COLUMN_NAMES.index("Center")
entry[col] = sio_attribute("foaf:homepage",entry[col])
col = COLUMN_NAMES.index("Start_Position")
entry[col] = sio_attribute("http://biohackathon.org/resource/faldo#begin", entry[col],"http://biohackathon.org/resource/faldo#Position")
col = COLUMN_NAMES.index("End_Position")
entry[col] = sio_attribute("http://biohackathon.org/resource/faldo#end", entry[col],"http://biohackathon.org/resource/faldo#Position")
entry
end
|
#structure(options = {}) ⇒ Object
176
177
178
179
180
181
182
183
184
185
186
187
|
# File 'lib/bio-publisci/readers/maf.rb', line 176
def structure(options={})
str = prefixes(@dataset_name,options)
str << data_structure_definition(@measures,@dimensions,@codes,@dataset_name,options)
str << dataset(@dataset_name,options)
component_specifications(@measures, @dimensions, @codes, @dataset_name, options).map{ |c| str << c }
measure_properties(@measures,@dataset_name,options).map{|m| str << m}
dimension_properties(@dimensions,@codes, @dataset_name,options).map{|d| str << d}
code_lists(@codes,TCGA_CODES,@dataset_name,options).map{|c| str << c}
concept_codes(@codes,TCGA_CODES,@dataset_name,options).map{|c| str << c}
str
end
|