Class: Bio::Iprscan::Report

Inherits:
Object show all
Defined in:
lib/bio/appl/iprscan/report.rb

Overview

DESCRIPTION

Class for InterProScan report. It is used to parse results and reformat results from (raw|xml|txt) into (html, xml, ebihtml, txt, gff3) format.

See ftp.ebi.ac.uk/pub/software/unix/iprscan/README.html

USAGE

# Read a marged.txt and split each entry.
Bio::Iprscan::Report.parse_txt(File.read("marged.txt")) do |report| 
  report.query_id
  report.matches.size
  report.matches.each do |match|
    match.ipr_id #=> 'IPR...'
    match.ipr_description
    match.method
    match.accession
    match.description
    match.match_start
    match.match_end
    match.evalue    
  end
  # report.to_gff3 
  # report.to_html
end

Bio::Iprscan::Report.parse_raw(File.read("marged.raw")) do |report| 
  report.class #=> Bio::Iprscan::Report
end

Defined Under Namespace

Classes: Match

Constant Summary

RS =

Entry delimiter pattern.

DELIMITER = "\n\/\/\n"

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeReport



236
237
238
239
240
241
# File 'lib/bio/appl/iprscan/report.rb', line 236

def initialize
  @query_id = nil
  @query_length = nil
  @crc64 = nil
  @matches = []
end

Instance Attribute Details

#crc64Object

CRC64 checksum of query sequence.



59
60
61
# File 'lib/bio/appl/iprscan/report.rb', line 59

def crc64
  @crc64
end

#matchesObject

Matched InterPro motifs in Hash. Each InterPro motif have :name, :definition, :accession and :motifs keys. And :motifs key contains motifs in Array. Each motif have :method, :accession, :definition, :score, :location_from and :location_to keys.



65
66
67
# File 'lib/bio/appl/iprscan/report.rb', line 65

def matches
  @matches
end

#query_idObject Also known as: entry_id

Qeury sequence name (entry_id).



52
53
54
# File 'lib/bio/appl/iprscan/report.rb', line 52

def query_id
  @query_id
end

#query_lengthObject

Qeury sequence length.



56
57
58
# File 'lib/bio/appl/iprscan/report.rb', line 56

def query_length
  @query_length
end

Class Method Details

.parse_ptxt(io) ⇒ Object

Splits entry stream.

Usage

Bio::Iprscan::Report.parse_ptxt(File.open("merged.txt")) do |report|
  report
end


194
195
196
197
198
# File 'lib/bio/appl/iprscan/report.rb', line 194

def self.parse_ptxt(io)
  io.each("\n\/\/\n") do |entry|
    yield self.parse_ptxt_entry(entry)
  end
end

.parse_ptxt_entry(str) ⇒ Object

Parser method for a pseudo-txt formated entry. Retruns a Bio::Iprscan::Report object.

Usage

File.read("marged.txt").each(Bio::Iprscan::Report::RS) do |e| 
  report = Bio::Iprscan::Report.parse_ptxt_entry(e)
end


209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
# File 'lib/bio/appl/iprscan/report.rb', line 209

def self.parse_ptxt_entry(str)
  report = self.new
  ipr_line = ''
  str.split(/\n/).each do |line|
    line = line.split("\t")
    if line.size == 2
      report.query_id = line[0]
      report.query_length = line[1].to_i
    elsif line.first == '//'
    elsif line.first == 'InterPro'
      ipr_line = line
    else
      startp, endp = line[4].split("-")
      report.matches << Match.new(:ipr_id => ipr_line[1], 
                                  :ipr_description => ipr_line[2],
                                  :method => line[0], 
                                  :accession => line[1],
                                  :description => line[2], 
                                  :evalue => line[3],
                                  :match_start => startp.to_i,
                                  :match_end => endp.to_i)
    end
  end
  report
end

.parse_raw(io) {|Bio::Iprscan::Report.parse_raw_entry(entry)| ... } ⇒ Object

USAGE

Bio::Iprscan::Report.parse_raw(File.open("merged.raw")) do |report|
  report
end


72
73
74
75
76
77
78
79
80
81
82
83
84
85
# File 'lib/bio/appl/iprscan/report.rb', line 72

def self.parse_raw(io)
  entry = ''
  while line = io.gets
    if entry != '' and entry.split("\t").first == line.split("\t").first
      entry << line
    elsif entry != ''
      yield Bio::Iprscan::Report.parse_raw_entry(entry)
      entry = line
    else
      entry << line
    end
  end
  yield Bio::Iprscan::Report.parse_raw_entry(entry) if entry != ''
end

.parse_raw_entry(str) ⇒ Object

Parser method for a raw formated entry. Retruns a Bio::Iprscan::Report object.



89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# File 'lib/bio/appl/iprscan/report.rb', line 89

def self.parse_raw_entry(str)
  report = self.new
  str.split(/\n/).each do |line|
    line = line.split("\t")
    report.matches << Match.new(:query_id => line[0],
                                :crc64    => line[1],
                                :query_length => line[2].to_i,
                                :method       => line[3], 
                                :accession    => line[4],
                                :description => line[5], 
                                :match_start => line[6].to_i,
                                :match_end   => line[7].to_i,
                                :evalue => line[8],
                                :status => line[9],
                                :date   => line[10])
    if line[11]
      report.matches.last.ipr_id = line[11]
      report.matches.last.ipr_description = line[12]
    end
    report.matches.last.go_terms = line[13].scan(/(\w+ \w+\:.+? \(GO:\d+\))/).flatten if line[13]          
  end
  report.query_id = report.matches.first.query_id
  report.query_length = report.matches.first.query_length
  report
end

.parse_txt(io) ⇒ Object

Splits the entry stream.

Usage

Bio::Iprscan::Report.reports_txt(File.open("merged.txt")) do |report|
  report.class #=> Bio::Iprscan::Report
end


130
131
132
133
134
135
136
137
138
139
140
# File 'lib/bio/appl/iprscan/report.rb', line 130

def self.parse_txt(io)
  io.each("\n\nSequence") do |entry|
    if entry =~ /Sequence$/
      entry = entry.sub(/Sequence$/, '')
    end
    unless entry =~ /^Sequence/
      entry = 'Sequence' + entry
    end
    yield self.parse_txt_entry(entry)
  end
end

.parse_txt_entry(str) ⇒ Object

Parser method for a txt formated entry. Returns a Bio::Iprscan::Report object.



147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
# File 'lib/bio/appl/iprscan/report.rb', line 147

def self.parse_txt_entry(str)
  unless str =~ /^Sequence /
    raise ArgumentError, "Invalid format:  \n\n#{str}"
  end
  header, *matches = str.split(/\n\n/)
  report = self.new
  report.query_id = if header =~ /Sequence \"(.+)\" / then $1 else '' end
  report.query_length = if header =~ /length: (\d+) aa./ then $1.to_i else nil end
  report.crc64 = if header =~ /crc64 checksum: (\S+) / then $1 else nil end
  ipr_line = ''
  go_annotation = ''
  matches.each do |m|
    m = m.split(/\n/).map {|x| x.split(/  +/) }
    m.each do |match|
      case match[0]
      when 'method'
      when /(Molecular Function|Cellular Component|Biological Process):/
        go_annotation = match[0].scan(/([MCB]\w+ \w+): (\S.+?\S) \((GO:\d+)\),*/)
      when 'InterPro'
        ipr_line = match
      else
        pos_scores = match[3].scan(/(\S)\[(\d+)-(\d+)\] (\S+) */)
        pos_scores.each do |pos_score|
          report.matches << Match.new(:ipr_id          => ipr_line[1],
                                      :ipr_description => ipr_line[2],
                                      :method      => match[0], 
                                      :accession   => match[1],
                                      :description => match[2], 
                                      :evalue      => pos_score[3],
                                      :status      => pos_score[0],
                                      :match_start => pos_score[1].to_i,
                                      :match_end   => pos_score[2].to_i,
                                      :go_terms => go_annotation)
        end
      end
    end
  end
  return report
end

Instance Method Details

#format_rawObject

def format_txt

end


266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
# File 'lib/bio/appl/iprscan/report.rb', line 266

def format_raw
  @matches.map { |match|
    [self.query_id,
     self.crc64,
     self.query_length,
     match.method_name,
     match.accession,
     match.description,
     match.match_start,
     match.match_end,
     match.evalue,
     match.status,
     match.date,
     match.ipr_id,
     match.ipr_description,
     match.go_terms.map {|x| x[0] + ': ' + x[1] + ' (' + x[2] + ')' }.join(', ')
    ].join("\t")
  }.join("\n")
end

#output(format_type) ⇒ Object

Output interpro matches in the format_type.



245
246
247
248
249
250
251
252
# File 'lib/bio/appl/iprscan/report.rb', line 245

def output(format_type)
  case format_type
  when 'raw', :raw
    format_raw
  else
    raise NameError, "Invalid format_type."
  end
end

#to_hashObject

Returns a Hash (key as an Interpro ID and value as a Match).

report.to_hash.each do |ipr_id, matches|
  matches.each do |match|
    report.matches.ipr_id == ipr_id #=> true
  end
end


298
299
300
301
302
303
304
305
306
307
308
309
# File 'lib/bio/appl/iprscan/report.rb', line 298

def to_hash
  unless @ipr_ids
    @ipr_ids = {} 
    @matches.each_with_index do |match, i|
      @ipr_ids[match.ipr_id] ||= []
      @ipr_ids[match.ipr_id] << match
    end
    return @ipr_ids
  else
    return @ipr_ids
  end
end