Class: Bio::Stockholm::Reader

Inherits:
Object
  • Object
show all
Defined in:
lib/bio-stockholm/stockholm.rb

Class Method Summary collapse

Class Method Details

.parse_from_file(filename) ⇒ Object



5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# File 'lib/bio-stockholm/stockholm.rb', line 5

def self.parse_from_file(filename)
  # # STOCKHOLM 1.0
  #
  # #=GS ABK77038.1 DE ammonia monooxygenase subunit A [Cenarchaeum symbiosum A]
  #
  # ABK77038.1         --------------------------------------LTMVWLRRCTHY
  # #=GR ABK77038.1 PP ......................................67889999****
  # #=GC PP_cons       ......................................67889999****
  # #=GC RF            xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
  #
  # ABK77038.1         LFIAVVAVNSTLLTINAGDYIFYTDWAWTS--F..TVFSISQTLML....
  # #=GR ABK77038.1 PP **************************9886..4..699********....
  # #=GC PP_cons       **************************9886..4..699********....
  # #=GC RF            xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx..xxxxxxxxxxx....
  # //
  state = :first
  returns = []
  to_return = Bio::Stockholm::Store.new

  File.open(filename).each_line do |line|
    next if line.strip.empty? and state == :first_block

    if state == :first
      unless line == "\# STOCKHOLM 1.0\n"
        raise FormatException, "Currently unable to parse stockholm format files unless they are version 1.0"
      end
      to_return.header = line.strip
      state = :first_block

    elsif state == :first_block
      # Match a GR, GS, etc. "markup" line
      if matches = line.match(/^\#=(..) (\S+)\s+(.*)/)
        if matches[1] == 'GF'
          to_return.gf_features ||= {}
          if to_return.gf_features.key?(matches[2])
            to_return.gf_features[matches[2]] = to_return.gf_features[matches[2]]+' '+matches[3]
          else
            to_return.gf_features[matches[2]] = matches[3]
          end
        elsif matches[1] == 'GC'
          to_return.gc_features ||= {}
          if to_return.gc_features.key?(matches[2])
            to_return.gc_features[matches[2]] = to_return.gc_features[matches[2]]+matches[3]
          else
            to_return.gc_features[matches[2]] = matches[3]
          end
        else
          # GS, GR, or bad parsing
          unless matches2 = matches[3].match(/(.*?)\s+(.*)/)
            raise FormatException, "Unable to parse stockholm GS or GR format line: #{line}"
          end
          sequence_identifier = matches[2]
          to_return.records[sequence_identifier] ||= Record.new

          if matches[1] == 'GS'
            to_return.records[sequence_identifier].gs_features ||= {}

            if to_return.records[sequence_identifier].gs_features[matches2[1]]
              to_return.records[sequence_identifier].gs_features[matches2[1]] += matches2[2]
            else
              to_return.records[sequence_identifier].gs_features[matches2[1]] = matches2[2]
            end
          elsif matches[1] == 'GR'
            to_return.records[sequence_identifier].gr_features ||= {}

            if to_return.records[sequence_identifier].gr_features[matches2[1]]
              to_return.records[sequence_identifier].gr_features[matches2[1]] += matches2[2]
            else
              to_return.records[sequence_identifier].gr_features[matches2[1]] = matches2[2]
            end
          else
            raise FormatException, "Unable to parse stockholm format line: #{line}"
          end
        end
      elsif line.match(/^\/\//)
        returns.push to_return
        to_return = Bio::Stockholm::Store.new
      else
        # Else this is just plain old sequence, aligned
        unless matches = line.match(/^(\S+)\s+(.+)$/)
          raise FormatException, "Unable to parse stockholm format line: #{line}"
        end
        to_return.records[matches[1]] ||= Record.new
        to_return.records[matches[1]].sequence ||= ''
        to_return.records[matches[1]].sequence += matches[2].rstrip
      end
    end
  end

  return returns
end