Module: Mspire::Mzml::Reader

Included in:
Mspire::Mzml
Defined in:
lib/mspire/mzml/reader.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Instance Attribute Details

Returns the value of attribute link.



28
29
30
# File 'lib/mspire/mzml/reader.rb', line 28

def link
  @link
end

Instance Method Details

#get_default_data_processing_ids(io, index_list, lookback = 300) ⇒ Object

returns a hash keyed by :spectrum or :chromatogram that gives the id (aka ref) as a string.



43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/mspire/mzml/reader.rb', line 43

def get_default_data_processing_ids(io, index_list, lookback=300)
  hash = {}
  index_list.each_pair do |name, index|
    if index.size > 0
      # ^ we cannot quickly retrieve a defaultDataProcessingRef unless there
      # is at least one spectrum/chromatogram to start with.  However, if
      # there is no spectrum/chromatogram, then the defaultDataProcessingRef
      # will not be needed either.
      io.bookmark do |io|
        io.pos = index[0] - lookback 
        hash[name] = io.read(lookback)[/<#{name}List.*defaultDataProcessingRef=['"](.*?)['"]/m, 1]
      end
    end
  end
  hash
end

#get_header_string(io) ⇒ Object

saves ~ 3 seconds when reading a 83M mzML file to scrape off the header string (even though we’re just handing in an IO object to Nokogiri::XML::Document.parse and we are very careful to not parse too far).



64
65
66
67
68
69
70
71
72
73
74
75
# File 'lib/mspire/mzml/reader.rb', line 64

def get_header_string(io)
  chunk_size = 2**12
  loc = 0
  string = ''
  while chunk = @io.read(chunk_size)
    string << chunk
    start_looking = ((loc-20) < 0) ? 0 : (loc-20)
    break if string[start_looking..-1] =~ /<(spectrum|chromatogram)/
      loc += chunk_size
  end
  string
end

#read_header!(list_type_to_default_data_processing_id) ⇒ Object

list_type_to_default_data_processing_id is a hash keyed by :spectrum or :chromatogram that gives the default data_processing_object for the SpectrumList and/or the ChromatogramList. This information is not obtainable from the header string, so must be pre-obtained.



81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# File 'lib/mspire/mzml/reader.rb', line 81

def read_header!(list_type_to_default_data_processing_id)
  @io.rewind

  string = get_header_string(@io)
  doc = Nokogiri::XML.parse(string, nil, @encoding, Mspire::Mzml::Parser::NOBLANKS)

  doc.remove_namespaces!
  mzml_n = doc.root
  if mzml_n.name == 'indexedmzML'
    mzml_n = mzml_n.child
  end

  @id = mzml_n[:id]

  cv_list_n = mzml_n.child
  self.cvs = cv_list_n.children.map do |cv_n|
    Mspire::Mzml::CV.from_xml(cv_n)
  end

  # get the file description node but deal with it after getting ref_hash
  file_description_n = cv_list_n.next

  xml_n = file_description_n.next

  # a hash of referenceable_param_groups indexed by id
  @link = {}

  if xml_n.name == 'referenceableParamGroupList'
    self.referenceable_param_groups = xml_n.children.map do |rpg_n|
      Mspire::Mzml::ReferenceableParamGroup.from_xml(rpg_n) # <- no ref_hash (not made yet)
    end
    @link[:ref_hash] = self.referenceable_param_groups.index_by(&:id)
    xml_n = xml_n.next
  end

  # now we can set the file description because we have the ref_hash
  self.file_description = Mspire::Mzml::FileDescription.from_xml(file_description_n, @link)
  @link[:source_file_hash] = self.file_description.source_files.index_by(&:id)


  loop do
    case xml_n.name
    when 'sampleList'
      self.samples = xml_n.children.map do |sample_n|
        Mspire::Mzml::Sample.from_xml(sample_n, @link)
      end
      @link[:sample_hash] = self.samples.index_by(&:id)
    when 'softwareList'  # required
      self.software_list = xml_n.children.map do |software_n|
        Mspire::Mzml::Software.from_xml(software_n, @link)
      end
      @link[:software_hash] = self.software_list.index_by(&:id)
    when 'instrumentConfigurationList'
      self.instrument_configurations = xml_n.children.map do |inst_config_n|
        Mspire::Mzml::InstrumentConfiguration.from_xml(inst_config_n, @link)
      end
      @link[:instrument_configuration_hash] = self.instrument_configurations.index_by(&:id)
    when 'dataProcessingList'
      self.data_processing_list = xml_n.children.map do |data_processing_n|
        Mspire::Mzml::DataProcessing.from_xml(data_processing_n, @link)
      end
      @link[:data_processing_hash] = self.data_processing_list.index_by(&:id)
    when 'run'
      @link[:index_list] = @index_list
      list_type_to_default_data_processing_id.each do |type, process_id|
        @link["#{type}_default_data_processing".to_sym] = @link[:data_processing_hash][process_id]
      end
      self.run = Mspire::Mzml::Run.from_xml(@io, xml_n, @link)
      break
    end
    xml_n = xml_n.next
  end
end

#set_from_xml_io!(xml_io) ⇒ Object



30
31
32
33
34
35
36
37
38
39
# File 'lib/mspire/mzml/reader.rb', line 30

def set_from_xml_io!(xml_io)
  @io = xml_io
  begin
    @encoding = @io.bookmark(true) {|io| io.readline.match(/encoding=["'](.*?)["']/)[1] }
  rescue EOFError
    raise RuntimeError, "no encoding present in XML!  (Is this even an xml file?)"
  end
  @index_list = Mspire::Mzml::IndexList.from_io(@io)
  read_header!( get_default_data_processing_ids(@io, @index_list) )
end