Class: PepXML

Inherits:

Object

Object
PepXML

show all

Defined in:: lib/protk/pepxml.rb

Overview

require ‘rexml/document’ require ‘rexml/xpath’

Instance Attribute Summary collapse

#file_name ⇒ Object

Returns the value of attribute file_name.

Instance Method Summary collapse

#extract_db ⇒ Object

Obtain the database name from the given input file.
#extract_engine ⇒ Object

Obtain the search engine name from the input file The name of the engine is returned in lowercase and should contain no spaces Names of common engines are searched for and extracted in simplified form if possible.
#extract_enzyme ⇒ Object
#find_runs ⇒ Object

TODO: Make this faster and more memory efficient by using XML::Reader as in the functions above.
#initialize(file_name) ⇒ PepXML constructor

A new instance of PepXML.
#is_valid_type(type) ⇒ Object
#type_from_base_name(basename) ⇒ Object
#type_from_summary_attributes(atts) ⇒ Object

Constructor Details

#initialize(file_name) ⇒ `PepXML`

Returns a new instance of PepXML.

# File 'lib/protk/pepxml.rb', line 13

def initialize(file_name)
  @file_name=file_name

  XML::Error.set_handler(&XML::Error::QUIET_HANDLER)
  pepxml_parser=XML::Parser.file("#{file_name}")

  @pepxml_ns_prefix="xmlns:"
  @pepxml_ns="xmlns:http://regis-web.systemsbiology.net/pepXML"
  @pepxml_doc=pepxml_parser.parse
  if not @pepxml_doc.root.namespaces.default
    @pepxml_ns_prefix=""
    @pepxml_ns=nil
  end
end

Instance Attribute Details

#file_name ⇒ `Object`

Returns the value of attribute file_name.



11
12
13

# File 'lib/protk/pepxml.rb', line 11

def file_name
  @file_name
end

Instance Method Details

#extract_db ⇒ `Object`

Obtain the database name from the given input file

# File 'lib/protk/pepxml.rb', line 32

def extract_db()
  reader = XML::Reader.file(self.file_name)
  throw "Failed to open xml file #{file_name}" unless reader!=nil

  while(reader.read)
    # For pep.xml files
    #
    if ( reader.name == "search_database" )
      dbnode=reader.expand
      dbvalue=dbnode['local_path']
      reader.close
      return dbvalue
    end

    # For prot.xml files
    #
    if ( reader.name == "protein_summary_header" )
      dbnode=reader.expand
      dbvalue=dbnode['reference_database']
      reader.close
      return dbvalue
    end
    
    
    
  end

end

#extract_engine ⇒ `Object`

Obtain the search engine name from the input file The name of the engine is returned in lowercase and should contain no spaces Names of common engines are searched for and extracted in simplified form if possible

# File 'lib/protk/pepxml.rb', line 67

def extract_engine()
  reader = XML::Reader.file(self.file_name)
  throw "Failed to open xml file #{file_name}" unless reader!=nil

  while(reader.read)
    if ( reader.name == "search_summary" )
      dbnode=reader.expand
      dbvalue=dbnode['search_engine']
      reader.close
      engine_name=dbvalue.gsub(/ /,"_")
      engine_name=engine_name.gsub(/\(/,"")
      engine_name=engine_name.gsub(/\)/,"")
      engine_name=engine_name.gsub(/\!/,"")        
      return engine_name.downcase
    end
  end
end

#extract_enzyme ⇒ `Object`

# File 'lib/protk/pepxml.rb', line 86

def extract_enzyme()
  reader = XML::Reader.file(self.file_name)
  throw "Failed to open xml file #{file_name}" unless reader!=nil

  while(reader.read)
    if ( reader.name == "sample_enzyme" )
      dbnode=reader.expand
      dbvalue=dbnode['name']
      reader.close        
      return dbvalue.downcase
    end
  end
end

#find_runs ⇒ `Object`

TODO: Make this faster and more memory efficient by using XML::Reader as in the functions above

# File 'lib/protk/pepxml.rb', line 146

def find_runs()


  run_summaries = @pepxml_doc.find("//#{@pepxml_ns_prefix}msms_run_summary", @pepxml_ns)

  runs = {}
  run_summaries.each do |summary|
    base_name = summary.attributes["base_name"]
    if not runs.has_key?(base_name)
      bn = summary.attributes["base_name"]

      runs[base_name] = {:base_name => summary.attributes["base_name"]}

      if is_valid_type(type_from_summary_attributes(summary.attributes))
        runs[base_name][:type] = type_from_summary_attributes(summary.attributes)
      elsif is_valid_type(type_from_base_name(bn))
        runs[base_name][:type] = type_from_base_name(bn)
      else
        runs[base_name][:type] = "mzML" # Same guess as peptide prophet makes
      end

    end
  end
  runs
end

#is_valid_type(type) ⇒ `Object`

# File 'lib/protk/pepxml.rb', line 130

def is_valid_type(type)
  case type
  when /^mgf$/i
    return true
  when /^mzML$/i
    return true
  when /^mzXML$/i
    return true
  else
    return false
  end
end

#type_from_base_name(basename) ⇒ `Object`

# File 'lib/protk/pepxml.rb', line 102

def type_from_base_name(basename)
  # A common error is for tools to include the extension in the base_name attribute.
  # We exploit this to guess the type
  ext_guess=""
  case basename
  when /.mgf$/
    ext_guess="mgf"
  when /.mzML$/
    ext_guess="mzML"
  when /.mzXML$/
    ext_guess="mzXML"
  else
    ext_guess=""
  end
  ext_guess
end

#type_from_summary_attributes(atts) ⇒ `Object`

# File 'lib/protk/pepxml.rb', line 119

def type_from_summary_attributes(atts)
  if is_valid_type(atts["raw_data_type"])
    return  atts["raw_data_type"]
  end

  if is_valid_type(atts["raw_data"])
    return atts["raw_data"]
  end
  return ""
end

Class: PepXML

Overview

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(file_name) ⇒ PepXML

Instance Attribute Details

#file_name ⇒ Object

Instance Method Details

#extract_db ⇒ Object

#extract_engine ⇒ Object

#extract_enzyme ⇒ Object

#find_runs ⇒ Object

#is_valid_type(type) ⇒ Object

#type_from_base_name(basename) ⇒ Object

#type_from_summary_attributes(atts) ⇒ Object