Class: Bio::SOFT

Inherits:
Object show all
Defined in:
lib/bio/db/soft.rb

Overview

bio/db/soft.rb - Interface for SOFT formatted files

Author

Trevor Wennblom <[email protected]>

Copyright

Copyright © 2007 Midwinter Laboratories, LLC (midwinterlabs.com)

License

The Ruby License

Description

“SOFT (Simple Omnibus in Text Format) is a compact, simple, line-based, ASCII text format that incorporates experimental data and metadata.” – GEO, National Center for Biotechnology Information

The Bio::SOFT module reads SOFT Series or Platform formatted files that contain information describing one database, one series, one platform, and many samples (GEO accessions). The data from the file can then be viewed with Ruby methods.

Bio::SOFT also supports the reading of SOFT DataSet files which contain one database, one dataset, and many subsets.

Format specification is located here:

SOFT data files may be directly downloaded here:

NCBI’s Gene Expression Omnibus (GEO) is here:

Usage

If an attribute has more than one value then the values are stored in an Array of String objects. Otherwise the attribute is stored as a String.

The platform and each sample may contain a table of data. A dataset from a DataSet file may also contain a table.

Attributes are dynamically created based on the data in the file. Predefined keys have not been created in advance due to the variability of SOFT files in-the-wild.

Keys are generally stored as Symbols. In the case of keys for samples and table headings may alternatively be accessed with Strings. The names of samples (geo accessions) are case sensitive. Table headers are case insensitive.

require 'bio'

lines = IO.readlines('GSE3457_family.soft') 
soft = Bio::SOFT.new(lines)

soft.platform[:geo_accession]             # => "GPL2092"
soft.platform[:organism]                  # => "Populus"
soft.platform[:contributor]               # => ["Jingyi,,Li", "Olga,,Shevchenko", "Steve,H,Strauss", "Amy,M,Brunner"]
soft.platform[:data_row_count]            # => "240"
soft.platform.keys.sort {|a,b| a.to_s <=> b.to_s}[0..2] # => [:contact_address, :contact_city, :contact_country]
soft.platform[:"contact_zip/postal_code"] # => "97331"
soft.platform[:table].header              # => ["ID", "GB_ACC", "SPOT_ID", "Function/Family", "ORGANISM", "SEQUENCE"]
soft.platform[:table].header_description  # => {"ORGANISM"=>"sequence sources", "SEQUENCE"=>"oligo sequence used", "Function/Family"=>"gene functions and family", "ID"=>"", "SPOT_ID"=>"", "GB_ACC"=>"Gene bank accession number"}
soft.platform[:table].rows.size           # => 240
soft.platform[:table].rows[5]             # => ["A039P68U", "AI163321", "", "TF, flowering protein CONSTANS", "P. tremula x P. tremuloides", "AGAAAATTCGATATACTGTCCGTAAAGAGGTAGCACTTAGAATGCAACGGAATAAAGGGCAGTTCACCTC"]
soft.platform[:table].rows[5][4]          # => "P. tremula x P. tremuloides"
soft.platform[:table].rows[5][:organism]  # => "P. tremula x P. tremuloides"
soft.platform[:table].rows[5]['ORGANISM'] # => "P. tremula x P. tremuloides"

soft.series[:geo_accession]               # => "GSE3457"
soft.series[:contributor]                 # => ["Jingyi,,Li", "Olga,,Shevchenko", "Ove,,Nilsson", "Steve,H,Strauss", "Amy,M,Brunner"]
soft.series[:platform_id]                 # => "GPL2092"
soft.series[:sample_id].size              # => 74
soft.series[:sample_id][0..4]             # => ["GSM77557", "GSM77558", "GSM77559", "GSM77560", "GSM77561"]

soft.database[:name]                      # => "Gene Expression Omnibus (GEO)"
soft.database[:ref]                       # => "Nucleic Acids Res. 2005 Jan 1;33 Database Issue:D562-6"
soft.database[:institute]                 # => "NCBI NLM NIH"

soft.samples.size                         # => 74
soft.samples[:GSM77600][:series_id]       # => "GSE3457"
soft.samples['GSM77600'][:series_id]      # => "GSE3457"
soft.samples[:GSM77600][:platform_id]     # => "GPL2092"
soft.samples[:GSM77600][:type]            # => "RNA"
soft.samples[:GSM77600][:title]           # => "jst2b2"
soft.samples[:GSM77600][:table].header    # => ["ID_REF", "VALUE"]
soft.samples[:GSM77600][:table].header_description # => {"ID_REF"=>"", "VALUE"=>"normalized signal intensities"}
soft.samples[:GSM77600][:table].rows.size # => 217
soft.samples[:GSM77600][:table].rows[5]   # => ["A039P68U", "8.19"]
soft.samples[:GSM77600][:table].rows[5][0]        # => "A039P68U"
soft.samples[:GSM77600][:table].rows[5][:id_ref]  # => "A039P68U"
soft.samples[:GSM77600][:table].rows[5]['ID_REF'] # => "A039P68U"

lines = IO.readlines('GDS100.soft') 
soft = Bio::SOFT.new(lines)

soft.database[:name]                      # => "Gene Expression Omnibus (GEO)"
soft.database[:ref]                       # => "Nucleic Acids Res. 2005 Jan 1;33 Database Issue:D562-6"
soft.database[:institute]                 # => "NCBI NLM NIH"

soft.subsets.size                         # => 8
soft.subsets.keys                         # => ["GDS100_1", "GDS100_2", "GDS100_3", "GDS100_4", "GDS100_5", "GDS100_6", "GDS100_7", "GDS100_8"]
soft.subsets[:GDS100_7]                   # => {:dataset_id=>"GDS100", :type=>"time", :sample_id=>"GSM548,GSM543", :description=>"60 minute"}
soft.subsets['GDS100_7'][:sample_id]      # => "GSM548,GSM543"
soft.subsets[:GDS100_7][:sample_id]       # => "GSM548,GSM543"
soft.subsets[:GDS100_7][:dataset_id]      # => "GDS100"

soft.dataset[:order]                      # => "none"
soft.dataset[:sample_organism]            # => "Escherichia coli"
soft.dataset[:table].header               # => ["ID_REF", "IDENTIFIER", "GSM549", "GSM542", "GSM543", "GSM547", "GSM544", "GSM545", "GSM546", "GSM548"]
soft.dataset[:table].rows.size            # => 5764
soft.dataset[:table].rows[5]              # => ["6", "EMPTY", "0.097", "0.217", "0.242", "0.067", "0.104", "0.162", "0.104", "0.154"]
soft.dataset[:table].rows[5][4]           # => "0.242"
soft.dataset[:table].rows[5][:gsm549]     # => "0.097"
soft.dataset[:table].rows[5][:GSM549]     # => "0.097"
soft.dataset[:table].rows[5]['GSM549']    # => "0.097"

Defined Under Namespace

Classes: Database, Dataset, Entity, Platform, Sample, Samples, Series, Subset, Subsets, Table

Constant Summary collapse

LINE_TYPE_ENTITY_INDICATOR =
'^'
LINE_TYPE_ENTITY_ATTRIBUTE =
'!'
LINE_TYPE_TABLE_HEADER =
'#'
TABLE_COLUMN_DELIMITER =

data table row defined by absence of line type character

"\t"

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(lines = nil) ⇒ SOFT

Constructor


Arguments

  • lines: (required) contents of SOFT formatted file

Returns

Bio::SOFT



147
148
149
150
151
152
153
154
155
156
157
158
# File 'lib/bio/db/soft.rb', line 147

def initialize(lines=nil)
  @database = Database.new
  
  @series = Series.new
  @platform = Platform.new
  @samples = Samples.new
  
  @dataset = Dataset.new
  @subsets = Subsets.new
  
  process(lines)
end

Instance Attribute Details

#databaseObject

Returns the value of attribute database.



130
131
132
# File 'lib/bio/db/soft.rb', line 130

def database
  @database
end

#datasetObject

Returns the value of attribute dataset.



132
133
134
# File 'lib/bio/db/soft.rb', line 132

def dataset
  @dataset
end

#platformObject

Returns the value of attribute platform.



131
132
133
# File 'lib/bio/db/soft.rb', line 131

def platform
  @platform
end

#samplesObject

Returns the value of attribute samples.



131
132
133
# File 'lib/bio/db/soft.rb', line 131

def samples
  @samples
end

#seriesObject

Returns the value of attribute series.



131
132
133
# File 'lib/bio/db/soft.rb', line 131

def series
  @series
end

#subsetsObject

Returns the value of attribute subsets.



132
133
134
# File 'lib/bio/db/soft.rb', line 132

def subsets
  @subsets
end