Class: Reader
- Inherits:
-
Object
- Object
- Reader
- Defined in:
- lib/pubchem/reader.rb
Instance Attribute Summary collapse
-
#names ⇒ Object
Returns the value of attribute names.
-
#pubchem_compound_ids ⇒ Object
Returns the value of attribute pubchem_compound_ids.
-
#pubchem_substance_ids ⇒ Object
Returns the value of attribute pubchem_substance_ids.
Instance Method Summary collapse
- #add_name(name) ⇒ Object
- #fuzzy_name_lookup(lookup_name, threshold) ⇒ Object
-
#initialize(names_filename = nil, pubchem_substance_ids_filename = nil, pubchem_compound_ids_filename = nil) ⇒ Reader
constructor
A new instance of Reader.
- #initialize_from_files(names_filename, pubchem_substance_ids_filename, pubchem_compound_ids_filename) ⇒ Object
- #match_list_of_names(names, threshold = 0.99) ⇒ Object
- #parse_compound(compound) ⇒ Object
- #parse_info_data(info_data) ⇒ Object
- #parse_property(property) ⇒ Object
- #parse_substance(substance) ⇒ Object
- #read(xml_filepath, type: nil) ⇒ Object
- #retrieve_compound_ids ⇒ Object
- #retrieve_ids(collection) ⇒ Object
- #retrieve_substance_ids ⇒ Object
- #save(names_filename, pubchem_substance_ids_filename, pubchem_compound_ids_filename) ⇒ Object
- #short_code(name) ⇒ Object
Constructor Details
#initialize(names_filename = nil, pubchem_substance_ids_filename = nil, pubchem_compound_ids_filename = nil) ⇒ Reader
Returns a new instance of Reader.
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 |
# File 'lib/pubchem/reader.rb', line 12 def initialize(names_filename=nil, pubchem_substance_ids_filename=nil, pubchem_compound_ids_filename=nil) @fuzzy_matcher = FuzzyStringMatch::JaroWinkler .create( :native ) return if initialize_from_files( names_filename, pubchem_substance_ids_filename, pubchem_compound_ids_filename ) @names = Hash.new { |h,k| h[k] = Set.new } @pubchem_substance_ids = Hash.new { |h,k| h[k] = Set.new } @pubchem_compound_ids = Hash.new { |h,k| h[k] = Set.new } end |
Instance Attribute Details
#names ⇒ Object
Returns the value of attribute names.
8 9 10 |
# File 'lib/pubchem/reader.rb', line 8 def names @names end |
#pubchem_compound_ids ⇒ Object
Returns the value of attribute pubchem_compound_ids.
8 9 10 |
# File 'lib/pubchem/reader.rb', line 8 def pubchem_compound_ids @pubchem_compound_ids end |
#pubchem_substance_ids ⇒ Object
Returns the value of attribute pubchem_substance_ids.
8 9 10 |
# File 'lib/pubchem/reader.rb', line 8 def pubchem_substance_ids @pubchem_substance_ids end |
Instance Method Details
#add_name(name) ⇒ Object
140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
# File 'lib/pubchem/reader.rb', line 140 def add_name(name) return if name.nil? || name.empty? # Speed up lookups with sorted names @names[self.short_code(name)].add name if @current_type == "substance" @pubchem_substance_ids[name].add @pubchem_id elsif @current_type == "compound" @pubchem_compound_ids[name].add @pubchem_id else raise "Unknown substance" end end |
#fuzzy_name_lookup(lookup_name, threshold) ⇒ Object
156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 |
# File 'lib/pubchem/reader.rb', line 156 def fuzzy_name_lookup(lookup_name, threshold) closest_distance = 0.0 closest_name = nil # Optimistically check for exact name match exact_match = self.short_code(lookup_name).include? lookup_name return @pubchem_ids[lookup_name] if exact_match return nil if threshold == 1.0 @names[self.short_code(lookup_name)].each do |name| distance = @fuzzy_matcher.getDistance(lookup_name, name) if distance > closest_distance closest_name = name closest_distance = distance end end return closest_name if closest_distance > 0.99 end |
#initialize_from_files(names_filename, pubchem_substance_ids_filename, pubchem_compound_ids_filename) ⇒ Object
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
# File 'lib/pubchem/reader.rb', line 30 def initialize_from_files(names_filename, pubchem_substance_ids_filename, pubchem_compound_ids_filename) filenames = [ names_filename, pubchem_substance_ids_filename, pubchem_compound_ids_filename ] return nil unless filenames.any? raise "Both filenames required" unless filenames.all? @names = Ox.load_file(names_filename) @pubchem_substance_ids = Ox.load_file(pubchem_substance_ids_filename) @pubchem_compound_ids = Ox.load_file(pubchem_compound_ids_filename) end |
#match_list_of_names(names, threshold = 0.99) ⇒ Object
182 183 184 185 186 187 |
# File 'lib/pubchem/reader.rb', line 182 def match_list_of_names(names, threshold=0.99) @matched_names = names.inject({}) do |acc, name| acc[name] = self.fuzzy_name_lookup(name, threshold) acc end end |
#parse_compound(compound) ⇒ Object
87 88 89 90 91 92 93 94 95 96 97 98 |
# File 'lib/pubchem/reader.rb', line 87 def parse_compound(compound) @pubchem_id = compound.css("PC-Compound_id PC-CompoundType PC-CompoundType_id PC-CompoundType_id_cid").text.to_i compound.css("PC-Compound_props").each do |property| self.parse_property(property) end end |
#parse_info_data(info_data) ⇒ Object
122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
# File 'lib/pubchem/reader.rb', line 122 def parse_info_data(info_data) urn_label = info_data.css("PC-InfoData_urn PC-Urn PC-Urn_label").first.text name = nil case urn_label when "SMILES" name = info_data.css("PC-InfoData_value PC-InfoData_value_sval").first.text when"IUPAC Name" name = info_data.css("PC-InfoData_value PC-InfoData_value_sval").first.text end self.add_name(name) end |
#parse_property(property) ⇒ Object
114 115 116 117 118 119 120 |
# File 'lib/pubchem/reader.rb', line 114 def parse_property(property) property.css("PC-InfoData").each do |info_data| parse_info_data(info_data) end end |
#parse_substance(substance) ⇒ Object
100 101 102 103 104 105 106 107 108 109 110 111 112 |
# File 'lib/pubchem/reader.rb', line 100 def parse_substance(substance) @pubchem_id = substance.css("PC-Substance_sid PC-ID PC-ID_id").text.to_i substance.css("PC-Substance_synonyms PC-Substance_synonyms_E").each do |substance_synonym| self.add_name(substance_synonym.text) end end |
#read(xml_filepath, type: nil) ⇒ Object
57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
# File 'lib/pubchem/reader.rb', line 57 def read(xml_filepath, type: nil) filepath = File.basename(xml_filepath) if type.nil? and filepath.downcase.start_with? "compound" type = :compound elsif type.nil? and filepath.downcase.start_with? "substance" type = :substance else raise "Cannot infer pubchem type" end f = File.open(xml_filepath) doc = Nokogiri::XML(f) f.close @current_type = type.to_s case type when :compound doc.css("PC-Compounds PC-Compound").each do |compound| self.parse_compound(compound) end when :substance doc.css("PC-Substances PC-Substance").each do |substance| self.parse_substance(substance) end else raise "Unknown type" end end |
#retrieve_compound_ids ⇒ Object
215 216 217 |
# File 'lib/pubchem/reader.rb', line 215 def retrieve_compound_ids self.retrieve_ids(@pubchem_compound_ids) end |
#retrieve_ids(collection) ⇒ Object
189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 |
# File 'lib/pubchem/reader.rb', line 189 def retrieve_ids(collection) msg = "@matched_names required, see #{self.class}#match_list_of_names" raise msg unless @matched_names @matched_names.inject({}) do |acc, name| input_name = name[0] matched_name = name[1] if matched_name ids = collection[matched_name] if ids.size > 1 puts "WARNING: Multiple matching sets" end collection_id = collection[matched_name].first acc[input_name] = collection_id if collection_id end acc end end |
#retrieve_substance_ids ⇒ Object
211 212 213 |
# File 'lib/pubchem/reader.rb', line 211 def retrieve_substance_ids self.retrieve_ids(@pubchem_substance_ids) end |
#save(names_filename, pubchem_substance_ids_filename, pubchem_compound_ids_filename) ⇒ Object
47 48 49 50 51 52 53 54 55 |
# File 'lib/pubchem/reader.rb', line 47 def save(names_filename, pubchem_substance_ids_filename, pubchem_compound_ids_filename) Ox.to_file(names_filename, @names, indent: 0) Ox.to_file(pubchem_substance_ids_filename, @pubchem_substance_ids, indent: 0) Ox.to_file(pubchem_compound_ids_filename, @pubchem_compound_ids, indent: 0) end |
#short_code(name) ⇒ Object
219 220 221 |
# File 'lib/pubchem/reader.rb', line 219 def short_code(name) name[0..2].downcase end |