Class: Reader

Inherits:
Object
  • Object
show all
Defined in:
lib/pubchem/reader.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(names_filename = nil, pubchem_substance_ids_filename = nil, pubchem_compound_ids_filename = nil) ⇒ Reader

Returns a new instance of Reader.



12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# File 'lib/pubchem/reader.rb', line 12

def initialize(names_filename=nil,
               pubchem_substance_ids_filename=nil,
               pubchem_compound_ids_filename=nil)

  @fuzzy_matcher = FuzzyStringMatch::JaroWinkler
                   .create( :native )

  return if initialize_from_files( names_filename,
                                   pubchem_substance_ids_filename,
                                   pubchem_compound_ids_filename )

  @names = Hash.new { |h,k| h[k] = Set.new }

  @pubchem_substance_ids = Hash.new { |h,k| h[k] = Set.new }
  @pubchem_compound_ids = Hash.new  { |h,k| h[k] = Set.new }

end

Instance Attribute Details

#namesObject

Returns the value of attribute names.



8
9
10
# File 'lib/pubchem/reader.rb', line 8

def names
  @names
end

#pubchem_compound_idsObject

Returns the value of attribute pubchem_compound_ids.



8
9
10
# File 'lib/pubchem/reader.rb', line 8

def pubchem_compound_ids
  @pubchem_compound_ids
end

#pubchem_substance_idsObject

Returns the value of attribute pubchem_substance_ids.



8
9
10
# File 'lib/pubchem/reader.rb', line 8

def pubchem_substance_ids
  @pubchem_substance_ids
end

Instance Method Details

#add_name(name) ⇒ Object



140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# File 'lib/pubchem/reader.rb', line 140

def add_name(name)
  return if name.nil? || name.empty?

  # Speed up lookups with sorted names
  @names[self.short_code(name)].add name

  if @current_type == "substance"
    @pubchem_substance_ids[name].add @pubchem_id
  elsif @current_type == "compound"
    @pubchem_compound_ids[name].add @pubchem_id
  else
    raise "Unknown substance"
  end

end

#fuzzy_name_lookup(lookup_name, threshold) ⇒ Object



156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
# File 'lib/pubchem/reader.rb', line 156

def fuzzy_name_lookup(lookup_name, threshold)

  closest_distance = 0.0
  closest_name = nil

  # Optimistically check for exact name match
  exact_match = self.short_code(lookup_name).include? lookup_name

  return @pubchem_ids[lookup_name] if exact_match
  return nil if threshold == 1.0

  @names[self.short_code(lookup_name)].each do |name|

    distance = @fuzzy_matcher.getDistance(lookup_name, name)

    if distance > closest_distance
      closest_name = name
      closest_distance = distance
    end

  end

  return closest_name if closest_distance > 0.99

end

#initialize_from_files(names_filename, pubchem_substance_ids_filename, pubchem_compound_ids_filename) ⇒ Object



30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# File 'lib/pubchem/reader.rb', line 30

def initialize_from_files(names_filename,
                          pubchem_substance_ids_filename,
                          pubchem_compound_ids_filename)

  filenames = [ names_filename,
                pubchem_substance_ids_filename,
                pubchem_compound_ids_filename ]

  return nil unless filenames.any?
  raise "Both filenames required" unless filenames.all?

  @names = Ox.load_file(names_filename)
  @pubchem_substance_ids = Ox.load_file(pubchem_substance_ids_filename)
  @pubchem_compound_ids = Ox.load_file(pubchem_compound_ids_filename)

end

#match_list_of_names(names, threshold = 0.99) ⇒ Object



182
183
184
185
186
187
# File 'lib/pubchem/reader.rb', line 182

def match_list_of_names(names, threshold=0.99)
  @matched_names = names.inject({}) do |acc, name|
    acc[name] = self.fuzzy_name_lookup(name, threshold)
    acc
  end
end

#parse_compound(compound) ⇒ Object



87
88
89
90
91
92
93
94
95
96
97
98
# File 'lib/pubchem/reader.rb', line 87

def parse_compound(compound)

  @pubchem_id = compound.css("PC-Compound_id
                              PC-CompoundType
                              PC-CompoundType_id
                              PC-CompoundType_id_cid").text.to_i

  compound.css("PC-Compound_props").each do |property|
    self.parse_property(property)
  end

end

#parse_info_data(info_data) ⇒ Object



122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# File 'lib/pubchem/reader.rb', line 122

def parse_info_data(info_data)

  urn_label = info_data.css("PC-InfoData_urn
                             PC-Urn
                             PC-Urn_label").first.text
  name = nil
  case urn_label
  when "SMILES"
    name = info_data.css("PC-InfoData_value
                          PC-InfoData_value_sval").first.text
  when"IUPAC Name"
    name = info_data.css("PC-InfoData_value
                          PC-InfoData_value_sval").first.text
  end

  self.add_name(name)
end

#parse_property(property) ⇒ Object



114
115
116
117
118
119
120
# File 'lib/pubchem/reader.rb', line 114

def parse_property(property)

  property.css("PC-InfoData").each do |info_data|
    parse_info_data(info_data)
  end

end

#parse_substance(substance) ⇒ Object



100
101
102
103
104
105
106
107
108
109
110
111
112
# File 'lib/pubchem/reader.rb', line 100

def parse_substance(substance)


  @pubchem_id = substance.css("PC-Substance_sid
                               PC-ID
                               PC-ID_id").text.to_i

  substance.css("PC-Substance_synonyms
                 PC-Substance_synonyms_E").each do |substance_synonym|
    self.add_name(substance_synonym.text)
  end

end

#read(xml_filepath, type: nil) ⇒ Object



57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# File 'lib/pubchem/reader.rb', line 57

def read(xml_filepath, type: nil)

  filepath = File.basename(xml_filepath)
  if type.nil? and filepath.downcase.start_with? "compound"
    type = :compound
  elsif type.nil? and filepath.downcase.start_with? "substance"
    type = :substance
  else
    raise "Cannot infer pubchem type"
  end

  f = File.open(xml_filepath)
  doc = Nokogiri::XML(f)
  f.close
  @current_type = type.to_s
  case type
  when :compound
    doc.css("PC-Compounds PC-Compound").each do |compound|
      self.parse_compound(compound)
    end
  when :substance
    doc.css("PC-Substances PC-Substance").each do |substance|
      self.parse_substance(substance)
    end
  else
    raise "Unknown type"
  end

end

#retrieve_compound_idsObject



215
216
217
# File 'lib/pubchem/reader.rb', line 215

def retrieve_compound_ids
  self.retrieve_ids(@pubchem_compound_ids)
end

#retrieve_ids(collection) ⇒ Object



189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
# File 'lib/pubchem/reader.rb', line 189

def retrieve_ids(collection)
  msg = "@matched_names required, see #{self.class}#match_list_of_names"

  raise msg unless @matched_names

  @matched_names.inject({}) do |acc, name|
    input_name = name[0]
    matched_name = name[1]

    if matched_name
      ids = collection[matched_name]
      if ids.size > 1
        puts "WARNING: Multiple matching sets"
      end
      collection_id = collection[matched_name].first
      acc[input_name] = collection_id if collection_id
    end

    acc
  end
end

#retrieve_substance_idsObject



211
212
213
# File 'lib/pubchem/reader.rb', line 211

def retrieve_substance_ids
  self.retrieve_ids(@pubchem_substance_ids)
end

#save(names_filename, pubchem_substance_ids_filename, pubchem_compound_ids_filename) ⇒ Object



47
48
49
50
51
52
53
54
55
# File 'lib/pubchem/reader.rb', line 47

def save(names_filename,
         pubchem_substance_ids_filename,
         pubchem_compound_ids_filename)

  Ox.to_file(names_filename, @names, indent: 0)
  Ox.to_file(pubchem_substance_ids_filename, @pubchem_substance_ids, indent: 0)
  Ox.to_file(pubchem_compound_ids_filename, @pubchem_compound_ids, indent: 0)

end

#short_code(name) ⇒ Object



219
220
221
# File 'lib/pubchem/reader.rb', line 219

def short_code(name)
  name[0..2].downcase
end