Class: Ezgff::GffDb

Inherits:
Object
  • Object
show all
Defined in:
lib/ezgff/gffsqlitedb.rb

Defined Under Namespace

Classes: Annotation

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(path) ⇒ GffDb

Returns a new instance of GffDb.



184
185
186
# File 'lib/ezgff/gffsqlitedb.rb', line 184

def initialize(path)
  @db = SQLite3::Database.new(path)
end

Class Method Details

.attributes_as_json(gffline) ⇒ Object



151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
# File 'lib/ezgff/gffsqlitedb.rb', line 151

def self.attributes_as_json(gffline)
  keys_multi_val_allowed = %{Parent Alias Note Dbxref Ontology_term}

  gr = Bio::GFF::GFF3::Record.new(gffline.chomp)

  h = Hash.new
  gr.attributes.each do |att|
    k, v = att
    unless h.has_key?(k)
      h[k] = []
    end
    h[k] << v
  end
  h2 = Hash.new
  h.each do |key, values|
    if key == "Dbxref2" # dummy (not used currently)
      h3 = Hash.new
      values.each do |val|
        m = /(.+?):/.match(val)
        dbtag = m[1]
        dbval = m.post_match
        h3.update({dbtag => dbval})
      end
      h2[key] = h3
    elsif key == "Ontology_term" || key == "Alias" || key == "Dbxref"
      h2[key] = values
    else
      h2[key] = values.join(",")
    end
  end
  h2.to_json
end

.build_db(gff_in, ezdb_base = nil) ⇒ Object

sqlite3 schema

gff_records (

line_num     integer primary key,
record       text,            # original record
id           text,
parent       text,
seqid        text not null,
source       text,
type         text,
start        integer not null,
end          integer not null,
score        real,
strand       varchar(1),
phase        integer,
attributes   text,
attributes_json json

)



35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# File 'lib/ezgff/gffsqlitedb.rb', line 35

def self.build_db(gff_in, ezdb_base = nil)
  ezdb_base = (ezdb_base || ".")
  ezdb_path = ezdb_base + "/" + File.basename(gff_in) + ".ezdb"
  gff_file = ezdb_path + "/" + File.basename(gff_in)
  Dir.mkdir(ezdb_path)
  File.open(gff_file, "w") do |o|
    File.open(gff_in).each do |l|
      break if /^\#\#FASTA/.match(l)
      ## skip header section
      next if /^\#/.match(l)
      o.puts l
    end
  end
  
  #    FileUtils.cp(gff_in, gff_file)
  sq3_file = gff_file + ".sqlite3"

  ## Create table in sqlite3 RDBMS
  ##   table name: gff_record

  sq3_db = SQLite3::Database.new(sq3_file)

  sql = <<-SQL
  CREATE TABLE gff_records (
    line_num     integer primary key,
    record       text,
    id           text,
    parent       text,
    seqid        text not null,
    source       text,
    type         text,
    start        integer not null,
    end          integer not null,
    score        real,
    strand       varchar(1),
    phase        integer,
    attributes   text,
    attributes_json json
  );
  SQL

  sq3_db.execute(sql)

  ## Read GFF file and insert data into 
  ## the sqlite3 table

  sq3_db.transaction do 
    File.open(gff_file).each_with_index do |l, i|
  #    puts l
      ## skip FASTA seq section
      break if /^\#\#FASTA/.match(l)
  
      ## skip header section
      next if /^\#/.match(l)
      gr = Bio::GFF::GFF3::Record.new(l.chomp)
  #    p gr.attributes
      id = nil
      id_found = gr.attributes.select{|a| a[0] == "ID"}
      if id_found.size == 1
        id = id_found[0][1]
      elsif id_found.size == 0
        ## do nothing (id = nil)
      elsif id_found > 1
        STDERR.puts gr.attributes
        raise "Multiple IDs found."
      end
      parent =  ((gr.attributes.select{|a| a[0] == "Parent"}[0]) || [])[1]
      a = l.chomp.split(/\t/)
  
      sql = "INSERT INTO gff_records (line_num, record, id, parent, seqid, source, type, start, end, score, strand, phase, attributes, attributes_json)
      VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
      values = [
        i,       # line number
        l.chomp, # raw record
        id,      # ID
        parent,  # parent ID
        a[0],    # seqid
        a[1],    # source
        a[2],    # type
        a[3],    # start
        a[4],    # end
        (a[5] == "." ? nil : a[5]),    # score
        a[6],    # strand
        (a[7] == "." ? nil : a[7]),    # phase
        a[8],    # attributes
        attributes_as_json(l)]
      sq3_db.execute(sql, values)
    end
  end

  ## Indexing the sqlite3 table
  table = "gff_records"
  %w{id parent source type}.each do |col|
    idxname = "index_#{table}_on_#{col}"
    sql = "CREATE INDEX #{idxname} ON #{table}(#{col})"
    sq3_db.execute(sql)
  end

  return ezdb_path

end

.build_tabix(gff_in) ⇒ Object



137
138
139
140
141
142
143
144
145
146
147
148
149
# File 'lib/ezgff/gffsqlitedb.rb', line 137

def self.build_tabix(gff_in)
  ## sort gff by position
  gfffile_sorted = gff_in + ".gz"
  cmd = %Q{(grep ^"#" #{gff_in}; grep -v ^"#" #{gff_in} | sort -t $'\t' -k1,1 -k4,4n) | bgzip > #{gfffile_sorted};}
  STDERR.puts cmd
  system cmd

  cmd = "tabix -p gff #{gfffile_sorted}"
  STDERR.puts cmd
  system cmd
  
  STDERR.puts "#{gfffile_sorted} and #{gfffile_sorted}.tbi were generated."
end

Instance Method Details

#each_recordObject



188
189
190
191
192
193
194
195
# File 'lib/ezgff/gffsqlitedb.rb', line 188

def each_record
  sql = "SELECT * FROM gff_records"
  @db.execute(sql).each do |r|
    an = Annotation.new()
    an.build_from_db_record(r)
    yield an
  end
end

#get(id) ⇒ Object



197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
# File 'lib/ezgff/gffsqlitedb.rb', line 197

def get(id)
  sql = %Q{SELECT * FROM gff_records WHERE id=="#{id}";}
  #    puts sql
  res = @db.execute(sql)
  if res.size == 1
    an = Annotation.new(@db)
    an.build_from_db_record(res[0])
    return an
  else
    if res.size >= 2
      raise "multiple hits"
    elsif res.size == 0
      raise "not found: #{id}"
    end
  end
end

#get_by_line_number(n) ⇒ Object



214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
# File 'lib/ezgff/gffsqlitedb.rb', line 214

def (n)
  sql = %Q{SELECT * FROM gff_records WHERE line_num=="#{n}";}
  res = @db.execute(sql)
  if res.size == 1
    an = Annotation.new(@db)
    an.build_from_db_record(res[0])
    return an
  else
    if res.size >= 2
      raise "multiple hits"
    elsif res.size == 0
      raise "not found: #{id}"
    end
  end
end

#search(query, num_limit = 100, type = nil) ⇒ Object



230
231
232
233
234
235
236
237
238
239
240
# File 'lib/ezgff/gffsqlitedb.rb', line 230

def search(query, num_limit=100, type=nil)
  sql = %Q{SELECT * FROM  gff_records WHERE id LIKE "%#{query}%" OR parent LIKE "%#{query}%" OR attributes LIKE "%#{query}%" }
  if type
    sql += %Q{ AND type=="#{type}"}
  end
  sql += %Q{ LIMIT #{num_limit} } ;
  STDERR.puts sql
  res = @db.execute(sql)
  res2 = res.map{|r| an = Annotation.new(@db); an.build_from_db_record(r); an}
  res2
end