Class: Ezgff::GffDb

Inherits:
Object
  • Object
show all
Defined in:
lib/ezgff/gffsqlitedb.rb

Defined Under Namespace

Classes: Annotation

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(path) ⇒ GffDb

Returns a new instance of GffDb.



170
171
172
# File 'lib/ezgff/gffsqlitedb.rb', line 170

def initialize(path)
  @db = SQLite3::Database.new(path)
end

Class Method Details

.attributes_as_json(gffline) ⇒ Object



139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
# File 'lib/ezgff/gffsqlitedb.rb', line 139

def self.attributes_as_json(gffline)
  keys_multi_val_allowed = %{Parent Alias Note Dbxref Ontology_term}

  gr = Bio::GFF::GFF3::Record.new(gffline.chomp)

  h = Hash.new
  gr.attributes.each do |att|
    k, v = att
    unless h.has_key?(k)
      h[k] = []
    end
    h[k] << v
  end
  h2 = Hash.new
  h.each do |key, values|
    if key == "Dbxref" || key == "Ontology_term"
      h3 = Hash.new
      values.each do |val|
        m = /(.+?):/.match(val)
        dbtag = m[1]
        dbval = m.post_match
        h3.update({dbtag => dbval})
      end
      h2[key] = h3
    else
      h2[key] = values.join(",")
    end
  end
  h2.to_json
end

.build_db(gff_in, ezdb_base = nil) ⇒ Object

sqlite3 schema

gff_records (

line_num     integer primary key,
record       text,            # original record
id           text,
parent       text,
seqid        text not null,
source       text,
type         text,
start        integer not null,
end          integer not null,
score        real,
strand       varchar(1),
phase        integer,
attributes   text,
attributes_json json

)



35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# File 'lib/ezgff/gffsqlitedb.rb', line 35

def self.build_db(gff_in, ezdb_base = nil)
  ezdb_base = (ezdb_base || ".")
  ezdb_path = ezdb_base + "/" + File.basename(gff_in) + ".ezdb"
  gff_file = ezdb_path + "/" + File.basename(gff_in)
  Dir.mkdir(ezdb_path)
  File.open(gff_file, "w") do |o|
    File.open(gff_in).each do |l|
      break if /^\#\#FASTA/.match(l)
      ## skip header section
      next if /^\#/.match(l)
      o.puts l
    end
  end
  
  #    FileUtils.cp(gff_in, gff_file)
  sq3_file = gff_file + ".sqlite3"

  ## Create table in sqlite3 RDBMS
  ##   table name: gff_record

  sq3_db = SQLite3::Database.new(sq3_file)

  sql = <<-SQL
  CREATE TABLE gff_records (
    line_num     integer primary key,
    record       text,
    id           text,
    parent       text,
    seqid        text not null,
    source       text,
    type         text,
    start        integer not null,
    end          integer not null,
    score        real,
    strand       varchar(1),
    phase        integer,
    attributes   text,
    attributes_json json
  );
  SQL

  sq3_db.execute(sql)

  ## Read GFF file and insert data into 
  ## the sqlite3 table

  sq3_db.transaction do 
    File.open(gff_file).each_with_index do |l, i|
  #    puts l
      ## skip FASTA seq section
      break if /^\#\#FASTA/.match(l)
  
      ## skip header section
      next if /^\#/.match(l)
      gr = Bio::GFF::GFF3::Record.new(l.chomp)
  #    p gr.attributes
      id = nil
      id_found = gr.attributes.select{|a| a[0] == "ID"}
      if id_found.size == 1
        id = id_found[0][1]
      elsif id_found.size == 0
        ## do nothing (id = nil)
      elsif id_found > 1
        STDERR.puts gr.attributes
        raise "Multiple IDs found."
      end
      parent =  ((gr.attributes.select{|a| a[0] == "Parent"}[0]) || [])[1]
      a = l.chomp.split(/\t/)
  
      sql = "INSERT INTO gff_records (line_num, record, id, parent, seqid, source, type, start, end, score, strand, phase, attributes, attributes_json)
      VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
      values = [i, l.chomp, id, parent, 
        a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8], 
        attributes_as_json(l)]
      sq3_db.execute(sql, values)
    end
  end

  ## Indexing the sqlite3 table
  table = "gff_records"
  %w{id parent source type}.each do |col|
    idxname = "index_#{table}_on_#{col}"
    sql = "CREATE INDEX #{idxname} ON #{table}(#{col})"
    sq3_db.execute(sql)
  end

  return ezdb_path

end

.build_tabix(gff_in) ⇒ Object



125
126
127
128
129
130
131
132
133
134
135
136
137
# File 'lib/ezgff/gffsqlitedb.rb', line 125

def self.build_tabix(gff_in)
  ## sort gff by position
  gfffile_sorted = gff_in + ".gz"
  cmd = %Q{(grep ^"#" #{gff_in}; grep -v ^"#" #{gff_in} | sort -t $'\t' -k1,1 -k4,4n) | bgzip > #{gfffile_sorted};}
  STDERR.puts cmd
  system cmd

  cmd = "tabix -p gff #{gfffile_sorted}"
  STDERR.puts cmd
  system cmd
  
  STDERR.puts "#{gfffile_sorted} and #{gfffile_sorted}.tbi were generated."
end

Instance Method Details

#each_recordObject



174
175
176
177
178
179
180
181
# File 'lib/ezgff/gffsqlitedb.rb', line 174

def each_record
  sql = "SELECT * FROM gff_records"
  @db.execute(sql).each do |r|
    an = Annotation.new()
    an.build_from_db_record(r)
    yield an
  end
end

#get(id) ⇒ Object



183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
# File 'lib/ezgff/gffsqlitedb.rb', line 183

def get(id)
  sql = %Q{SELECT * FROM gff_records WHERE id=="#{id}";}
  #    puts sql
  res = @db.execute(sql)
  if res.size == 1
    an = Annotation.new(@db)
    an.build_from_db_record(res[0])
    return an
  else
    if res.size >= 2
      raise "multiple hits"
    elsif res.size == 0
      raise "not found: #{id}"
    end
  end
end

#get_by_line_number(n) ⇒ Object



200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
# File 'lib/ezgff/gffsqlitedb.rb', line 200

def (n)
  sql = %Q{SELECT * FROM gff_records WHERE line_num=="#{n}";}
  res = @db.execute(sql)
  if res.size == 1
    an = Annotation.new(@db)
    an.build_from_db_record(res[0])
    return an
  else
    if res.size >= 2
      raise "multiple hits"
    elsif res.size == 0
      raise "not found: #{id}"
    end
  end
end

#search(query, num_limit = 100) ⇒ Object



216
217
218
219
220
221
222
# File 'lib/ezgff/gffsqlitedb.rb', line 216

def search(query, num_limit=100)
  sql = %Q{SELECT * FROM  gff_records WHERE id LIKE "%#{query}%" OR parent LIKE "%#{query}%" OR attributes LIKE "%#{query}%";}
  STDERR.puts sql
  res = @db.execute(sql)
  res2 = res.map{|r| an = Annotation.new(@db); an.build_from_db_record(r); an}
  res2
end