Class: Wgit::Database

Inherits:
Object
  • Object
show all
Includes:
Assertable
Defined in:
lib/wgit/database/database.rb

Overview

Class modeling a DB connection and CRUD operations for the Url and Document collections. The most common methods are: insert, update, urls, search, stats, size.

Constant Summary collapse

LOG_FILE_PATH =

Is relative to the root project folder, not this file.

"misc/mongo_log.txt"

Constants included from Assertable

Assertable::DEFAULT_DUCK_FAIL_MSG, Assertable::DEFAULT_TYPE_FAIL_MSG, Assertable::WRONG_METHOD_MSG

Instance Method Summary collapse

Methods included from Assertable

#assert_arr_types, #assert_respond_to, #assert_types

Constructor Details

#initializeDatabase

Returns a new instance of Database.



21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# File 'lib/wgit/database/database.rb', line 21

def initialize
  conn_details = Wgit::CONNECTION_DETAILS
  if conn_details.empty?
    raise "Wgit::CONNECTION_DETAILS must be defined and include :host, 
:port, :db, :uname, :pword for a database connection to be established."
  end
  
  logger = Logger.new(LOG_FILE_PATH)
  address = "#{conn_details[:host]}:#{conn_details[:port]}"
  @@client = Mongo::Client.new([address], 
                               :database => conn_details[:db],
                               :user => conn_details[:uname],
                               :password => conn_details[:pword],
                               :logger => logger,
                               :truncate_logs => false)
end

Instance Method Details

#crawled_urls(limit = 0, skip = 0, &block) ⇒ Object



103
104
105
# File 'lib/wgit/database/database.rb', line 103

def crawled_urls(limit = 0, skip = 0, &block)
  urls(true, limit, skip, &block)
end

#insert(data) ⇒ Object

Create Data ###



40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# File 'lib/wgit/database/database.rb', line 40

def insert(data)
    if data.is_a?(Url)
        insert_urls(data)
    elsif data.is_a?(Document)
        insert_docs(data)
    elsif data.respond_to?(:first)
        if data.first.is_a?(Url)
            insert_urls(data)
        else
            insert_docs(data)
        end
    else
        raise "data is not in the correct format (all Url's or Document's)"
    end
end

#insert_docs(doc_or_docs) ⇒ Object Also known as: insert_doc



69
70
71
72
73
74
75
76
77
78
79
80
81
82
# File 'lib/wgit/database/database.rb', line 69

def insert_docs(doc_or_docs)
    unless doc_or_docs.respond_to?(:map)
        assert_type(doc_or_docs, [Document, Hash])
        unless doc_or_docs.is_a?(Hash)
            doc_or_docs = Wgit::Model.document(doc_or_docs)
        end
    else
        assert_arr_types(doc_or_docs, [Document, Hash])
        doc_or_docs = doc_or_docs.map do |doc|
            Wgit::Model.document(doc) unless doc.is_a?(Hash)
        end
    end
    create(:documents, doc_or_docs)
end

#insert_urls(url_or_urls) ⇒ Object Also known as: insert_url



56
57
58
59
60
61
62
63
64
65
66
67
# File 'lib/wgit/database/database.rb', line 56

def insert_urls(url_or_urls)
    unless url_or_urls.respond_to?(:map)
        assert_type(url_or_urls, Url)
        url_or_urls = Wgit::Model.url(url_or_urls)
    else
        assert_arr_types(url_or_urls, Url)
        url_or_urls = url_or_urls.map do |url|
            Wgit::Model.url(url)
        end
    end
    create(:urls, url_or_urls)
end

#search(text, whole_sentence = false, limit = 10, skip = 0, &block) ⇒ Array

Currently all searches are case insensitive.

Searches against the indexed docs in the DB for the given text. The searched fields are decided by the text index setup against the documents collection. Currently we search against the following fields: “author”, “keywords”, “title” and “text”.

The MongoDB search ranks/sorts the results in order (highest first) based upon each documents textScore which records the number of text hits. We then store this textScore in each Document object for use elsewhere if needed.

searched for separately. most relevant based upon the textScore of the search.

Parameters:

  • text (String)

    the value to search the data against.

  • whole_sentence (Boolean) (defaults to: false)

    whether multiple words should be

  • limit (Fixnum) (defaults to: 10)

    the max length/count of the results array.

  • skip (Fixnum) (defaults to: 0)

    the number of results to skip, starting with the

  • block (Block)

    a block which if provided is passed to each result.

Returns:

  • (Array)

    of Document objects representing the search results.



132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# File 'lib/wgit/database/database.rb', line 132

def search(text, whole_sentence = false, limit = 10, skip = 0, &block)
  text.strip!
  text.replace("\"" + text + "\"") if whole_sentence

  # The textScore sorts based on the most search hits.
  # We use the textScore hash as a sort and a projection below.
  # :$caseSensitive => case_sensitive, # 3.2+ only.
  sort_proj = { :score => { :$meta => "textScore" } }
  query = { :$text => { :$search => text } }
  results = retrieve(:documents, query, sort_proj, sort_proj, limit, skip)

  return [] if results.count < 1
  # results.respond_to? :map! is false so we use map and overwrite the var.
  results = results.map { |mongo_doc| Wgit::Document.new(mongo_doc) }
  return results unless block_given?
  results.each { |doc| block.call(doc) }
end

#search_p(text, whole_sentence = false, limit = 10, skip = 0, sentence_length = 80, &block) ⇒ Object Also known as: search_and_format

Performs a search and pretty prints the results.



151
152
153
154
155
# File 'lib/wgit/database/database.rb', line 151

def search_p(text, whole_sentence = false, limit = 10, 
             skip = 0, sentence_length = 80, &block)
  results = search(text, whole_sentence, limit, skip, &block)
  Wgit::Utils.printf_search_results(results, text, false, sentence_length)
end

#sizeObject Also known as: count, length



162
163
164
# File 'lib/wgit/database/database.rb', line 162

def size
    stats[:dataSize]
end

#statsObject

Returns a Mongo object which can be used like a Hash to retrieve values.



158
159
160
# File 'lib/wgit/database/database.rb', line 158

def stats
    @@client.command(:dbStats => 0).documents[0]
end

#uncrawled_urls(limit = 0, skip = 0, &block) ⇒ Object



107
108
109
# File 'lib/wgit/database/database.rb', line 107

def uncrawled_urls(limit = 0, skip = 0, &block)
  urls(false, limit, skip, &block)
end

#update(data) ⇒ Object

Update Data ###



168
169
170
171
172
173
174
175
176
# File 'lib/wgit/database/database.rb', line 168

def update(data)
  if data.is_a?(Url)
    update_url(data)
  elsif data.is_a?(Document)
    update_doc(data)
  else
    raise "data is not in the correct format (all Url's or Document's)"
  end
end

#update_doc(doc) ⇒ Object



186
187
188
189
190
191
192
# File 'lib/wgit/database/database.rb', line 186

def update_doc(doc)
  assert_type(doc, Document)
  selection = { :url => doc.url }
  doc_hash = Wgit::Model.document(doc).merge(Wgit::Model.common_update_data)
  update = { "$set" => doc_hash }
  _update(true, :documents, selection, update)
end

#update_url(url) ⇒ Object



178
179
180
181
182
183
184
# File 'lib/wgit/database/database.rb', line 178

def update_url(url)
  assert_type(url, Url)
  selection = { :url => url }
  url_hash = Wgit::Model.url(url).merge(Wgit::Model.common_update_data)
  update = { "$set" => url_hash }
  _update(true, :urls, selection, update)
end

#urls(crawled = nil, limit = 0, skip = 0, &block) ⇒ Object

A crawled parameter value of nil (the default) returns all urls. A limit of 0 means all urls are returned. All urls are sorted by date_added ascending, in other words the first url in the results is the first added.



90
91
92
93
94
95
96
97
98
99
100
101
# File 'lib/wgit/database/database.rb', line 90

def urls(crawled = nil, limit = 0, skip = 0, &block)
  crawled.nil? ? query = {} : query = { :crawled => crawled }
  
  sort = { :date_added => 1 }
  results = retrieve(:urls, query, sort, {}, limit, skip)
  return [] if results.count < 1
  
  # results.respond_to? :map! is false so we use map and overwrite the var.
  results = results.map { |url_doc| Wgit::Url.new(url_doc) }
  return results unless block_given?
  results.each { |url| block.call(url) }
end