Class: Corpus

Inherits:
Object
  • Object
show all
Defined in:
lib/rbbt/corpus/corpus.rb,
lib/rbbt/corpus/sources/pubmed.rb

Constant Summary collapse

NAMESPACES =
{}

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(corpora_path = nil) ⇒ Corpus

Returns a new instance of Corpus.



6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# File 'lib/rbbt/corpus/corpus.rb', line 6

def initialize(corpora_path = nil)
   @corpora_path = case
                  when corpora_path.nil?
                    Rbbt.corpora
                  when (not Path === corpora_path)
                    Path.setup(corpora_path)
                  else
                    corpora_path
                  end

   @corpora_path = @corpora_path.find
   @persistence_dir = File.join(@corpora_path, "annotations")

   Misc.lock(@persistence_dir) do
     @global_annotations = TSV.setup(Persist.open_tokyocabinet(File.join(@persistence_dir, "global_annotations"), false, :list), :key => "ID", :fields => ["Start", "End", "JSON", "Document ID", "Entity Type"])
     @global_annotations.unnamed = true
     @global_annotations.close
   end

   Misc.lock(@corpora_path.document_repo) do
     @document_repo   = DocumentRepo.open_tokyocabinet @corpora_path.document_repo, false
   end

end

Instance Attribute Details

#corpora_pathObject

Returns the value of attribute corpora_path.



5
6
7
# File 'lib/rbbt/corpus/corpus.rb', line 5

def corpora_path
  @corpora_path
end

#document_repoObject

Returns the value of attribute document_repo.



5
6
7
# File 'lib/rbbt/corpus/corpus.rb', line 5

def document_repo
  @document_repo
end

#global_annotationsObject

Returns the value of attribute global_annotations.



5
6
7
# File 'lib/rbbt/corpus/corpus.rb', line 5

def global_annotations
  @global_annotations
end

#persistence_dirObject

Returns the value of attribute persistence_dir.



5
6
7
# File 'lib/rbbt/corpus/corpus.rb', line 5

def persistence_dir
  @persistence_dir
end

Instance Method Details

#add_document(text, namespace, id, type = nil) ⇒ Object



46
47
48
49
# File 'lib/rbbt/corpus/corpus.rb', line 46

def add_document(text, namespace, id, type = nil)
  hash = Digest::MD5.hexdigest(text)
  @document_repo.add(text, namespace, id, type, hash)
end

#add_pmid(pmid, type = nil) ⇒ Object



8
9
10
11
12
13
14
15
16
17
18
19
20
# File 'lib/rbbt/corpus/sources/pubmed.rb', line 8

def add_pmid(pmid, type = nil)
  pmids = Array === pmid ? pmid : [pmid]
  type = nil if String === type and type.empty?

  PubMed.get_article(pmids).collect do |pmid, article|
    if (type.nil? and article.pdf_url.nil?) or (not type.nil? and type.to_sym === :abstract)
      add_document(article.text, :pubmed, pmid, :abstract)
    else
      raise "No FullText available for #{ pmid }" if article.pdf_url.nil?
      add_document(article.full_text, :pubmed, pmid, :fulltext)
    end
  end
end

#add_pubmed_query(query, max, type = nil) ⇒ Object



22
23
24
25
# File 'lib/rbbt/corpus/sources/pubmed.rb', line 22

def add_pubmed_query(query, max, type = nil)
  pmids = PubMed.query(query, max)
  add_pmid(pmids, type)
end

#docid(docid) ⇒ Object



41
42
43
44
# File 'lib/rbbt/corpus/corpus.rb', line 41

def docid(docid)
  raise "Document '#{ docid }' was not found." unless @document_repo.include? docid
  Document.new(persistence_for(docid), docid, @document_repo[docid], @global_annotations)
end

#document(namespace, id, type, hash) ⇒ Object



35
36
37
38
39
# File 'lib/rbbt/corpus/corpus.rb', line 35

def document(namespace, id, type, hash)
  docid = [namespace, id, type, hash] * ":"
  raise "Document '#{ docid }' was not found." unless @document_repo.include? docid
  Document.new(persistence_for(docid), docid, @document_repo[docid], @global_annotations)
end

#exists?(namespace = nil, id = nil, type = nil, hash = nil) ⇒ Boolean

Returns:

  • (Boolean)


63
64
65
# File 'lib/rbbt/corpus/corpus.rb', line 63

def exists?(namespace=nil, id = nil, type = nil, hash = nil)
  find(namespace, id, type, hash).any?
end

#find(namespace = nil, id = nil, type = nil, hash = nil) ⇒ Object



51
52
53
54
55
# File 'lib/rbbt/corpus/corpus.rb', line 51

def find(namespace=nil, id = nil, type = nil, hash = nil)
  @document_repo.find(namespace, id, type, hash).collect{|docid|
    Document.new(persistence_for(docid), docid, @document_repo[docid], @global_annotations)
  }
end

#find_docid(docid) ⇒ Object



57
58
59
60
61
# File 'lib/rbbt/corpus/corpus.rb', line 57

def find_docid(docid)
  @document_repo.find_docid(docid).collect{|docid|
    Document.new(persistence_for(docid), docid, @document_repo[docid], @global_annotations)
  }
end

#persistence_for(docid) ⇒ Object



31
32
33
# File 'lib/rbbt/corpus/corpus.rb', line 31

def persistence_for(docid)
  File.join(persistence_dir, docid)
end