Module: PacerXml::Sample

Defined in:
lib/pacer-xml/sample.rb

Class Method Summary collapse

Class Method Details

.cleanup(fn = nil) ⇒ Object



80
81
82
83
84
# File 'lib/pacer-xml/sample.rb', line 80

def cleanup(fn = nil)
  fn ||= a_week
  name, week = fn.split '_'
  Dir["/tmp/#{name}*"].each { |f| File.delete f }
end

.importer(graph = nil, fn = nil, start_rule = nil, end_rule = nil) ⇒ Object

Sample of using the xml import function with some advanced options to clean up the resulting graph.

Import can successfully be run with no options specified, but this patent xml is particularly hairy.



52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# File 'lib/pacer-xml/sample.rb', line 52

def importer(graph = nil, fn = nil, start_rule = nil, end_rule = nil)
  html = [:abstract]
  rename = {
    'classification-national' => 'classification',
    'assistant-examiner' => 'examiner',
    'primary-examiner' => 'examiner',
    'us-term-of-grant' => 'term',
    'addressbook' => 'entity',
    'document-id' => 'document',
    'us-related-documents' => 'related-document',
    'us-patent-grant' => 'patent-version',
    'us-bibliographic-data-grant' => 'patent'
  }
  cache = { stats: true }
  graph ||= Pacer.tg
  graph.create_key_index :type, :vertex
  xml_route = xml(fn, start_rule, end_rule)
  xml_route.
    process { print '.' }.
    import(graph, html: html, rename: rename, cache: cache)
end

.load_100(*args) ⇒ Object

Will actually load 101. To avoid this side-effect of prefetching, the route should be defined as: xml_route.limit(100).import(…)



9
10
11
12
13
# File 'lib/pacer-xml/sample.rb', line 9

def load_100(*args)
  i = importer(*args).limit(100)
  i.run!
  i.graph
end

.load_all(graph = nil, *args) ⇒ Object

Uses a Neo4j graph because the data is too big to fit in memory without configuring the JVM to use more than its small default footprint.

Alternatively, to start the JVM with more memory, try: bundle exec jruby -J-Xmx2048m -S irb



21
22
23
24
25
26
27
28
# File 'lib/pacer-xml/sample.rb', line 21

def load_all(graph = nil, *args)
  require 'pacer-neo4j'
  n = Time.now.to_i % 1000000
  graph ||= Pacer.neo4j "sample.#{n}.graph"
  i = importer(graph, *args)
  i.run!
  i.graph
end

.structure(g) ⇒ Object



30
31
32
# File 'lib/pacer-xml/sample.rb', line 30

def structure(g)
  Pacer::Utils::GraphAnalysis.structure g
end

.structure!(g, fn = 'patent-structure.graphml') ⇒ Object



34
35
36
37
38
39
40
41
42
43
44
45
# File 'lib/pacer-xml/sample.rb', line 34

def structure!(g, fn = 'patent-structure.graphml')
  s = structure g
  if fn
    e = Pacer::Utils::YFilesExport.new
    e.vertex_label = s.vertex_name
    e.edge_label = s.edge_name
    e.export s, fn
    puts
    puts "Wrote #{ fn }"
  end
  s
end

.xml(fn = nil, *args) ⇒ Object



74
75
76
77
78
# File 'lib/pacer-xml/sample.rb', line 74

def xml(fn = nil, *args)
  fn ||= a_week
  path = download_patent_grant fn
  Pacer.xml path, *args
end