Module: PacerXml::Sample
- Defined in:
- lib/pacer-xml/sample.rb
Class Method Summary collapse
- .cleanup(fn = nil) ⇒ Object
-
.importer(graph = nil, fn = nil, start_rule = nil, end_rule = nil) ⇒ Object
Sample of using the xml import function with some advanced options to clean up the resulting graph.
-
.load_100(*args) ⇒ Object
Will actually load 101.
-
.load_all(graph = nil, *args) ⇒ Object
Uses a Neo4j graph because the data is too big to fit in memory without configuring the JVM to use more than its small default footprint.
- .structure(g) ⇒ Object
- .structure!(g, fn = 'patent-structure.graphml') ⇒ Object
- .xml(fn = nil, *args) ⇒ Object
Class Method Details
.cleanup(fn = nil) ⇒ Object
80 81 82 83 84 |
# File 'lib/pacer-xml/sample.rb', line 80 def cleanup(fn = nil) fn ||= a_week name, week = fn.split '_' Dir["/tmp/#{name}*"].each { |f| File.delete f } end |
.importer(graph = nil, fn = nil, start_rule = nil, end_rule = nil) ⇒ Object
Sample of using the xml import function with some advanced options to clean up the resulting graph.
Import can successfully be run with no options specified, but this patent xml is particularly hairy.
52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
# File 'lib/pacer-xml/sample.rb', line 52 def importer(graph = nil, fn = nil, start_rule = nil, end_rule = nil) html = [:abstract] rename = { 'classification-national' => 'classification', 'assistant-examiner' => 'examiner', 'primary-examiner' => 'examiner', 'us-term-of-grant' => 'term', 'addressbook' => 'entity', 'document-id' => 'document', 'us-related-documents' => 'related-document', 'us-patent-grant' => 'patent-version', 'us-bibliographic-data-grant' => 'patent' } cache = { stats: true } graph ||= Pacer.tg graph.create_key_index :type, :vertex xml_route = xml(fn, start_rule, end_rule) xml_route. process { print '.' }. import(graph, html: html, rename: rename, cache: cache) end |
.load_100(*args) ⇒ Object
Will actually load 101. To avoid this side-effect of prefetching, the route should be defined as: xml_route.limit(100).import(…)
9 10 11 12 13 |
# File 'lib/pacer-xml/sample.rb', line 9 def load_100(*args) i = importer(*args).limit(100) i.run! i.graph end |
.load_all(graph = nil, *args) ⇒ Object
Uses a Neo4j graph because the data is too big to fit in memory without configuring the JVM to use more than its small default footprint.
Alternatively, to start the JVM with more memory, try: bundle exec jruby -J-Xmx2048m -S irb
21 22 23 24 25 26 27 28 |
# File 'lib/pacer-xml/sample.rb', line 21 def load_all(graph = nil, *args) require 'pacer-neo4j' n = Time.now.to_i % 1000000 graph ||= Pacer.neo4j "sample.#{n}.graph" i = importer(graph, *args) i.run! i.graph end |
.structure(g) ⇒ Object
30 31 32 |
# File 'lib/pacer-xml/sample.rb', line 30 def structure(g) Pacer::Utils::GraphAnalysis.structure g end |
.structure!(g, fn = 'patent-structure.graphml') ⇒ Object
34 35 36 37 38 39 40 41 42 43 44 45 |
# File 'lib/pacer-xml/sample.rb', line 34 def structure!(g, fn = 'patent-structure.graphml') s = structure g if fn e = Pacer::Utils::YFilesExport.new e.vertex_label = s.vertex_name e.edge_label = s.edge_name e.export s, fn puts puts "Wrote #{ fn }" end s end |