Class: P3

Inherits:
Object
  • Object
show all
Defined in:
lib/arxiv/references/P3.rb

Constant Summary collapse

BASE_URL =
"https://arxiv.org"
REFERENCE_START_REGEXP =
Regexp.new('\n*[rR][eE][fF][eE][rR][eE][nN][cC][eE][sS]?( +|\n+)?$')
REFERENCE_REGEXP =
Regexp.new('(\[[0-9]?[0-9]\]|\[.+?\])')

Class Method Summary collapse

Class Method Details

.convertSingleColPdf(job_id, work_dir, file_name, use_dir) ⇒ Object



52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# File 'lib/arxiv/references/P3.rb', line 52

def self.convertSingleColPdf(job_id, work_dir,file_name, use_dir)
  cmd = "k2pdfopt -dev kpw #{file_name}"
  PTY.spawn(cmd) do |i,o|
    o.sync = true
    i.expect(/\S.*Enter option above \(h=help, q=quit\):/,10){
      o.puts "\n"
      o.flush
    }
    while( i.eof? == false )
      res = i.gets
      print res
      break unless res.index('written').nil?
    end
  end
  return getK2Pdf(job_id, work_dir, use_dir)
end

.fetchFromPdfUrl(pdfUrl, work_dir = true, use_dir = true) ⇒ Object



102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# File 'lib/arxiv/references/P3.rb', line 102

def self.fetchFromPdfUrl(pdfUrl, work_dir=true, use_dir=true)
  job_id = makeId
  makeDir(job_id, work_dir) if use_dir
  file_name = makeFile(job_id, work_dir, use_dir)

  fetchPdfFile(pdfUrl, file_name)
  executed_pdf = convertSingleColPdf(job_id, work_dir, file_name, use_dir)
  references = fetchReference(executed_pdf)
  if use_dir
    removeDir(job_id, work_dir) 
  else
    removeFile(job_id, work_dir)
  end
  return references
end

.fetchPdfFile(pdfUrl, file_name) ⇒ Object



44
45
46
47
48
49
50
# File 'lib/arxiv/references/P3.rb', line 44

def self.fetchPdfFile(pdfUrl,file_name) 
  open(file_name, 'wb') do |o|
    open(pdfUrl) do |data|
      o.write(data.read)
    end
  end
end

.fetchReference(file_name) ⇒ Object



69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# File 'lib/arxiv/references/P3.rb', line 69

def self.fetchReference(file_name)
  reader = PDF::Reader.new(file_name)
  page_no = reader.
    pages.
    reject{|i|
      i.text.index(REFERENCE_START_REGEXP).nil?
    }.
    map(&:number).
    sort.
    shift

    ref_page = reader.
      pages.
      select{|i|
        i.number >= page_no
      }.
      map{|i|
        i.text.gsub(/\n\n+/,"\n").gsub(/ +/,' ').gsub(/-\n +/,'')
      }

      ref_page.shift

      ref_page = ref_page.
      join(' ').
      gsub(REFERENCE_REGEXP,"\n\\1")

      ref_page = ref_page.
      split(/\n *\n/).
      map{|i| i.gsub("\n",'')}.
      select{|i| i.length > 15}
    return ref_page
end

.getK2Pdf(id, work_dir, use_dir) ⇒ Object



32
33
34
35
36
37
38
# File 'lib/arxiv/references/P3.rb', line 32

def self. getK2Pdf(id, work_dir, use_dir)
  if use_dir
    return "#{work_dir}/#{id}/output_k2opt.pdf"
  else
    return "#{work_dir}/#{id}-output_k2opt.pdf"
  end
end

.makeDir(id, work_dir) ⇒ Object



16
17
18
# File 'lib/arxiv/references/P3.rb', line 16

def self.makeDir(id, work_dir)
  Dir.mkdir("#{work_dir}/#{id}") 
end

.makeFile(id, work_dir, use_dir) ⇒ Object



24
25
26
27
28
29
30
# File 'lib/arxiv/references/P3.rb', line 24

def self.makeFile(id, work_dir, use_dir)
  if use_dir
    return "#{work_dir}/#{id}/output.pdf"
  else
    return "#{work_dir}/#{id}-output.pdf"
  end
end

.makeIdObject



12
13
14
# File 'lib/arxiv/references/P3.rb', line 12

def self.makeId
  return Digest::SHA256.hexdigest Time.now.strftime("%F %H:%M:%S")
end

.removeDir(id, work_dir) ⇒ Object



20
21
22
# File 'lib/arxiv/references/P3.rb', line 20

def self.removeDir(id, work_dir)
  FileUtils.rm_rf("#{work_dir}/#{id}")
end

.removeFile(id, work_dir) ⇒ Object



40
41
42
43
# File 'lib/arxiv/references/P3.rb', line 40

def self.removeFile(id, work_dir)
  File.delete("#{work_dir}/#{id}-output.pdf")
  File.delete("#{work_dir}/#{id}-output_k2opt.pdf")
end