Class: P3

Inherits:
Object
  • Object
show all
Defined in:
lib/paper/pdf/parser/P3.rb

Constant Summary collapse

REFERENCE_START_REGEXP =
Regexp.new('\n*[rR][eE][fF][eE][rR][eE][nN][cC][eE][sS]?( +|\n+)?$')
REFERENCE_REGEXP =
Regexp.new('(\[[0-9]?[0-9]\]|\[.+?\])')

Class Method Summary collapse

Class Method Details

.convertSingleColPdf(file_name) ⇒ Object



54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# File 'lib/paper/pdf/parser/P3.rb', line 54

def self.convertSingleColPdf(file_name)
  cmd = "k2pdfopt -dev kpw #{file_name}"
  PTY.spawn(cmd) do |i, o|
    o.sync = true
    i.expect(/\S.*Enter option above \(h=help, q=quit\):/, 10) {
      o.puts "\n"
      o.flush
    }
    while (i.eof? == false)
      res = i.gets
      print res
      unless res.index('written').nil?
        return res.match(/\/[\a-zA-Z0-9_]+.pdf/).to_s
      end
    end
  end
end

.fetchFromPdfPath(path, work_dir = true, use_dir = true) ⇒ Object Also known as: parse



106
107
108
109
110
111
112
113
114
115
# File 'lib/paper/pdf/parser/P3.rb', line 106

def self.fetchFromPdfPath(path, work_dir=true, use_dir=true)
  executed_pdf = convertSingleColPdf(path)
  references = fetchReference(executed_pdf)
  if use_dir || use_dir.nil?
    removeDir(@job_id, work_dir)
  else
    removeFile(@job_id, work_dir)
  end
  return references
end

.fetchFromPdfUrl(pdfUrl, work_dir = true, use_dir = true) ⇒ Object



117
118
119
120
121
122
123
# File 'lib/paper/pdf/parser/P3.rb', line 117

def self.fetchFromPdfUrl(pdfUrl, work_dir=true, use_dir=true)
  @job_id = makeId
  makeDir(@job_id, work_dir) if use_dir || use_dir.nil?
  file_name = makeFile(@job_id, work_dir, use_dir)
  fetchPdfFile(pdfUrl, file_name)
  return self.parse(file_name, work_dir, use_dir)
end

.fetchPdfFile(pdfUrl, file_name) ⇒ Object



46
47
48
49
50
51
52
# File 'lib/paper/pdf/parser/P3.rb', line 46

def self.fetchPdfFile(pdfUrl, file_name)
  open(file_name, 'wb') do |o|
    open(pdfUrl) do |data|
      o.write(data.read)
    end
  end
end

.fetchReference(file_name) ⇒ Object



72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# File 'lib/paper/pdf/parser/P3.rb', line 72

def self.fetchReference(file_name)
  puts file_name
  reader = PDF::Reader.new(file_name)
  page_no = reader.
      pages.
      reject { |i|
    i.text.index(REFERENCE_START_REGEXP).nil?
  }.
      map(&:number).
      sort.
      shift

  ref_page = reader.
      pages.
      select { |i|
    i.number >= page_no
  }.
      map { |i|
    i.text.gsub(/\n\n+/, "\n").gsub(/ +/, ' ').gsub(/-\n +/, '')
  }

  ref_page.shift

  ref_page = ref_page.
      join(' ').
      gsub(REFERENCE_REGEXP, "\n\\1")

  ref_page = ref_page.
      split(/\n *\n/).
      map { |i| i.gsub("\n", '') }.
      select { |i| i.length > 15 }
  return ref_page
end

.getK2Pdf(id, work_dir, use_dir) ⇒ Object



33
34
35
36
37
38
39
# File 'lib/paper/pdf/parser/P3.rb', line 33

def self.getK2Pdf(id, work_dir, use_dir)
  if use_dir
    return "#{work_dir}/#{id}/output_k2opt.pdf"
  else
    return "#{work_dir}/#{id}-output_k2opt.pdf"
  end
end

.makeDir(id, work_dir) ⇒ Object



17
18
19
# File 'lib/paper/pdf/parser/P3.rb', line 17

def self.makeDir(id, work_dir)
  Dir.mkdir("#{work_dir}/#{id}")
end

.makeFile(id, work_dir, use_dir) ⇒ Object



25
26
27
28
29
30
31
# File 'lib/paper/pdf/parser/P3.rb', line 25

def self.makeFile(id, work_dir, use_dir)
  if use_dir || use_dir.nil?
    return "#{work_dir}/#{id}/output.pdf"
  else
    return "#{work_dir}/#{id}-output.pdf"
  end
end

.makeIdObject



13
14
15
# File 'lib/paper/pdf/parser/P3.rb', line 13

def self.makeId
  return Digest::SHA256.hexdigest Time.now.strftime('%F %H:%M:%S')
end

.removeDir(id, work_dir) ⇒ Object



21
22
23
# File 'lib/paper/pdf/parser/P3.rb', line 21

def self.removeDir(id, work_dir)
  FileUtils.rm_rf("#{work_dir}/#{id}")
end

.removeFile(id, work_dir) ⇒ Object



41
42
43
44
# File 'lib/paper/pdf/parser/P3.rb', line 41

def self.removeFile(id, work_dir)
  File.delete("#{work_dir}/#{id}-output.pdf")
  File.delete("#{work_dir}/#{id}-output_k2opt.pdf")
end