Module: PDF

Extended by:
PDF
Includes:
Utils
Included in:
PDF
Defined in:
lib/pdf.rb

Instance Method Summary collapse

Methods included from Utils

#breaklines, #clean_text, #detect_sections_from_html, #detect_utf8, #end_mark?, #escape_html, #extract_keywords_from_path, #fixed_page_break, #guess_content_line_length, #line_closed?, #make_destination_dir, #merge_para_part, #scan_file_from_dir, #source_exists?, #text_similarity, #text_to_array, #to_utf8, #walk_dir, #wrapper_html, #write_file

Instance Method Details

#extract_illustrations(filename, options = {}) ⇒ Object

extract_illustrations

parameters:

+filename+   pdf


86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# File 'lib/pdf.rb', line 86

def extract_illustrations(filename,options={})
  tmp_dir = options[:dir] || File.basename(filename,'.pdf')
  old_dir = Dir.getwd
  Dir.mkdir(tmp_dir) unless Dir.exists?(tmp_dir)
  system("pdfimages -p '#{filename}' '#{tmp_dir}/'")
  system("mogrify -format png '#{tmp_dir}/*.ppm'")
  Dir.chdir(tmp_dir)
  images = Dir.glob('*.png')
  images_path = []
  images.each do |image|
    images_path << image
  end
  Dir.chdir(old_dir)
  images_path
end

#extract_page_illustrations(illustrations, index) ⇒ Object



116
117
118
119
120
121
122
123
124
# File 'lib/pdf.rb', line 116

def extract_page_illustrations(illustrations,index)
  page_illustrations = []
  illustrations.each do |image_path|
    if image_path.split("-")[1].to_i == index
      page_illustrations << image_path
    end
  end
  page_illustrations
end

#extract_pdf_meta(filename) ⇒ Object

extract_pdf_meta

parameters:

+filename+   pdf


58
59
60
61
62
63
64
# File 'lib/pdf.rb', line 58

def extract_pdf_meta(filename)
  pdf = Poppler::Document.new(filename)
  meta ={}
  meta[:author] = pdf.author
  meta[:title] = pdf.title
  meta
end

#extract_pdf_pages_text(filename) ⇒ Object

extract_pdf_pages_text

parameters:

+filename+   pdf


25
26
27
28
29
30
31
32
33
# File 'lib/pdf.rb', line 25

def extract_pdf_pages_text(filename)
  pdf = PDF::Reader.new(filename)
  pages = []

  pdf.pages.each do |page|
    pages << page.text
  end
  pages
end

#extract_sections(filename) ⇒ Object

extract_sections

parameters:

+filename+   pdf


70
71
72
73
74
75
76
77
78
# File 'lib/pdf.rb', line 70

def extract_sections(filename)
  sections = []
  pdf = Poppler::Document.new(filename)
  indexer = Poppler::IndexIter.new(pdf)
  walk_index(indexer,sections)
  sections
rescue
  sections
end

#fixed_break_of_cross_page(pages, length = 80) ⇒ Object



175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
# File 'lib/pdf.rb', line 175

def fixed_break_of_cross_page(pages,length=80)
  i=0
  while i < (pages.count-1)
    first_page_lines = pages[i].split("\n")
    second_page_lines = pages[i+1].split("\n")
    if first_page_lines.any? && second_page_lines.any?
      first_page_last = first_page_lines.last
      second_page_first = second_page_lines.first    

      unless Utils.end_mark?(first_page_last)
        first_page_lines[(first_page_lines.count-1)] = Utils.merge_para_part(first_page_last,second_page_first)
        second_page_lines.shift
        pages[i] = first_page_lines.join("\n")
        pages[i+1] = second_page_lines.join("\n")
      end
    end
    i = i + 1
  end
  pages
end

#fixed_break_with_pages_text(pages_text) ⇒ Object



102
103
104
105
106
# File 'lib/pdf.rb', line 102

def fixed_break_with_pages_text(pages_text)
  line_length = pages_text.map{|text| Utils.guess_content_line_length(text)}.compact.sort.last * 0.5
  pages_text = pages_text.map{|page_text| Utils.fixed_page_break(page_text,:length=>line_length) }
  pages_text = fixed_break_of_cross_page(pages_text,line_length)
end

#gen_html_from_page_text(page_text, illustrations, options = {}) ⇒ Object



135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# File 'lib/pdf.rb', line 135

def gen_html_from_page_text(page_text,illustrations,options={})
  html = ''
  page_text.split("\n").each_with_index do |line,index| 
    if line.present? 
      if HeaderDetect.guess_header?(line)
        html += "<h2 id='#{options[:index]}_#{index}'>#{Utils.escape_html(Utils.clean_text(line))}</h2>"
      else
        html += "<p class='division'>#{Utils.escape_html(Utils.clean_text(line))}</p>" 
      end
    end
  end

  images = illustrations.map{|image_path| "<p class='division'><img src='#{image_path}' /></p>"}.compact.join("")
  "<div class='page' name='#{options[:index]}' >#{html}#{images}</div>"
end

#gen_html_from_page_texts(page_texts, illustrations, options = {}) ⇒ Object



126
127
128
129
130
131
132
133
# File 'lib/pdf.rb', line 126

def gen_html_from_page_texts(page_texts,illustrations,options={})
  page_htmls = []
  page_texts.each_with_index do |page_text,index|
    page_illustrations = extract_page_illustrations(illustrations,index)
    page_htmls << gen_html_from_page_text(page_text,page_illustrations,options.merge(:index=>index))
  end
  page_htmls.join("")
end

#gen_html_from_sections_and_page_texts(sections, page_texts, illustrations) ⇒ Object



108
109
110
111
112
113
114
# File 'lib/pdf.rb', line 108

def gen_html_from_sections_and_page_texts(sections,page_texts,illustrations)
  if sections.empty?
    gen_html_from_page_texts(page_texts,illustrations)
  else
    gen_html_from_page_texts(page_texts,illustrations) #sections中的页码不准确,暂时不进行处理
  end
end


213
214
215
216
217
218
219
220
221
222
223
224
# File 'lib/pdf.rb', line 213

def guess_footer_row_count(pages_text)
  i = 0
  while true
    lines = pages_text.map{|page_text| page_text.split("\n")[(-i -1)]}
    if guess_footer_line?(lines)
      i = i + 1
    else
      break
    end
  end
  i > 2 ? 0 : i
end

#guess_header_line?(lines) ⇒ Boolean Also known as: guess_footer_line?

猜测是否是页眉/页脚行猜测规则:

1. 


231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
# File 'lib/pdf.rb', line 231

def guess_header_line?(lines)
  return false if lines.empty?

  lines = lines.map{|line| line.strip if line.present?}
  similarity_set = []
  lines.each_with_index do |line,index|
    if Utils.text_similarity(line,lines[index+1]) > 0.7
      similarity_set << [index,index+1]
    end
  end
  similarity_set.flatten!
  similarity_set.uniq!

  return true if similarity_set.count.to_f / lines.count.to_f > 0.5

  similarity_set = []
  lines.each_with_index do |line,index|
    if Utils.text_similarity(line,lines[index+2]) > 0.7
      similarity_set << [index,index+2]
    end
  end
  similarity_set.flatten!
  similarity_set.uniq!
  return true if similarity_set.count.to_f / lines.count.to_f > 0.5

  similarity_set=[]
  lines.each_with_index do |line,index|
    similarity_set << index if line.to_i > 0
  end
  return true if similarity_set.count.to_f / lines.count.to_f > 0.5

  false
end

#guess_header_row_count(pages_text) ⇒ Object

猜测页眉/页脚的行数页眉页脚有一定的规律:

1. 


201
202
203
204
205
206
207
208
209
210
211
# File 'lib/pdf.rb', line 201

def guess_header_row_count(pages_text)
  i = 0
  while true
    if guess_header_line?(pages_text.map{|page_text| page_text.split("\n")[i]})
      i = i + 1
    else
      break
    end
  end
  i > 2 ? 0 : i
end

sanitize_page_header_and_footer

parameters:

+pdf_pages_text+  pdf


42
43
44
45
46
47
48
49
50
51
52
# File 'lib/pdf.rb', line 42

def sanitize_page_header_and_footer(pdf_pages_text,options={})
  header_rows_count = options[:header_rows_count] || guess_header_row_count(pdf_pages_text)
  footer_rows_count = options[:footer_rows_count] || guess_footer_row_count(pdf_pages_text)
  pages_text = []
  pdf_pages_text.each do |page_text|
    page_lines = page_text.split("\n")
    page_lines = page_lines[(header_rows_count)..(-footer_rows_count-1)] || []
    pages_text << page_lines.join("\n")
  end
  pages_text
end

#scan_pdf?(filename) ⇒ Boolean

scan_pdf?

parameters:

+filename+   pdf


12
13
14
15
16
17
18
19
# File 'lib/pdf.rb', line 12

def scan_pdf?(filename)
  if File.extname(filename).downcase == '.pdf'
    threshold = 1000
    pdf = Poppler::Document.new(filename)
    content = pdf.map{|page| page.get_text}.join('')
    content.strip.length < threshold ? true : false
  end
end

#walk_index(indexer, sections) ⇒ Object



151
152
153
154
155
156
157
158
159
160
161
# File 'lib/pdf.rb', line 151

def walk_index(indexer,sections)
  indexer.each_with_index do |i,index|
    sections[index] = {:title=>Utils.clean_text(i.action.title),:page_num=>i.action.dest.page_num}
    child = i.child
    if child.nil? == false
      sub_sections = []
      work_index(child,sub_sections)
      sections[index][:sub_sections] = sub_sections
    end
  end
end

#work_index(child, sections) ⇒ Object



163
164
165
166
167
168
169
170
171
172
173
# File 'lib/pdf.rb', line 163

def work_index(child,sections)
  child.each_with_index do |h,index|
    sections[index] = {:title=> Utils.clean_text(h.action.title),:page_num=>h.action.dest.page_num}
    sub = h.child
    if sub.nil? == false
      sub_sections = []
      work_index(sub,sub_sections)
      sections[index][:sub_sections] = sub_sections
    end
  end
end