Module: PDF

Extended by:
PDF
Includes:
Utils
Included in:
PDF
Defined in:
lib/pdf.rb

Instance Method Summary collapse

Methods included from Utils

#breaklines, #clean_text, #detect_sections_from_html, #detect_utf8, #end_mark?, #escape_html, #extract_keywords_from_path, #extract_text_from_file, #fixed_page_break, #guess_content_line_length, #line_closed?, #make_destination_dir, #merge_para_part, #scan_file_from_dir, #source_exists?, #text_similarity, #text_to_array, #timer, #to_utf8, #walk_dir, #wrapper_html, #write_file

Instance Method Details

#extract_illustrations(filename, options = {}) ⇒ Object

extract_illustrations

提取pdf文件中的插图

parameters:

+filename+   pdf文件
+options+    可选参数
   +dir+       插图存放的目录,默认存放在当前目录下与filename同名的子目录下。


86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# File 'lib/pdf.rb', line 86

def extract_illustrations(filename,options={})
  tmp_dir = options[:dir] || File.basename(filename,'.pdf')
  old_dir = Dir.getwd
  Dir.mkdir(tmp_dir) unless Dir.exists?(tmp_dir)
  system("pdfimages -p '#{filename}' '#{tmp_dir}/'")
  system("mogrify -format png '#{tmp_dir}/*.ppm'")
  Dir.chdir(tmp_dir)
  images = Dir.glob('*.png')
  images_path = []
  images.each do |image|
    images_path << image
  end
  Dir.chdir(old_dir)
  images_path
end

#extract_page_illustrations(illustrations, index) ⇒ Object



116
117
118
119
120
121
122
123
124
# File 'lib/pdf.rb', line 116

def extract_page_illustrations(illustrations,index)
  page_illustrations = []
  illustrations.each do |image_path|
    if image_path.split("-")[1].to_i == index
      page_illustrations << image_path
    end
  end
  page_illustrations
end

#extract_pdf_meta(filename) ⇒ Object

extract_pdf_meta

提取pdf元数据

parameters:

+filename+   pdf文件


58
59
60
61
62
63
64
# File 'lib/pdf.rb', line 58

def extract_pdf_meta(filename)
  pdf = Poppler::Document.new(filename)
  meta ={}
  meta[:author] = pdf.author
  meta[:title] = pdf.title
  meta
end

#extract_pdf_pages_text(filename) ⇒ Object

extract_pdf_pages_text

提取pdf中页文本内容

parameters:

+filename+   pdf文件


25
26
27
28
29
30
31
32
33
# File 'lib/pdf.rb', line 25

def extract_pdf_pages_text(filename)
  pdf = PDF::Reader.new(filename)
  pages = []

  pdf.pages.each do |page|
    pages << page.text
  end
  pages
end

#extract_sections(filename) ⇒ Object

extract_sections

提取pdf文件的大纲

parameters:

+filename+   pdf文件


70
71
72
73
74
75
76
77
78
# File 'lib/pdf.rb', line 70

def extract_sections(filename)
  sections = []
  pdf = Poppler::Document.new(filename)
  indexer = Poppler::IndexIter.new(pdf)
  walk_index(indexer,sections)
  sections
rescue
  sections
end

#fixed_break_of_cross_page(pages, length = 80) ⇒ Object



175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
# File 'lib/pdf.rb', line 175

def fixed_break_of_cross_page(pages,length=80)
  i=0
  while i < (pages.count-1)
    first_page_lines = pages[i].split("\n")
    second_page_lines = pages[i+1].split("\n")
    if first_page_lines.any? && second_page_lines.any?
      first_page_last = first_page_lines.last
      second_page_first = second_page_lines.first    

      unless Utils.end_mark?(first_page_last)
        first_page_lines[(first_page_lines.count-1)] = Utils.merge_para_part(first_page_last,second_page_first)
        second_page_lines.shift
        pages[i] = first_page_lines.join("\n")
        pages[i+1] = second_page_lines.join("\n")
      end
    end
    i = i + 1
  end
  pages
end

#fixed_break_with_pages_text(pages_text) ⇒ Object



102
103
104
105
106
# File 'lib/pdf.rb', line 102

def fixed_break_with_pages_text(pages_text)
  line_length = pages_text.map{|text| Utils.guess_content_line_length(text)}.compact.sort.last * 0.5
  pages_text = pages_text.map{|page_text| Utils.fixed_page_break(page_text,:length=>line_length) }
  pages_text = fixed_break_of_cross_page(pages_text,line_length)
end

#gen_html_from_page_text(page_text, illustrations, options = {}) ⇒ Object



135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# File 'lib/pdf.rb', line 135

def gen_html_from_page_text(page_text,illustrations,options={})
  html = ''
  page_text.split("\n").each_with_index do |line,index| 
    if line.present? 
      if HeaderDetect.guess_header?(line)
        html += "<h2 id='#{options[:index]}_#{index}'>#{Utils.escape_html(Utils.clean_text(line))}</h2>"
      else
        html += "<p class='division'>#{Utils.escape_html(Utils.clean_text(line))}</p>" 
      end
    end
  end

  images = illustrations.map{|image_path| "<p class='division'><img src='#{image_path}' /></p>"}.compact.join("")
  "<div class='page' name='#{options[:index]}' >#{html}#{images}</div>"
end

#gen_html_from_page_texts(page_texts, illustrations, options = {}) ⇒ Object



126
127
128
129
130
131
132
133
# File 'lib/pdf.rb', line 126

def gen_html_from_page_texts(page_texts,illustrations,options={})
  page_htmls = []
  page_texts.each_with_index do |page_text,index|
    page_illustrations = extract_page_illustrations(illustrations,index)
    page_htmls << gen_html_from_page_text(page_text,page_illustrations,options.merge(:index=>index))
  end
  page_htmls.join("")
end

#gen_html_from_sections_and_page_texts(sections, page_texts, illustrations) ⇒ Object



108
109
110
111
112
113
114
# File 'lib/pdf.rb', line 108

def gen_html_from_sections_and_page_texts(sections,page_texts,illustrations)
  if sections.empty?
    gen_html_from_page_texts(page_texts,illustrations)
  else
    gen_html_from_page_texts(page_texts,illustrations) #sections中的页码不准确,暂时不进行处理
  end
end


213
214
215
216
217
218
219
220
221
222
223
224
# File 'lib/pdf.rb', line 213

def guess_footer_row_count(pages_text)
  i = 0
  while true
    lines = pages_text.map{|page_text| page_text.split("\n")[(-i -1)]}
    if guess_footer_line?(lines)
      i = i + 1
    else
      break
    end
  end
  i > 2 ? 0 : i
end

#guess_header_line?(lines) ⇒ Boolean Also known as: guess_footer_line?

猜测是否是页眉/页脚行 猜测规则:

1. 相邻页的行匹配相似度,一定相似比例(默认70%)以上加入相似集合。如果相似集合占总集合数的比例高于一定值(默认50%)时,猜测为页眉页脚行
2. 隔页的行匹配相似度,一定相似比例(默认70%)以上加入相似集合。如果相似集合占总集合数的比例高于一定值(默认50%)时,猜测为页眉页脚行
3. 页码猜测,页的行是数值则加入相似集合。如果相似集合占总集合数的比例高于一定值(默认50%)时,猜测为页眉页脚行

Returns:

  • (Boolean)


231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
# File 'lib/pdf.rb', line 231

def guess_header_line?(lines)
  return false if lines.empty?

  lines = lines.map{|line| line.strip if line.present?}
  similarity_set = []
  lines.each_with_index do |line,index|
    if Utils.text_similarity(line,lines[index+1]) > 0.7
      similarity_set << [index,index+1]
    end
  end
  similarity_set.flatten!
  similarity_set.uniq!

  return true if similarity_set.count.to_f / lines.count.to_f > 0.5

  similarity_set = []
  lines.each_with_index do |line,index|
    if Utils.text_similarity(line,lines[index+2]) > 0.7
      similarity_set << [index,index+2]
    end
  end
  similarity_set.flatten!
  similarity_set.uniq!
  return true if similarity_set.count.to_f / lines.count.to_f > 0.5

  similarity_set=[]
  lines.each_with_index do |line,index|
    similarity_set << index if line.to_i > 0
  end
  return true if similarity_set.count.to_f / lines.count.to_f > 0.5

  false
end

#guess_header_row_count(pages_text) ⇒ Object

猜测页眉/页脚的行数 页眉页脚有一定的规律:

1. 页眉和页脚一般都在每页的固定位置出现或者对称出现(相邻两页左右位置堆成)
2. 呈现的内容一般是书名、章节名、页码等。
3. 呈现的顺序一般有两种形式:逐页式,即每一页的页眉页脚大致相似;隔页式


201
202
203
204
205
206
207
208
209
210
211
# File 'lib/pdf.rb', line 201

def guess_header_row_count(pages_text)
  i = 0
  while true
    if guess_header_line?(pages_text.map{|page_text| page_text.split("\n")[i]})
      i = i + 1
    else
      break
    end
  end
  i > 2 ? 0 : i
end

sanitize_page_header_and_footer

清洗页眉页脚

parameters:

+pdf_pages_text+  pdf文件页文本内容集合
+options+    可选参数
 :header_rows_count 指定页眉行数
 :footer_rows_count 指定页脚行数


42
43
44
45
46
47
48
49
50
51
52
# File 'lib/pdf.rb', line 42

def sanitize_page_header_and_footer(pdf_pages_text,options={})
  header_rows_count = options[:header_rows_count] || guess_header_row_count(pdf_pages_text)
  footer_rows_count = options[:footer_rows_count] || guess_footer_row_count(pdf_pages_text)
  pages_text = []
  pdf_pages_text.each do |page_text|
    page_lines = page_text.split("\n")
    page_lines = page_lines[(header_rows_count)..(-footer_rows_count-1)] || []
    pages_text << page_lines.join("\n")
  end
  pages_text
end

#scan_pdf?(filename) ⇒ Boolean

scan_pdf?

检查指定的文件是否为扫描版pdf

parameters:

+filename+   pdf文件

Returns:

  • (Boolean)


12
13
14
15
16
17
18
19
# File 'lib/pdf.rb', line 12

def scan_pdf?(filename)
  if File.extname(filename).downcase == '.pdf'
    threshold = 1000
    pdf = Poppler::Document.new(filename)
    content = pdf.map{|page| page.get_text}.join('')
    content.strip.length < threshold ? true : false
  end
end

#walk_index(indexer, sections) ⇒ Object



151
152
153
154
155
156
157
158
159
160
161
# File 'lib/pdf.rb', line 151

def walk_index(indexer,sections)
  indexer.each_with_index do |i,index|
    sections[index] = {:title=>Utils.clean_text(i.action.title),:page_num=>i.action.dest.page_num}
    child = i.child
    if child.nil? == false
      sub_sections = []
      work_index(child,sub_sections)
      sections[index][:sub_sections] = sub_sections
    end
  end
end

#work_index(child, sections) ⇒ Object



163
164
165
166
167
168
169
170
171
172
173
# File 'lib/pdf.rb', line 163

def work_index(child,sections)
  child.each_with_index do |h,index|
    sections[index] = {:title=> Utils.clean_text(h.action.title),:page_num=>h.action.dest.page_num}
    sub = h.child
    if sub.nil? == false
      sub_sections = []
      work_index(sub,sub_sections)
      sections[index][:sub_sections] = sub_sections
    end
  end
end