Module: Utils

Extended by:
Utils
Included in:
EPUB, PDF, Utils
Defined in:
lib/utils.rb

Instance Method Summary collapse

Instance Method Details

#breaklines(text, options = {}) ⇒ Object



51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# File 'lib/utils.rb', line 51

def breaklines(text,options={})
  break_lines = []
  lines = text_to_array(text)
  length = options[:length] || guess_content_line_length(text)
  return break_lines if length <= 0

  lines.each do |line|
    if line.length > 0
      unless line_closed?(line,length)
        break_lines << line
      end
    end
  end
  break_lines
end

#clean_text(text) ⇒ Object

clean_text

获得干净的文本,去除两边的空格和回车,主要在txt标题转换成html时使用


146
147
148
149
150
151
152
# File 'lib/utils.rb', line 146

def clean_text(text)
  return text if text.nil?
  text = text.strip
  text = text.gsub("\n",'')
  #去除全角空格
  text.gsub(/^ */,'')
end

#detect_sections_from_html(html_file) ⇒ Object



250
251
252
253
254
255
256
257
# File 'lib/utils.rb', line 250

def detect_sections_from_html(html_file)
  sections = []
  html = Nokogiri::HTML.parse(File.open(html_file).read)
  html.search('h2').each do |node|
    sections << {:title=>node.text,:page_num=>node['id']}
  end
  sections
end

#detect_utf8(content) ⇒ Object



182
183
184
185
186
187
# File 'lib/utils.rb', line 182

def detect_utf8(content)
  content.each_line{|line|line.strip}
  true
rescue
  false
end

#end_mark?(text) ⇒ Boolean

Returns:

  • (Boolean)


105
106
107
108
# File 'lib/utils.rb', line 105

def end_mark?(text)
  end_mark = [".","",'"','!','?','','','','>']
  return true if end_mark.include?(text[-1])
end

#escape_html(text) ⇒ Object

escape_html 文本转义,在txt文本转html时需要使用



156
157
158
# File 'lib/utils.rb', line 156

def escape_html(text)
  CGI::escapeHTML(text)
end

#extract_keywords_from_path(path) ⇒ Object

根据路径提取关键词



228
229
230
# File 'lib/utils.rb', line 228

def extract_keywords_from_path(path)
  keywords = path.split(/[\\\/]/).map{|key| key if key.strip != ''}.compact
end

#extract_text_from_file(filename, format) ⇒ Object



259
260
261
262
263
264
265
266
267
268
269
270
271
272
# File 'lib/utils.rb', line 259

def extract_text_from_file(filename,format)
  txt_file = File.basename(filename,format)
  if !filename.include?("'")
    cmd = %Q(ebook-convert '#{filename}' '#{txt_file}.txt')
  elsif !filename.include?('"')
    cmd = %Q(ebook-convert "#{filename}" "#{txt_file}.txt")
  else
    cmd = %Q(ebook-convert #{filename} #{txt_file}.txt)
  end
  output = `#{cmd}`
  content = File.open("#{txt_file}.txt").read
  FileUtils.remove_file("#{txt_file}.txt",true)
  return content
end

#fixed_page_break(page_text, options = {}) ⇒ Object

fixed_page_break

修复文本中的异常中断

parameters:

+page_text+   文本内容


25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# File 'lib/utils.rb', line 25

def fixed_page_break(page_text,options={})
  length = options[:length] || guess_content_line_length(page_text)

  return page_text if (length > 80 || length <=0)  #每行超过80个文字的默认为不需要修复

  page_lines = text_to_array(page_text)

  lines = []
  flag_tag = false
  page_lines.each do |line|
    if line.length > 0
      if flag_tag
        lines[(lines.count - 1)] = merge_para_part(lines.last,line)
      else
        lines << line
      end
      if line_closed?(line,length)
        flag_tag = false
      else
        flag_tag = true
      end
    end
  end
  lines.join("\n")
end

#guess_content_line_length(content) ⇒ Object

猜测内容长度,用于修复PDF导出时出现断句的问题

PDF导出文本中的断句特点:
  * 文本长度小于80
  * 相同长度的句子一定高于某个比例

返回值:

如果识别长度则返回识别的长度,否则返回0


124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# File 'lib/utils.rb', line 124

def guess_content_line_length(content)
  line_length = 0    
  return line_length if content.blank?
  lengths = []
  content.each_line{|line|
    lengths << line.length
  }

  grouped = lengths.group_by{|i| i}
  sorted = grouped.map{|k,v| [k,v.count]}.sort_by{|i| i[1]}.reverse
  sorted.each do |length, count|
    if ((count.to_f / lengths.count.to_f) > 0.1) && length < 80
      line_length = (length * 0.8).to_i
      break
    end
  end
  
  return line_length
end

#line_closed?(text, length = 60) ⇒ Boolean

line_closed?

判断是否为一行的结束。如何算一行结束?
 * 以句子结束符结尾的
 * 猜测是一种标题
 * 非结束符结束,但长度小于猜测的行长度的

parameters:

+text+  一行的文本内容

Returns:

  • (Boolean)


96
97
98
99
100
101
102
103
# File 'lib/utils.rb', line 96

def line_closed?(text,length=60)
  return true if end_mark?(text)
  short_text = text.gsub(/[\.\-—. ]/,'')
  return true if short_text.length > 80
  return true if HeaderDetect.guess_header?(short_text)
  return true if short_text.length < length
  false
end

#make_destination_dir(destination) ⇒ Object



222
223
224
225
# File 'lib/utils.rb', line 222

def make_destination_dir(destination)
  dest_path = File.dirname(destination)
  FileUtils.mkdir_p(dest_path) unless Dir.exists?(dest_path)    
end

#merge_para_part(part1, part2) ⇒ Object



110
111
112
113
114
115
116
# File 'lib/utils.rb', line 110

def merge_para_part(part1,part2)
  if part2 =~ /\p{Han}/
    [part1,part2].join("")
  else
    [part1,part2].join(" ")
  end
end

#scan_file_from_dir(dir, options = {}) ⇒ Object

scan_file_from_dir 遍历目录下的文件 parameters:

+dir+       需遍历的目录
+options+   可选参数
   :format     指定需要遍历的文件后缀名,例如要遍历所有pdf文件,通过:format=>'.pdf'指定


195
196
197
198
199
200
201
# File 'lib/utils.rb', line 195

def scan_file_from_dir(dir,options={})
  files = []
  walk_dir(dir,options) do |file|
    files << file.realpath.to_s
  end
  files
end

#source_exists?(source, dir_flag = nil) ⇒ Boolean

source_exists? detect source file or directory parameters:

+source+   file or directory
+dir_flag+  directory flag, default nil.

Returns:

  • (Boolean)


214
215
216
217
218
219
220
# File 'lib/utils.rb', line 214

def source_exists?(source,dir_flag=nil)
  if dir_flag
    File.directory?(source)
  else
    File.exists?(source)
  end
end

#text_similarity(text1, text2) ⇒ Object

计算文本相似度



80
81
82
83
84
85
86
87
# File 'lib/utils.rb', line 80

def text_similarity(text1,text2)
  return 0 if text1.blank? || text2.blank?
  diff = Levenshtein.distance(text1,text2)
  count = text1.length > text2.length ? text1.length : text2.length
  similarity = (count - diff) / count.to_f
rescue
  0
end

#text_to_array(text) ⇒ Object



68
69
70
71
72
73
74
75
76
77
# File 'lib/utils.rb', line 68

def text_to_array(text)
  page_lines = []
  text.each_line do |line|
    line.gsub!("\r\n","")
    line.gsub!("\n","")
    line.strip!
    page_lines << line
  end    
  page_lines
end

#timerObject



275
276
277
278
279
280
281
282
# File 'lib/utils.rb', line 275

def timer
  time = Time.now
  result = yield
  end_time = Time.now
  delta = end_time - time 
  puts "#{delta} seconds"
  result
end

#to_utf8(text, encoding = 'GB2312') ⇒ Object



174
175
176
177
178
179
180
# File 'lib/utils.rb', line 174

def to_utf8(text,encoding='GB2312')
  doc = Iconv.iconv('UTF-8//IGNORE',"#{encoding}//IGNORE",text)
  doc.join("")
  #text.encode(encoding)
rescue
  nil
end

#walk_dir(path_str, options = {}) ⇒ Object



232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
# File 'lib/utils.rb', line 232

def walk_dir(path_str,options={})
  path = Pathname.new(path_str)
  format = options[:format]
  path.children.each do |entry|
    if entry.directory?
      walk_dir(entry) {|x| yield(x)}
    elsif entry.file?
      if format
        if entry.extname == format
          yield entry
        end
      else
        yield entry
      end
    end
  end
end

#wrapper_html(content, options = {}) ⇒ Object



160
161
162
163
164
165
166
167
168
169
170
171
172
# File 'lib/utils.rb', line 160

def wrapper_html(content,options={})
  <<-EOS
  <!DOCTYPE html>
  <HTML xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang="">
  <HEAD>
  <TITLE>#{options[:title]}</TITLE>
  <META http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
  </HEAD>
  <body>
  #{content}
  </body>
  EOS
end

#write_file(text, filename) ⇒ Object



203
204
205
206
207
# File 'lib/utils.rb', line 203

def write_file(text, filename)
  File.open(filename,'wb') do |file|
    file.write text
  end
end