Module: HeaderDetect

Extended by:
HeaderDetect
Included in:
HeaderDetect, TxtBook
Defined in:
lib/header_detect.rb

Overview

HeaderDetect

HeaderDetect

文档结构信息分析

 

有效的标题信息应该符合以下规则:

1. 

Constant Summary collapse

HEAD_TYPES =
[:volume,:part,:chapter,:section,:preface,:appendix,:index,:glossary]

Instance Method Summary collapse

Instance Method Details

#guess_appendix?(text) ⇒ Boolean

Returns:



110
111
112
113
114
115
116
117
# File 'lib/header_detect.rb', line 110

def guess_appendix?(text)
  return false if valid_title?(text)
  return true if text =~ /^附\s*录$/
  return true if text =~ /^附\s*录\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIiA-Za-z]/
  text = text.downcase
  return true if text =~ /^appendix$/
  return true if text =~ /^appendix\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIiA-Za-z]/
end

#guess_chapter?(text) ⇒ Boolean

Returns:



76
77
78
79
80
81
# File 'lib/header_detect.rb', line 76

def guess_chapter?(text)
  return false if valid_title?(text)
  return true if text =~ /^第.{1,4}[章回则讲]/
  text = text.downcase
  return true if text =~ /^chapter\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
end

#guess_digital_header?(text) ⇒ Boolean

Returns:



138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# File 'lib/header_detect.rb', line 138

def guess_digital_header?(text)
  return false if valid_title?(text)
  matcher = text.match(/(^\d+(\.\d)*\s)(.*)/)
  if matcher
    return false if matcher[3].length == 0
    levels = matcher[1].split(".")
    return false if levels[0].to_i > 99
    case levels.count
    when 1
      "chapter".to_sym
    else
      "sect#{levels.count - 1}".to_sym
    end
  end
end

#guess_digital_section?(text) ⇒ Boolean

Returns:



128
129
130
131
132
133
134
135
136
# File 'lib/header_detect.rb', line 128

def guess_digital_section?(text)
  return false if valid_title?(text)
  matcher = text.match(/^(\d+\.)+[\d]\s*(.*)/)
  if matcher
    return false if matcher[2].length == 0
    level = matcher[0].split(".").count - 1
    "sect#{level}".to_sym
  end
end

#guess_glossary?(text) ⇒ Boolean

Returns:



119
120
121
122
123
124
125
126
# File 'lib/header_detect.rb', line 119

def guess_glossary?(text)
  return false if valid_title?(text)
  return true if text =~ /^术\s*语$/
  return true if text =~ /^术\s*语\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
  text = text.downcase
  return true if text =~ /^glossary$/
  return true if text =~ /^glossary\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
end

#guess_header?(text) ⇒ Boolean

Returns:



154
155
156
157
158
159
160
161
162
163
164
165
166
# File 'lib/header_detect.rb', line 154

def guess_header?(text)
  return :volume if guess_volume?(text)
  return :part if guess_part?(text)
  return :chapter if guess_chapter?(text)
  return :sect1 if guess_section?(text)
  return :preface if guess_preface?(text)
  return :appendix if guess_appendix?(text)
  return :index if guess_index?(text)
  return :glossary if guess_glossary?(text)
  if type = guess_digital_section?(text)
    return type
  end
end

#guess_index?(text) ⇒ Boolean

Returns:



101
102
103
104
105
106
107
108
# File 'lib/header_detect.rb', line 101

def guess_index?(text)
  return false if valid_title?(text)
  return true if text =~ /^索\s*引$/
  return true if text =~ /^索\s*引\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
  text = text.downcase
  return true if text =~ /^index$/
  return true if text =~ /^index\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
end

#guess_part?(text, options = {}) ⇒ Boolean

Returns:



69
70
71
72
73
74
# File 'lib/header_detect.rb', line 69

def guess_part?(text,options={})
  return false if valid_title?(text)
  return true if text =~ /^第.{1,3}[部篇]/
  text = text.downcase
  return true if text =~ /^part\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
end

#guess_preface?(text) ⇒ Boolean

Returns:



88
89
90
91
92
93
94
95
96
97
98
99
# File 'lib/header_detect.rb', line 88

def guess_preface?(text)
  return false if valid_title?(text)
  return true if text =~ /^前\s*言$/
  return true if text =~ /^序\s*言$/
  return true if text =~ /^序$/
  return true if text =~ /^序[言]\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
  text = text.downcase
  return true if text =~ /^preface$/
  return true if text =~ /^preface\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
  return true if text =~ /^foreword$/
  return true if text =~ /^foreword\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
end

#guess_section?(text) ⇒ Boolean

Returns:



83
84
85
86
# File 'lib/header_detect.rb', line 83

def guess_section?(text)
  return false if valid_title?(text)
  return true if text =~ /^第.{1,3}[节]/
end

#guess_volume?(text, options = {}) ⇒ Boolean

Returns:



62
63
64
65
66
67
# File 'lib/header_detect.rb', line 62

def guess_volume?(text,options={})
  return false if valid_title?(text)
  return true if (text =~ /^第.{1,3}卷/ || text =~ /^卷\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/)
  text = text.downcase
  return true if text =~ /^volume\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
end

#hav_complete_sentence?(text) ⇒ Boolean

判断包含完整的句子。

Returns:



52
53
54
55
# File 'lib/header_detect.rb', line 52

def hav_complete_sentence?(text)
  text = text.gsub(/^\d+(\.\d)*\s/,'')
  text =~ /[\.。!\?!?]/
end

#valid_title?(text) ⇒ Boolean

Returns:



57
58
59
60
# File 'lib/header_detect.rb', line 57

def valid_title?(text)
  text = text.gsub(/^\d+(\.\d)*\s/,'')
  text =~ /[\.。]/
end