Module: HeaderDetect
Overview
HeaderDetect
HeaderDetect
文档结构信息分析
有效的标题信息应该符合以下规则:
1.
Constant Summary collapse
- HEAD_TYPES =
[:volume,:part,:chapter,:section,:preface,:appendix,:index,:glossary]
Instance Method Summary collapse
- #guess_appendix?(text) ⇒ Boolean
- #guess_chapter?(text) ⇒ Boolean
- #guess_digital_header?(text) ⇒ Boolean
- #guess_digital_section?(text) ⇒ Boolean
- #guess_glossary?(text) ⇒ Boolean
- #guess_header?(text) ⇒ Boolean
- #guess_index?(text) ⇒ Boolean
- #guess_part?(text, options = {}) ⇒ Boolean
- #guess_preface?(text) ⇒ Boolean
- #guess_section?(text) ⇒ Boolean
- #guess_volume?(text, options = {}) ⇒ Boolean
-
#hav_complete_sentence?(text) ⇒ Boolean
判断包含完整的句子。.
- #valid_title?(text) ⇒ Boolean
Instance Method Details
#guess_appendix?(text) ⇒ Boolean
110 111 112 113 114 115 116 117 |
# File 'lib/header_detect.rb', line 110 def guess_appendix?(text) return false if valid_title?(text) return true if text =~ /^附\s*录$/ return true if text =~ /^附\s*录\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIiA-Za-z]/ text = text.downcase return true if text =~ /^appendix$/ return true if text =~ /^appendix\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIiA-Za-z]/ end |
#guess_chapter?(text) ⇒ Boolean
76 77 78 79 80 81 |
# File 'lib/header_detect.rb', line 76 def guess_chapter?(text) return false if valid_title?(text) return true if text =~ /^第.{1,4}[章回则讲]/ text = text.downcase return true if text =~ /^chapter\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/ end |
#guess_digital_header?(text) ⇒ Boolean
138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
# File 'lib/header_detect.rb', line 138 def guess_digital_header?(text) return false if valid_title?(text) matcher = text.match(/(^\d+(\.\d)*\s)(.*)/) if matcher return false if matcher[3].length == 0 levels = matcher[1].split(".") return false if levels[0].to_i > 99 case levels.count when 1 "chapter".to_sym else "sect#{levels.count - 1}".to_sym end end end |
#guess_digital_section?(text) ⇒ Boolean
128 129 130 131 132 133 134 135 136 |
# File 'lib/header_detect.rb', line 128 def guess_digital_section?(text) return false if valid_title?(text) matcher = text.match(/^(\d+\.)+[\d]\s*(.*)/) if matcher return false if matcher[2].length == 0 level = matcher[0].split(".").count - 1 "sect#{level}".to_sym end end |
#guess_glossary?(text) ⇒ Boolean
119 120 121 122 123 124 125 126 |
# File 'lib/header_detect.rb', line 119 def guess_glossary?(text) return false if valid_title?(text) return true if text =~ /^术\s*语$/ return true if text =~ /^术\s*语\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/ text = text.downcase return true if text =~ /^glossary$/ return true if text =~ /^glossary\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/ end |
#guess_header?(text) ⇒ Boolean
154 155 156 157 158 159 160 161 162 163 164 165 166 |
# File 'lib/header_detect.rb', line 154 def guess_header?(text) return :volume if guess_volume?(text) return :part if guess_part?(text) return :chapter if guess_chapter?(text) return :sect1 if guess_section?(text) return :preface if guess_preface?(text) return :appendix if guess_appendix?(text) return :index if guess_index?(text) return :glossary if guess_glossary?(text) if type = guess_digital_section?(text) return type end end |
#guess_index?(text) ⇒ Boolean
101 102 103 104 105 106 107 108 |
# File 'lib/header_detect.rb', line 101 def guess_index?(text) return false if valid_title?(text) return true if text =~ /^索\s*引$/ return true if text =~ /^索\s*引\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/ text = text.downcase return true if text =~ /^index$/ return true if text =~ /^index\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/ end |
#guess_part?(text, options = {}) ⇒ Boolean
69 70 71 72 73 74 |
# File 'lib/header_detect.rb', line 69 def guess_part?(text,={}) return false if valid_title?(text) return true if text =~ /^第.{1,3}[部篇]/ text = text.downcase return true if text =~ /^part\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/ end |
#guess_preface?(text) ⇒ Boolean
88 89 90 91 92 93 94 95 96 97 98 99 |
# File 'lib/header_detect.rb', line 88 def guess_preface?(text) return false if valid_title?(text) return true if text =~ /^前\s*言$/ return true if text =~ /^序\s*言$/ return true if text =~ /^序$/ return true if text =~ /^序[言]\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/ text = text.downcase return true if text =~ /^preface$/ return true if text =~ /^preface\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/ return true if text =~ /^foreword$/ return true if text =~ /^foreword\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/ end |
#guess_section?(text) ⇒ Boolean
83 84 85 86 |
# File 'lib/header_detect.rb', line 83 def guess_section?(text) return false if valid_title?(text) return true if text =~ /^第.{1,3}[节]/ end |
#guess_volume?(text, options = {}) ⇒ Boolean
62 63 64 65 66 67 |
# File 'lib/header_detect.rb', line 62 def guess_volume?(text,={}) return false if valid_title?(text) return true if (text =~ /^第.{1,3}卷/ || text =~ /^卷\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/) text = text.downcase return true if text =~ /^volume\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/ end |
#hav_complete_sentence?(text) ⇒ Boolean
判断包含完整的句子。
52 53 54 55 |
# File 'lib/header_detect.rb', line 52 def hav_complete_sentence?(text) text = text.gsub(/^\d+(\.\d)*\s/,'') text =~ /[\.。!\?!?]/ end |
#valid_title?(text) ⇒ Boolean
57 58 59 60 |
# File 'lib/header_detect.rb', line 57 def valid_title?(text) text = text.gsub(/^\d+(\.\d)*\s/,'') text =~ /[\.。]/ end |