Class: Anystyle::Parser::Normalizer
- Inherits:
-
Object
- Object
- Anystyle::Parser::Normalizer
- Includes:
- Singleton
- Defined in:
- lib/anystyle/parser/normalizer.rb
Constant Summary collapse
- MONTH =
Hash.new do |h,k| case k when /jan/i h[k] = 1 when /feb/i h[k] = 2 when /mar/i h[k] = 3 when /apr/i h[k] = 4 when /ma[yi]/i h[k] = 5 when /jun/i h[k] = 6 when /jul/i h[k] = 7 when /aug/i h[k] = 8 when /sep/i h[k] = 9 when /o[ck]t/i h[k] = 10 when /nov/i h[k] = 11 when /dec/i h[k] = 12 else h[k] = nil end end
Instance Method Summary collapse
- #extract_edition(token, hash) ⇒ Object
- #method_missing(name, *arguments, &block) ⇒ Object
-
#normalize(key, hash) ⇒ Object
Default normalizer.
- #normalize_author(hash) ⇒ Object
- #normalize_booktitle(hash) ⇒ Object
- #normalize_container(hash) ⇒ Object
- #normalize_date(hash) ⇒ Object
- #normalize_editor(hash) ⇒ Object
- #normalize_names(names) ⇒ Object
- #normalize_pages(hash) ⇒ Object
- #normalize_title(hash) ⇒ Object
- #normalize_translator(hash) ⇒ Object
- #normalize_volume(hash) ⇒ Object
- #tokenize_names(names) ⇒ Object
Dynamic Method Handling
This class handles dynamic methods through the method_missing method
#method_missing(name, *arguments, &block) ⇒ Object
41 42 43 44 45 46 47 48 |
# File 'lib/anystyle/parser/normalizer.rb', line 41 def method_missing(name, *arguments, &block) case name.to_s when /^normalize_(.+)$/ normalize($1.to_sym, *arguments, &block) else super end end |
Instance Method Details
#extract_edition(token, hash) ⇒ Object
184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 |
# File 'lib/anystyle/parser/normalizer.rb', line 184 def extract_edition(token, hash) edition = [hash[:edition]].flatten.compact if token.gsub!(/\W*(\d+)(?:st|nd|rd|th)?\s*ed(?:ition|\.)?\W*/i, '') edition << $1 end if token.gsub!(/(?:\band)?\W*([Ee]xpanded)\W*$/, '') edition << $1 end if token.gsub!(/(?:\band)?\W*([Ii]llustrated)\W*$/, '') edition << $1 end if token.gsub!(/(?:\band)?\W*([Rr]evised)\W*$/, '') edition << $1 end if token.gsub!(/(?:\band)?\W*([Rr]eprint)\W*$/, '') edition << $1 end hash[:edition] = edition.join(', ') unless edition.empty? end |
#normalize(key, hash) ⇒ Object
Default normalizer. Strips punctuation.
51 52 53 54 55 56 57 58 59 60 61 |
# File 'lib/anystyle/parser/normalizer.rb', line 51 def normalize(key, hash) token, *dangling = hash[key] unmatched(key, hash, dangling) unless dangling.empty? token.gsub!(/^\W+|\W+$/, '') hash[key] = token hash rescue => e warn e. hash end |
#normalize_author(hash) ⇒ Object
63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
# File 'lib/anystyle/parser/normalizer.rb', line 63 def (hash) , *dangling = hash[:author] unmatched(:author, hash, dangling) unless dangling.empty? if =~ /\W*[Ee]d(s|itors)?\W*$/ && !hash.has_key?(:editor) hash[:editor] = hash.delete(:author) normalize_editor(hash) else hash['more-authors'] = true if !!.sub!(/\bet\.?\s*al.*$/i, '') .gsub!(/^\W+|\W+$/, '') hash[:author] = normalize_names() end hash rescue => e warn e. hash end |
#normalize_booktitle(hash) ⇒ Object
210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 |
# File 'lib/anystyle/parser/normalizer.rb', line 210 def normalize_booktitle(hash) booktitle, *dangling = hash[:booktitle] unmatched(:booktitle, hash, dangling) unless dangling.empty? booktitle.gsub!(/^in\s*/i, '') extract_edition(booktitle, hash) booktitle.gsub!(/[\.,:;\s]+$/, '') hash[:booktitle] = booktitle hash rescue => e warn e. hash end |
#normalize_container(hash) ⇒ Object
227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 |
# File 'lib/anystyle/parser/normalizer.rb', line 227 def normalize_container(hash) container, *dangling = hash[:container] unmatched(:container, hash, dangling) unless dangling.empty? case container when /dissertation abstracts/i container.gsub!(/\s*section \w: ([\w\s]+).*$/i, '') hash[:category] = $1 unless $1.nil? hash[:type] = :phdthesis end hash[:container] = container hash rescue => e warn e. hash end |
#normalize_date(hash) ⇒ Object
245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 |
# File 'lib/anystyle/parser/normalizer.rb', line 245 def normalize_date(hash) date, *dangling = hash[:date] unmatched(:date, hash, dangling) unless dangling.empty? unless (month = MONTH[date]).nil? hash[:month] = month end if date =~ /(\d{4})/ hash[:year] = $1.to_i hash.delete(:date) end hash rescue => e warn e. hash end |
#normalize_editor(hash) ⇒ Object
82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
# File 'lib/anystyle/parser/normalizer.rb', line 82 def normalize_editor(hash) editors, edition = hash[:editor] unless edition.nil? if edition =~ /(\d+)/ hash[:edition] = $1.to_i end end hash['more-editors'] = true if !!editors.sub!(/\bet\.?\s*al.*$/i, '') editors.gsub!(/^\W+|\W+$/, '') editors.gsub!(/^in\s+/i, '') editors.gsub!(/\W*[Ee]d(s|itors|ited)?\W*?/, '') editors.gsub!(/\bby\b/i, '') is_trans = !!editors.gsub!(/\W*trans(lated)?\W*/i, '') hash[:editor] = normalize_names(editors) hash[:translator] = hash[:editor] if is_trans hash rescue => e warn e. hash end |
#normalize_names(names) ⇒ Object
123 124 125 126 127 128 129 130 131 132 133 |
# File 'lib/anystyle/parser/normalizer.rb', line 123 def normalize_names(names) names = tokenize_names(names).map do |name| name.strip! name.gsub!(/\b([[:upper:]])(\W|$)/) { [$1, $2 == ?. ? nil : ?., $2].compact.join } name end names.join(' and ') rescue => e warn e. hash end |
#normalize_pages(hash) ⇒ Object
289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 |
# File 'lib/anystyle/parser/normalizer.rb', line 289 def normalize_pages(hash) pages, *dangling = hash[:pages] unmatched(:pages, hash, dangling) unless dangling.empty? # "volume.issue(year):pp" case pages when /(\d+) (?: \.(\d+))? (?: \( (\d{4}) \))? : (\d.*)/x hash[:volume] = $1.to_i hash[:number] = $2.to_i unless $2.nil? hash[:year] = $3.to_i unless $3.nil? hash[:pages] = $4 end case hash[:pages] when /(\d+)\D+(\d+)/ hash[:pages] = [$1,$2].join('--') when /(\d+)/ hash[:pages] = $1 end hash rescue => e warn e. hash end |
#normalize_title(hash) ⇒ Object
163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 |
# File 'lib/anystyle/parser/normalizer.rb', line 163 def normalize_title(hash) title, container = hash[:title] unless container.nil? hash[:container] = container normalize(:container, hash) end extract_edition(title, hash) title.gsub!(/[\.,:;\s]+$/, '') title.gsub!(/^["'”’´‘“`]|["'”’´‘“`]$/, '') hash[:title] = title hash rescue => e warn e. hash end |
#normalize_translator(hash) ⇒ Object
109 110 111 112 113 114 115 116 117 118 119 120 121 |
# File 'lib/anystyle/parser/normalizer.rb', line 109 def normalize_translator(hash) translators = hash[:translator] translators.gsub!(/^\W+|\W+$/, '') translators.gsub!(/\W*trans(lated)?\W*/i, '') translators.gsub!(/\bby\b/i, '') hash[:translator] = normalize_names(translators) hash rescue => e warn e. hash end |
#normalize_volume(hash) ⇒ Object
264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 |
# File 'lib/anystyle/parser/normalizer.rb', line 264 def normalize_volume(hash) volume, *dangling = hash[:volume] unmatched(:volume, hash, dangling) unless dangling.empty? case volume when /\D*(\d+)\D+(\d+[\s&-]+\d+)/ hash[:volume], hash[:number] = $1.to_i, $2 when /(\d+)?\D+no\.\s*(\d+\D+\d+)/ hash[:volume] = $1.to_i unless $1.nil? hash[:number] = $2 when /(\d+)?\D+no\.\s*(\d+)/ hash[:volume] = $1.to_i unless $1.nil? hash[:number] = $2.to_i when /\D*(\d+)\D+(\d+)/ hash[:volume], hash[:number] = $1.to_i, $2.to_i when /(\d+)/ hash[:volume] = $1.to_i end hash rescue => e warn e. hash end |
#tokenize_names(names) ⇒ Object
135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
# File 'lib/anystyle/parser/normalizer.rb', line 135 def tokenize_names(names) s, n, ns, cc = StringScanner.new(names), '', [], 0 until s.eos? case when s.scan(/,?\s*(and\b|&)/) ns << n n, cc = '', 0 when s.scan(/\s+/) n << ' ' when s.scan(/,?\s*(jr|sr|ph\.?d|m\.?d|esq)\.?/i) n << s.matched when s.scan(/,/) if cc > 0 || (n =~ /\S{2,}\s+\S{2,}/ && s.rest !~ /^\s*\w+(\.|,|$)/) ns << n n, cc = '', 0 else n << s.matched cc += 1 end when s.scan(/\w+/) n << s.matched when s.scan(/./) n << s.matched end end ns << n end |