Class: Anystyle::Parser::Normalizer

Inherits:
Object
  • Object
show all
Includes:
Singleton
Defined in:
lib/anystyle/parser/normalizer.rb

Constant Summary collapse

MONTH =
Hash.new do |h,k|
	case k
	when /jan/i
		h[k] = 1
	when /feb/i
		h[k] = 2
	when /mar/i
		h[k] = 3
	when /apr/i
		h[k] = 4
	when /ma[yi]/i
		h[k] = 5
	when /jun/i
		h[k] = 6
	when /jul/i
		h[k] = 7
	when /aug/i
		h[k] = 8
	when /sep/i
		h[k] = 9
	when /o[ck]t/i
		h[k] = 10
	when /nov/i
		h[k] = 11
	when /dec/i
		h[k] = 12
	else
		h[k] = nil
	end
end

Instance Method Summary collapse

Dynamic Method Handling

This class handles dynamic methods through the method_missing method

#method_missing(name, *arguments, &block) ⇒ Object



41
42
43
44
45
46
47
48
# File 'lib/anystyle/parser/normalizer.rb', line 41

def method_missing(name, *arguments, &block)
	case name.to_s
	when /^normalize_(.+)$/
		normalize($1.to_sym, *arguments, &block)
	else
		super
	end
end

Instance Method Details

#extract_edition(token, hash) ⇒ Object



184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
# File 'lib/anystyle/parser/normalizer.rb', line 184

def extract_edition(token, hash)
	edition = [hash[:edition]].flatten.compact
	
	if token.gsub!(/\W*(\d+)(?:st|nd|rd|th)?\s*ed(?:ition|\.)?\W*/i, '')
		edition << $1
	end				

	if token.gsub!(/(?:\band)?\W*([Ee]xpanded)\W*$/, '')
		edition << $1
	end					

	if token.gsub!(/(?:\band)?\W*([Ii]llustrated)\W*$/, '')
		edition << $1
	end					

	if token.gsub!(/(?:\band)?\W*([Rr]evised)\W*$/, '')
		edition << $1
	end					

	if token.gsub!(/(?:\band)?\W*([Rr]eprint)\W*$/, '')
		edition << $1
	end
	
	hash[:edition] = edition.join(', ') unless edition.empty?
end

#normalize(key, hash) ⇒ Object

Default normalizer. Strips punctuation.



51
52
53
54
55
56
57
58
59
60
61
# File 'lib/anystyle/parser/normalizer.rb', line 51

def normalize(key, hash)
	token, *dangling =  hash[key]
	unmatched(key, hash, dangling) unless dangling.empty?

	token.gsub!(/^\W+|\W+$/, '')
	hash[key] = token
	hash
rescue => e
	warn e.message
	hash
end

#normalize_author(hash) ⇒ Object



63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# File 'lib/anystyle/parser/normalizer.rb', line 63

def normalize_author(hash)
	authors, *dangling = hash[:author]
	unmatched(:author, hash, dangling) unless dangling.empty?
	
	if authors =~ /\W*[Ee]d(s|itors)?\W*$/ && !hash.has_key?(:editor)
		hash[:editor] = hash.delete(:author)
		normalize_editor(hash)
	else
     hash['more-authors'] = true if !!authors.sub!(/\bet\.?\s*al.*$/i, '')
		authors.gsub!(/^\W+|\W+$/, '')
		hash[:author] = normalize_names(authors)
	end
	
	hash
rescue => e
	warn e.message
	hash
end

#normalize_booktitle(hash) ⇒ Object



210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
# File 'lib/anystyle/parser/normalizer.rb', line 210

def normalize_booktitle(hash)
	booktitle, *dangling = hash[:booktitle]
	unmatched(:booktitle, hash, dangling) unless dangling.empty?
	
	booktitle.gsub!(/^in\s*/i, '')

	extract_edition(booktitle, hash)

	booktitle.gsub!(/[\.,:;\s]+$/, '')			
	hash[:booktitle] = booktitle
	
	hash
rescue => e
	warn e.message
	hash
end

#normalize_container(hash) ⇒ Object



227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
# File 'lib/anystyle/parser/normalizer.rb', line 227

def normalize_container(hash)
	container, *dangling = hash[:container]
	unmatched(:container, hash, dangling) unless dangling.empty?
	
	case container
	when /dissertation abstracts/i
		container.gsub!(/\s*section \w: ([\w\s]+).*$/i, '')
		hash[:category] = $1 unless $1.nil?
		hash[:type] = :phdthesis
	end
	
	hash[:container] = container
	hash
rescue => e
	warn e.message
	hash
end

#normalize_date(hash) ⇒ Object



245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
# File 'lib/anystyle/parser/normalizer.rb', line 245

def normalize_date(hash)
	date, *dangling = hash[:date]
	unmatched(:date, hash, dangling) unless dangling.empty?
	
	unless (month = MONTH[date]).nil?
		hash[:month] = month
	end
	
	if date =~ /(\d{4})/
		hash[:year] = $1.to_i
		hash.delete(:date)
	end

	hash
rescue => e
	warn e.message
	hash
end

#normalize_editor(hash) ⇒ Object



82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# File 'lib/anystyle/parser/normalizer.rb', line 82

def normalize_editor(hash)
  editors, edition = hash[:editor]
	
				unless edition.nil?
if edition =~ /(\d+)/
	hash[:edition] = $1.to_i
end
				end
	
  hash['more-editors'] = true if !!editors.sub!(/\bet\.?\s*al.*$/i, '')
	
				editors.gsub!(/^\W+|\W+$/, '')
				editors.gsub!(/^in\s+/i, '')
				editors.gsub!(/\W*[Ee]d(s|itors|ited)?\W*?/, '')
				editors.gsub!(/\bby\b/i, '')

				is_trans = !!editors.gsub!(/\W*trans(lated)?\W*/i, '')

 	hash[:editor] = normalize_names(editors)
				hash[:translator] = hash[:editor] if is_trans
				
  hash
			rescue => e
				warn e.message
				hash
end

#normalize_names(names) ⇒ Object



123
124
125
126
127
128
129
130
131
132
133
# File 'lib/anystyle/parser/normalizer.rb', line 123

def normalize_names(names)
	names = tokenize_names(names).map do |name|
		name.strip!
		name.gsub!(/\b([[:upper:]])(\W|$)/) { [$1, $2 == ?. ? nil : ?., $2].compact.join }
		name
	end
	names.join(' and ')
rescue => e
	warn e.message
	hash
end

#normalize_pages(hash) ⇒ Object



289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
# File 'lib/anystyle/parser/normalizer.rb', line 289

def normalize_pages(hash)
	pages, *dangling = hash[:pages]
	unmatched(:pages, hash, dangling) unless dangling.empty?
	
	# "volume.issue(year):pp"
	case pages
	when /(\d+) (?: \.(\d+))? (?: \( (\d{4}) \))? : (\d.*)/x
		hash[:volume] = $1.to_i
		hash[:number] = $2.to_i unless $2.nil?
		hash[:year] = $3.to_i unless $3.nil?
		hash[:pages] = $4
	end

	case hash[:pages]
	when /(\d+)\D+(\d+)/
		hash[:pages] = [$1,$2].join('--')
	when  /(\d+)/
		hash[:pages] = $1
	end
	
	hash
rescue => e
	warn e.message
	hash
end

#normalize_title(hash) ⇒ Object



163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
# File 'lib/anystyle/parser/normalizer.rb', line 163

def normalize_title(hash)
	title, container = hash[:title]
	
	unless container.nil?
		hash[:container] = container
		normalize(:container, hash)
	end

	extract_edition(title, hash)
	
	title.gsub!(/[\.,:;\s]+$/, '')
	title.gsub!(/^["'”’´‘“`]|["'”’´‘“`]$/, '')
		
	hash[:title] = title
	
	hash
rescue => e
	warn e.message
	hash
end

#normalize_translator(hash) ⇒ Object



109
110
111
112
113
114
115
116
117
118
119
120
121
# File 'lib/anystyle/parser/normalizer.rb', line 109

def normalize_translator(hash)
	translators = hash[:translator]
	
	translators.gsub!(/^\W+|\W+$/, '')
	translators.gsub!(/\W*trans(lated)?\W*/i, '')
	translators.gsub!(/\bby\b/i, '')
	
	hash[:translator] = normalize_names(translators)
	hash
rescue => e
	warn e.message
	hash
end

#normalize_volume(hash) ⇒ Object



264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
# File 'lib/anystyle/parser/normalizer.rb', line 264

def normalize_volume(hash)
	volume, *dangling = hash[:volume]
	unmatched(:volume, hash, dangling) unless dangling.empty?
	
	case volume
	when /\D*(\d+)\D+(\d+[\s&-]+\d+)/
		hash[:volume], hash[:number] = $1.to_i, $2
	when /(\d+)?\D+no\.\s*(\d+\D+\d+)/
		hash[:volume] = $1.to_i unless $1.nil?
		hash[:number] = $2
	when /(\d+)?\D+no\.\s*(\d+)/
		hash[:volume] = $1.to_i unless $1.nil?
		hash[:number] = $2.to_i
	when /\D*(\d+)\D+(\d+)/
		hash[:volume], hash[:number] = $1.to_i, $2.to_i
	when /(\d+)/
		hash[:volume] = $1.to_i
	end

	hash
rescue => e
	warn e.message
	hash
end

#tokenize_names(names) ⇒ Object



135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# File 'lib/anystyle/parser/normalizer.rb', line 135

def tokenize_names(names)
	s, n, ns, cc = StringScanner.new(names), '', [], 0
	until s.eos?
		case
		when s.scan(/,?\s*(and\b|&)/)
			ns << n
			n, cc = '', 0
		when s.scan(/\s+/)
			n << ' '
		when s.scan(/,?\s*(jr|sr|ph\.?d|m\.?d|esq)\.?/i)
			n << s.matched
		when s.scan(/,/)
			if cc > 0 || (n =~ /\S{2,}\s+\S{2,}/ && s.rest !~ /^\s*\w+(\.|,|$)/)
				ns << n
				n, cc = '', 0							
			else
				n << s.matched
				cc += 1
			end
		when s.scan(/\w+/)
			n << s.matched
		when  s.scan(/./)
			n << s.matched
		end
	end
	ns << n		
end