Class: Scalpel
- Inherits:
-
Object
- Object
- Scalpel
- Defined in:
- lib/scalpel.rb
Overview
Sentence segmentation based on a set of predefined rules that handle a large number of usage cases of sentence enders. The idea is to remove all cases of .!? being used for other purposes than marking a full stop before naively segmenting the text.
Constant Summary collapse
- VERSION =
Current version.
'0.2.1'
Class Method Summary collapse
-
.cut(text) ⇒ Object
Segment a text using the Scalpel algorithm.
Class Method Details
.cut(text) ⇒ Object
Segment a text using the Scalpel algorithm. This will eventually be ported to a gem.
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
# File 'lib/scalpel.rb', line 15 def self.cut(text) # Get a copy of the string. text = text.to_s # Remove composite abbreviations. text.gsub!('et al.', '&&&') # Remove suspension points. text.gsub!('...', '&;&.') # Remove floating point numbers. text.gsub!(/([0-9]+)\.([0-9]+)/) { $1 + '&@&' + $2 } # Handle floats without leading zero. text.gsub!(/\s\.([0-9]+)/) { ' &#&' + $1 } # Remove abbreviations. text.gsub!(/(?:[A-Za-z]\.){2,}/) { |abbr| abbr.gsub('.', '&-&') } # Remove titles. text.gsub!(/[A-Z][a-z]{1,2}\./) { |title| title.gsub('.', '&*&') } # Unstick sentences from each other. text.gsub!(/([^.?!]\.|\!|\?)([^\s"'])/) { $1 + ' ' + $2 } # Remove sentence enders next to quotes. text.gsub!(/'([.?!])\s?"/) { '&^&' + $1 } text.gsub!(/'([.?!])\s?”/) { '&*&' + $1 } text.gsub!(/([.?!])\s?”/) { '&=&' + $1 } text.gsub!(/([.?!])\s?'"/) { '&,&' + $1 } text.gsub!(/([.?!])\s?'/) { '&%&' + $1 } text.gsub!(/([.?!])\s?"/) { '&$&' + $1 } # Split on any sentence ender. sentences = text.split(/([.!?])/) new_sents = [] # Join the obtaine slices. sentences.each_slice(2) do |slice| new_sents << slice.join('') end # Repair the damage we've done. results = [] new_sents.each do |sentence| # Skip whitespace zones. next if sentence.strip == '' # Repair composite abbreviations. sentence.gsub!('&&&', 'et al.') # Repair abbreviations. sentence.gsub!('&-&', '.') # Repair titles. sentence.gsub!('&*&', '.') # Repair suspension points. sentence.gsub!('&;&.', '...') # Repair floats. sentence.gsub!(/([0-9]+)&@&([0-9]+)/) { $1 + '.' + $2 } # Repair quotes with sentence enders sentence.gsub!(/&=&([.!?])/) { $1 + '”' } sentence.gsub!(/&,&([.!?])/) { $1 + "'\"" } sentence.gsub!(/&%&([.!?])/) { $1 + "'" } sentence.gsub!(/&\^&([.?!])/) { "'" + $1 + '"' } sentence.gsub!(/&\*&([.?!])/) { "'" + $1 + '”' } sentence.gsub!(/&\$&([.!?])/) { $1 + '"' } # Repair floats without leading zeros. sentence.gsub!(/&#&([0-9]+)/) { '.' + $1 } results << sentence.strip end results end |