Class: Lingua::IT::Sentence

Inherits:
Object
  • Object
show all
Defined in:
lib/lingua/it/sentence.rb

Constant Summary collapse

TITLES =

Common abbreviations

%w(Sig Sigg Dott Preg Prof Mr Jr Amn Avv Co Stim Dr Egr Geom Ing Mons On Rag Rev Soc Spett Card Ill Gent Cav)
MISC =
%w(P V Femm Dim Ecc Etc Corr Cc Bcc All Es Fatt G Gg Id Int Lett Ogg Pag Pagg Cap Pp Tel Ind V N Num Min Sec Ms Abbr Agg Art Aus)
MONTHS =
%w(Gen Feb Mar Apr Mag Giu Lug Ago Set Sett Ott Nov Dic)
DAYS =
%w(Lun Mar Mer Gio Ven Sab Dom)
STD =

Standard delimiters

%w(. ? !)

Class Attribute Summary collapse

Class Method Summary collapse

Class Attribute Details

.abbr_regexObject (readonly)

Returns the value of attribute abbr_regex.



10
11
12
# File 'lib/lingua/it/sentence.rb', line 10

def abbr_regex
  @abbr_regex
end

.abbreviationsObject (readonly)

Returns the value of attribute abbreviations.



9
10
11
# File 'lib/lingua/it/sentence.rb', line 9

def abbreviations
  @abbreviations
end

.delim_regexObject (readonly)

Returns the value of attribute delim_regex.



12
13
14
# File 'lib/lingua/it/sentence.rb', line 12

def delim_regex
  @delim_regex
end

.delimitersObject (readonly)

Returns the value of attribute delimiters.



11
12
13
# File 'lib/lingua/it/sentence.rb', line 11

def delimiters
  @delimiters
end

Class Method Details

.abbreviation(*abbreviations) ⇒ Object

Add customized abbreviations to standard set



38
39
40
41
42
43
# File 'lib/lingua/it/sentence.rb', line 38

def self.abbreviation(*abbreviations)
  @abbreviations += abbreviations
  @abbreviations.uniq!
  set_abbr_regex!
  @abbreviations
end

.delimiter(*delimiters) ⇒ Object

Add symbols to sentence delimters



46
47
48
49
50
51
# File 'lib/lingua/it/sentence.rb', line 46

def self.delimiter(*delimiters)
  @delimiters += delimiters
  @delimiters.uniq!
  set_delim_regex!
  @delimiters
end

.reset_delimiter!Object



53
54
55
56
57
# File 'lib/lingua/it/sentence.rb', line 53

def self.reset_delimiter!
  @delimiters = STD
  set_delim_regex!
  @delimiters
end

.sentences(text) ⇒ Object

Split up in sentences, use 0002 as a temporary end mark for the abbreviations found, even if the regex should be enough to recognize real stop point from abbreviations ones. A sentences should definetly end marked only by a . or a ? or a !



29
30
31
32
33
34
35
# File 'lib/lingua/it/sentence.rb', line 29

def self.sentences(text)
  txt = text.dup
  txt.gsub!(/\b(#{@abbr_regex})(\.)\B/i, '\10002')
  txt.gsub!(/["']?[A-Z][^\Q#{@delim_regex}\E]+((?![\Q#{@delim_regex}\E]['"]?\s["']?[A-Z][^\Q#{@delim_regex}\E]).)+[\Q#{@delim_regex}\E'"]+/, '\2\001')
  txt.gsub!(/\b(#{@abbr_regex})(0002)/i, '\1.')
  txt.split(/01/).map { |sentence| sentence.strip }
end