Module: RubyMarkovify::Splitters

Included in:
Text
Defined in:
lib/ruby_markovify/splitters.rb

Constant Summary collapse

ASCII_LOWERCASE =
'abcdefghijklmnopqrstuvwxyz'
ASCII_UPPERCASE =
ASCII_LOWERCASE.upcase
STATES =

States w/ with thanks to github.com/unitedstates/python-us Titles w/ thanks to github.com/nytimes/emphasis and @donohoe

%w{
  ala ariz ark calif colo conn del fla ga ill ind kan ky la md mass mich minn miss mo mont neb nev okla
  ore pa tenn vt va wash wis wyo
}
UNITED_STATES =
%w{u.s}
TITLES =
%w{mr ms mrs msr dr gov pres sen sens rep reps prof gen messrs col sr jf sgt mgr fr rev jr snr atty supt}
STREETS =
%w{ave blvd st rd hwy}
MONTHS =
%w{jan feb mar apr jun jul aug sep sept oct nov dec}
INITIALS =
ASCII_LOWERCASE.chars
ABBR_CAPPED =
STATES + UNITED_STATES + TITLES + STREETS + MONTHS + INITIALS
ABBR_LOWERCASE =
%w{etc v vs viz al pct}
EXCEPTIONS =
%w{U.S. U.N. E.U. F.B.I. C.I.A.}
PUNCTUATION =
%w{? !}
END_PATTERN =

A word that ends with punctuation Followed by optional quote/parens/etc Followed by whitespace + non-(lowercase or dash)

/([\w\.'’&\]\)]+[\.\?!])([‘’“”'\"\)\]]*)(\s+(?![a-z\-–—]))/

Instance Method Summary collapse

Instance Method Details

#is_abbreviation(dotted_word) ⇒ Object



24
25
26
27
28
29
30
31
# File 'lib/ruby_markovify/splitters.rb', line 24

def is_abbreviation(dotted_word)
  clipped = dotted_word[0..-2]
  if ASCII_UPPERCASE.include? clipped[0]
    ABBR_CAPPED.include? clipped.downcase
  else
    ABBR_LOWERCASE.include? clipped
  end
end

#is_sentence_ender(word) ⇒ Object



33
34
35
36
37
38
39
# File 'lib/ruby_markovify/splitters.rb', line 33

def is_sentence_ender(word)
  return false if EXCEPTIONS.include? word
  return true if PUNCTUATION.include? word[-1]
  return true if word.sub(/[^A-Z]/, '').length > 1
  return true if word[-1] == '.' && !is_abbreviation(word)
  false
end

#split_into_sentences(text) ⇒ Object



46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# File 'lib/ruby_markovify/splitters.rb', line 46

def split_into_sentences(text)
  res = []
  text.scan(END_PATTERN) do |c|
    res << [c, $~.offset(0)[0]]
  end

  end_indices = res.select do |e|
    groups, _ = e
    is_sentence_ender(groups[0])
  end.map do |e|
    groups, index = e
    index + groups[0].length + groups[1].length
  end

  spans = ([nil] + end_indices).zip(end_indices + [nil])

  ret = spans.map do |elem|
    start_idx, end_idx = elem
    next if end_idx == nil
    start_idx ||= 0
    text[start_idx..end_idx].strip
  end
  ret.compact
end