Class: Keyphrase

Inherits:
Object
  • Object
show all
Defined in:
lib/keyphrase.rb,
lib/keyphrase/version.rb,
lib/keyphrase/stoplist/afr.rb,
lib/keyphrase/stoplist/aka.rb,
lib/keyphrase/stoplist/amh.rb,
lib/keyphrase/stoplist/ara.rb,
lib/keyphrase/stoplist/aze.rb,
lib/keyphrase/stoplist/bel.rb,
lib/keyphrase/stoplist/ben.rb,
lib/keyphrase/stoplist/bul.rb,
lib/keyphrase/stoplist/cat.rb,
lib/keyphrase/stoplist/ces.rb,
lib/keyphrase/stoplist/cmn.rb,
lib/keyphrase/stoplist/dan.rb,
lib/keyphrase/stoplist/deu.rb,
lib/keyphrase/stoplist/ell.rb,
lib/keyphrase/stoplist/eng.rb,
lib/keyphrase/stoplist/epo.rb,
lib/keyphrase/stoplist/est.rb,
lib/keyphrase/stoplist/fin.rb,
lib/keyphrase/stoplist/fra.rb,
lib/keyphrase/stoplist/guj.rb,
lib/keyphrase/stoplist/heb.rb,
lib/keyphrase/stoplist/hin.rb,
lib/keyphrase/stoplist/hrv.rb,
lib/keyphrase/stoplist/hun.rb,
lib/keyphrase/stoplist/hye.rb,
lib/keyphrase/stoplist/ind.rb,
lib/keyphrase/stoplist/ita.rb,
lib/keyphrase/stoplist/jav.rb,
lib/keyphrase/stoplist/jpn.rb,
lib/keyphrase/stoplist/kan.rb,
lib/keyphrase/stoplist/kat.rb,
lib/keyphrase/stoplist/khm.rb,
lib/keyphrase/stoplist/kor.rb,
lib/keyphrase/stoplist/lat.rb,
lib/keyphrase/stoplist/lav.rb,
lib/keyphrase/stoplist/lit.rb,
lib/keyphrase/stoplist/mal.rb,
lib/keyphrase/stoplist/mar.rb,
lib/keyphrase/stoplist/mkd.rb,
lib/keyphrase/stoplist/mya.rb,
lib/keyphrase/stoplist/nep.rb,
lib/keyphrase/stoplist/nld.rb,
lib/keyphrase/stoplist/nob.rb,
lib/keyphrase/stoplist/ori.rb,
lib/keyphrase/stoplist/pan.rb,
lib/keyphrase/stoplist/pes.rb,
lib/keyphrase/stoplist/pol.rb,
lib/keyphrase/stoplist/por.rb,
lib/keyphrase/stoplist/ron.rb,
lib/keyphrase/stoplist/rus.rb,
lib/keyphrase/stoplist/sin.rb,
lib/keyphrase/stoplist/slk.rb,
lib/keyphrase/stoplist/slv.rb,
lib/keyphrase/stoplist/sna.rb,
lib/keyphrase/stoplist/spa.rb,
lib/keyphrase/stoplist/srp.rb,
lib/keyphrase/stoplist/swe.rb,
lib/keyphrase/stoplist/tam.rb,
lib/keyphrase/stoplist/tel.rb,
lib/keyphrase/stoplist/tgl.rb,
lib/keyphrase/stoplist/tha.rb,
lib/keyphrase/stoplist/tuk.rb,
lib/keyphrase/stoplist/tur.rb,
lib/keyphrase/stoplist/ukr.rb,
lib/keyphrase/stoplist/urd.rb,
lib/keyphrase/stoplist/uzb.rb,
lib/keyphrase/stoplist/vie.rb,
lib/keyphrase/stoplist/yid.rb,
lib/keyphrase/stoplist/zul.rb

Defined Under Namespace

Modules: Stoplist

Constant Summary collapse

CLEAN_REGEX =

don’t remove ‘ because it might be part of a stop word

/([^\p{L}a-zA-Z0-9\'\- \.]|(?<!\w)\.)/
BLACKLIST_REGEX =

remove words with no letters, ie 123.23.12. And last chance to remove ‘ and -

/(?:^|\s)[^a-zA-Z\p{L}]+\b|\'|\-/
CLEAN_SPACES_REGEX =
/\s+/
SENTENCES_REGEX =
/[+!?,;:&\[\]\{\}\<\>\=\/\n\t\\"\\(\\)\u2019\u2013\|]|-(?!\w)|'(?=s)|(?<!\s)\.(?![a-zA-Z0-9])|(?<!\w)\#(?=\w)/u
VERSION =
"0.2.0"

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeKeyphrase

Returns a new instance of Keyphrase.



19
20
21
# File 'lib/keyphrase.rb', line 19

def initialize
  @cached_regex = {}
end

Class Method Details

.analyse(text, options = {}) ⇒ Object



14
15
16
17
# File 'lib/keyphrase.rb', line 14

def self.analyse text, options={}
  @@keyphrase ||= Keyphrase.new
  @@keyphrase.analyse text, options
end

Instance Method Details

#analyse(text, options = {}) ⇒ Object



23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# File 'lib/keyphrase.rb', line 23

def analyse text, options={}
  stopwords = options[:stopwords]
  lang = options[:lang] || :eng
  clean_regex = options[:clean] || CLEAN_REGEX
  position_bonus = options[:position_bonus] || true
  sort = options[:sort] || true
  blacklist = options[:blacklist] || BLACKLIST_REGEX
  sentences_regex = options[:sentences_regex] || SENTENCES_REGEX
  clean_spaces_regex = options[:clean_spaces_regex] || CLEAN_SPACES_REGEX

  pattern    = buildStopwordRegExPattern lang, stopwords
  sentences  = text.split sentences_regex
  phrases    = generateCandidateKeywords sentences, pattern, clean_regex, blacklist, clean_spaces_regex
  wordscores = calculateWordScores phrases
  candidates = generateCandidateKeywordScores phrases, wordscores, position_bonus

  if sort
    candidates = candidates.sort_by{|k,v| -v}.to_h
  end

  if options[:verbose]
    candidates.each do |word, score|
      puts sprintf '%.2f - %s', score, word
    end
  end

  return candidates
end