Class: ActivityMapper::Linguistics::Tagger
- Inherits:
-
Object
- Object
- ActivityMapper::Linguistics::Tagger
- Defined in:
- lib/activity_mapper/linguistics.rb
Constant Summary collapse
- UNINTERESTING_PARTS_OF_SPEECH =
['DT', 'PRP', 'IN', 'CC', 'MD']
Class Method Summary collapse
Instance Method Summary collapse
-
#initialize ⇒ Tagger
constructor
A new instance of Tagger.
- #part_of_speech_tag(text) ⇒ Object
- #tokenize(words) ⇒ Object
Constructor Details
#initialize ⇒ Tagger
Returns a new instance of Tagger.
17 18 19 20 21 22 23 24 25 |
# File 'lib/activity_mapper/linguistics.rb', line 17 def initialize() @lexicon = {} file = File.new(File.join(File.dirname(__FILE__), 'lexicon.txt'), 'r') file.each_line {|line| toks=line.split @lexicon[toks.shift]=toks } file.close end |
Class Method Details
.keywords_for_caption(caption) ⇒ Object
31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
# File 'lib/activity_mapper/linguistics.rb', line 31 def self.() @@tagger ||= self.new keywords = [] all_keywords = @@tagger.tokenize() = @@tagger.part_of_speech_tag(all_keywords) all_keywords.each_with_index do |keyword,i| next if UNINTERESTING_PARTS_OF_SPEECH.include?([i]) next unless keyword.size > 4 keywords << keyword.downcase end keywords rescue => e all_keywords end |
Instance Method Details
#part_of_speech_tag(text) ⇒ Object
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
# File 'lib/activity_mapper/linguistics.rb', line 46 def part_of_speech_tag(text) ## start by tokenizing strings passed in if text.class == String then text = tokenize(text) end ## we only work on arrays. If text isn't an array, ## quit now. if text.class != Array then raise RuntimeError, "can't tokenize #{text.class}" end # this looks like an artifact of testing # puts "text:",text,"\n" ret = [] text.each do |w| ret << (@lexicon[w] && @lexicon[w][0]) || (@lexicon[w.downcase] && words[w.downcase][0]) || 'NN' end ## Now, apply transformational rules: text.length.times do |i| ## rule 1: DT, {VBD | VBP} --> DT, NN if i > 0 then if ret[i - 1] == "DT" then if ret[i] == "VBD" or ret[i] == "VBP" or ret[i] == "VB" then ret[i] = "NN" end end end ## rule 2: convert a noun to a number (CD) if "." appears in the word if ret[i] =~ /^N/ then if text[i] =~ /\./ then ret[i] = "CD" end end ## rule 3: convert a noun to a past participle if words[i] ends ## with "ed" if ret[i] =~ /^N/ && text[i] =~ /ed$/ then ret[i] = "VBN" end ## rule 4: convert any type to adverb if it ends in "ly" if text[i] =~ /ly$/ then ret[i] = "RB" end ## rule 5: convert a common noun (NN or NNS) to a adjective if ## it ends with "al" if ret[i] =~ /^NN/ && text[i] =~ /al$/ then ret[i] = "JJ" end ## rule 6: convert a noun to a verb if the preceeding work is "would" if i > 0 then if ret[i] =~ /^NN/ then if text[i-1].downcase == "would" then ret[i] = "VB" end end end ## rule 7: if a word has been categorized as a common noun and ## it ends with "s", then set its type to plural common noun (NNS) if ret[i] == "NN" && text[i] =~ /s$/ then ret[i] = "NNS" end ## rule 8: convert a common noun to a present participle ## verb (i.e., a gerand) if ret[i] =~ /^NN/ && text[i] =~ /ing$/ then ret[i] = "VBG" end ## rule 9: <noun> <noun 2> --> <noun> <verb> if <noun 2> ## can also be a verb if i > 0 then if ( ( ret[i-1] =~ /^NN/ ) && ( ret[i] =~ /^NN/ ) ) then if @lexicon[text[i]].include?("VBN") then ret[i] = "VBN" end if @lexicon[text[i]].include?("VBZ") then ret[i] = "VBZ" end end end end return ret end |
#tokenize(words) ⇒ Object
27 28 29 |
# File 'lib/activity_mapper/linguistics.rb', line 27 def tokenize(words) words.split(/ |,|\.|\:|\;|\'/) #' end |