Class: ClauseExtractor

Inherits:
Object
  • Object
show all
Defined in:
lib/clause_extractor.rb

Class Method Summary collapse

Class Method Details

.get_clauses(phrase, format = String.new) ⇒ Object



5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# File 'lib/clause_extractor.rb', line 5

def self.get_clauses(phrase, format = String.new)     
  @format        = format 
  phrase         = phrase.downcase
  #list           = format.match("audioverb") ? Hash.new : Array.new
  list           = Hash.new
  @verbs        ||= get_verbs
  @tiempos      ||= get_tiempos
  @id_tiempo    ||= get_id_tiempos
  @tense_id     ||= get_tenses
  @con_id       ||= get_con_id
  ranges       = []

  phrase.gsub!(/[!.?\(\)]/,"") if phrase
  phrase_a = phrase.split(/\s+/) 
  phrase_a.length.times do |i|
  #  phrase_a[i].gsub!(/[!.?\(\)]/,"") if phrase_a[i] #remove any punctuation from the word 
      if @con_id[phrase_a[i]] then  #if word matches a conjugation
      $tense_regexes.each do |k,v|
        if k.match(/#{@id_tiempo[@tiempos[phrase_a[i]]]}/)
          v.each do |tense, regex_array|
            regex_array.each do |regex|
              regex = regex.to_s.gsub("search", "#{phrase_a[i]}")
              phrase, list, ranges = scan_phrase(phrase, list, regex, phrase_a[i], tense, i, ranges)
            end
          end
        end
      end
    end
  end    
  list.each { |k, v| list.delete(k) unless ranges.include?(v) }
  list.each { |k, v| print "#{k}\n" }    
  list
end

.get_match_start_index(verb, match, index) ⇒ Object



39
40
41
42
43
44
45
46
# File 'lib/clause_extractor.rb', line 39

def self.get_match_start_index(verb, match, index)
  #get start position of last occurence of verb in match
  verb_index_in_match = match.index /#{verb}(?!.*#{verb})/i
  #count spaces between match start and verb_index_in_match and subtract that from index
  lo = index - match[0,verb_index_in_match].split(/\s+/).size  
  hi = lo + match[0,verb_index_in_match].split(/\s+/).size
  return lo, hi
end

.prioritize_ranges(ranges, lo, hi, match) ⇒ Object



62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# File 'lib/clause_extractor.rb', line 62

def self.prioritize_ranges(ranges, lo, hi,match)
  range = (lo..hi)

  ranges.size.times.each do |r|
    #replace old range with new one if start is same point and new range is longer
    if ranges[r].begin == lo and ranges[r].count < range.count
      ranges[r] = range
    elsif (range.include?(ranges[r].begin) || range.include?(ranges[r].end)) && range.count > ranges[r].count
      ranges.delete_at(r)
    end
  end
  #add range to ranges if it is not already included in an existing range
  if ranges.each.select{|r| r.include?(lo) || r.include?(hi)}.size == 0
    ranges << range
  end
  ranges
end

.scan_phrase(phrase, list, regex, verb, tense_label, index, ranges) ⇒ Object



48
49
50
51
52
53
54
55
56
57
58
59
60
# File 'lib/clause_extractor.rb', line 48

def self.scan_phrase(phrase, list, regex, verb, tense_label, index, ranges)
  if match = phrase.match(/#{regex}/i)
    match = match.to_s
    lo, hi = get_match_start_index(verb, match, index)
    ranges = prioritize_ranges(ranges, lo, hi,match)
    if @format.match(/audioverb/)
      list[@tense_id["#{tense_label}"].to_s+":" + match.to_s + ":" + @verbs[verb].to_s] = (lo..hi) 
    else
      list["#{tense_label}:" + match.to_s + ":" + (lo..hi).to_s] = (lo..hi)
    end
  end
  return phrase, list, ranges
end