Class: Ai4r::Reinforcement::PolicyIteration

Inherits:
Object
  • Object
show all
Includes:
Data::Parameterizable
Defined in:
lib/ai4r/reinforcement/policy_iteration.rb

Overview

Compute an optimal policy for a known MDP.

Instance Method Summary collapse

Methods included from Data::Parameterizable

#get_parameters, included, #set_parameters

Constructor Details

#initializePolicyIteration

Returns a new instance of PolicyIteration.



19
20
21
# File 'lib/ai4r/reinforcement/policy_iteration.rb', line 19

def initialize
  @discount = 0.9
end

Instance Method Details

#policy_iteration(states, actions, transition, reward) ⇒ Object

Perform policy iteration.

states

Array of states

actions

Array of actions

transition

Hash[action] => => prob

reward

Hash[action] => reward



28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# File 'lib/ai4r/reinforcement/policy_iteration.rb', line 28

def policy_iteration(states, actions, transition, reward)
  policy = {}
  states.each { |s| policy[s] = actions.first }
  values = Hash.new(0.0)

  loop do
    # Policy evaluation
    delta = Float::INFINITY
    while delta > 1e-6
      delta = 0.0
      states.each do |s|
        v = values[s]
        a = policy[s]
        new_v = reward[s][a] +
                @discount * transition[s][a].sum { |s2, p| p * values[s2] }
        values[s] = new_v
        diff = (v - new_v).abs
        delta = diff if diff > delta
      end
    end

    # Policy improvement
    stable = true
    states.each do |s|
      old = policy[s]
      best = actions.max_by do |a|
        reward[s][a] +
          @discount * transition[s][a].sum { |s2, p| p * values[s2] }
      end
      policy[s] = best
      stable = false if best != old
    end
    break if stable
  end
  policy
end