M = {S, A, pT, p0, g}
Pr{St+1 = s′

|At = a, St = s, …} = Pr{St+1 = s′

|At = a, St = s}
=: pT(s′

|s, a), Pr(S0 = s) =: p0(s)
π ∈ ΠM
Pr(At = a|St = s, …) = Pr(At = a|St = s)
=: π(a|s)
Vπ
Vπ
(s) :=
𝔼
π
[C0 |S0 = s], Ct :=
∞
∑
i=0
γi
g(At+i, St+i), γ ∈ [0,1)
f(π)
f(π) :=
∑
s∈S
p0(s)Vπ
(s)
π∈ΠM
f(π) M
Vπ
(s) =
𝔼
π
[C0 |S0 = s]
=
𝔼
π
[g(A0, S0) + γC1 |S0 = s]
=
∑
a∈A
π(a|s)(g(a, s) + γ
∑
a∈A
∑
s′

∈S
π(a|s)pT(s′

|s, a)
𝔼
[C1 |s1 = s′

])
=
∑
a∈A
π(a|s)(g(a, s) + γ
∑
s′

∈S
pT(s′

|s, a)V(s′

)), ∀s ∈ S
V*
V*(s) := max
(π0,π1,…)
V(π0,π1,…)
(s)
V*(s) = max
(π0,π1,…)
𝔼
(π0,π1,…)
[g(A0, S0) + γC1 |S0 = s]
= max
π0
𝔼
π0
[g(A0, S0) + γ max
(π1,π2,…)
𝔼
(π1,π2,…)
[C1 |S1 ∼ pT( ⋅ |S0, A0)]|S0 = s]
= max
π0
∑
a∈A
π0(a|s)((g(a, s) + γ
∑
s′

∈S
pT(s′

|s, a)V*(s′

))
= max
a∈A
((g(a, s) + γ
∑
s′

∈S
pT(s′

|s, a)V*(s′

)), ∀s ∈ S
Bπ(V) :=
∑
a∈A
π(a| ⋅ )(g(a, ⋅ ) + γ
∑
s′

∈S
pT(s′

| ⋅ ,a)V(s′

))
B*(V) := max
a∈A
{g(a, ⋅ ) + γ
∑
s′

∈S
pT(s′

| ⋅ ,a)V(s′

)}
V = B(V), B := {B*, Bπ}
v, v′

: S → ℝ
v ≤ v′

⇔ v(s) ≤ v′

(s), ∀s ∈ S
∥v − v′

∥ := max
s∈S
|v(s) − v(s′

)|
v ≤ v′

⇒ B(v) ≤ B(v′

)
B(v + c) = B(v) + γc, ∀c ∈ ℝ
∥B(v) − B(v′

)∥ ≤ γ∥v − v′

∥
v* = B(v*) v*
lim
k→∞
Bk
(v0) = v*, ∀v0 : S → ℝ
B*(v)(s) = max
a∈A
{g(a, s) + γ
∑
s′

∈S
pT(s′

|s, a)v(s′

)}
≤ max
a∈A
{g(a, s) + γ
∑
s′

∈S
pT(s′

|s, a)v′

(s′

)}
= B*(v′

)(s), ∀s ∈ S
Bπ
B*(v + c)(s) = max
a∈A
{g(a, s) + γ
∑
s′

∈S
pT(s′
|s, a)(v(s′

) + c)}
= max
a∈A
{g(a, s) + γ
∑
s′

∈S
pT(s′

|s, a)v(s′

)} + γc
= B*(v)(s) + γc, ∀s ∈ S
Bπ
v′

− ∥v − v′

∥ ≤ v ≤ v′

+ ∥v − v′

∥
⇒ B(v′

) − γ∥v′

− v∥ ≤ B(v) ≤ B(v′

) + γ∥v′

− v∥
⇒ ∥B(v′

) − B(v)∥ ≤ γ∥v − v′

∥
v, v′

: S → ℝ
v ≤ v′

⇔ v(s) ≤ v′

(s), ∀s ∈ S
∥v − v′

∥ := max
s∈S
|v(s) − v(s′

)|
v ≤ v′

⇒ B(v) ≤ B(v′

)
B(v + c) = B(v) + γc, ∀c ∈ ℝ
∥B(v) − B(v′

)∥ ≤ γ∥v − v′

∥
v* = B(v*) v*
lim
k→∞
Bk
(v0) = v*, ∀v0 : S → ℝ
∥v − v′

∥ ≤ ∥B(v) − B(v′

)∥ + ∥v − B(v)∥ + ∥v′

− B(v′

)∥
≤ γ∥v − v′

∥ + ∥v − B(v)∥ + ∥v′

− B(v′

)∥
⇒ ∥v − v′

| ≤
∥v − B(v)∥ + ∥v′

− B(v′

)∥
1 − γ
vk := Bk
(v0)
∥vn − vm∥ ≤
∥Bn
(v0) − Bn
(v1)∥ + ∥Bm
(v0) − Bm
(v1)∥
1 − γ
≤
γn
∥v0 − v1∥ + γm
∥v0 − v1∥
1 − γ
=
γn
+ γm
1 − γ
∥v0 − v1∥
lim
n,m→∞
∥vn − vm∥ = 0
∥vn − v*∥ ≤
∥Bn
(v0) − Bn
(v1)∥
1 − γ
=
γn
1 − γ
∥v0 − v1∥
lim
n→∞
∥vn − v*∥ = 0
B*(V) := max
a∈A
{g(a, ⋅ ) + γ
∑
s′

∈S
pT(s′

| ⋅ ,a)V(s′

)}
πd
*
πd
* (s) := arg max
a∈A
{g(a, s) + γ
∑
s′

∈S
pT(s′

|s, a)V*(s′

)}
lim
k→∞
Bk
(v0) = v*, ∀v0 : S → ℝ
M = {S, A, pT, p0, g} ε ∈ (0,∞)
v′

: S → ℝ π*v′

: S → A
v′

: S → ℝ
v′

= max
a∈A
{g(a, ⋅ ) + γ
∑
s′

∈S
pT(s′

| ⋅ ,a)v(s′

)}
∥v − v′

∥ < ε πd
*
πd
v′

(s) := arg max
a∈A
{g(a, s) + γ
∑
s′

∈S
pT(s′

|s, a)v′

(s′

)}
v = v′


強化学習勉強会の資料(3回目)

  • 3.
    M = {S,A, pT, p0, g} Pr{St+1 = s′  |At = a, St = s, …} = Pr{St+1 = s′  |At = a, St = s} =: pT(s′  |s, a), Pr(S0 = s) =: p0(s) π ∈ ΠM Pr(At = a|St = s, …) = Pr(At = a|St = s) =: π(a|s) Vπ Vπ (s) := 𝔼 π [C0 |S0 = s], Ct := ∞ ∑ i=0 γi g(At+i, St+i), γ ∈ [0,1) f(π) f(π) := ∑ s∈S p0(s)Vπ (s) π∈ΠM f(π) M
  • 4.
    Vπ (s) = 𝔼 π [C0 |S0= s] = 𝔼 π [g(A0, S0) + γC1 |S0 = s] = ∑ a∈A π(a|s)(g(a, s) + γ ∑ a∈A ∑ s′  ∈S π(a|s)pT(s′  |s, a) 𝔼 [C1 |s1 = s′  ]) = ∑ a∈A π(a|s)(g(a, s) + γ ∑ s′  ∈S pT(s′  |s, a)V(s′  )), ∀s ∈ S V* V*(s) := max (π0,π1,…) V(π0,π1,…) (s) V*(s) = max (π0,π1,…) 𝔼 (π0,π1,…) [g(A0, S0) + γC1 |S0 = s] = max π0 𝔼 π0 [g(A0, S0) + γ max (π1,π2,…) 𝔼 (π1,π2,…) [C1 |S1 ∼ pT( ⋅ |S0, A0)]|S0 = s] = max π0 ∑ a∈A π0(a|s)((g(a, s) + γ ∑ s′  ∈S pT(s′  |s, a)V*(s′  )) = max a∈A ((g(a, s) + γ ∑ s′  ∈S pT(s′  |s, a)V*(s′  )), ∀s ∈ S
  • 5.
    Bπ(V) := ∑ a∈A π(a| ⋅)(g(a, ⋅ ) + γ ∑ s′  ∈S pT(s′  | ⋅ ,a)V(s′  )) B*(V) := max a∈A {g(a, ⋅ ) + γ ∑ s′  ∈S pT(s′  | ⋅ ,a)V(s′  )} V = B(V), B := {B*, Bπ}
  • 6.
    v, v′  : S→ ℝ v ≤ v′  ⇔ v(s) ≤ v′  (s), ∀s ∈ S ∥v − v′  ∥ := max s∈S |v(s) − v(s′  )| v ≤ v′  ⇒ B(v) ≤ B(v′  ) B(v + c) = B(v) + γc, ∀c ∈ ℝ ∥B(v) − B(v′  )∥ ≤ γ∥v − v′  ∥ v* = B(v*) v* lim k→∞ Bk (v0) = v*, ∀v0 : S → ℝ B*(v)(s) = max a∈A {g(a, s) + γ ∑ s′  ∈S pT(s′  |s, a)v(s′  )} ≤ max a∈A {g(a, s) + γ ∑ s′  ∈S pT(s′  |s, a)v′  (s′  )} = B*(v′  )(s), ∀s ∈ S Bπ B*(v + c)(s) = max a∈A {g(a, s) + γ ∑ s′  ∈S pT(s′ |s, a)(v(s′  ) + c)} = max a∈A {g(a, s) + γ ∑ s′  ∈S pT(s′  |s, a)v(s′  )} + γc = B*(v)(s) + γc, ∀s ∈ S Bπ v′  − ∥v − v′  ∥ ≤ v ≤ v′  + ∥v − v′  ∥ ⇒ B(v′  ) − γ∥v′  − v∥ ≤ B(v) ≤ B(v′  ) + γ∥v′  − v∥ ⇒ ∥B(v′  ) − B(v)∥ ≤ γ∥v − v′  ∥
  • 7.
    v, v′  : S→ ℝ v ≤ v′  ⇔ v(s) ≤ v′  (s), ∀s ∈ S ∥v − v′  ∥ := max s∈S |v(s) − v(s′  )| v ≤ v′  ⇒ B(v) ≤ B(v′  ) B(v + c) = B(v) + γc, ∀c ∈ ℝ ∥B(v) − B(v′  )∥ ≤ γ∥v − v′  ∥ v* = B(v*) v* lim k→∞ Bk (v0) = v*, ∀v0 : S → ℝ ∥v − v′  ∥ ≤ ∥B(v) − B(v′  )∥ + ∥v − B(v)∥ + ∥v′  − B(v′  )∥ ≤ γ∥v − v′  ∥ + ∥v − B(v)∥ + ∥v′  − B(v′  )∥ ⇒ ∥v − v′  | ≤ ∥v − B(v)∥ + ∥v′  − B(v′  )∥ 1 − γ vk := Bk (v0) ∥vn − vm∥ ≤ ∥Bn (v0) − Bn (v1)∥ + ∥Bm (v0) − Bm (v1)∥ 1 − γ ≤ γn ∥v0 − v1∥ + γm ∥v0 − v1∥ 1 − γ = γn + γm 1 − γ ∥v0 − v1∥ lim n,m→∞ ∥vn − vm∥ = 0 ∥vn − v*∥ ≤ ∥Bn (v0) − Bn (v1)∥ 1 − γ = γn 1 − γ ∥v0 − v1∥ lim n→∞ ∥vn − v*∥ = 0
  • 8.
    B*(V) := max a∈A {g(a,⋅ ) + γ ∑ s′  ∈S pT(s′  | ⋅ ,a)V(s′  )} πd * πd * (s) := arg max a∈A {g(a, s) + γ ∑ s′  ∈S pT(s′  |s, a)V*(s′  )} lim k→∞ Bk (v0) = v*, ∀v0 : S → ℝ M = {S, A, pT, p0, g} ε ∈ (0,∞) v′  : S → ℝ π*v′  : S → A v′  : S → ℝ v′  = max a∈A {g(a, ⋅ ) + γ ∑ s′  ∈S pT(s′  | ⋅ ,a)v(s′  )} ∥v − v′  ∥ < ε πd * πd v′  (s) := arg max a∈A {g(a, s) + γ ∑ s′  ∈S pT(s′  |s, a)v′  (s′  )} v = v′