SlideShare a Scribd company logo
1 of 41
Download to read offline
Learning Robust Rewards With Adversarial Inverse
Reinforcement Learning
Justin Fu, Katie Luo, Sergey Levine
University of California, Berkeley
+ /1 5:
#
8 0 0 6 2
Agenda
• ML
• R NA
G E (,) - (,) ) -
- - -
)
• I - - ( , ) -
• C
2
t
• 2 8 A 2 1 A ( C 2 2 C A
2 8
, A -2A 0 8 C
)
lns u s u s W
• nF
W dg W e c ow
hbi
• f aF
W owJ e cSI L
K
Wv yr R ow m hbi
3
Markov Decision Process
• (", $, %, &, ', ())
• "
• $
• ' ∈ [0,1]
• % " × $ × " ↦ ℝ
• & " ↦ ℝ:
• () " ↦ ℝ:
4
Inverse Reinforcement Learning
• p
R ] R b mx!"L
• rt Ma [ e ) 8 : ( ,+ 0
u #I E i oL u I xm
x
$" # yn a Z
• yn x ou L mx %L
5
max
"
)*~, log $" #
$" # =
1
2
exp(!"(#))
!"(#):reward function
2:partition function
[] Inverse Reinforcement Learning
• ) + ) 1 1 + 6 6 ( 66
• , 6 +F a!Fg E R RIG
• R RI "($)Ge n a LM a F i[
C"($)F i[GdmE
n aF i[ o [
R RI F i[ n RM [
6
ℒ'()*'+ , = ./~1 log 56 $ = ./~1 76($) − log!
= ./~1 76($) − log ./~9
exp(76($)
"($)
ℒ=*>?@(' " = ./~9 76($) − ./~9[log("($))]
Generative Adversarial Nets [Goodfellow+, 14]
• ) ) , ( , , , , ) M
a M
• ) ) G ! M
"#$%$(') M
• , ( , , :D M
M )
) ) :D - )
•
min
,
max
/
0 1, 3 = 56 ~ 8#$%$(6) log 1(<) + 5> ~ 8?(>) log(1 − 1 < )
Discriminator true labels
for dataset
Discriminator false labels
for generated data
7
G
• 6
• A ,1 + 1 N
8
!" # =
1
&
exp *"(#)
1
&
exp *"(#) + .(#)
※GAIL[Ho & Ermon, 16]は, 報酬関数を隠に求めつつ方策を学習
[http://rll.berkeley.edu/deeprlcourse/d
ocs/inverserl.pdf ]
,1 + 1 DF
Adversarial Inverse Reinforcement Learning
)
• ]!, G # d Dc +4 1 4
D [
• $% d
• a +4 1 4 D d D
4 4 ( , )
9
&% !, # =
exp $%(!, #)
exp $%(!, #) + . # !)
̂01 !, # = log &% !, # − log 1 − &% !, #
= log
exp $%(!, #)
exp $%(!, #) + . # !)
− log
. # !)
exp $%(!, #) + . # !)
= $% !, # − log .(#|!)
IL
• R rt I IL
IL
a c Ds m
• ID
o rt! rt n
IL Di
10
bc Dli m
e d
d ( ) ) , ) ) ,
• bc bc n a
bc
• ! " + $("′) ! " $("′)D a
"′ " bc
11
"( ")
"* "+
,
• ! " , $ " , % " , &(") ", "′,
•
,
12
! " + $ "+ = % " + &("′)
! " = % " + const
$ " = & " + const
eB o lg
e fa 9 N
• !(#) e#B o B
• ̂!(s) e#B o
• [ B e m ] n , (
• 9 Φ # ∶ ) ↦ ℝ
• ,-,/
∗
! 1B B m
• , ̂-,/
∗
̂! 1B B m
• + ) m
, ̂-,/
∗
#, 2 = ,-,/
∗
#, 2 − Φ(#)
13
,-
∗ #, 2 = ! # + 6789[softmaxA9,-
∗ #′, 2′ ]
a B c
• ) ( ,
• ̂" # = " # + &(s) &(s) D
• ( ,
14
̂" # = " # + const
̂"(#) = " # + &(s) = " # + ./01[Φ(#′)] − Φ(#)
d a c b
7 ̂8
∗
#, ; = " # + ./01[Φ(#′)] − Φ(#) + ./01[softmax@17 ̂8
∗
#A, ;A ]
Adversarial Inverse Reinforcement Learning
• D i
[D i i bd!D he
bdaN , + 9 , - 9
D ] gc - ( )
15
" ̂$,&
∗
!, ( = "$,&
∗
!, ( − Φ(!)
Adversarial Inverse Reinforcement Learning
)
• D! , ,
16
"#,% !, &, !′ =
exp ,#,%(!, &, !′)
exp ,#,%(!, &, !′) + 0(&|!)
,#,% !, &, !′ = 2# ! + 3ℎ% !5 − ℎ% !
Adversarial Inverse Reinforcement Learning
• D exp $∗ &, (, &′ = +, ( &)
• ,
D
17
ℎ∗ & = /∗ & + const
6∗ & = 7∗ & + const
$∗ &, (, &′ = 7∗ & + 8/∗ &9 − /∗ &
;(&, () /(&)
$∗ &, (, &′ = =∗(&, ()
6∗ & + 8ℎ∗ &′ − ℎ∗ & = 7∗ & + 8/∗ &′ − /∗ &
18
• ( ) (
19
value
iteration
step
return for the policy
a
• a D 2
• ) ( ( a
20
e
e b a
• P d
D
21
) (
A
b i
) (
) (
d
•
original policy direct policy transfer AIRL
(re-optimal
state only reward)
[https://sites.google.com/view/adversarial-irl]
22
H imitation learning
• H G ]I [
]I E 6
• 1A L6 & , 1A
23
• .
v O n
• .
w n
e
• i
• r p ) ,( -
24
0 ≤ exp &',)(+, ,, +′) ≤ 1
• [Goodfellow+, 14] Generative Adversarial Nets, NIPS2014
• [Ziebart+, 08] Maximum Entropy Inverse Reinforcement Learning, AAAI2018
• [Ng+, 99] Policy invariance under reward transformations : Theory and
application to reward shaping, ICML1999
• [Finn+, 16] A Connection Between Generative Adversarial Networks, Inverse
Reinforcement Learning, and Energy-Based Models, NIPS2016
• [Finn+, 16] Guided Cost Learning: Deep Inverse Optimal Control via Policy
Optimization, ICML2016
• https://sites.google.com/view/adversarial-irl : Ant
25
Appendix
26
D A
( ) G A
27
D
N ( ) - A A D
! G
28
A
D !
29
A
30
Decomposability condition
• ! " + $ "% ! " $("′)
• ,
•
• "
) "′ − + "′ "′
31
!(") = ) "′ − + "′
- " + + "% = . " + )("′)
- " − . " = ) "% − + "%
!(") = - " − . "
Decomposability condition
• ! !′
• #(!) = const ,
32
, ! = - ! + const
/ ! = 0 ! + const
!1 !2
!3 !4
# !1 = 0 !2 − / !2 = 0 !3 − / !3 = 0 !4 − / !4
# !3 = 0 !1 − / !1 = 0 !2 − / !2
# !2 = 0 !1 − / !1 = 0 !3 − / !3 = 0 !4 − / !4
#(!4) = 0 !2 − / !2
reward shaping
• !(#) [#
• ̂!(s) [#
• [ N B9 B a , (
• !(#) + ) ]
' ̂(,*
∗
#, , = '(,*
∗
#, , − Φ(#)
'(
∗ #, , = ! # + 1234[softmax<4'(
∗ #′, ,′ ]
'(
∗ #, , − Φ(#) = ! # − Φ(#) + 1234[softmax<4'(
∗ #′, ,′ ]
'(
∗ #, , − Φ # = ! # + 123?[Φ(#′)] − Φ(#) + 123?[softmax<?'(
∗ #4, ,4 − Φ(#′)]
' ̂(
∗
#, , = ! # + 123?[Φ(#′)] − Φ(#) + 123?[softmax<?' ̂(
∗
#4, ,4 ]
( Φ # ∶ A ↦ ℝ )
33
, reward shaping
• ̂"($) = ' ̂(
∗
$, + − -./0[softmax90' ̂(
∗
$′, +′ ]
• ̂" $ = " $ + =(s)
• ̂" $ = " $ + const
34
' ̂(
∗
$, + = " $ + -./@[Φ($′)] − Φ($) + -./@[softmax9@' ̂(
∗
$0, +0 ]
̂"($) = " $ + -./@[Φ($′)] − Φ($)
=(s) = -./@[Φ($′)] − Φ($)
reward shaping
• D Φ "# ≠ const D D
• ( ) ) , ) ) ,
•
• "* : D
• ̂, " " D Φ "- = 1 Φ "0 = 2
35
"* "-
"0
2*
2-
345 Φ "# = 6
4#
7 "# ", 2 Φ("′)
2* ∶ 7 "- "*, 2* Φ "- + 7 "0 "*, 2* Φ "0
2* ∶ 7 "- "*, 2- Φ "- + 7 "0 "*, 2- Φ "0
後続状態に依存
=行動に依存
reward shaping
• , Φ "# = const
• , , Φ " = const
•
36
̂+ " = + " + const
AIRL Generator optimization
•
• CG
• , !"($) ,
37
maximize
,
-" ., 0 = 2, !"(., 0) − 2, log7(0|.)
maximize
9
ℒ;<=>?@A B = 29 !"($) − 29 logB($)
AIRL Discriminator optimization
•
•
38
!" #, % =
exp *"(#, %)
exp *"(#, %) + . % #)
max
"
ℒ 2 = 3
456
7
89 log !"(#4, %4) + 8=>
log(1 − !" #4, %4 )
= 3
456
7
89 log
exp *"(#4, %4)
exp *"(#4, %4) + . %4 #4)
+ 8=>
log
. %4 #4)
exp *"(#4, %4) + . %4 #4)
= 3
456
7
89 *"(#4, %4) + 8=A>
log . %4 #4) − 28CD>
log(exp *"(#4, %4) + . %4 #4))
AIRL Discriminator optimization
• !
39
"
"!
ℒ ! = %
&'(
)
*+
"
"!
,-(/&, 1&) −
"
"!
2*5678
log(exp ,-(/&, 1&) + @ 1& /&))
= %
&'(
)
*+
"
"!
,-(/&, 1&) − *A8
exp ,-(/&, 1&)
1
2
exp ,-(/&, 1&) +
1
2
@ 1& /&)
"
"!
,-(/&, 1&)
AIRL Discriminator optimization
• ) () ! = !#
! !# D ,
• ) ()
$
%
40
exp )∗(,, .) = !#
)∗ ,, . = log !# = 3∗ ,, .
N ]
• N +
Φ " ↦ ℝ
• ,, %(', )) → ',
• % ', ) ≠ %, ', ) ̂/
• /+ '′9 ) [ N
41
̂/ ', ), ', = / ', ), ', + 3Φ ', − Φ '
̂/ ', ) = / ', ) + 3Φ %(', )) − Φ '

More Related Content

What's hot

強化学習アルゴリズムPPOの解説と実験
強化学習アルゴリズムPPOの解説と実験強化学習アルゴリズムPPOの解説と実験
強化学習アルゴリズムPPOの解説と実験克海 納谷
 
Decision Transformer: Reinforcement Learning via Sequence Modeling
Decision Transformer: Reinforcement Learning via Sequence ModelingDecision Transformer: Reinforcement Learning via Sequence Modeling
Decision Transformer: Reinforcement Learning via Sequence ModelingYasunori Ozaki
 
ゼロから始める深層強化学習(NLP2018講演資料)/ Introduction of Deep Reinforcement Learning
ゼロから始める深層強化学習(NLP2018講演資料)/ Introduction of Deep Reinforcement Learningゼロから始める深層強化学習(NLP2018講演資料)/ Introduction of Deep Reinforcement Learning
ゼロから始める深層強化学習(NLP2018講演資料)/ Introduction of Deep Reinforcement LearningPreferred Networks
 
NIPS KANSAI Reading Group #7: 逆強化学習の行動解析への応用
NIPS KANSAI Reading Group #7: 逆強化学習の行動解析への応用NIPS KANSAI Reading Group #7: 逆強化学習の行動解析への応用
NIPS KANSAI Reading Group #7: 逆強化学習の行動解析への応用Eiji Uchibe
 
[DL輪読会]近年のオフライン強化学習のまとめ —Offline Reinforcement Learning: Tutorial, Review, an...
[DL輪読会]近年のオフライン強化学習のまとめ —Offline Reinforcement Learning: Tutorial, Review, an...[DL輪読会]近年のオフライン強化学習のまとめ —Offline Reinforcement Learning: Tutorial, Review, an...
[DL輪読会]近年のオフライン強化学習のまとめ —Offline Reinforcement Learning: Tutorial, Review, an...Deep Learning JP
 
強化学習の分散アーキテクチャ変遷
強化学習の分散アーキテクチャ変遷強化学習の分散アーキテクチャ変遷
強化学習の分散アーキテクチャ変遷Eiji Sekiya
 
方策勾配型強化学習の基礎と応用
方策勾配型強化学習の基礎と応用方策勾配型強化学習の基礎と応用
方策勾配型強化学習の基礎と応用Ryo Iwaki
 
劣モジュラ最適化と機械学習1章
劣モジュラ最適化と機械学習1章劣モジュラ最適化と機械学習1章
劣モジュラ最適化と機械学習1章Hakky St
 
Generative Adversarial Imitation Learningの紹介(RLアーキテクチャ勉強会)
Generative Adversarial Imitation Learningの紹介(RLアーキテクチャ勉強会)Generative Adversarial Imitation Learningの紹介(RLアーキテクチャ勉強会)
Generative Adversarial Imitation Learningの紹介(RLアーキテクチャ勉強会)Yusuke Nakata
 
Optimizer入門&最新動向
Optimizer入門&最新動向Optimizer入門&最新動向
Optimizer入門&最新動向Motokawa Tetsuya
 
[Dl輪読会]introduction of reinforcement learning
[Dl輪読会]introduction of reinforcement learning[Dl輪読会]introduction of reinforcement learning
[Dl輪読会]introduction of reinforcement learningDeep Learning JP
 
ノンパラメトリックベイズを用いた逆強化学習
ノンパラメトリックベイズを用いた逆強化学習ノンパラメトリックベイズを用いた逆強化学習
ノンパラメトリックベイズを用いた逆強化学習Shota Ishikawa
 
論文紹介 No-Reward Meta Learning (RL architecture勉強会)
論文紹介 No-Reward Meta Learning (RL architecture勉強会)論文紹介 No-Reward Meta Learning (RL architecture勉強会)
論文紹介 No-Reward Meta Learning (RL architecture勉強会)Yusuke Nakata
 
Stochastic Variational Inference
Stochastic Variational InferenceStochastic Variational Inference
Stochastic Variational InferenceKaede Hayashi
 
【DL輪読会】マルチエージェント強化学習における近年の 協調的方策学習アルゴリズムの発展
【DL輪読会】マルチエージェント強化学習における近年の 協調的方策学習アルゴリズムの発展【DL輪読会】マルチエージェント強化学習における近年の 協調的方策学習アルゴリズムの発展
【DL輪読会】マルチエージェント強化学習における近年の 協調的方策学習アルゴリズムの発展Deep Learning JP
 
最近のDQN
最近のDQN最近のDQN
最近のDQNmooopan
 
SSII2021 [TS2] 深層強化学習 〜 強化学習の基礎から応用まで 〜
SSII2021 [TS2] 深層強化学習 〜 強化学習の基礎から応用まで 〜SSII2021 [TS2] 深層強化学習 〜 強化学習の基礎から応用まで 〜
SSII2021 [TS2] 深層強化学習 〜 強化学習の基礎から応用まで 〜SSII
 
生成モデルの Deep Learning
生成モデルの Deep Learning生成モデルの Deep Learning
生成モデルの Deep LearningSeiya Tokui
 
確率的推論と行動選択
確率的推論と行動選択確率的推論と行動選択
確率的推論と行動選択Masahiro Suzuki
 
[DL輪読会]深層強化学習はなぜ難しいのか?Why Deep RL fails? A brief survey of recent works.
[DL輪読会]深層強化学習はなぜ難しいのか?Why Deep RL fails? A brief survey of recent works.[DL輪読会]深層強化学習はなぜ難しいのか?Why Deep RL fails? A brief survey of recent works.
[DL輪読会]深層強化学習はなぜ難しいのか?Why Deep RL fails? A brief survey of recent works.Deep Learning JP
 

What's hot (20)

強化学習アルゴリズムPPOの解説と実験
強化学習アルゴリズムPPOの解説と実験強化学習アルゴリズムPPOの解説と実験
強化学習アルゴリズムPPOの解説と実験
 
Decision Transformer: Reinforcement Learning via Sequence Modeling
Decision Transformer: Reinforcement Learning via Sequence ModelingDecision Transformer: Reinforcement Learning via Sequence Modeling
Decision Transformer: Reinforcement Learning via Sequence Modeling
 
ゼロから始める深層強化学習(NLP2018講演資料)/ Introduction of Deep Reinforcement Learning
ゼロから始める深層強化学習(NLP2018講演資料)/ Introduction of Deep Reinforcement Learningゼロから始める深層強化学習(NLP2018講演資料)/ Introduction of Deep Reinforcement Learning
ゼロから始める深層強化学習(NLP2018講演資料)/ Introduction of Deep Reinforcement Learning
 
NIPS KANSAI Reading Group #7: 逆強化学習の行動解析への応用
NIPS KANSAI Reading Group #7: 逆強化学習の行動解析への応用NIPS KANSAI Reading Group #7: 逆強化学習の行動解析への応用
NIPS KANSAI Reading Group #7: 逆強化学習の行動解析への応用
 
[DL輪読会]近年のオフライン強化学習のまとめ —Offline Reinforcement Learning: Tutorial, Review, an...
[DL輪読会]近年のオフライン強化学習のまとめ —Offline Reinforcement Learning: Tutorial, Review, an...[DL輪読会]近年のオフライン強化学習のまとめ —Offline Reinforcement Learning: Tutorial, Review, an...
[DL輪読会]近年のオフライン強化学習のまとめ —Offline Reinforcement Learning: Tutorial, Review, an...
 
強化学習の分散アーキテクチャ変遷
強化学習の分散アーキテクチャ変遷強化学習の分散アーキテクチャ変遷
強化学習の分散アーキテクチャ変遷
 
方策勾配型強化学習の基礎と応用
方策勾配型強化学習の基礎と応用方策勾配型強化学習の基礎と応用
方策勾配型強化学習の基礎と応用
 
劣モジュラ最適化と機械学習1章
劣モジュラ最適化と機械学習1章劣モジュラ最適化と機械学習1章
劣モジュラ最適化と機械学習1章
 
Generative Adversarial Imitation Learningの紹介(RLアーキテクチャ勉強会)
Generative Adversarial Imitation Learningの紹介(RLアーキテクチャ勉強会)Generative Adversarial Imitation Learningの紹介(RLアーキテクチャ勉強会)
Generative Adversarial Imitation Learningの紹介(RLアーキテクチャ勉強会)
 
Optimizer入門&最新動向
Optimizer入門&最新動向Optimizer入門&最新動向
Optimizer入門&最新動向
 
[Dl輪読会]introduction of reinforcement learning
[Dl輪読会]introduction of reinforcement learning[Dl輪読会]introduction of reinforcement learning
[Dl輪読会]introduction of reinforcement learning
 
ノンパラメトリックベイズを用いた逆強化学習
ノンパラメトリックベイズを用いた逆強化学習ノンパラメトリックベイズを用いた逆強化学習
ノンパラメトリックベイズを用いた逆強化学習
 
論文紹介 No-Reward Meta Learning (RL architecture勉強会)
論文紹介 No-Reward Meta Learning (RL architecture勉強会)論文紹介 No-Reward Meta Learning (RL architecture勉強会)
論文紹介 No-Reward Meta Learning (RL architecture勉強会)
 
Stochastic Variational Inference
Stochastic Variational InferenceStochastic Variational Inference
Stochastic Variational Inference
 
【DL輪読会】マルチエージェント強化学習における近年の 協調的方策学習アルゴリズムの発展
【DL輪読会】マルチエージェント強化学習における近年の 協調的方策学習アルゴリズムの発展【DL輪読会】マルチエージェント強化学習における近年の 協調的方策学習アルゴリズムの発展
【DL輪読会】マルチエージェント強化学習における近年の 協調的方策学習アルゴリズムの発展
 
最近のDQN
最近のDQN最近のDQN
最近のDQN
 
SSII2021 [TS2] 深層強化学習 〜 強化学習の基礎から応用まで 〜
SSII2021 [TS2] 深層強化学習 〜 強化学習の基礎から応用まで 〜SSII2021 [TS2] 深層強化学習 〜 強化学習の基礎から応用まで 〜
SSII2021 [TS2] 深層強化学習 〜 強化学習の基礎から応用まで 〜
 
生成モデルの Deep Learning
生成モデルの Deep Learning生成モデルの Deep Learning
生成モデルの Deep Learning
 
確率的推論と行動選択
確率的推論と行動選択確率的推論と行動選択
確率的推論と行動選択
 
[DL輪読会]深層強化学習はなぜ難しいのか?Why Deep RL fails? A brief survey of recent works.
[DL輪読会]深層強化学習はなぜ難しいのか?Why Deep RL fails? A brief survey of recent works.[DL輪読会]深層強化学習はなぜ難しいのか?Why Deep RL fails? A brief survey of recent works.
[DL輪読会]深層強化学習はなぜ難しいのか?Why Deep RL fails? A brief survey of recent works.
 

Similar to 第5回NIPS読み会・関西発表資料

Functional Gradient Boosting based on Residual Network Perception
Functional Gradient Boosting based on Residual Network PerceptionFunctional Gradient Boosting based on Residual Network Perception
Functional Gradient Boosting based on Residual Network PerceptionAtsushi Nitanda
 
Sales lessons for startups
Sales lessons for startupsSales lessons for startups
Sales lessons for startupsMunish Chawla
 
Attention-Based Adaptive Selection of Operations for Image Restoration in the...
Attention-Based Adaptive Selection of Operations for Image Restoration in the...Attention-Based Adaptive Selection of Operations for Image Restoration in the...
Attention-Based Adaptive Selection of Operations for Image Restoration in the...MasanoriSuganuma
 
Programming Contest Hacks
Programming Contest HacksProgramming Contest Hacks
Programming Contest HacksKosei Moriyama
 
Agile estimation 2_complete
Agile estimation 2_completeAgile estimation 2_complete
Agile estimation 2_completexpdaysgermany
 
Python Fundamentals - Basic
Python Fundamentals - BasicPython Fundamentals - Basic
Python Fundamentals - BasicWei-Yuan Chang
 
An Effort to Restore from Imperata Grassland to Secondary Forest in Samboja L...
An Effort to Restore from Imperata Grassland to Secondary Forest in Samboja L...An Effort to Restore from Imperata Grassland to Secondary Forest in Samboja L...
An Effort to Restore from Imperata Grassland to Secondary Forest in Samboja L...GPFLR
 
Kaggle Google Quest Q&A Labeling 反省会 LT資料 47th place solution
Kaggle Google Quest Q&A Labeling 反省会 LT資料 47th place solutionKaggle Google Quest Q&A Labeling 反省会 LT資料 47th place solution
Kaggle Google Quest Q&A Labeling 反省会 LT資料 47th place solutionKen'ichi Matsui
 
敵対的生成ネットワークによる食事画像の変換に関する研究
敵対的生成ネットワークによる食事画像の変換に関する研究敵対的生成ネットワークによる食事画像の変換に関する研究
敵対的生成ネットワークによる食事画像の変換に関する研究Ryosuke Tanno
 
Training evaluation workshop ver 00
Training evaluation workshop ver 00Training evaluation workshop ver 00
Training evaluation workshop ver 00Reza Seifollahy
 
Google. Мобильная реклама сегодня
Google. Мобильная реклама сегодняGoogle. Мобильная реклама сегодня
Google. Мобильная реклама сегодняTechart Marketing Group
 
[2017 PYCON 튜토리얼]OpenAI Gym을 이용한 강화학습 에이전트 만들기
[2017 PYCON 튜토리얼]OpenAI Gym을 이용한 강화학습 에이전트 만들기[2017 PYCON 튜토리얼]OpenAI Gym을 이용한 강화학습 에이전트 만들기
[2017 PYCON 튜토리얼]OpenAI Gym을 이용한 강화학습 에이전트 만들기이 의령
 
katagaitai CTF workshop #10 AESに対する相関電力解析
katagaitai CTF workshop #10 AESに対する相関電力解析katagaitai CTF workshop #10 AESに対する相関電力解析
katagaitai CTF workshop #10 AESに対する相関電力解析trmr
 
Droidcon 2011: Android in Social Networks - Jens Mücke, XING
Droidcon 2011: Android in Social Networks - Jens Mücke, XINGDroidcon 2011: Android in Social Networks - Jens Mücke, XING
Droidcon 2011: Android in Social Networks - Jens Mücke, XINGDroidcon Berlin
 
[AWS Dev Day] 인공지능 / 기계 학습 | 개발자를 위한 수백만 사용자 대상 기계 학습 서비스 확장 하기 - 윤석찬 AWS 수석테...
[AWS Dev Day] 인공지능 / 기계 학습 | 개발자를 위한 수백만 사용자 대상 기계 학습 서비스 확장 하기 - 윤석찬 AWS 수석테...[AWS Dev Day] 인공지능 / 기계 학습 | 개발자를 위한 수백만 사용자 대상 기계 학습 서비스 확장 하기 - 윤석찬 AWS 수석테...
[AWS Dev Day] 인공지능 / 기계 학습 | 개발자를 위한 수백만 사용자 대상 기계 학습 서비스 확장 하기 - 윤석찬 AWS 수석테...Amazon Web Services Korea
 
SEO - It Works Even if You Don’t Know How or Why
SEO - It Works Even if You Don’t Know How or Why SEO - It Works Even if You Don’t Know How or Why
SEO - It Works Even if You Don’t Know How or Why Wolfgang Weicht
 
メタプログラミング入門
メタプログラミング入門メタプログラミング入門
メタプログラミング入門Kent Ohashi
 
Evergreen trails master plan community meeting 1 boards
Evergreen trails master plan community meeting 1 boardsEvergreen trails master plan community meeting 1 boards
Evergreen trails master plan community meeting 1 boardsOV Consulting
 
직장인을 위한 GTD 시작하기 (How To Start GTD)
직장인을 위한 GTD 시작하기 (How To Start GTD)직장인을 위한 GTD 시작하기 (How To Start GTD)
직장인을 위한 GTD 시작하기 (How To Start GTD)Jinho Jung
 
機械学習モデルの判断根拠の説明
機械学習モデルの判断根拠の説明機械学習モデルの判断根拠の説明
機械学習モデルの判断根拠の説明Satoshi Hara
 

Similar to 第5回NIPS読み会・関西発表資料 (20)

Functional Gradient Boosting based on Residual Network Perception
Functional Gradient Boosting based on Residual Network PerceptionFunctional Gradient Boosting based on Residual Network Perception
Functional Gradient Boosting based on Residual Network Perception
 
Sales lessons for startups
Sales lessons for startupsSales lessons for startups
Sales lessons for startups
 
Attention-Based Adaptive Selection of Operations for Image Restoration in the...
Attention-Based Adaptive Selection of Operations for Image Restoration in the...Attention-Based Adaptive Selection of Operations for Image Restoration in the...
Attention-Based Adaptive Selection of Operations for Image Restoration in the...
 
Programming Contest Hacks
Programming Contest HacksProgramming Contest Hacks
Programming Contest Hacks
 
Agile estimation 2_complete
Agile estimation 2_completeAgile estimation 2_complete
Agile estimation 2_complete
 
Python Fundamentals - Basic
Python Fundamentals - BasicPython Fundamentals - Basic
Python Fundamentals - Basic
 
An Effort to Restore from Imperata Grassland to Secondary Forest in Samboja L...
An Effort to Restore from Imperata Grassland to Secondary Forest in Samboja L...An Effort to Restore from Imperata Grassland to Secondary Forest in Samboja L...
An Effort to Restore from Imperata Grassland to Secondary Forest in Samboja L...
 
Kaggle Google Quest Q&A Labeling 反省会 LT資料 47th place solution
Kaggle Google Quest Q&A Labeling 反省会 LT資料 47th place solutionKaggle Google Quest Q&A Labeling 反省会 LT資料 47th place solution
Kaggle Google Quest Q&A Labeling 反省会 LT資料 47th place solution
 
敵対的生成ネットワークによる食事画像の変換に関する研究
敵対的生成ネットワークによる食事画像の変換に関する研究敵対的生成ネットワークによる食事画像の変換に関する研究
敵対的生成ネットワークによる食事画像の変換に関する研究
 
Training evaluation workshop ver 00
Training evaluation workshop ver 00Training evaluation workshop ver 00
Training evaluation workshop ver 00
 
Google. Мобильная реклама сегодня
Google. Мобильная реклама сегодняGoogle. Мобильная реклама сегодня
Google. Мобильная реклама сегодня
 
[2017 PYCON 튜토리얼]OpenAI Gym을 이용한 강화학습 에이전트 만들기
[2017 PYCON 튜토리얼]OpenAI Gym을 이용한 강화학습 에이전트 만들기[2017 PYCON 튜토리얼]OpenAI Gym을 이용한 강화학습 에이전트 만들기
[2017 PYCON 튜토리얼]OpenAI Gym을 이용한 강화학습 에이전트 만들기
 
katagaitai CTF workshop #10 AESに対する相関電力解析
katagaitai CTF workshop #10 AESに対する相関電力解析katagaitai CTF workshop #10 AESに対する相関電力解析
katagaitai CTF workshop #10 AESに対する相関電力解析
 
Droidcon 2011: Android in Social Networks - Jens Mücke, XING
Droidcon 2011: Android in Social Networks - Jens Mücke, XINGDroidcon 2011: Android in Social Networks - Jens Mücke, XING
Droidcon 2011: Android in Social Networks - Jens Mücke, XING
 
[AWS Dev Day] 인공지능 / 기계 학습 | 개발자를 위한 수백만 사용자 대상 기계 학습 서비스 확장 하기 - 윤석찬 AWS 수석테...
[AWS Dev Day] 인공지능 / 기계 학습 | 개발자를 위한 수백만 사용자 대상 기계 학습 서비스 확장 하기 - 윤석찬 AWS 수석테...[AWS Dev Day] 인공지능 / 기계 학습 | 개발자를 위한 수백만 사용자 대상 기계 학습 서비스 확장 하기 - 윤석찬 AWS 수석테...
[AWS Dev Day] 인공지능 / 기계 학습 | 개발자를 위한 수백만 사용자 대상 기계 학습 서비스 확장 하기 - 윤석찬 AWS 수석테...
 
SEO - It Works Even if You Don’t Know How or Why
SEO - It Works Even if You Don’t Know How or Why SEO - It Works Even if You Don’t Know How or Why
SEO - It Works Even if You Don’t Know How or Why
 
メタプログラミング入門
メタプログラミング入門メタプログラミング入門
メタプログラミング入門
 
Evergreen trails master plan community meeting 1 boards
Evergreen trails master plan community meeting 1 boardsEvergreen trails master plan community meeting 1 boards
Evergreen trails master plan community meeting 1 boards
 
직장인을 위한 GTD 시작하기 (How To Start GTD)
직장인을 위한 GTD 시작하기 (How To Start GTD)직장인을 위한 GTD 시작하기 (How To Start GTD)
직장인을 위한 GTD 시작하기 (How To Start GTD)
 
機械学習モデルの判断根拠の説明
機械学習モデルの判断根拠の説明機械学習モデルの判断根拠の説明
機械学習モデルの判断根拠の説明
 

Recently uploaded

SPICE PARK APR2024 ( 6,793 SPICE Models )
SPICE PARK APR2024 ( 6,793 SPICE Models )SPICE PARK APR2024 ( 6,793 SPICE Models )
SPICE PARK APR2024 ( 6,793 SPICE Models )Tsuyoshi Horigome
 
the ladakh protest in leh ladakh 2024 sonam wangchuk.pptx
the ladakh protest in leh ladakh 2024 sonam wangchuk.pptxthe ladakh protest in leh ladakh 2024 sonam wangchuk.pptx
the ladakh protest in leh ladakh 2024 sonam wangchuk.pptxhumanexperienceaaa
 
Decoding Kotlin - Your guide to solving the mysterious in Kotlin.pptx
Decoding Kotlin - Your guide to solving the mysterious in Kotlin.pptxDecoding Kotlin - Your guide to solving the mysterious in Kotlin.pptx
Decoding Kotlin - Your guide to solving the mysterious in Kotlin.pptxJoão Esperancinha
 
Call Girls Delhi {Jodhpur} 9711199012 high profile service
Call Girls Delhi {Jodhpur} 9711199012 high profile serviceCall Girls Delhi {Jodhpur} 9711199012 high profile service
Call Girls Delhi {Jodhpur} 9711199012 high profile servicerehmti665
 
GDSC ASEB Gen AI study jams presentation
GDSC ASEB Gen AI study jams presentationGDSC ASEB Gen AI study jams presentation
GDSC ASEB Gen AI study jams presentationGDSCAESB
 
Model Call Girl in Narela Delhi reach out to us at 🔝8264348440🔝
Model Call Girl in Narela Delhi reach out to us at 🔝8264348440🔝Model Call Girl in Narela Delhi reach out to us at 🔝8264348440🔝
Model Call Girl in Narela Delhi reach out to us at 🔝8264348440🔝soniya singh
 
High Profile Call Girls Nagpur Isha Call 7001035870 Meet With Nagpur Escorts
High Profile Call Girls Nagpur Isha Call 7001035870 Meet With Nagpur EscortsHigh Profile Call Girls Nagpur Isha Call 7001035870 Meet With Nagpur Escorts
High Profile Call Girls Nagpur Isha Call 7001035870 Meet With Nagpur Escortsranjana rawat
 
VIP Call Girls Service Kondapur Hyderabad Call +91-8250192130
VIP Call Girls Service Kondapur Hyderabad Call +91-8250192130VIP Call Girls Service Kondapur Hyderabad Call +91-8250192130
VIP Call Girls Service Kondapur Hyderabad Call +91-8250192130Suhani Kapoor
 
Analog to Digital and Digital to Analog Converter
Analog to Digital and Digital to Analog ConverterAnalog to Digital and Digital to Analog Converter
Analog to Digital and Digital to Analog ConverterAbhinavSharma374939
 
(RIA) Call Girls Bhosari ( 7001035870 ) HI-Fi Pune Escorts Service
(RIA) Call Girls Bhosari ( 7001035870 ) HI-Fi Pune Escorts Service(RIA) Call Girls Bhosari ( 7001035870 ) HI-Fi Pune Escorts Service
(RIA) Call Girls Bhosari ( 7001035870 ) HI-Fi Pune Escorts Serviceranjana rawat
 
chaitra-1.pptx fake news detection using machine learning
chaitra-1.pptx  fake news detection using machine learningchaitra-1.pptx  fake news detection using machine learning
chaitra-1.pptx fake news detection using machine learningmisbanausheenparvam
 
OSVC_Meta-Data based Simulation Automation to overcome Verification Challenge...
OSVC_Meta-Data based Simulation Automation to overcome Verification Challenge...OSVC_Meta-Data based Simulation Automation to overcome Verification Challenge...
OSVC_Meta-Data based Simulation Automation to overcome Verification Challenge...Soham Mondal
 
Gfe Mayur Vihar Call Girls Service WhatsApp -> 9999965857 Available 24x7 ^ De...
Gfe Mayur Vihar Call Girls Service WhatsApp -> 9999965857 Available 24x7 ^ De...Gfe Mayur Vihar Call Girls Service WhatsApp -> 9999965857 Available 24x7 ^ De...
Gfe Mayur Vihar Call Girls Service WhatsApp -> 9999965857 Available 24x7 ^ De...srsj9000
 
Microscopic Analysis of Ceramic Materials.pptx
Microscopic Analysis of Ceramic Materials.pptxMicroscopic Analysis of Ceramic Materials.pptx
Microscopic Analysis of Ceramic Materials.pptxpurnimasatapathy1234
 
Porous Ceramics seminar and technical writing
Porous Ceramics seminar and technical writingPorous Ceramics seminar and technical writing
Porous Ceramics seminar and technical writingrakeshbaidya232001
 
College Call Girls Nashik Nehal 7001305949 Independent Escort Service Nashik
College Call Girls Nashik Nehal 7001305949 Independent Escort Service NashikCollege Call Girls Nashik Nehal 7001305949 Independent Escort Service Nashik
College Call Girls Nashik Nehal 7001305949 Independent Escort Service NashikCall Girls in Nagpur High Profile
 
Sheet Pile Wall Design and Construction: A Practical Guide for Civil Engineer...
Sheet Pile Wall Design and Construction: A Practical Guide for Civil Engineer...Sheet Pile Wall Design and Construction: A Practical Guide for Civil Engineer...
Sheet Pile Wall Design and Construction: A Practical Guide for Civil Engineer...Dr.Costas Sachpazis
 

Recently uploaded (20)

★ CALL US 9953330565 ( HOT Young Call Girls In Badarpur delhi NCR
★ CALL US 9953330565 ( HOT Young Call Girls In Badarpur delhi NCR★ CALL US 9953330565 ( HOT Young Call Girls In Badarpur delhi NCR
★ CALL US 9953330565 ( HOT Young Call Girls In Badarpur delhi NCR
 
SPICE PARK APR2024 ( 6,793 SPICE Models )
SPICE PARK APR2024 ( 6,793 SPICE Models )SPICE PARK APR2024 ( 6,793 SPICE Models )
SPICE PARK APR2024 ( 6,793 SPICE Models )
 
the ladakh protest in leh ladakh 2024 sonam wangchuk.pptx
the ladakh protest in leh ladakh 2024 sonam wangchuk.pptxthe ladakh protest in leh ladakh 2024 sonam wangchuk.pptx
the ladakh protest in leh ladakh 2024 sonam wangchuk.pptx
 
Decoding Kotlin - Your guide to solving the mysterious in Kotlin.pptx
Decoding Kotlin - Your guide to solving the mysterious in Kotlin.pptxDecoding Kotlin - Your guide to solving the mysterious in Kotlin.pptx
Decoding Kotlin - Your guide to solving the mysterious in Kotlin.pptx
 
Call Girls Delhi {Jodhpur} 9711199012 high profile service
Call Girls Delhi {Jodhpur} 9711199012 high profile serviceCall Girls Delhi {Jodhpur} 9711199012 high profile service
Call Girls Delhi {Jodhpur} 9711199012 high profile service
 
GDSC ASEB Gen AI study jams presentation
GDSC ASEB Gen AI study jams presentationGDSC ASEB Gen AI study jams presentation
GDSC ASEB Gen AI study jams presentation
 
Model Call Girl in Narela Delhi reach out to us at 🔝8264348440🔝
Model Call Girl in Narela Delhi reach out to us at 🔝8264348440🔝Model Call Girl in Narela Delhi reach out to us at 🔝8264348440🔝
Model Call Girl in Narela Delhi reach out to us at 🔝8264348440🔝
 
Exploring_Network_Security_with_JA3_by_Rakesh Seal.pptx
Exploring_Network_Security_with_JA3_by_Rakesh Seal.pptxExploring_Network_Security_with_JA3_by_Rakesh Seal.pptx
Exploring_Network_Security_with_JA3_by_Rakesh Seal.pptx
 
High Profile Call Girls Nagpur Isha Call 7001035870 Meet With Nagpur Escorts
High Profile Call Girls Nagpur Isha Call 7001035870 Meet With Nagpur EscortsHigh Profile Call Girls Nagpur Isha Call 7001035870 Meet With Nagpur Escorts
High Profile Call Girls Nagpur Isha Call 7001035870 Meet With Nagpur Escorts
 
VIP Call Girls Service Kondapur Hyderabad Call +91-8250192130
VIP Call Girls Service Kondapur Hyderabad Call +91-8250192130VIP Call Girls Service Kondapur Hyderabad Call +91-8250192130
VIP Call Girls Service Kondapur Hyderabad Call +91-8250192130
 
Analog to Digital and Digital to Analog Converter
Analog to Digital and Digital to Analog ConverterAnalog to Digital and Digital to Analog Converter
Analog to Digital and Digital to Analog Converter
 
(RIA) Call Girls Bhosari ( 7001035870 ) HI-Fi Pune Escorts Service
(RIA) Call Girls Bhosari ( 7001035870 ) HI-Fi Pune Escorts Service(RIA) Call Girls Bhosari ( 7001035870 ) HI-Fi Pune Escorts Service
(RIA) Call Girls Bhosari ( 7001035870 ) HI-Fi Pune Escorts Service
 
chaitra-1.pptx fake news detection using machine learning
chaitra-1.pptx  fake news detection using machine learningchaitra-1.pptx  fake news detection using machine learning
chaitra-1.pptx fake news detection using machine learning
 
9953056974 Call Girls In South Ex, Escorts (Delhi) NCR.pdf
9953056974 Call Girls In South Ex, Escorts (Delhi) NCR.pdf9953056974 Call Girls In South Ex, Escorts (Delhi) NCR.pdf
9953056974 Call Girls In South Ex, Escorts (Delhi) NCR.pdf
 
OSVC_Meta-Data based Simulation Automation to overcome Verification Challenge...
OSVC_Meta-Data based Simulation Automation to overcome Verification Challenge...OSVC_Meta-Data based Simulation Automation to overcome Verification Challenge...
OSVC_Meta-Data based Simulation Automation to overcome Verification Challenge...
 
Gfe Mayur Vihar Call Girls Service WhatsApp -> 9999965857 Available 24x7 ^ De...
Gfe Mayur Vihar Call Girls Service WhatsApp -> 9999965857 Available 24x7 ^ De...Gfe Mayur Vihar Call Girls Service WhatsApp -> 9999965857 Available 24x7 ^ De...
Gfe Mayur Vihar Call Girls Service WhatsApp -> 9999965857 Available 24x7 ^ De...
 
Microscopic Analysis of Ceramic Materials.pptx
Microscopic Analysis of Ceramic Materials.pptxMicroscopic Analysis of Ceramic Materials.pptx
Microscopic Analysis of Ceramic Materials.pptx
 
Porous Ceramics seminar and technical writing
Porous Ceramics seminar and technical writingPorous Ceramics seminar and technical writing
Porous Ceramics seminar and technical writing
 
College Call Girls Nashik Nehal 7001305949 Independent Escort Service Nashik
College Call Girls Nashik Nehal 7001305949 Independent Escort Service NashikCollege Call Girls Nashik Nehal 7001305949 Independent Escort Service Nashik
College Call Girls Nashik Nehal 7001305949 Independent Escort Service Nashik
 
Sheet Pile Wall Design and Construction: A Practical Guide for Civil Engineer...
Sheet Pile Wall Design and Construction: A Practical Guide for Civil Engineer...Sheet Pile Wall Design and Construction: A Practical Guide for Civil Engineer...
Sheet Pile Wall Design and Construction: A Practical Guide for Civil Engineer...
 

第5回NIPS読み会・関西発表資料

  • 1. Learning Robust Rewards With Adversarial Inverse Reinforcement Learning Justin Fu, Katie Luo, Sergey Levine University of California, Berkeley + /1 5: # 8 0 0 6 2
  • 2. Agenda • ML • R NA G E (,) - (,) ) - - - - ) • I - - ( , ) - • C 2
  • 3. t • 2 8 A 2 1 A ( C 2 2 C A 2 8 , A -2A 0 8 C ) lns u s u s W • nF W dg W e c ow hbi • f aF W owJ e cSI L K Wv yr R ow m hbi 3
  • 4. Markov Decision Process • (", $, %, &, ', ()) • " • $ • ' ∈ [0,1] • % " × $ × " ↦ ℝ • & " ↦ ℝ: • () " ↦ ℝ: 4
  • 5. Inverse Reinforcement Learning • p R ] R b mx!"L • rt Ma [ e ) 8 : ( ,+ 0 u #I E i oL u I xm x $" # yn a Z • yn x ou L mx %L 5 max " )*~, log $" # $" # = 1 2 exp(!"(#)) !"(#):reward function 2:partition function
  • 6. [] Inverse Reinforcement Learning • ) + ) 1 1 + 6 6 ( 66 • , 6 +F a!Fg E R RIG • R RI "($)Ge n a LM a F i[ C"($)F i[GdmE n aF i[ o [ R RI F i[ n RM [ 6 ℒ'()*'+ , = ./~1 log 56 $ = ./~1 76($) − log! = ./~1 76($) − log ./~9 exp(76($) "($) ℒ=*>?@(' " = ./~9 76($) − ./~9[log("($))]
  • 7. Generative Adversarial Nets [Goodfellow+, 14] • ) ) , ( , , , , ) M a M • ) ) G ! M "#$%$(') M • , ( , , :D M M ) ) ) :D - ) • min , max / 0 1, 3 = 56 ~ 8#$%$(6) log 1(<) + 5> ~ 8?(>) log(1 − 1 < ) Discriminator true labels for dataset Discriminator false labels for generated data 7
  • 8. G • 6 • A ,1 + 1 N 8 !" # = 1 & exp *"(#) 1 & exp *"(#) + .(#) ※GAIL[Ho & Ermon, 16]は, 報酬関数を隠に求めつつ方策を学習 [http://rll.berkeley.edu/deeprlcourse/d ocs/inverserl.pdf ] ,1 + 1 DF
  • 9. Adversarial Inverse Reinforcement Learning ) • ]!, G # d Dc +4 1 4 D [ • $% d • a +4 1 4 D d D 4 4 ( , ) 9 &% !, # = exp $%(!, #) exp $%(!, #) + . # !) ̂01 !, # = log &% !, # − log 1 − &% !, # = log exp $%(!, #) exp $%(!, #) + . # !) − log . # !) exp $%(!, #) + . # !) = $% !, # − log .(#|!)
  • 10. IL • R rt I IL IL a c Ds m • ID o rt! rt n IL Di 10
  • 11. bc Dli m e d d ( ) ) , ) ) , • bc bc n a bc • ! " + $("′) ! " $("′)D a "′ " bc 11 "( ") "* "+
  • 12. , • ! " , $ " , % " , &(") ", "′, • , 12 ! " + $ "+ = % " + &("′) ! " = % " + const $ " = & " + const
  • 13. eB o lg e fa 9 N • !(#) e#B o B • ̂!(s) e#B o • [ B e m ] n , ( • 9 Φ # ∶ ) ↦ ℝ • ,-,/ ∗ ! 1B B m • , ̂-,/ ∗ ̂! 1B B m • + ) m , ̂-,/ ∗ #, 2 = ,-,/ ∗ #, 2 − Φ(#) 13 ,- ∗ #, 2 = ! # + 6789[softmaxA9,- ∗ #′, 2′ ]
  • 14. a B c • ) ( , • ̂" # = " # + &(s) &(s) D • ( , 14 ̂" # = " # + const ̂"(#) = " # + &(s) = " # + ./01[Φ(#′)] − Φ(#) d a c b 7 ̂8 ∗ #, ; = " # + ./01[Φ(#′)] − Φ(#) + ./01[softmax@17 ̂8 ∗ #A, ;A ]
  • 15. Adversarial Inverse Reinforcement Learning • D i [D i i bd!D he bdaN , + 9 , - 9 D ] gc - ( ) 15 " ̂$,& ∗ !, ( = "$,& ∗ !, ( − Φ(!)
  • 16. Adversarial Inverse Reinforcement Learning ) • D! , , 16 "#,% !, &, !′ = exp ,#,%(!, &, !′) exp ,#,%(!, &, !′) + 0(&|!) ,#,% !, &, !′ = 2# ! + 3ℎ% !5 − ℎ% !
  • 17. Adversarial Inverse Reinforcement Learning • D exp $∗ &, (, &′ = +, ( &) • , D 17 ℎ∗ & = /∗ & + const 6∗ & = 7∗ & + const $∗ &, (, &′ = 7∗ & + 8/∗ &9 − /∗ & ;(&, () /(&) $∗ &, (, &′ = =∗(&, () 6∗ & + 8ℎ∗ &′ − ℎ∗ & = 7∗ & + 8/∗ &′ − /∗ &
  • 18. 18
  • 19. • ( ) ( 19 value iteration step return for the policy
  • 20. a • a D 2 • ) ( ( a 20 e
  • 21. e b a • P d D 21 ) ( A b i ) ( ) ( d
  • 22. • original policy direct policy transfer AIRL (re-optimal state only reward) [https://sites.google.com/view/adversarial-irl] 22
  • 23. H imitation learning • H G ]I [ ]I E 6 • 1A L6 & , 1A 23
  • 24. • . v O n • . w n e • i • r p ) ,( - 24 0 ≤ exp &',)(+, ,, +′) ≤ 1
  • 25. • [Goodfellow+, 14] Generative Adversarial Nets, NIPS2014 • [Ziebart+, 08] Maximum Entropy Inverse Reinforcement Learning, AAAI2018 • [Ng+, 99] Policy invariance under reward transformations : Theory and application to reward shaping, ICML1999 • [Finn+, 16] A Connection Between Generative Adversarial Networks, Inverse Reinforcement Learning, and Energy-Based Models, NIPS2016 • [Finn+, 16] Guided Cost Learning: Deep Inverse Optimal Control via Policy Optimization, ICML2016 • https://sites.google.com/view/adversarial-irl : Ant 25
  • 27. D A ( ) G A 27
  • 28. D N ( ) - A A D ! G 28
  • 30. A 30
  • 31. Decomposability condition • ! " + $ "% ! " $("′) • , • • " ) "′ − + "′ "′ 31 !(") = ) "′ − + "′ - " + + "% = . " + )("′) - " − . " = ) "% − + "% !(") = - " − . "
  • 32. Decomposability condition • ! !′ • #(!) = const , 32 , ! = - ! + const / ! = 0 ! + const !1 !2 !3 !4 # !1 = 0 !2 − / !2 = 0 !3 − / !3 = 0 !4 − / !4 # !3 = 0 !1 − / !1 = 0 !2 − / !2 # !2 = 0 !1 − / !1 = 0 !3 − / !3 = 0 !4 − / !4 #(!4) = 0 !2 − / !2
  • 33. reward shaping • !(#) [# • ̂!(s) [# • [ N B9 B a , ( • !(#) + ) ] ' ̂(,* ∗ #, , = '(,* ∗ #, , − Φ(#) '( ∗ #, , = ! # + 1234[softmax<4'( ∗ #′, ,′ ] '( ∗ #, , − Φ(#) = ! # − Φ(#) + 1234[softmax<4'( ∗ #′, ,′ ] '( ∗ #, , − Φ # = ! # + 123?[Φ(#′)] − Φ(#) + 123?[softmax<?'( ∗ #4, ,4 − Φ(#′)] ' ̂( ∗ #, , = ! # + 123?[Φ(#′)] − Φ(#) + 123?[softmax<?' ̂( ∗ #4, ,4 ] ( Φ # ∶ A ↦ ℝ ) 33
  • 34. , reward shaping • ̂"($) = ' ̂( ∗ $, + − -./0[softmax90' ̂( ∗ $′, +′ ] • ̂" $ = " $ + =(s) • ̂" $ = " $ + const 34 ' ̂( ∗ $, + = " $ + -./@[Φ($′)] − Φ($) + -./@[softmax9@' ̂( ∗ $0, +0 ] ̂"($) = " $ + -./@[Φ($′)] − Φ($) =(s) = -./@[Φ($′)] − Φ($)
  • 35. reward shaping • D Φ "# ≠ const D D • ( ) ) , ) ) , • • "* : D • ̂, " " D Φ "- = 1 Φ "0 = 2 35 "* "- "0 2* 2- 345 Φ "# = 6 4# 7 "# ", 2 Φ("′) 2* ∶ 7 "- "*, 2* Φ "- + 7 "0 "*, 2* Φ "0 2* ∶ 7 "- "*, 2- Φ "- + 7 "0 "*, 2- Φ "0 後続状態に依存 =行動に依存
  • 36. reward shaping • , Φ "# = const • , , Φ " = const • 36 ̂+ " = + " + const
  • 37. AIRL Generator optimization • • CG • , !"($) , 37 maximize , -" ., 0 = 2, !"(., 0) − 2, log7(0|.) maximize 9 ℒ;<=>?@A B = 29 !"($) − 29 logB($)
  • 38. AIRL Discriminator optimization • • 38 !" #, % = exp *"(#, %) exp *"(#, %) + . % #) max " ℒ 2 = 3 456 7 89 log !"(#4, %4) + 8=> log(1 − !" #4, %4 ) = 3 456 7 89 log exp *"(#4, %4) exp *"(#4, %4) + . %4 #4) + 8=> log . %4 #4) exp *"(#4, %4) + . %4 #4) = 3 456 7 89 *"(#4, %4) + 8=A> log . %4 #4) − 28CD> log(exp *"(#4, %4) + . %4 #4))
  • 39. AIRL Discriminator optimization • ! 39 " "! ℒ ! = % &'( ) *+ " "! ,-(/&, 1&) − " "! 2*5678 log(exp ,-(/&, 1&) + @ 1& /&)) = % &'( ) *+ " "! ,-(/&, 1&) − *A8 exp ,-(/&, 1&) 1 2 exp ,-(/&, 1&) + 1 2 @ 1& /&) " "! ,-(/&, 1&)
  • 40. AIRL Discriminator optimization • ) () ! = !# ! !# D , • ) () $ % 40 exp )∗(,, .) = !# )∗ ,, . = log !# = 3∗ ,, .
  • 41. N ] • N + Φ " ↦ ℝ • ,, %(', )) → ', • % ', ) ≠ %, ', ) ̂/ • /+ '′9 ) [ N 41 ̂/ ', ), ', = / ', ), ', + 3Φ ', − Φ ' ̂/ ', ) = / ', ) + 3Φ %(', )) − Φ '