Deep RBF Value Functions for Continuous Control Kavosh Asadi Ronald Parr George Konidaris Michael Littman � 1
<latexit sha1_base64="53B5SMtKkE+DzAK9yYgkFpOi08Q=">AB6nicbVDLSsNAFL2pr1pfVZduBovgQkpi49d0Y3LivYBbSiT6aQdOpmEmYlQj/BjQtF3PpF7vwbJ2kQtR64cDjnXu69x4s4U9q2P63C0vLK6lpxvbSxubW9U97da6swloS2SMhD2fWwopwJ2tJMc9qNJMWBx2nHm1ynfueBSsVCca+nEXUDPBLMZwRrI92pEzkoV+yqnQEtEicnFcjRHJQ/+sOQxAEVmnCsVM+xI+0mWGpGOJ2V+rGiESYTPKI9QwUOqHKT7NQZOjLKEPmhNCU0ytSfEwkOlJoGnukMsB6rv14q/uf1Yu1fuAkTUaypIPNFfsyRDlH6NxoySYnmU0MwkczcisgYS0y0SaeUhXCZ4uz75UXSPq06tWrtl5pXOVxFOEADuEYHDiHBtxAE1pAYASP8AwvFrerFfrbd5asPKZfgF6/0LMd+N4A=</latexit> <latexit sha1_base64="F2D/FvNhHLsAPKNyXxvNekAF7Nw=">AB6HicbVDJSgNBEK2JW4xb1KOXxiB4CjMqLregF48JmAWSIfR0apI2PQvdPUI+QIvHhTx6id582/smQyixgcFj/eqKrnxYIrbdufVmFpeWV1rbhe2tjc2t4p7+61VJRIhk0WiUh2PKpQ8BCbmuBnVgiDTyBbW98k/rtB5SKR+GdnsToBnQYcp8zqo3UoP1yxa7aGcgicXJSgRz1fvmjN4hYEmComaBKdR071u6USs2ZwFmplyiMKRvTIXYNDWmAyp1mh87IkVEGxI+kqVCTP05MaWBUpPAM50B1SP10vF/7xuov1Ld8rDONEYsvkiPxFERyT9mgy4RKbFxBDKJDe3EjaikjJtsilIVylOP9+eZG0TqrOafW0cVapXedxFOEADuEYHLiAGtxCHZrAOERnuHFurerFfrbd5asPKZfgF6/0L3CWNHA=</latexit> The RL Problem a agent s, r � 2
“The state-space complexity for Go has been estimated at 10 174 , which is more than the total number of atoms in the universe.” � 3
� 4
<latexit sha1_base64="Eap0UWn/Y0YCL6OqYgDXYjRvcbI=">ACFnicbVDJSgNBEO2JW4xb1KOXwSBEiGFixA0CQRE9JmAWSMahp9OTNOlZ6K4Rwpiv8OKvePGgiFfx5t/YkwR4NuHu9VUVXPDjiTYBifWmJmdm5+IbmYWlpeWV1Lr2/UpR8KQmvE575o2lhSzjxaAwacNgNBsWtz2rD757HfuKVCMt+7hkFATRd3PeYwgkFJVnqvehO1AzbMyhzePS21XQw924kuhq1LC+6kBSVlqB/nVJVpTNG3hBnyaFCcmgCSpW+qPd8UnoUg8Ix1K2CkYAZoQFMLpMNUOJQ0w6eMubSnqYZdKMxqdNdR3lNLRHV+o54E+Un92RNiVcuDaqjJeW/71YvE/rxWCc2xGzAtCoB4ZD3JCroOvxnpHSYoAT5QBPB1K46WGBCagkU6MQTmIcfp8Ter7+UIxX6weZMpnkziSaAtoywqoCNURleogmqIoHv0iJ7Ri/agPWmv2tu4NKFNejbRL2jvX0dlnvY=</latexit> <latexit sha1_base64="SmPoZkPmiI4sBiQ+C+T6BVdgfPg=">ACGXicbVDLSgMxFM3UV62vUZdugkVwUcrUio9d1Y3Lqn1Bp5Q7adqGJpkhyQil9Dfc+CtuXCjiUlf+jTPTUtR6IHA451xy7/ECzrRxnC8rtbC4tLySXs2srW9sbtnbOzXth4rQKvG5rxoeaMqZpFXDKeNQFEQHqd1b3AV+/V7qjTzZcUMA9oS0JOsywiYSGrbjstB9jFrgDTJ8DxXW5GL3L4NlfJuT0QAlyVBNt21sk7CfA8KUxJFk1RbtsfbscnoaDSEA5aNwtOYFojUIYRTscZN9Q0ADKAHm1GVIKgujVKLhvjg0jp4K6voicNTtSfEyMQWg+FyXjpfVfLxb/85qh6Z61RkwGoaGSTD7qhwbH8c14Q5TlBg+jAgQxaJdMemDAmKiMjNJCecxTmYnz5PaUb5QzBdvjrOly2kdabSH9tEhKqBTVELXqIyqiKAH9IRe0Kv1aD1b9b7JqypjO76Besz2+rcJ+b</latexit> <latexit sha1_base64="rBfmq1u3Ee+W/A/4LPk3OZYW4zw=">AC3icbZDLSsNAFIYnXmu9RV26CS1CFSmJFW8gFN24bMFeoIlhMp20QycXZiZiCdm78VXcuFDErS/gzrdxkgZR6w/D/HznHGbO74SUcKHrn8rM7Nz8wmJhqbi8srq2rm5stnkQMYRbKAB6zqQY0p83BJEUNwNGYaeQ3HGV2m9c4tZpwE/rUYh9jy4MAnLkFQSGSrpeZNvJdU+D7cPTs3PXhnx2ZIEknTK+O2WtareiZt2hi5KYNcDVv9MPsBijzsC0Qh5z1D4UVQyYIojgpmhHIUQjOMA9aX3oYW7F2S6JtiNJX3MDJo8vtIz+nIihx/nYc2SnB8WQ/62l8L9aLxLuiRUTP4wE9tHkITeimgi0NBitTxhGgo6lgYgR+VcNDSGDSMj4ilkIp6mOvleNu2DqlGr1pqH5fpFHkcBbIMSqADHIM6uAIN0AI3INH8AxelAflSXlV3iatM0o+swV+SXn/Amoymio=</latexit> <latexit sha1_base64="C5mojRDEHRc/VWFZWbt3pwOTLkc=">ACEnicbVDLSsNAFJ34rPUVdelmsAjtpqRWfK2qblxWtA9oQplMJ+3QySTMTJQS+g1u/BU3LhRx68qdf+MkjaLWAxcO59zLvfe4IaNSWdaHMTM7N7+wmFvKL6+srq2bG5tNGUQCkwYOWCDaLpKEU4aipG2qEgyHcZabnD8Rv3RAhacCv1Sgkjo/6nHoUI6WlrlmyQ3pi+0gNMGLx1dgWtD9QSIjgFtZF8cuBp6WuWbDKVgo4TSoZKYAM9a75bvcCHPmEK8yQlJ2KFSonRkJRzMg4b0eShAgPUZ90NOXIJ9KJ05fGcFcrPegFQhdXMFV/TsTIl3Lku7ozOVH+9RLxP68TKe/IiSkPI0U4nizyIgZVAJN8YI8KghUbaYKwoPpWiAdIKx0ivk0hOMEB98vT5PmXrlSLVcv9wu1syOHNgGO6AIKuAQ1MAFqIMGwOAOPIAn8GzcG4/Gi/E6aZ0xspkt8AvG2ycfJ3b</latexit> <latexit sha1_base64="uXAjvt20+uKMGj8x1xAWtnNL+o=">ACOnicbVDLSgMxFM34tr6qLt0EiyAUhpkqvqBQdKHLClaFTh0yaYGk8yQ3BHKMN/lxq9w58KNC0Xc+gFO20F8HQicnHMuyT1BLgBx3m0xsYnJqemZ2ZLc/MLi0vl5ZVzEyWashaNRKQvA2KY4Iq1gINgl7FmRAaCXQ3RwP/4pZpwyN1Bv2YdSTpKR5ySiCX/PLpsQ8Hde1D1esRKQnWfgpVNyuV7WRUMuqtm3XPZNIP+V1J7vyuAqhj4tYyrNRkGd+ueLYzhD4L3ELUkEFmn75wetGNJFMARXEmLbrxNBJiQZOBctKXmJYTOgN6bF2ThWRzHTS4eoZ3siVLg4jnR8FeKh+n0iJNKYvgzwpCVyb395A/M9rJxDudVKu4gSYoqOHwkRgiPCgR9zlmlEQ/ZwQqn+V0yviSYU8rZLwxL2B9j5WvkvOa/Z7pa9dbpdaRwWdcygNbSONpGLdlEDnaAmaiGK7tATekGv1r31bL1Z76PomFXMrKIfsD4+AXq8rNw=</latexit> Background • MDP h S , A , R, T, γ i • policy π : S → Pr ( A ) ∞ • return X G t := r t + γ r t +1 + γ 2 r t +2 + ... = γ i r t + i i =0 • Q function Q π ( s, a ) := E [ G t | s t = s, a t = a, π ] • optimal Q function Q ∗ ( s, a ) := max Q π ( s, a ) π learn from interaction. � 5
Recommend
More recommend