cs 4803 7643 deep learning
play

CS 4803 / 7643: Deep Learning Topics: Application: PointGoal - PowerPoint PPT Presentation

CS 4803 / 7643: Deep Learning Topics: Application: PointGoal Navigation Trust Region Policy Optimization (TRPO) Proximal Policy Optimization (PPO) Erik Wijmans Georgia Tech Who Am I? Research Interests Computer Vision


  1. <latexit sha1_base64="Q87OeiBHTj2uynKBf+Fc2+4r43I=">ACKXicbVBNSwMxEM36bf2qevQSLIJeyq4Ieix68ajSqtAty2w2bYPZElmhbL073jxr3hRUNSrf8RsW8GvB4E3782QmRdnUlj0/Tdvanpmdm5+YbGytLyulZd37i0OjeMt5iW2lzHYLkUirdQoOTXmeGQxpJfxTcnpX91y40VWjVxkPFOCj0luoIBOimqNkIFsYQoxD5HoKHUPRpm4qvehQhpmIqE2gj3aMgSXdaAfQayuBhGzaha8+v+CPQvCSakRiY4i6pPYaJZnKFTIK17cDPsFOAQcEkH1bC3PIM2A30eNtRBSm3nWJ06ZDuOCWhXW3cU0hH6veJAlJrB2nsOsl7W+vFP/z2jl2jzqFUFmOXLHxR91cUtS0jI0mwnCGcuAIMCPcrpT1wQBDF27FhRD8PvkvudyvB349OD+oNY4ncSyQLbJNdklADkmDnJIz0iKM3JEH8kxevHv0Xv13setU95kZpP8gPfxCf5KpnQ=</latexit> Advantage Actor Critic (A2C) • High variance:

  2. <latexit sha1_base64="Q87OeiBHTj2uynKBf+Fc2+4r43I=">ACKXicbVBNSwMxEM36bf2qevQSLIJeyq4Ieix68ajSqtAty2w2bYPZElmhbL073jxr3hRUNSrf8RsW8GvB4E3782QmRdnUlj0/Tdvanpmdm5+YbGytLyulZd37i0OjeMt5iW2lzHYLkUirdQoOTXmeGQxpJfxTcnpX91y40VWjVxkPFOCj0luoIBOimqNkIFsYQoxD5HoKHUPRpm4qvehQhpmIqE2gj3aMgSXdaAfQayuBhGzaha8+v+CPQvCSakRiY4i6pPYaJZnKFTIK17cDPsFOAQcEkH1bC3PIM2A30eNtRBSm3nWJ06ZDuOCWhXW3cU0hH6veJAlJrB2nsOsl7W+vFP/z2jl2jzqFUFmOXLHxR91cUtS0jI0mwnCGcuAIMCPcrpT1wQBDF27FhRD8PvkvudyvB349OD+oNY4ncSyQLbJNdklADkmDnJIz0iKM3JEH8kxevHv0Xv13setU95kZpP8gPfxCf5KpnQ=</latexit> <latexit sha1_base64="stVTPD2NxfInfe0m+8mdo3PSnQ=">ACL3icbVBNSysxFM348dQ+37Pq0k2wPKiLV2ZE0KUoiMsqrQpNGe5k0jaYSYbkjlCG/iM3/hU3Ioq49V+Y1gp+HQicnHMv96T5Eo6DMP7YGZ2bv7XwuJS5fyn78r1dW1M2cKy0WbG2XsRQJOKlFGyUqcZFbAVmixHlyeTj2z6+EdLoFg5z0c2gr2VPckAvxdUjpiFREDMcCATKlOlTlsv3fx1ipCyTKXUxblHGU4O0zjLAQdVno7iFv1Pk624Wgsb4QT0O4mpEamaMbVW5YaXmRCI1fgXCcKc+yWYFyJUYVjiRA7+Evuh4qiETrltO7h3Rf15Jac9Y/zTSifqxo4TMuWGW+Mrxpu6rNxZ/8joF9va6pdR5gULzt0G9QlE0dBweTaUVHNXQE+BW+l0pH4AFj7ig8h+nryd3K23YjCRnSyU9s/mMaxSDbIJqmTiOySfXJMmqRNOLkmt+SBPAY3wV3wFDy/lc4E0518gnBysfJqfQ</latexit> Advantage Actor Critic (A2C) • High variance: • Reduce variance with baseline:

  3. <latexit sha1_base64="Q87OeiBHTj2uynKBf+Fc2+4r43I=">ACKXicbVBNSwMxEM36bf2qevQSLIJeyq4Ieix68ajSqtAty2w2bYPZElmhbL073jxr3hRUNSrf8RsW8GvB4E3782QmRdnUlj0/Tdvanpmdm5+YbGytLyulZd37i0OjeMt5iW2lzHYLkUirdQoOTXmeGQxpJfxTcnpX91y40VWjVxkPFOCj0luoIBOimqNkIFsYQoxD5HoKHUPRpm4qvehQhpmIqE2gj3aMgSXdaAfQayuBhGzaha8+v+CPQvCSakRiY4i6pPYaJZnKFTIK17cDPsFOAQcEkH1bC3PIM2A30eNtRBSm3nWJ06ZDuOCWhXW3cU0hH6veJAlJrB2nsOsl7W+vFP/z2jl2jzqFUFmOXLHxR91cUtS0jI0mwnCGcuAIMCPcrpT1wQBDF27FhRD8PvkvudyvB349OD+oNY4ncSyQLbJNdklADkmDnJIz0iKM3JEH8kxevHv0Xv13setU95kZpP8gPfxCf5KpnQ=</latexit> <latexit sha1_base64="stVTPD2NxfInfe0m+8mdo3PSnQ=">ACL3icbVBNSysxFM348dQ+37Pq0k2wPKiLV2ZE0KUoiMsqrQpNGe5k0jaYSYbkjlCG/iM3/hU3Ioq49V+Y1gp+HQicnHMv96T5Eo6DMP7YGZ2bv7XwuJS5fyn78r1dW1M2cKy0WbG2XsRQJOKlFGyUqcZFbAVmixHlyeTj2z6+EdLoFg5z0c2gr2VPckAvxdUjpiFREDMcCATKlOlTlsv3fx1ipCyTKXUxblHGU4O0zjLAQdVno7iFv1Pk624Wgsb4QT0O4mpEamaMbVW5YaXmRCI1fgXCcKc+yWYFyJUYVjiRA7+Evuh4qiETrltO7h3Rf15Jac9Y/zTSifqxo4TMuWGW+Mrxpu6rNxZ/8joF9va6pdR5gULzt0G9QlE0dBweTaUVHNXQE+BW+l0pH4AFj7ig8h+nryd3K23YjCRnSyU9s/mMaxSDbIJqmTiOySfXJMmqRNOLkmt+SBPAY3wV3wFDy/lc4E0518gnBysfJqfQ</latexit> <latexit sha1_base64="UAloJD1/ubWRcMSZ24yHXdTCXs=">ACNHicbVBNaxsxENXmO85H3eSYi6gJOIeY3VBoj6a5BHpJi+0ELPMamVbRCst0mzALP5RueSH5BICOTSEXPsbqrVdaJM8EDy9N8PMvCRX0mEYPgRLyura+sbm7Wt7Z3dD/WPez1nCstFlxtl7GUCTipRclKnGZWwFZosRFcnVa+RfXwjpdAcnuRhkMNJyKDmgl+L6d6YhURAzHAsEypQZUZbLv/8mxEhZJlPqYjyijKcGaZNlgGMOqvw5jTv0mPalXsU1xthK5yBviXRgjTIAudx/Y6lheZ0MgVONePwhwHJViUXIlpjRVO5MCvYCT6nmrIhBuUs6On9NArKR0a659GOlP/7Sghc26SJb6yWte9irxPa9f4PDroJQ6L1BoPh80LBRFQ6sEaSqt4KgmngC30u9K+RgscPQ513wI0euT35LeSsKW9GPz432t0UcG+SAfCJNEpEvpE3OyDnpEk5uyD35RZ6C2+AxeA5e5qVLwaJn/yH4Pcfr46pjQ=</latexit> Advantage Actor Critic (A2C) • High variance: • Reduce variance with baseline: • Use value-function as the baseline (A2C):

  4. <latexit sha1_base64="Q87OeiBHTj2uynKBf+Fc2+4r43I=">ACKXicbVBNSwMxEM36bf2qevQSLIJeyq4Ieix68ajSqtAty2w2bYPZElmhbL073jxr3hRUNSrf8RsW8GvB4E3782QmRdnUlj0/Tdvanpmdm5+YbGytLyulZd37i0OjeMt5iW2lzHYLkUirdQoOTXmeGQxpJfxTcnpX91y40VWjVxkPFOCj0luoIBOimqNkIFsYQoxD5HoKHUPRpm4qvehQhpmIqE2gj3aMgSXdaAfQayuBhGzaha8+v+CPQvCSakRiY4i6pPYaJZnKFTIK17cDPsFOAQcEkH1bC3PIM2A30eNtRBSm3nWJ06ZDuOCWhXW3cU0hH6veJAlJrB2nsOsl7W+vFP/z2jl2jzqFUFmOXLHxR91cUtS0jI0mwnCGcuAIMCPcrpT1wQBDF27FhRD8PvkvudyvB349OD+oNY4ncSyQLbJNdklADkmDnJIz0iKM3JEH8kxevHv0Xv13setU95kZpP8gPfxCf5KpnQ=</latexit> <latexit sha1_base64="stVTPD2NxfInfe0m+8mdo3PSnQ=">ACL3icbVBNSysxFM348dQ+37Pq0k2wPKiLV2ZE0KUoiMsqrQpNGe5k0jaYSYbkjlCG/iM3/hU3Ioq49V+Y1gp+HQicnHMv96T5Eo6DMP7YGZ2bv7XwuJS5fyn78r1dW1M2cKy0WbG2XsRQJOKlFGyUqcZFbAVmixHlyeTj2z6+EdLoFg5z0c2gr2VPckAvxdUjpiFREDMcCATKlOlTlsv3fx1ipCyTKXUxblHGU4O0zjLAQdVno7iFv1Pk624Wgsb4QT0O4mpEamaMbVW5YaXmRCI1fgXCcKc+yWYFyJUYVjiRA7+Evuh4qiETrltO7h3Rf15Jac9Y/zTSifqxo4TMuWGW+Mrxpu6rNxZ/8joF9va6pdR5gULzt0G9QlE0dBweTaUVHNXQE+BW+l0pH4AFj7ig8h+nryd3K23YjCRnSyU9s/mMaxSDbIJqmTiOySfXJMmqRNOLkmt+SBPAY3wV3wFDy/lc4E0518gnBysfJqfQ</latexit> <latexit sha1_base64="UAloJD1/ubWRcMSZ24yHXdTCXs=">ACNHicbVBNaxsxENXmO85H3eSYi6gJOIeY3VBoj6a5BHpJi+0ELPMamVbRCst0mzALP5RueSH5BICOTSEXPsbqrVdaJM8EDy9N8PMvCRX0mEYPgRLyura+sbm7Wt7Z3dD/WPez1nCstFlxtl7GUCTipRclKnGZWwFZosRFcnVa+RfXwjpdAcnuRhkMNJyKDmgl+L6d6YhURAzHAsEypQZUZbLv/8mxEhZJlPqYjyijKcGaZNlgGMOqvw5jTv0mPalXsU1xthK5yBviXRgjTIAudx/Y6lheZ0MgVONePwhwHJViUXIlpjRVO5MCvYCT6nmrIhBuUs6On9NArKR0a659GOlP/7Sghc26SJb6yWte9irxPa9f4PDroJQ6L1BoPh80LBRFQ6sEaSqt4KgmngC30u9K+RgscPQ513wI0euT35LeSsKW9GPz432t0UcG+SAfCJNEpEvpE3OyDnpEk5uyD35RZ6C2+AxeA5e5qVLwaJn/yH4Pcfr46pjQ=</latexit> Advantage Actor Critic (A2C) • High variance: • Reduce variance with baseline: • Use value-function as the baseline (A2C):

  5. <latexit sha1_base64="Q87OeiBHTj2uynKBf+Fc2+4r43I=">ACKXicbVBNSwMxEM36bf2qevQSLIJeyq4Ieix68ajSqtAty2w2bYPZElmhbL073jxr3hRUNSrf8RsW8GvB4E3782QmRdnUlj0/Tdvanpmdm5+YbGytLyulZd37i0OjeMt5iW2lzHYLkUirdQoOTXmeGQxpJfxTcnpX91y40VWjVxkPFOCj0luoIBOimqNkIFsYQoxD5HoKHUPRpm4qvehQhpmIqE2gj3aMgSXdaAfQayuBhGzaha8+v+CPQvCSakRiY4i6pPYaJZnKFTIK17cDPsFOAQcEkH1bC3PIM2A30eNtRBSm3nWJ06ZDuOCWhXW3cU0hH6veJAlJrB2nsOsl7W+vFP/z2jl2jzqFUFmOXLHxR91cUtS0jI0mwnCGcuAIMCPcrpT1wQBDF27FhRD8PvkvudyvB349OD+oNY4ncSyQLbJNdklADkmDnJIz0iKM3JEH8kxevHv0Xv13setU95kZpP8gPfxCf5KpnQ=</latexit> <latexit sha1_base64="stVTPD2NxfInfe0m+8mdo3PSnQ=">ACL3icbVBNSysxFM348dQ+37Pq0k2wPKiLV2ZE0KUoiMsqrQpNGe5k0jaYSYbkjlCG/iM3/hU3Ioq49V+Y1gp+HQicnHMv96T5Eo6DMP7YGZ2bv7XwuJS5fyn78r1dW1M2cKy0WbG2XsRQJOKlFGyUqcZFbAVmixHlyeTj2z6+EdLoFg5z0c2gr2VPckAvxdUjpiFREDMcCATKlOlTlsv3fx1ipCyTKXUxblHGU4O0zjLAQdVno7iFv1Pk624Wgsb4QT0O4mpEamaMbVW5YaXmRCI1fgXCcKc+yWYFyJUYVjiRA7+Evuh4qiETrltO7h3Rf15Jac9Y/zTSifqxo4TMuWGW+Mrxpu6rNxZ/8joF9va6pdR5gULzt0G9QlE0dBweTaUVHNXQE+BW+l0pH4AFj7ig8h+nryd3K23YjCRnSyU9s/mMaxSDbIJqmTiOySfXJMmqRNOLkmt+SBPAY3wV3wFDy/lc4E0518gnBysfJqfQ</latexit> <latexit sha1_base64="UAloJD1/ubWRcMSZ24yHXdTCXs=">ACNHicbVBNaxsxENXmO85H3eSYi6gJOIeY3VBoj6a5BHpJi+0ELPMamVbRCst0mzALP5RueSH5BICOTSEXPsbqrVdaJM8EDy9N8PMvCRX0mEYPgRLyura+sbm7Wt7Z3dD/WPez1nCstFlxtl7GUCTipRclKnGZWwFZosRFcnVa+RfXwjpdAcnuRhkMNJyKDmgl+L6d6YhURAzHAsEypQZUZbLv/8mxEhZJlPqYjyijKcGaZNlgGMOqvw5jTv0mPalXsU1xthK5yBviXRgjTIAudx/Y6lheZ0MgVONePwhwHJViUXIlpjRVO5MCvYCT6nmrIhBuUs6On9NArKR0a659GOlP/7Sghc26SJb6yWte9irxPa9f4PDroJQ6L1BoPh80LBRFQ6sEaSqt4KgmngC30u9K+RgscPQ513wI0euT35LeSsKW9GPz432t0UcG+SAfCJNEpEvpE3OyDnpEk5uyD35RZ6C2+AxeA5e5qVLwaJn/yH4Pcfr46pjQ=</latexit> <latexit sha1_base64="5+vMcXT+CN5kLeqSR9srGNolKAg=">AC3icbZDLSgMxFIYz9VbrbdSlm9AiVNAyI4JuhKobly3YC7TDkEkzbWjmQnJGKV7N76KGxeKuPUF3Pk2ZtpZ1NYfAl/+cw7J+b1YcAW9WPkVlbX1jfym4Wt7Z3dPXP/oKmiRFLWoJGIZNsjigkesgZwEKwdS0YCT7CWN7xL61HJhWPwgcYxcwJSD/kPqcEtOWaxZsyceEUKxdO8DWuz93OcLOcgmuWrIo1FV4GO4MSylRze9uL6JwEKgijVsa0YnDGRwKlgk0I3USwmdEj6rKMxJAFTzni6ywQfa6eH/UjqEwKeuvMTYxIoNQo83RkQGKjFWmr+V+sk4F85Yx7GCbCQzh7yE4EhwmkwuMcloyBGgiVXP8V0wGRhIKOr6BDsBdXobmecW2Knb9olS9zeLIoyNURGVko0tURfeohqIoif0gt7Qu/FsvBofxuesNWdkM4foj4yvX5iQl5Y=</latexit> Advantage Actor Critic (A2C) • High variance: • Reduce variance with baseline: • Use value-function as the baseline (A2C):

  6. <latexit sha1_base64="Q87OeiBHTj2uynKBf+Fc2+4r43I=">ACKXicbVBNSwMxEM36bf2qevQSLIJeyq4Ieix68ajSqtAty2w2bYPZElmhbL073jxr3hRUNSrf8RsW8GvB4E3782QmRdnUlj0/Tdvanpmdm5+YbGytLyulZd37i0OjeMt5iW2lzHYLkUirdQoOTXmeGQxpJfxTcnpX91y40VWjVxkPFOCj0luoIBOimqNkIFsYQoxD5HoKHUPRpm4qvehQhpmIqE2gj3aMgSXdaAfQayuBhGzaha8+v+CPQvCSakRiY4i6pPYaJZnKFTIK17cDPsFOAQcEkH1bC3PIM2A30eNtRBSm3nWJ06ZDuOCWhXW3cU0hH6veJAlJrB2nsOsl7W+vFP/z2jl2jzqFUFmOXLHxR91cUtS0jI0mwnCGcuAIMCPcrpT1wQBDF27FhRD8PvkvudyvB349OD+oNY4ncSyQLbJNdklADkmDnJIz0iKM3JEH8kxevHv0Xv13setU95kZpP8gPfxCf5KpnQ=</latexit> <latexit sha1_base64="stVTPD2NxfInfe0m+8mdo3PSnQ=">ACL3icbVBNSysxFM348dQ+37Pq0k2wPKiLV2ZE0KUoiMsqrQpNGe5k0jaYSYbkjlCG/iM3/hU3Ioq49V+Y1gp+HQicnHMv96T5Eo6DMP7YGZ2bv7XwuJS5fyn78r1dW1M2cKy0WbG2XsRQJOKlFGyUqcZFbAVmixHlyeTj2z6+EdLoFg5z0c2gr2VPckAvxdUjpiFREDMcCATKlOlTlsv3fx1ipCyTKXUxblHGU4O0zjLAQdVno7iFv1Pk624Wgsb4QT0O4mpEamaMbVW5YaXmRCI1fgXCcKc+yWYFyJUYVjiRA7+Evuh4qiETrltO7h3Rf15Jac9Y/zTSifqxo4TMuWGW+Mrxpu6rNxZ/8joF9va6pdR5gULzt0G9QlE0dBweTaUVHNXQE+BW+l0pH4AFj7ig8h+nryd3K23YjCRnSyU9s/mMaxSDbIJqmTiOySfXJMmqRNOLkmt+SBPAY3wV3wFDy/lc4E0518gnBysfJqfQ</latexit> <latexit sha1_base64="UAloJD1/ubWRcMSZ24yHXdTCXs=">ACNHicbVBNaxsxENXmO85H3eSYi6gJOIeY3VBoj6a5BHpJi+0ELPMamVbRCst0mzALP5RueSH5BICOTSEXPsbqrVdaJM8EDy9N8PMvCRX0mEYPgRLyura+sbm7Wt7Z3dD/WPez1nCstFlxtl7GUCTipRclKnGZWwFZosRFcnVa+RfXwjpdAcnuRhkMNJyKDmgl+L6d6YhURAzHAsEypQZUZbLv/8mxEhZJlPqYjyijKcGaZNlgGMOqvw5jTv0mPalXsU1xthK5yBviXRgjTIAudx/Y6lheZ0MgVONePwhwHJViUXIlpjRVO5MCvYCT6nmrIhBuUs6On9NArKR0a659GOlP/7Sghc26SJb6yWte9irxPa9f4PDroJQ6L1BoPh80LBRFQ6sEaSqt4KgmngC30u9K+RgscPQ513wI0euT35LeSsKW9GPz432t0UcG+SAfCJNEpEvpE3OyDnpEk5uyD35RZ6C2+AxeA5e5qVLwaJn/yH4Pcfr46pjQ=</latexit> <latexit sha1_base64="5+vMcXT+CN5kLeqSR9srGNolKAg=">AC3icbZDLSgMxFIYz9VbrbdSlm9AiVNAyI4JuhKobly3YC7TDkEkzbWjmQnJGKV7N76KGxeKuPUF3Pk2ZtpZ1NYfAl/+cw7J+b1YcAW9WPkVlbX1jfym4Wt7Z3dPXP/oKmiRFLWoJGIZNsjigkesgZwEKwdS0YCT7CWN7xL61HJhWPwgcYxcwJSD/kPqcEtOWaxZsyceEUKxdO8DWuz93OcLOcgmuWrIo1FV4GO4MSylRze9uL6JwEKgijVsa0YnDGRwKlgk0I3USwmdEj6rKMxJAFTzni6ywQfa6eH/UjqEwKeuvMTYxIoNQo83RkQGKjFWmr+V+sk4F85Yx7GCbCQzh7yE4EhwmkwuMcloyBGgiVXP8V0wGRhIKOr6BDsBdXobmecW2Knb9olS9zeLIoyNURGVko0tURfeohqIoif0gt7Qu/FsvBofxuesNWdkM4foj4yvX5iQl5Y=</latexit> Advantage Actor Critic (A2C) • High variance: • Reduce variance with baseline: • Use value-function as the baseline (A2C):

  7. <latexit sha1_base64="Q87OeiBHTj2uynKBf+Fc2+4r43I=">ACKXicbVBNSwMxEM36bf2qevQSLIJeyq4Ieix68ajSqtAty2w2bYPZElmhbL073jxr3hRUNSrf8RsW8GvB4E3782QmRdnUlj0/Tdvanpmdm5+YbGytLyulZd37i0OjeMt5iW2lzHYLkUirdQoOTXmeGQxpJfxTcnpX91y40VWjVxkPFOCj0luoIBOimqNkIFsYQoxD5HoKHUPRpm4qvehQhpmIqE2gj3aMgSXdaAfQayuBhGzaha8+v+CPQvCSakRiY4i6pPYaJZnKFTIK17cDPsFOAQcEkH1bC3PIM2A30eNtRBSm3nWJ06ZDuOCWhXW3cU0hH6veJAlJrB2nsOsl7W+vFP/z2jl2jzqFUFmOXLHxR91cUtS0jI0mwnCGcuAIMCPcrpT1wQBDF27FhRD8PvkvudyvB349OD+oNY4ncSyQLbJNdklADkmDnJIz0iKM3JEH8kxevHv0Xv13setU95kZpP8gPfxCf5KpnQ=</latexit> <latexit sha1_base64="stVTPD2NxfInfe0m+8mdo3PSnQ=">ACL3icbVBNSysxFM348dQ+37Pq0k2wPKiLV2ZE0KUoiMsqrQpNGe5k0jaYSYbkjlCG/iM3/hU3Ioq49V+Y1gp+HQicnHMv96T5Eo6DMP7YGZ2bv7XwuJS5fyn78r1dW1M2cKy0WbG2XsRQJOKlFGyUqcZFbAVmixHlyeTj2z6+EdLoFg5z0c2gr2VPckAvxdUjpiFREDMcCATKlOlTlsv3fx1ipCyTKXUxblHGU4O0zjLAQdVno7iFv1Pk624Wgsb4QT0O4mpEamaMbVW5YaXmRCI1fgXCcKc+yWYFyJUYVjiRA7+Evuh4qiETrltO7h3Rf15Jac9Y/zTSifqxo4TMuWGW+Mrxpu6rNxZ/8joF9va6pdR5gULzt0G9QlE0dBweTaUVHNXQE+BW+l0pH4AFj7ig8h+nryd3K23YjCRnSyU9s/mMaxSDbIJqmTiOySfXJMmqRNOLkmt+SBPAY3wV3wFDy/lc4E0518gnBysfJqfQ</latexit> <latexit sha1_base64="UAloJD1/ubWRcMSZ24yHXdTCXs=">ACNHicbVBNaxsxENXmO85H3eSYi6gJOIeY3VBoj6a5BHpJi+0ELPMamVbRCst0mzALP5RueSH5BICOTSEXPsbqrVdaJM8EDy9N8PMvCRX0mEYPgRLyura+sbm7Wt7Z3dD/WPez1nCstFlxtl7GUCTipRclKnGZWwFZosRFcnVa+RfXwjpdAcnuRhkMNJyKDmgl+L6d6YhURAzHAsEypQZUZbLv/8mxEhZJlPqYjyijKcGaZNlgGMOqvw5jTv0mPalXsU1xthK5yBviXRgjTIAudx/Y6lheZ0MgVONePwhwHJViUXIlpjRVO5MCvYCT6nmrIhBuUs6On9NArKR0a659GOlP/7Sghc26SJb6yWte9irxPa9f4PDroJQ6L1BoPh80LBRFQ6sEaSqt4KgmngC30u9K+RgscPQ513wI0euT35LeSsKW9GPz432t0UcG+SAfCJNEpEvpE3OyDnpEk5uyD35RZ6C2+AxeA5e5qVLwaJn/yH4Pcfr46pjQ=</latexit> <latexit sha1_base64="5+vMcXT+CN5kLeqSR9srGNolKAg=">AC3icbZDLSgMxFIYz9VbrbdSlm9AiVNAyI4JuhKobly3YC7TDkEkzbWjmQnJGKV7N76KGxeKuPUF3Pk2ZtpZ1NYfAl/+cw7J+b1YcAW9WPkVlbX1jfym4Wt7Z3dPXP/oKmiRFLWoJGIZNsjigkesgZwEKwdS0YCT7CWN7xL61HJhWPwgcYxcwJSD/kPqcEtOWaxZsyceEUKxdO8DWuz93OcLOcgmuWrIo1FV4GO4MSylRze9uL6JwEKgijVsa0YnDGRwKlgk0I3USwmdEj6rKMxJAFTzni6ywQfa6eH/UjqEwKeuvMTYxIoNQo83RkQGKjFWmr+V+sk4F85Yx7GCbCQzh7yE4EhwmkwuMcloyBGgiVXP8V0wGRhIKOr6BDsBdXobmecW2Knb9olS9zeLIoyNURGVko0tURfeohqIoif0gt7Qu/FsvBofxuesNWdkM4foj4yvX5iQl5Y=</latexit> <latexit sha1_base64="73WF1LFqO48zTabI2caWEeiC8nc=">ACJHicbVDLSgMxFM3UV62vUZdugkXoUC0zIiIUHXjsop9QFuGTJq2oZkHyR2hDP0YN/6KGxc+cOHGbzHTdlGrBwLnMvN/d4keAKbPvLyCwsLi2vZFdza+sbm1vm9k5NhbGkrEpDEcqGRxQTPGBV4CBYI5KM+J5gdW9wnfr1ByYVD4N7GEas7ZNewLucEtCSa5fFpQLh5i4YOELXGj5BPqUiORuNGMUcU1XCRSdkWXho3EFlmvm7ZI9Bv5LnCnJoykqrvne6oQ09lkAVBClmo4dQTshEjgVbJRrxYpFhA5IjzU1DYjPVDsZHznCB1rp4G4o9QsAj9XZiYT4Sg19T3emN6h5LxX/85oxdM/aCQ+iGFhAJ4u6scAQ4jQx3OGSURBDTQiVXP8V0z6RhILONadDcOZP/ktqxyXHLjm3J/ny1TSOLNpD+6iAHSKyugGVAVUfSIntErejOejBfjw/ictGaM6cwu+gXj+wcTkaC8</latexit> Advantage Actor Critic (A2C) • High variance: • Reduce variance with baseline: • Use value-function as the baseline (A2C):

  8. Advantage Actor Critic (A2C) • A2C is great, but you can only use each rollout once!

  9. Advantage Actor Critic (A2C) • A2C is great, but you can only use each rollout once! Why?

  10. Advantage Actor Critic (A2C) • A2C is great, but you can only use each rollout once! • No theoretical grounding to do so

  11. Advantage Actor Critic (A2C) • Works poorly in-practice

  12. Advantage Actor Critic (A2C) • Works poorly in-practice Image credit: Alberto Metelli, 2018

  13. Outline • RL Refresher/Advantage Actor Critic (A2C) • Trust Region Policy Optimization (TRPO) • Proximal Policy Optimization (PPO) • Application: PointGoal Navigation Results

  14. Trust Region Policy Optimization (TRPO) A2C Maximizes:

  15. <latexit sha1_base64="WRGxFEVI9rYjIA4fp4fgmoJwGI4=">AB+nicbVBNS8NAEN34WetXqkcvi0Wol5KIoMeiF48V7Ae0IWw2m3bpZhN3J0qJ/SlePCji1V/izX/jts1BWx8MPN6bYWZekAquwXG+rZXVtfWNzdJWeXtnd2/frhy0dZIpylo0EYnqBkQzwSVrAQfBuqliJA4E6wSj6nfeWBK80TewThlXkwGkecEjCSb1fucY34gPsxD7H24RT7dtWpOzPgZeIWpIoKNH37qx8mNIuZBCqI1j3XScHLiQJOBZuU+5lmKaEjMmA9QyWJmfby2ekTfGKUEeJMiUBz9TfEzmJtR7HgemMCQz1ojcV/N6GUSXs5lmgGTdL4oygSGBE9zwCFXjIYG0Ko4uZWTIdEQomrbIJwV18eZm0z+quU3dvz6uNqyKOEjpCx6iGXHSBGugGNVELUfSIntErerOerBfr3fqYt65Yxcwh+gPr8wczm5Kn</latexit> Trust Region Policy Optimization (TRPO) Given a policy:

  16. <latexit sha1_base64="m/+Baqn3U8yhwCE2Svpz4XjOY=">ACTXicbVFNb9NAEF2nQNPwUbcuYyIiBK1jWyEVC6VAlw4FkTSHGwxutNur6o7tjpMjyH+wFiRv/opceihBineYQGkZa6c17b7Q7b6NcSUOe9NpbD14+Gi7udN6/OTps13b39kskJzMeSZyvQ4QiOUTMWQJCkxzrXAJFLiLr4UOtn34Q2Mku/0CIX0wTnqZxJjmSp0I0DwgI6EBiZwCV0MSQIEhmDCakHQCtd18vu7Y5BKyZDpxYHemcoyo/V2vKAYyWzpIO/KoHR1D3UBt6odv2+t6yYBP4K9BmqzoN3R9BnPEiESlxhcZMfC+naYmaJFeiagWFETnyC5yLiYUpJsJMy2UaFbyTAyzTNuTEizZ9YkSE2MWSWSd9SLmvlaT/9MmBc3eTkuZ5gWJlN9dNCsUAZ1tBLTiphQXItbRvBX6OGjnZD2jZEPz7K2+C0eu+7/X9T2/ag/erOJrsBXvJusxnx2zAPrJTNmScXbFrdst+Od+dG+e38+fO2nBWM8/ZP9XY/gsupq3+</latexit> <latexit sha1_base64="WRGxFEVI9rYjIA4fp4fgmoJwGI4=">AB+nicbVBNS8NAEN34WetXqkcvi0Wol5KIoMeiF48V7Ae0IWw2m3bpZhN3J0qJ/SlePCji1V/izX/jts1BWx8MPN6bYWZekAquwXG+rZXVtfWNzdJWeXtnd2/frhy0dZIpylo0EYnqBkQzwSVrAQfBuqliJA4E6wSj6nfeWBK80TewThlXkwGkecEjCSb1fucY34gPsxD7H24RT7dtWpOzPgZeIWpIoKNH37qx8mNIuZBCqI1j3XScHLiQJOBZuU+5lmKaEjMmA9QyWJmfby2ekTfGKUEeJMiUBz9TfEzmJtR7HgemMCQz1ojcV/N6GUSXs5lmgGTdL4oygSGBE9zwCFXjIYG0Ko4uZWTIdEQomrbIJwV18eZm0z+quU3dvz6uNqyKOEjpCx6iGXHSBGugGNVELUfSIntErerOerBfr3fqYt65Yxcwh+gPr8wczm5Kn</latexit> Trust Region Policy Optimization (TRPO) Given a policy: Collect experience and calculate advantage

  17. <latexit sha1_base64="m/+Baqn3U8yhwCE2Svpz4XjOY=">ACTXicbVFNb9NAEF2nQNPwUbcuYyIiBK1jWyEVC6VAlw4FkTSHGwxutNur6o7tjpMjyH+wFiRv/opceihBineYQGkZa6c17b7Q7b6NcSUOe9NpbD14+Gi7udN6/OTps13b39kskJzMeSZyvQ4QiOUTMWQJCkxzrXAJFLiLr4UOtn34Q2Mku/0CIX0wTnqZxJjmSp0I0DwgI6EBiZwCV0MSQIEhmDCakHQCtd18vu7Y5BKyZDpxYHemcoyo/V2vKAYyWzpIO/KoHR1D3UBt6odv2+t6yYBP4K9BmqzoN3R9BnPEiESlxhcZMfC+naYmaJFeiagWFETnyC5yLiYUpJsJMy2UaFbyTAyzTNuTEizZ9YkSE2MWSWSd9SLmvlaT/9MmBc3eTkuZ5gWJlN9dNCsUAZ1tBLTiphQXItbRvBX6OGjnZD2jZEPz7K2+C0eu+7/X9T2/ag/erOJrsBXvJusxnx2zAPrJTNmScXbFrdst+Od+dG+e38+fO2nBWM8/ZP9XY/gsupq3+</latexit> <latexit sha1_base64="WRGxFEVI9rYjIA4fp4fgmoJwGI4=">AB+nicbVBNS8NAEN34WetXqkcvi0Wol5KIoMeiF48V7Ae0IWw2m3bpZhN3J0qJ/SlePCji1V/izX/jts1BWx8MPN6bYWZekAquwXG+rZXVtfWNzdJWeXtnd2/frhy0dZIpylo0EYnqBkQzwSVrAQfBuqliJA4E6wSj6nfeWBK80TewThlXkwGkecEjCSb1fucY34gPsxD7H24RT7dtWpOzPgZeIWpIoKNH37qx8mNIuZBCqI1j3XScHLiQJOBZuU+5lmKaEjMmA9QyWJmfby2ekTfGKUEeJMiUBz9TfEzmJtR7HgemMCQz1ojcV/N6GUSXs5lmgGTdL4oygSGBE9zwCFXjIYG0Ko4uZWTIdEQomrbIJwV18eZm0z+quU3dvz6uNqyKOEjpCx6iGXHSBGugGNVELUfSIntErerOerBfr3fqYt65Yxcwh+gPr8wczm5Kn</latexit> <latexit sha1_base64="Y/Q+5m3fIVE7ZlVTIpodCXH5oc=">ACQ3icbZDLSgMxFIYzXmu9V26CRahBSkzIuhG8LIRVxWsip06nMlkbGjmYnJGKEPfzY0v4M4XcONCEbeC6Wh1QOBP9/Dsn5/VQKjb9bE1MTk3PzBbmivMLi0vLpZXVC51kivEGS2SirnzQXIqYN1Cg5Fep4hD5kl/6neO+f3nPlRZJfI7dlLciuI1FKBigQV7p2o0A2wxkftqruNjmCFW6T91QAcvdVHhDVgEPqRuJgGoPq738bgxQlwUJ0sObu4q5b1HjVr1S2a7Zg6J/hTMSZTKquld6coOEZRGPkUnQunYKbZyUCiY5L2im2meAuvALW8aGUPEdSsfZNCjm4YENEyUOTHSAf05kUOkdTfyTWd/Yz3u9eF/XjPDcK+VizjNkMds+FCYSYoJ7QdKA6E4Q9k1ApgS5q+UtcHEhyb2ognBGV/5r7jYrjl2zTnbKR8cjeIokHWyQSrEIbvkgJyQOmkQRh7IC3kj79aj9Wp9WJ/D1glrNLNGfpX19Q185a/1</latexit> Trust Region Policy Optimization (TRPO) Given a policy: Collect experience and calculate advantage Maximize:

  18. <latexit sha1_base64="Y/Q+5m3fIVE7ZlVTIpodCXH5oc=">ACQ3icbZDLSgMxFIYzXmu9V26CRahBSkzIuhG8LIRVxWsip06nMlkbGjmYnJGKEPfzY0v4M4XcONCEbeC6Wh1QOBP9/Dsn5/VQKjb9bE1MTk3PzBbmivMLi0vLpZXVC51kivEGS2SirnzQXIqYN1Cg5Fep4hD5kl/6neO+f3nPlRZJfI7dlLciuI1FKBigQV7p2o0A2wxkftqruNjmCFW6T91QAcvdVHhDVgEPqRuJgGoPq738bgxQlwUJ0sObu4q5b1HjVr1S2a7Zg6J/hTMSZTKquld6coOEZRGPkUnQunYKbZyUCiY5L2im2meAuvALW8aGUPEdSsfZNCjm4YENEyUOTHSAf05kUOkdTfyTWd/Yz3u9eF/XjPDcK+VizjNkMds+FCYSYoJ7QdKA6E4Q9k1ApgS5q+UtcHEhyb2ognBGV/5r7jYrjl2zTnbKR8cjeIokHWyQSrEIbvkgJyQOmkQRh7IC3kj79aj9Wp9WJ/D1glrNLNGfpX19Q185a/1</latexit> Trust Region Policy Optimization (TRPO) Maximize: Read as: Policy is better than if it takes good actions 
 ( ) more often and takes bad actions ( ) less often

  19. <latexit sha1_base64="Y/Q+5m3fIVE7ZlVTIpodCXH5oc=">ACQ3icbZDLSgMxFIYzXmu9V26CRahBSkzIuhG8LIRVxWsip06nMlkbGjmYnJGKEPfzY0v4M4XcONCEbeC6Wh1QOBP9/Dsn5/VQKjb9bE1MTk3PzBbmivMLi0vLpZXVC51kivEGS2SirnzQXIqYN1Cg5Fep4hD5kl/6neO+f3nPlRZJfI7dlLciuI1FKBigQV7p2o0A2wxkftqruNjmCFW6T91QAcvdVHhDVgEPqRuJgGoPq738bgxQlwUJ0sObu4q5b1HjVr1S2a7Zg6J/hTMSZTKquld6coOEZRGPkUnQunYKbZyUCiY5L2im2meAuvALW8aGUPEdSsfZNCjm4YENEyUOTHSAf05kUOkdTfyTWd/Yz3u9eF/XjPDcK+VizjNkMds+FCYSYoJ7QdKA6E4Q9k1ApgS5q+UtcHEhyb2ognBGV/5r7jYrjl2zTnbKR8cjeIokHWyQSrEIbvkgJyQOmkQRh7IC3kj79aj9Wp9WJ/D1glrNLNGfpX19Q185a/1</latexit> Trust Region Policy Optimization (TRPO) Maximize: Why this objective? Read as: Policy is better than if it takes good actions 
 ( ) more often and takes bad actions ( ) less often

  20. <latexit sha1_base64="m/+Baqn3U8yhwCE2Svpz4XjOY=">ACTXicbVFNb9NAEF2nQNPwUbcuYyIiBK1jWyEVC6VAlw4FkTSHGwxutNur6o7tjpMjyH+wFiRv/opceihBineYQGkZa6c17b7Q7b6NcSUOe9NpbD14+Gi7udN6/OTps13b39kskJzMeSZyvQ4QiOUTMWQJCkxzrXAJFLiLr4UOtn34Q2Mku/0CIX0wTnqZxJjmSp0I0DwgI6EBiZwCV0MSQIEhmDCakHQCtd18vu7Y5BKyZDpxYHemcoyo/V2vKAYyWzpIO/KoHR1D3UBt6odv2+t6yYBP4K9BmqzoN3R9BnPEiESlxhcZMfC+naYmaJFeiagWFETnyC5yLiYUpJsJMy2UaFbyTAyzTNuTEizZ9YkSE2MWSWSd9SLmvlaT/9MmBc3eTkuZ5gWJlN9dNCsUAZ1tBLTiphQXItbRvBX6OGjnZD2jZEPz7K2+C0eu+7/X9T2/ag/erOJrsBXvJusxnx2zAPrJTNmScXbFrdst+Od+dG+e38+fO2nBWM8/ZP9XY/gsupq3+</latexit> <latexit sha1_base64="MNyLJ0iEjL6De720taE7+Lp+Axg=">ACI3icbVDLSgMxFM3Ud31VXboJFqFuyowIiAU3bhUsCp0hiGTuW1DMw+TO2IZ5l/c+CtuXCjFjQv/xUzbha8DIYdziW5J0il0GjbH1ZlZnZufmFxqbq8srq2XtvYvNZJpji0eSITdRswDVLE0EaBEm5TBSwKJNwEg7PSv7kHpUSX+EwBS9ivVh0BWdoJL92fNdgPlI3EiHVPu7RE+qmws9d7AOy8oYHzBMZFkVBf0b9Wt1u2mPQv8SZkjqZ4sKvjdw4VkEMXLJtO4dopezhQKLqGoupmGlPEB60H0JhFoL18vGNBd40S0m6izImRjtXvEzmLtB5GgUlGDPv6t1eK/3mdDLtHXi7iNEOI+eShbiYpJrQsjIZCAUc5NIRxJcxfKe8zxTiaWqumBOf3yn/J9X7TsZvO5UG9dTqtY5Fskx3SIA45JC1yTi5Im3DySJ7JK3mznqwXa2S9T6IVazqzRX7A+vwCEC2j7A=</latexit> <latexit sha1_base64="Y/Q+5m3fIVE7ZlVTIpodCXH5oc=">ACQ3icbZDLSgMxFIYzXmu9V26CRahBSkzIuhG8LIRVxWsip06nMlkbGjmYnJGKEPfzY0v4M4XcONCEbeC6Wh1QOBP9/Dsn5/VQKjb9bE1MTk3PzBbmivMLi0vLpZXVC51kivEGS2SirnzQXIqYN1Cg5Fep4hD5kl/6neO+f3nPlRZJfI7dlLciuI1FKBigQV7p2o0A2wxkftqruNjmCFW6T91QAcvdVHhDVgEPqRuJgGoPq738bgxQlwUJ0sObu4q5b1HjVr1S2a7Zg6J/hTMSZTKquld6coOEZRGPkUnQunYKbZyUCiY5L2im2meAuvALW8aGUPEdSsfZNCjm4YENEyUOTHSAf05kUOkdTfyTWd/Yz3u9eF/XjPDcK+VizjNkMds+FCYSYoJ7QdKA6E4Q9k1ApgS5q+UtcHEhyb2ognBGV/5r7jYrjl2zTnbKR8cjeIokHWyQSrEIbvkgJyQOmkQRh7IC3kj79aj9Wp9WJ/D1glrNLNGfpX19Q185a/1</latexit> Trust Region Policy Optimization (TRPO) Given a policy: Collect experience and calculate advantage Maximize:

  21. Trust Region Policy Optimization (TRPO) Image credit: Alberto Metelli, 2018

  22. Trust Region Policy Optimization (TRPO) • Use a trust-region !

  23. Trust Region Policy Optimization (TRPO) • PS 1 problem 1

  24. 
 
 Trust Region Policy Optimization (TRPO) • PS 1 problem 1 • In this problem, you showed that the gradient descent update rule 
 can be seen as the minimizer of the affine-lower bound of 
 subject to a trust-region:

  25. <latexit sha1_base64="twLRSR1S0ZOn3JfPmQdbvpQgdY=">ACJ3icbZDLSsNAFIYnXmu9RV26GSxCuymJCLpRim5cVrAXaEKYTCbt0MnFmROhL6NG1/FjaAiuvRNnLZaOsPAz/fOYcz5/dTwRVY1pextLyurZe2ihvbm3v7Jp7+2VZJKyFk1EIrs+UzwmLWAg2DdVDIS+YJ1/OH1pN5YFLxJL6DUcrciPRjHnJKQCPvJQeVB0YMCA1fIGdUBKaOyn3ZqxKPMBOxAOsPKiN8/s54JkVq25NhReNXZgKtT0zFcnSGgWsRioIEr1bCsFNycSOBVsXHYyxVJCh6TPetrGJGLKzad3jvGxJgEOE6lfDHhKf0/kJFJqFPm6MyIwUPO1Cfyv1sgPHdzHqcZsJjOFoWZwJDgSWg4JRECNtCJVc/xXTAdFRgY62rEOw509eNO2Tum3V7dvTSuOqiKOEDtERqiIbnaEGukFN1EIUPaJn9IbejSfjxfgwPmetS0Yxc4D+yPj+Ab5bpT0=</latexit> <latexit sha1_base64="98O0mbVqYkorID1M2C1j2kUbRIg=">ACznicbVFbaxNBFJ5dbzXeoj76MhiEBGzZFUFfhKovYgWjNG0hky5nZ0+SoTuzm5mzsXFZfPX3+eYP8H84uaCm9cAMH+c71+kZa4cRdHPILxy9dr1Gzs3W7du37l7r3/wZErKitxIu8sCcpOMyVwQEpyvGktAg6zfE4PXu75I/naJ0qzCEtShxpmBg1VhLIu5L2L6GBphLy+n1zWh9+7n9suoKmSNDjr1qiMhna1ILE2ib0h3l9Ou6hJ5ySKjXJLUgPKda6bKwBEbi7hdUkylhxiGbgyGYNPwXd4SswqyrbIi9RW5kFlB/OCDSNVk0hWlStatur4BF1pl3LfrdHqm20zq7y/k5S2OFda0YITWt0SbsT7Ur45dBvAEdtrF+0v4hskJWGg3JHJwbxlFJoxosKZlj4zdwWI84sNPTSg0Y3q1Tka/sR7Mj4urH+G+Mr7b0YN2rmFTn3kUnx3kVs6/8cNKxq/HNXKlBWhketG4yrnVPDlbXmLErKFx6AtMrPyuUvMxeBdfyIsQXV74Mjp7txdFe/Ol5Z/NRo4d9og9Zl0Wsxdsn71jfTZgMjgIZsHXoA74Txswm/r0DY5DxkWxZ+/w2RmuEV</latexit> Trust Region Policy Optimization (TRPO)

  26. Trust Region Policy Optimization (TRPO) • Advantage • Able to perform multiple optimization steps per rollout

  27. Trust Region Policy Optimization (TRPO) • Advantage • Able to perform multiple optimization steps per rollout • Disadvantage • Choosing the correct value for beta is challenging and problem/network dependent

  28. Outline • RL Refresher/Advantage Actor Critic (A2C) • Trust Region Policy Optimization (TRPO) • Proximal Policy Optimization (PPO) • Application: PointGoal Navigation Results

  29. Proximal Policy Optimization (PPO)

  30. Proximal Policy Optimization (PPO)

  31. <latexit sha1_base64="WRGxFEVI9rYjIA4fp4fgmoJwGI4=">AB+nicbVBNS8NAEN34WetXqkcvi0Wol5KIoMeiF48V7Ae0IWw2m3bpZhN3J0qJ/SlePCji1V/izX/jts1BWx8MPN6bYWZekAquwXG+rZXVtfWNzdJWeXtnd2/frhy0dZIpylo0EYnqBkQzwSVrAQfBuqliJA4E6wSj6nfeWBK80TewThlXkwGkecEjCSb1fucY34gPsxD7H24RT7dtWpOzPgZeIWpIoKNH37qx8mNIuZBCqI1j3XScHLiQJOBZuU+5lmKaEjMmA9QyWJmfby2ekTfGKUEeJMiUBz9TfEzmJtR7HgemMCQz1ojcV/N6GUSXs5lmgGTdL4oygSGBE9zwCFXjIYG0Ko4uZWTIdEQomrbIJwV18eZm0z+quU3dvz6uNqyKOEjpCx6iGXHSBGugGNVELUfSIntErerOerBfr3fqYt65Yxcwh+gPr8wczm5Kn</latexit> Proximal Policy Optimization (PPO) Given a policy:

  32. <latexit sha1_base64="MNyLJ0iEjL6De720taE7+Lp+Axg=">ACI3icbVDLSgMxFM3Ud31VXboJFqFuyowIiAU3bhUsCp0hiGTuW1DMw+TO2IZ5l/c+CtuXCjFjQv/xUzbha8DIYdziW5J0il0GjbH1ZlZnZufmFxqbq8srq2XtvYvNZJpji0eSITdRswDVLE0EaBEm5TBSwKJNwEg7PSv7kHpUSX+EwBS9ivVh0BWdoJL92fNdgPlI3EiHVPu7RE+qmws9d7AOy8oYHzBMZFkVBf0b9Wt1u2mPQv8SZkjqZ4sKvjdw4VkEMXLJtO4dopezhQKLqGoupmGlPEB60H0JhFoL18vGNBd40S0m6izImRjtXvEzmLtB5GgUlGDPv6t1eK/3mdDLtHXi7iNEOI+eShbiYpJrQsjIZCAUc5NIRxJcxfKe8zxTiaWqumBOf3yn/J9X7TsZvO5UG9dTqtY5Fskx3SIA45JC1yTi5Im3DySJ7JK3mznqwXa2S9T6IVazqzRX7A+vwCEC2j7A=</latexit> Proximal Policy Optimization (PPO) Given a policy:

  33. <latexit sha1_base64="twLRSR1S0ZOn3JfPmQdbvpQgdY=">ACJ3icbZDLSsNAFIYnXmu9RV26GSxCuymJCLpRim5cVrAXaEKYTCbt0MnFmROhL6NG1/FjaAiuvRNnLZaOsPAz/fOYcz5/dTwRVY1pextLyurZe2ihvbm3v7Jp7+2VZJKyFk1EIrs+UzwmLWAg2DdVDIS+YJ1/OH1pN5YFLxJL6DUcrciPRjHnJKQCPvJQeVB0YMCA1fIGdUBKaOyn3ZqxKPMBOxAOsPKiN8/s54JkVq25NhReNXZgKtT0zFcnSGgWsRioIEr1bCsFNycSOBVsXHYyxVJCh6TPetrGJGLKzad3jvGxJgEOE6lfDHhKf0/kJFJqFPm6MyIwUPO1Cfyv1sgPHdzHqcZsJjOFoWZwJDgSWg4JRECNtCJVc/xXTAdFRgY62rEOw509eNO2Tum3V7dvTSuOqiKOEDtERqiIbnaEGukFN1EIUPaJn9IbejSfjxfgwPmetS0Yxc4D+yPj+Ab5bpT0=</latexit> <latexit sha1_base64="MNyLJ0iEjL6De720taE7+Lp+Axg=">ACI3icbVDLSgMxFM3Ud31VXboJFqFuyowIiAU3bhUsCp0hiGTuW1DMw+TO2IZ5l/c+CtuXCjFjQv/xUzbha8DIYdziW5J0il0GjbH1ZlZnZufmFxqbq8srq2XtvYvNZJpji0eSITdRswDVLE0EaBEm5TBSwKJNwEg7PSv7kHpUSX+EwBS9ivVh0BWdoJL92fNdgPlI3EiHVPu7RE+qmws9d7AOy8oYHzBMZFkVBf0b9Wt1u2mPQv8SZkjqZ4sKvjdw4VkEMXLJtO4dopezhQKLqGoupmGlPEB60H0JhFoL18vGNBd40S0m6izImRjtXvEzmLtB5GgUlGDPv6t1eK/3mdDLtHXi7iNEOI+eShbiYpJrQsjIZCAUc5NIRxJcxfKe8zxTiaWqumBOf3yn/J9X7TsZvO5UG9dTqtY5Fskx3SIA45JC1yTi5Im3DySJ7JK3mznqwXa2S9T6IVazqzRX7A+vwCEC2j7A=</latexit> Proximal Policy Optimization (PPO) Given a policy: Objective: Maximize

  34. Proximal Policy Optimization (PPO)

  35. <latexit sha1_base64="a75IYbuwf6T/xNrGACEcFVl6gI=">AB+nicbVDLSsNAFL3xWesr1aWbwSK4KokIuiy6ETdGsA9oY5lMJ+3QySTMTJQS8yluXCji1i9x5984abvQ1gMDh3Pu5Z45QcKZ0o7zbS0tr6yurZc2yptb2zu7dmWvqeJUEtogMY9lO8CKciZoQzPNaTuRFEcBp61gdFn4rQcqFYvFnR4n1I/wQLCQEayN1LMr3QjrIcE8u87vM8+7yXt21ak5E6BF4s5IFWbwevZXtx+TNKJCE46V6rhOov0MS80Ip3m5myqaYDLCA9oxVOCIKj+bRM/RkVH6KIyleUKjifp7I8ORUuMoMJNFUDXvFeJ/XifV4bmfMZGkmgoyPRSmHOkYFT2gPpOUaD42BPJTFZEhlhiok1bZVOCO/lRdI8qblOzb09rdYvZnWU4AO4RhcOIM6XIEHDSDwCM/wCm/Wk/VivVsf09Ela7azD39gf4AZSyUEg=</latexit> <latexit sha1_base64="a75IYbuwf6T/xNrGACEcFVl6gI=">AB+nicbVDLSsNAFL3xWesr1aWbwSK4KokIuiy6ETdGsA9oY5lMJ+3QySTMTJQS8yluXCji1i9x5984abvQ1gMDh3Pu5Z45QcKZ0o7zbS0tr6yurZc2yptb2zu7dmWvqeJUEtogMY9lO8CKciZoQzPNaTuRFEcBp61gdFn4rQcqFYvFnR4n1I/wQLCQEayN1LMr3QjrIcE8u87vM8+7yXt21ak5E6BF4s5IFWbwevZXtx+TNKJCE46V6rhOov0MS80Ip3m5myqaYDLCA9oxVOCIKj+bRM/RkVH6KIyleUKjifp7I8ORUuMoMJNFUDXvFeJ/XifV4bmfMZGkmgoyPRSmHOkYFT2gPpOUaD42BPJTFZEhlhiok1bZVOCO/lRdI8qblOzb09rdYvZnWU4AO4RhcOIM6XIEHDSDwCM/wCm/Wk/VivVsf09Ela7azD39gf4AZSyUEg=</latexit> <latexit sha1_base64="XU6mzGKxL8N+dKuvm07YfKiYqg=">AB+HicbVBNS8NAEN3Ur1o/GvXoZbEIFaQkIujBQ9WLxwr2A9oQNtNu3SzCbsToYb+Ei8eFPHqT/Hmv3Hb5qCtDwYe780wMy9IBNfgON9WYWV1bX2juFna2t7ZLdt7+y0dp4qyJo1FrDoB0UxwyZrAQbBOohiJAsHaweh26rcfmdI8lg8wTpgXkYHkIacEjOTb5euq9uEUEx9O8BV2fLvi1JwZ8DJxc1JBORq+/dXrxzSNmAQqiNZd10nAy4gCTgWblHqpZgmhIzJgXUMliZj2stnhE3xslD4OY2VKAp6pvycyEmk9jgLTGREY6kVvKv7ndVMIL72MyQFJul8UZgKDGepoD7XDEKYmwIoYqbWzEdEkUomKxKJgR38eVl0jqruU7NvT+v1G/yOIroEB2hKnLRBaqjO9RATURip7RK3qznqwX6936mLcWrHzmAP2B9fkD4qiRQ=</latexit> Proximal Policy Optimization (PPO)

Download Presentation
Download Policy: The content available on the website is offered to you 'AS IS' for your personal information and use only. It cannot be commercialized, licensed, or distributed on other websites without prior consent from the author. To download a presentation, simply click this link. If you encounter any difficulties during the download process, it's possible that the publisher has removed the file from their server.

Recommend


More recommend