Jake Gunther
2020/20/2
\[ \begin{gather} \mathbf{\theta} \in \mathbb{R}^{d'} \qquad (\text{policy parameter}) \\ \mathbf{w} \in \mathbb{R}^{d} \qquad (\text{value parameter}) \end{gather} \]
\[ \mathbf{\theta}_{t+1} = \mathbf{\theta}_{t} + \alpha \widehat{\nabla J(\mathbf{\theta})} \]
where \(\widehat{\nabla J(\mathbf{\theta})}\) is a stochastic estimate in which
\[ E\{\widehat{\nabla J(\mathbf{\theta})}\} \approx \nabla J(\mathbf{\theta}) \]
\[ \pi(a|s,\mathbf{\theta}) = \frac{e^{h(s,a,\mathbf{\theta})}}{\sum_b e^{h(s,b,\mathbf{\theta})}} \]
“soft-max in action preferences”
\[ \nabla J(\mathbf{\theta}) = C\cdot \sum_s \mu(s) \sum_a q_\pi(s,a) \nabla \pi(a|s,\mathbf{\theta}) \]
\[ \begin{align} \nabla J(\mathbf{\theta}) &\propto \sum_s \mu(s) \sum_a q_\pi(s,a) \nabla \pi(a|s,\mathbf{\theta})\\ &= E\left[ \sum_a q_\pi(S_t,a)\nabla\pi(a|S_t,\mathbf{\theta})\right] \\ \mathbf{\theta}_{t+1} &= \mathbf{\theta} +\alpha \sum_a \hat{q}(S_t,a,\mathbf{w}) \nabla \pi(a,|S_t,\mathbf{\theta}) \end{align} \]
\[ \begin{align} \nabla J(\mathbf{\theta}) &= E\left[ \sum_a q_\pi(S_t,a)\nabla\pi(a|S_t,\mathbf{\theta})\right] \\ &= E\left[ \sum_a \pi(a|S_t,\mathbf{\theta}) q_\pi(S_t,a)\frac{\nabla\pi(a|S_t,\mathbf{\theta})}{\pi(a|S_t,\mathbf{\theta})}\right] \\ &= E\left[ q_\pi(S_t,A_t)\frac{\nabla\pi(A_t|S_t,\mathbf{\theta})}{\pi(A_t|S_t,\mathbf{\theta})}\right] \end{align} \]
\[ \begin{align} \nabla J(\mathbf{\theta}) &= E\left[ q_\pi(S_t,A_t)\frac{\nabla\pi(A_t|S_t,\mathbf{\theta})}{\pi(A_t|S_t,\mathbf{\theta})}\right] \\ &= E\left[ G_t\frac{\nabla\pi(A_t|S_t,\mathbf{\theta})}{\pi(A_t|S_t,\mathbf{\theta})}\right] \end{align} \]
\[ \mathbf{\theta}_{t+1} = \mathbf{\theta} +\alpha G_t\frac{\nabla\pi(A_t|S_t,\mathbf{\theta})}{\pi(A_t|S_t,\mathbf{\theta})} \]
Policy Gradient Theorem
\[ \nabla J(\mathbf{\theta}) = C\cdot \sum_s \mu(s) \sum_a q_\pi(s,a) \nabla \pi(a|s,\mathbf{\theta}) \]
Policy Gradient with Baseline Theorem
\[ \nabla J(\mathbf{\theta}) = C\cdot \sum_s \mu(s) \sum_a \left(q_\pi(s,a)-b(s)\right) \nabla \pi(a|s,\mathbf{\theta}) \]
\[ \begin{align} \sum_a b(s) \nabla \pi(a|s,\mathbf{\theta}) &= b(s) \nabla \sum_a \pi(a|s,\mathbf{\theta}) \\ & = b(s) \nabla 1 = 0 \end{align} \]
Subtracting a baseline does nothing to the gradient
\[ \nabla J(\mathbf{\theta}) = C\cdot \sum_s \mu(s) \sum_a \left(q_\pi(s,a)-b(s)\right) \nabla \pi(a|s,\mathbf{\theta}) \]
\[ \mathbf{\theta}_{t+1} = \mathbf{\theta} +\alpha \left(G_t-b(S_t)\right)\frac{\nabla\pi(A_t|S_t,\mathbf{\theta})}{\pi(A_t|S_t,\mathbf{\theta})} \]
(see equations on next page)
\[ \begin{align} \mathbf{\theta}_{t+1} &= \mathbf{\theta}_t + \alpha \left( G_{t:t+1} -\hat{v}(S_t,\mathbf{w}) \right) \frac{\nabla \pi(A_t|S_t,\mathbf{\theta}_t)}{\pi(A_t|S_t,\mathbf{\theta}_t)} \\ &= \mathbf{\theta}_t + \alpha \left( R_{t+1} + \gamma \hat{v}(S_{t+1},\mathbf{w}) -\hat{v}(S_t,\mathbf{w}) \right) \frac{\nabla \pi(A_t|S_t,\mathbf{\theta}_t)}{\pi(A_t|S_t,\mathbf{\theta}_t)} \\ &= \mathbf{\theta}_t + \alpha \delta_t \frac{\nabla \pi(A_t|S_t,\mathbf{\theta}_t)}{\pi(A_t|S_t,\mathbf{\theta}_t)} \end{align} \]
\[ \begin{gather} \pi(a|s,\mathbf{\theta}) = \frac{1}{\sigma(s,\mathbf{\theta})\sqrt{2\pi}} \exp\left(-\frac{(a-\mu(s,\mathbf{\theta}))^2}{2\sigma(s,\mathbf{\theta})^2}\right) \\ \mu(s,\mathbf{\theta}) = \mathbf{\theta}_\mu^T \mathbf{x}_\mu(s) \qquad \sigma(s,\mathbf{\theta}) = \exp(\mathbf{\theta}_\sigma)^T \mathbf{x}_\sigma(s) \end{gather} \]
Comments