1# Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15"""Monte Carlo integration and helpers. 16 17See the @{$python/contrib.bayesflow.monte_carlo} guide. 18 19@@expectation 20@@expectation_importance_sampler 21@@expectation_importance_sampler_logspace 22""" 23 24from __future__ import absolute_import 25from __future__ import division 26from __future__ import print_function 27 28from tensorflow.python.framework import ops 29from tensorflow.python.ops import array_ops 30from tensorflow.python.ops import math_ops 31from tensorflow.python.ops import nn 32 33__all__ = [ 34 'expectation', 35 'expectation_importance_sampler', 36 'expectation_importance_sampler_logspace', 37] 38 39 40def expectation_importance_sampler(f, 41 log_p, 42 sampling_dist_q, 43 z=None, 44 n=None, 45 seed=None, 46 name='expectation_importance_sampler'): 47 r"""Monte Carlo estimate of `E_p[f(Z)] = E_q[f(Z) p(Z) / q(Z)]`. 48 49 With `p(z) := exp{log_p(z)}`, this `Op` returns 50 51 ``` 52 n^{-1} sum_{i=1}^n [ f(z_i) p(z_i) / q(z_i) ], z_i ~ q, 53 \approx E_q[ f(Z) p(Z) / q(Z) ] 54 = E_p[f(Z)] 55 ``` 56 57 This integral is done in log-space with max-subtraction to better handle the 58 often extreme values that `f(z) p(z) / q(z)` can take on. 59 60 If `f >= 0`, it is up to 2x more efficient to exponentiate the result of 61 `expectation_importance_sampler_logspace` applied to `Log[f]`. 62 63 User supplies either `Tensor` of samples `z`, or number of samples to draw `n` 64 65 Args: 66 f: Callable mapping samples from `sampling_dist_q` to `Tensors` with shape 67 broadcastable to `q.batch_shape`. 68 For example, `f` works "just like" `q.log_prob`. 69 log_p: Callable mapping samples from `sampling_dist_q` to `Tensors` with 70 shape broadcastable to `q.batch_shape`. 71 For example, `log_p` works "just like" `sampling_dist_q.log_prob`. 72 sampling_dist_q: The sampling distribution. 73 `tf.contrib.distributions.Distribution`. 74 `float64` `dtype` recommended. 75 `log_p` and `q` should be supported on the same set. 76 z: `Tensor` of samples from `q`, produced by `q.sample` for some `n`. 77 n: Integer `Tensor`. Number of samples to generate if `z` is not provided. 78 seed: Python integer to seed the random number generator. 79 name: A name to give this `Op`. 80 81 Returns: 82 The importance sampling estimate. `Tensor` with `shape` equal 83 to batch shape of `q`, and `dtype` = `q.dtype`. 84 """ 85 q = sampling_dist_q 86 with ops.name_scope(name, values=[z, n]): 87 z = _get_samples(q, z, n, seed) 88 89 log_p_z = log_p(z) 90 q_log_prob_z = q.log_prob(z) 91 92 def _importance_sampler_positive_f(log_f_z): 93 # Same as expectation_importance_sampler_logspace, but using Tensors 94 # rather than samples and functions. Allows us to sample once. 95 log_values = log_f_z + log_p_z - q_log_prob_z 96 return _logspace_mean(log_values) 97 98 # With f_plus(z) = max(0, f(z)), f_minus(z) = max(0, -f(z)), 99 # E_p[f(Z)] = E_p[f_plus(Z)] - E_p[f_minus(Z)] 100 # = E_p[f_plus(Z) + 1] - E_p[f_minus(Z) + 1] 101 # Without incurring bias, 1 is added to each to prevent zeros in logspace. 102 # The logarithm is approximately linear around 1 + epsilon, so this is good 103 # for small values of 'z' as well. 104 f_z = f(z) 105 log_f_plus_z = math_ops.log(nn.relu(f_z) + 1.) 106 log_f_minus_z = math_ops.log(nn.relu(-1. * f_z) + 1.) 107 108 log_f_plus_integral = _importance_sampler_positive_f(log_f_plus_z) 109 log_f_minus_integral = _importance_sampler_positive_f(log_f_minus_z) 110 111 return math_ops.exp(log_f_plus_integral) - math_ops.exp(log_f_minus_integral) 112 113 114def expectation_importance_sampler_logspace( 115 log_f, 116 log_p, 117 sampling_dist_q, 118 z=None, 119 n=None, 120 seed=None, 121 name='expectation_importance_sampler_logspace'): 122 r"""Importance sampling with a positive function, in log-space. 123 124 With `p(z) := exp{log_p(z)}`, and `f(z) = exp{log_f(z)}`, this `Op` 125 returns 126 127 ``` 128 Log[ n^{-1} sum_{i=1}^n [ f(z_i) p(z_i) / q(z_i) ] ], z_i ~ q, 129 \approx Log[ E_q[ f(Z) p(Z) / q(Z) ] ] 130 = Log[E_p[f(Z)]] 131 ``` 132 133 This integral is done in log-space with max-subtraction to better handle the 134 often extreme values that `f(z) p(z) / q(z)` can take on. 135 136 In contrast to `expectation_importance_sampler`, this `Op` returns values in 137 log-space. 138 139 140 User supplies either `Tensor` of samples `z`, or number of samples to draw `n` 141 142 Args: 143 log_f: Callable mapping samples from `sampling_dist_q` to `Tensors` with 144 shape broadcastable to `q.batch_shape`. 145 For example, `log_f` works "just like" `sampling_dist_q.log_prob`. 146 log_p: Callable mapping samples from `sampling_dist_q` to `Tensors` with 147 shape broadcastable to `q.batch_shape`. 148 For example, `log_p` works "just like" `q.log_prob`. 149 sampling_dist_q: The sampling distribution. 150 `tf.contrib.distributions.Distribution`. 151 `float64` `dtype` recommended. 152 `log_p` and `q` should be supported on the same set. 153 z: `Tensor` of samples from `q`, produced by `q.sample` for some `n`. 154 n: Integer `Tensor`. Number of samples to generate if `z` is not provided. 155 seed: Python integer to seed the random number generator. 156 name: A name to give this `Op`. 157 158 Returns: 159 Logarithm of the importance sampling estimate. `Tensor` with `shape` equal 160 to batch shape of `q`, and `dtype` = `q.dtype`. 161 """ 162 q = sampling_dist_q 163 with ops.name_scope(name, values=[z, n]): 164 z = _get_samples(q, z, n, seed) 165 log_values = log_f(z) + log_p(z) - q.log_prob(z) 166 return _logspace_mean(log_values) 167 168 169def _logspace_mean(log_values): 170 """Evaluate `Log[E[values]]` in a stable manner. 171 172 Args: 173 log_values: `Tensor` holding `Log[values]`. 174 175 Returns: 176 `Tensor` of same `dtype` as `log_values`, reduced across dim 0. 177 `Log[Mean[values]]`. 178 """ 179 # center = Max[Log[values]], with stop-gradient 180 # The center hopefully keep the exponentiated term small. It is canceled 181 # from the final result, so putting stop gradient on it will not change the 182 # final result. We put stop gradient on to eliminate unnecessary computation. 183 center = array_ops.stop_gradient(_sample_max(log_values)) 184 185 # centered_values = exp{Log[values] - E[Log[values]]} 186 centered_values = math_ops.exp(log_values - center) 187 188 # log_mean_of_values = Log[ E[centered_values] ] + center 189 # = Log[ E[exp{log_values - E[log_values]}] ] + center 190 # = Log[E[values]] - E[log_values] + center 191 # = Log[E[values]] 192 log_mean_of_values = math_ops.log(_sample_mean(centered_values)) + center 193 194 return log_mean_of_values 195 196 197def expectation(f, samples, log_prob=None, use_reparametrization=True, 198 axis=0, keep_dims=False, name=None): 199 """Computes the Monte-Carlo approximation of `E_p[f(X)]`. 200 201 This function computes the Monte-Carlo approximation of an expectation, i.e., 202 203 ```none 204 E_p[f(X)] approx= m**-1 sum_i^m f(x_j), x_j ~iid p(X) 205 ``` 206 207 where: 208 209 - `x_j = samples[j, ...]`, 210 - `log(p(samples)) = log_prob(samples)` and 211 - `m = prod(shape(samples)[axis])`. 212 213 Tricks: Reparameterization and Score-Gradient 214 215 When p is "reparameterized", i.e., a diffeomorphic transformation of a 216 parameterless distribution (e.g., 217 `Normal(Y; m, s) <=> Y = sX + m, X ~ Normal(0,1)`), we can swap gradient and 218 expectation, i.e., 219 `grad[ Avg{ s_i : i=1...n } ] = Avg{ grad[s_i] : i=1...n }` where 220 `S_n = Avg{s_i}` and `s_i = f(x_i), x_i ~ p`. 221 222 However, if p is not reparameterized, TensorFlow's gradient will be incorrect 223 since the chain-rule stops at samples of non-reparameterized distributions. 224 (The non-differentiated result, `approx_expectation`, is the same regardless 225 of `use_reparametrization`.) In this circumstance using the Score-Gradient 226 trick results in an unbiased gradient, i.e., 227 228 ```none 229 grad[ E_p[f(X)] ] 230 = grad[ int dx p(x) f(x) ] 231 = int dx grad[ p(x) f(x) ] 232 = int dx [ p'(x) f(x) + p(x) f'(x) ] 233 = int dx p(x) [p'(x) / p(x) f(x) + f'(x) ] 234 = int dx p(x) grad[ f(x) p(x) / stop_grad[p(x)] ] 235 = E_p[ grad[ f(x) p(x) / stop_grad[p(x)] ] ] 236 ``` 237 238 Unless p is not reparametrized, it is usually preferable to 239 `use_reparametrization = True`. 240 241 Warning: users are responsible for verifying `p` is a "reparameterized" 242 distribution. 243 244 Example Use: 245 246 ```python 247 bf = tf.contrib.bayesflow 248 ds = tf.contrib.distributions 249 250 # Monte-Carlo approximation of a reparameterized distribution, e.g., Normal. 251 252 num_draws = int(1e5) 253 p = ds.Normal(loc=0., scale=1.) 254 q = ds.Normal(loc=1., scale=2.) 255 exact_kl_normal_normal = ds.kl_divergence(p, q) 256 # ==> 0.44314718 257 approx_kl_normal_normal = bf.expectation( 258 f=lambda x: p.log_prob(x) - q.log_prob(x), 259 samples=p.sample(num_draws, seed=42), 260 log_prob=p.log_prob, 261 use_reparametrization=(p.reparameterization_type 262 == distribution.FULLY_REPARAMETERIZED)) 263 # ==> 0.44632751 264 # Relative Error: <1% 265 266 # Monte-Carlo approximation of non-reparameterized distribution, e.g., Gamma. 267 268 num_draws = int(1e5) 269 p = ds.Gamma(concentration=1., rate=1.) 270 q = ds.Gamma(concentration=2., rate=3.) 271 exact_kl_gamma_gamma = ds.kl_divergence(p, q) 272 # ==> 0.37999129 273 approx_kl_gamma_gamma = bf.expectation( 274 f=lambda x: p.log_prob(x) - q.log_prob(x), 275 samples=p.sample(num_draws, seed=42), 276 log_prob=p.log_prob, 277 use_reparametrization=(p.reparameterization_type 278 == distribution.FULLY_REPARAMETERIZED)) 279 # ==> 0.37696719 280 # Relative Error: <1% 281 282 # For comparing the gradients, see `monte_carlo_test.py`. 283 ``` 284 285 Note: The above example is for illustration only. To compute approximate 286 KL-divergence, the following is preferred: 287 288 ```python 289 approx_kl_p_q = bf.monte_carlo_csiszar_f_divergence( 290 f=bf.kl_reverse, 291 p_log_prob=q.log_prob, 292 q=p, 293 num_draws=num_draws) 294 ``` 295 296 Args: 297 f: Python callable which can return `f(samples)`. 298 samples: `Tensor` of samples used to form the Monte-Carlo approximation of 299 `E_p[f(X)]`. A batch of samples should be indexed by `axis` dimensions. 300 log_prob: Python callable which can return `log_prob(samples)`. Must 301 correspond to the natural-logarithm of the pdf/pmf of each sample. Only 302 required/used if `use_reparametrization=False`. 303 Default value: `None`. 304 use_reparametrization: Python `bool` indicating that the approximation 305 should use the fact that the gradient of samples is unbiased. Whether 306 `True` or `False`, this arg only affects the gradient of the resulting 307 `approx_expectation`. 308 Default value: `True`. 309 axis: The dimensions to average. If `None`, averages all 310 dimensions. 311 Default value: `0` (the left-most dimension). 312 keep_dims: If True, retains averaged dimensions using size `1`. 313 Default value: `False`. 314 name: A `name_scope` for operations created by this function. 315 Default value: `None` (which implies "expectation"). 316 317 Returns: 318 approx_expectation: `Tensor` corresponding to the Monte-Carlo approximation 319 of `E_p[f(X)]`. 320 321 Raises: 322 ValueError: if `f` is not a Python `callable`. 323 ValueError: if `use_reparametrization=False` and `log_prob` is not a Python 324 `callable`. 325 """ 326 327 with ops.name_scope(name, 'expectation', [samples]): 328 if not callable(f): 329 raise ValueError('`f` must be a callable function.') 330 if use_reparametrization: 331 return math_ops.reduce_mean(f(samples), axis=axis, keep_dims=keep_dims) 332 else: 333 if not callable(log_prob): 334 raise ValueError('`log_prob` must be a callable function.') 335 stop = array_ops.stop_gradient # For readability. 336 x = stop(samples) 337 logpx = log_prob(x) 338 fx = f(x) # Call `f` once in case it has side-effects. 339 # We now rewrite f(x) so that: 340 # `grad[f(x)] := grad[f(x)] + f(x) * grad[logqx]`. 341 # To achieve this, we use a trick that 342 # `h(x) - stop(h(x)) == zeros_like(h(x))` 343 # but its gradient is grad[h(x)]. 344 # Note that IEEE754 specifies that `x - x == 0.` and `x + 0. == x`, hence 345 # this trick loses no precision. For more discussion regarding the 346 # relevant portions of the IEEE754 standard, see the StackOverflow 347 # question, 348 # "Is there a floating point value of x, for which x-x == 0 is false?" 349 # http://stackoverflow.com/q/2686644 350 fx += stop(fx) * (logpx - stop(logpx)) # Add zeros_like(logpx). 351 return math_ops.reduce_mean(fx, axis=axis, keep_dims=keep_dims) 352 353 354def _sample_mean(values): 355 """Mean over sample indices. In this module this is always [0].""" 356 return math_ops.reduce_mean(values, reduction_indices=[0]) 357 358 359def _sample_max(values): 360 """Max over sample indices. In this module this is always [0].""" 361 return math_ops.reduce_max(values, reduction_indices=[0]) 362 363 364def _get_samples(dist, z, n, seed): 365 """Check args and return samples.""" 366 with ops.name_scope('get_samples', values=[z, n]): 367 if (n is None) == (z is None): 368 raise ValueError( 369 'Must specify exactly one of arguments "n" and "z". Found: ' 370 'n = %s, z = %s' % (n, z)) 371 if n is not None: 372 return dist.sample(n, seed=seed) 373 else: 374 return ops.convert_to_tensor(z, name='z') 375