1# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Vector Student's t distribution classes."""
16
17from __future__ import absolute_import
18from __future__ import division
19from __future__ import print_function
20
21from tensorflow.contrib.distributions.python.ops import bijectors
22from tensorflow.contrib.distributions.python.ops import distribution_util
23from tensorflow.python.framework import constant_op
24from tensorflow.python.framework import dtypes
25from tensorflow.python.framework import ops
26from tensorflow.python.ops import array_ops
27from tensorflow.python.ops.distributions import student_t
28from tensorflow.python.ops.distributions import transformed_distribution
29
30
31class _VectorStudentT(transformed_distribution.TransformedDistribution):
32  """A vector version of Student's t-distribution on `R^k`.
33
34  #### Mathematical details
35
36  The probability density function (pdf) is,
37
38  ```none
39  pdf(x; df, mu, Sigma) = (1 + ||y||**2 / df)**(-0.5 (df + 1)) / Z
40  where,
41  y = inv(Sigma) (x - mu)
42  Z = abs(det(Sigma)) ( sqrt(df pi) Gamma(0.5 df) / Gamma(0.5 (df + 1)) )**k
43  ```
44
45  where:
46  * `loc = mu`; a vector in `R^k`,
47  * `scale = Sigma`; a lower-triangular matrix in `R^{k x k}`,
48  * `Z` denotes the normalization constant, and,
49  * `Gamma` is the [gamma function](
50    https://en.wikipedia.org/wiki/Gamma_function), and,
51  * `||y||**2` denotes the [squared Euclidean norm](
52  https://en.wikipedia.org/wiki/Norm_(mathematics)#Euclidean_norm) of `y`.
53
54  The VectorStudentT distribution is a member of the [location-scale family](
55  https://en.wikipedia.org/wiki/Location-scale_family), i.e., it can be
56  constructed as,
57
58  ```none
59  X ~ StudentT(df, loc=0, scale=1)
60  Y = loc + scale * X
61  ```
62
63  Notice that the `scale` matrix has semantics closer to std. deviation than
64  covariance (but it is not std. deviation).
65
66  This distribution is an Affine transformation of iid
67  [Student's t-distributions](
68  https://en.wikipedia.org/wiki/Student%27s_t-distribution)
69  and should not be confused with the [Multivate Student's t-distribution](
70  https://en.wikipedia.org/wiki/Multivariate_t-distribution). The
71  traditional Multivariate Student's t-distribution is type of
72  [elliptical distribution](
73  https://en.wikipedia.org/wiki/Elliptical_distribution); it has PDF:
74
75  ```none
76  pdf(x; df, mu, Sigma) = (1 + ||y||**2 / df)**(-0.5 (df + k)) / Z
77  where,
78  y = inv(Sigma) (x - mu)
79  Z = abs(det(Sigma)) sqrt(df pi)**k Gamma(0.5 df) / Gamma(0.5 (df + k))
80  ```
81
82  Notice that the Multivariate Student's t-distribution uses `k` where the
83  Vector Student's t-distribution has a `1`. Conversely the Vector version has a
84  broader application of the power-`k` in the normalization constant.
85
86  #### Examples
87
88  A single instance of a "Vector Student's t-distribution" is defined by a mean
89  vector of length `k` and a scale matrix of shape `k x k`.
90
91  Extra leading dimensions, if provided, allow for batches.
92
93  ```python
94  tfd = tf.contrib.distributions
95
96  # Initialize a single 3-variate vector Student's t-distribution.
97  mu = [1., 2, 3]
98  chol = [[1., 0, 0.],
99          [1, 3, 0],
100          [1, 2, 3]]
101  vt = tfd.VectorStudentT(df=2, loc=mu, scale_tril=chol)
102
103  # Evaluate this on an observation in R^3, returning a scalar.
104  vt.prob([-1., 0, 1])
105
106  # Initialize a batch of two 3-variate vector Student's t-distributions.
107  mu = [[1., 2, 3],
108        [11, 22, 33]]
109  chol = ...  # shape 2 x 3 x 3, lower triangular, positive diagonal.
110  vt = tfd.VectorStudentT(loc=mu, scale_tril=chol)
111
112  # Evaluate this on a two observations, each in R^3, returning a length two
113  # tensor.
114  x = [[-1, 0, 1],
115       [-11, 0, 11]]
116  vt.prob(x)
117  ```
118
119  For more examples of how to construct the `scale` matrix, see the
120  `tf.contrib.distributions.bijectors.Affine` docstring.
121
122  """
123
124  def __init__(self,
125               df,
126               loc=None,
127               scale_identity_multiplier=None,
128               scale_diag=None,
129               scale_tril=None,
130               scale_perturb_factor=None,
131               scale_perturb_diag=None,
132               validate_args=False,
133               allow_nan_stats=True,
134               name="VectorStudentT"):
135    """Instantiates the vector Student's t-distributions on `R^k`.
136
137    The `batch_shape` is the broadcast between `df.batch_shape` and
138    `Affine.batch_shape` where `Affine` is constructed from `loc` and
139    `scale_*` arguments.
140
141    The `event_shape` is the event shape of `Affine.event_shape`.
142
143    Args:
144      df: Floating-point `Tensor`. The degrees of freedom of the
145        distribution(s). `df` must contain only positive values. Must be
146        scalar if `loc`, `scale_*` imply non-scalar batch_shape or must have the
147        same `batch_shape` implied by `loc`, `scale_*`.
148      loc: Floating-point `Tensor`. If this is set to `None`, no `loc` is
149        applied.
150      scale_identity_multiplier: floating point rank 0 `Tensor` representing a
151        scaling done to the identity matrix. When `scale_identity_multiplier =
152        scale_diag=scale_tril = None` then `scale += IdentityMatrix`. Otherwise
153        no scaled-identity-matrix is added to `scale`.
154      scale_diag: Floating-point `Tensor` representing the diagonal matrix.
155        `scale_diag` has shape [N1, N2, ..., k], which represents a k x k
156        diagonal matrix. When `None` no diagonal term is added to `scale`.
157      scale_tril: Floating-point `Tensor` representing the diagonal matrix.
158        `scale_diag` has shape [N1, N2, ..., k, k], which represents a k x k
159        lower triangular matrix. When `None` no `scale_tril` term is added to
160        `scale`. The upper triangular elements above the diagonal are ignored.
161      scale_perturb_factor: Floating-point `Tensor` representing factor matrix
162        with last two dimensions of shape `(k, r)`. When `None`, no rank-r
163        update is added to `scale`.
164      scale_perturb_diag: Floating-point `Tensor` representing the diagonal
165        matrix. `scale_perturb_diag` has shape [N1, N2, ..., r], which
166        represents an r x r Diagonal matrix. When `None` low rank updates will
167        take the form `scale_perturb_factor * scale_perturb_factor.T`.
168      validate_args: Python `bool`, default `False`. When `True` distribution
169        parameters are checked for validity despite possibly degrading runtime
170        performance. When `False` invalid inputs may silently render incorrect
171        outputs.
172      allow_nan_stats: Python `bool`, default `True`. When `True`,
173        statistics (e.g., mean, mode, variance) use the value "`NaN`" to
174        indicate the result is undefined. When `False`, an exception is raised
175        if one or more of the statistic's batch members are undefined.
176      name: Python `str` name prefixed to Ops created by this class.
177    """
178    parameters = locals()
179    graph_parents = [df, loc, scale_identity_multiplier, scale_diag,
180                     scale_tril, scale_perturb_factor, scale_perturb_diag]
181    with ops.name_scope(name):
182      with ops.name_scope("init", values=graph_parents):
183        # The shape of the _VectorStudentT distribution is governed by the
184        # relationship between df.batch_shape and affine.batch_shape. In
185        # pseudocode the basic procedure is:
186        #   if df.batch_shape is scalar:
187        #     if affine.batch_shape is not scalar:
188        #       # broadcast distribution.sample so
189        #       # it has affine.batch_shape.
190        #     self.batch_shape = affine.batch_shape
191        #   else:
192        #     if affine.batch_shape is scalar:
193        #       # let affine broadcasting do its thing.
194        #     self.batch_shape = df.batch_shape
195        # All of the above magic is actually handled by TransformedDistribution.
196        # Here we really only need to collect the affine.batch_shape and decide
197        # what we're going to pass in to TransformedDistribution's
198        # (override) batch_shape arg.
199        affine = bijectors.Affine(
200            shift=loc,
201            scale_identity_multiplier=scale_identity_multiplier,
202            scale_diag=scale_diag,
203            scale_tril=scale_tril,
204            scale_perturb_factor=scale_perturb_factor,
205            scale_perturb_diag=scale_perturb_diag,
206            validate_args=validate_args)
207        distribution = student_t.StudentT(
208            df=df,
209            loc=array_ops.zeros([], dtype=affine.dtype),
210            scale=array_ops.ones([], dtype=affine.dtype))
211        batch_shape, override_event_shape = (
212            distribution_util.shapes_from_loc_and_scale(
213                affine.shift, affine.scale))
214        override_batch_shape = distribution_util.pick_vector(
215            distribution.is_scalar_batch(),
216            batch_shape,
217            constant_op.constant([], dtype=dtypes.int32))
218        super(_VectorStudentT, self).__init__(
219            distribution=distribution,
220            bijector=affine,
221            batch_shape=override_batch_shape,
222            event_shape=override_event_shape,
223            validate_args=validate_args,
224            name=name)
225        self._parameters = parameters
226
227  @property
228  def df(self):
229    """Degrees of freedom in these Student's t distribution(s)."""
230    return self.distribution.df
231
232  @property
233  def loc(self):
234    """Locations of these Student's t distribution(s)."""
235    return self.bijector.shift
236
237  @property
238  def scale(self):
239    """Dense (batch) covariance matrix, if available."""
240    return self.bijector.scale
241