| import math |
| from typing import Any |
| |
| import torch |
| from torch import Tensor |
| from torch.nn.parameter import Parameter, UninitializedParameter |
| from .. import functional as F |
| from .. import init |
| from .module import Module |
| from .lazy import LazyModuleMixin |
| |
| |
| __all__ = [ |
| 'Bilinear', |
| 'Identity', |
| 'LazyLinear', |
| 'Linear', |
| ] |
| |
| |
| class Identity(Module): |
| r"""A placeholder identity operator that is argument-insensitive. |
| |
| Args: |
| args: any argument (unused) |
| kwargs: any keyword argument (unused) |
| |
| Shape: |
| - Input: :math:`(*)`, where :math:`*` means any number of dimensions. |
| - Output: :math:`(*)`, same shape as the input. |
| |
| Examples:: |
| |
| >>> m = nn.Identity(54, unused_argument1=0.1, unused_argument2=False) |
| >>> input = torch.randn(128, 20) |
| >>> output = m(input) |
| >>> print(output.size()) |
| torch.Size([128, 20]) |
| |
| """ |
| def __init__(self, *args: Any, **kwargs: Any) -> None: |
| super(Identity, self).__init__() |
| |
| def forward(self, input: Tensor) -> Tensor: |
| return input |
| |
| |
| class Linear(Module): |
| r"""Applies a linear transformation to the incoming data: :math:`y = xA^T + b` |
| |
| This module supports :ref:`TensorFloat32<tf32_on_ampere>`. |
| |
| On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward. |
| |
| Args: |
| in_features: size of each input sample |
| out_features: size of each output sample |
| bias: If set to ``False``, the layer will not learn an additive bias. |
| Default: ``True`` |
| |
| Shape: |
| - Input: :math:`(*, H_{in})` where :math:`*` means any number of |
| dimensions including none and :math:`H_{in} = \text{in\_features}`. |
| - Output: :math:`(*, H_{out})` where all but the last dimension |
| are the same shape as the input and :math:`H_{out} = \text{out\_features}`. |
| |
| Attributes: |
| weight: the learnable weights of the module of shape |
| :math:`(\text{out\_features}, \text{in\_features})`. The values are |
| initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where |
| :math:`k = \frac{1}{\text{in\_features}}` |
| bias: the learnable bias of the module of shape :math:`(\text{out\_features})`. |
| If :attr:`bias` is ``True``, the values are initialized from |
| :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where |
| :math:`k = \frac{1}{\text{in\_features}}` |
| |
| Examples:: |
| |
| >>> m = nn.Linear(20, 30) |
| >>> input = torch.randn(128, 20) |
| >>> output = m(input) |
| >>> print(output.size()) |
| torch.Size([128, 30]) |
| """ |
| __constants__ = ['in_features', 'out_features'] |
| in_features: int |
| out_features: int |
| weight: Tensor |
| |
| def __init__(self, in_features: int, out_features: int, bias: bool = True, |
| device=None, dtype=None) -> None: |
| factory_kwargs = {'device': device, 'dtype': dtype} |
| super(Linear, self).__init__() |
| self.in_features = in_features |
| self.out_features = out_features |
| self.weight = Parameter(torch.empty((out_features, in_features), **factory_kwargs)) |
| if bias: |
| self.bias = Parameter(torch.empty(out_features, **factory_kwargs)) |
| else: |
| self.register_parameter('bias', None) |
| self.reset_parameters() |
| |
| def reset_parameters(self) -> None: |
| # Setting a=sqrt(5) in kaiming_uniform is the same as initializing with |
| # uniform(-1/sqrt(in_features), 1/sqrt(in_features)). For details, see |
| # https://github.com/pytorch/pytorch/issues/57109 |
| init.kaiming_uniform_(self.weight, a=math.sqrt(5)) |
| if self.bias is not None: |
| fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight) |
| bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 |
| init.uniform_(self.bias, -bound, bound) |
| |
| def forward(self, input: Tensor) -> Tensor: |
| return F.linear(input, self.weight, self.bias) |
| |
| def extra_repr(self) -> str: |
| return 'in_features={}, out_features={}, bias={}'.format( |
| self.in_features, self.out_features, self.bias is not None |
| ) |
| |
| |
| # This class exists solely to avoid triggering an obscure error when scripting |
| # an improperly quantized attention layer. See this issue for details: |
| # https://github.com/pytorch/pytorch/issues/58969 |
| # TODO: fail fast on quantization API usage error, then remove this class |
| # and replace uses of it with plain Linear |
| class NonDynamicallyQuantizableLinear(Linear): |
| def __init__(self, in_features: int, out_features: int, bias: bool = True, |
| device=None, dtype=None) -> None: |
| super().__init__(in_features, out_features, bias=bias, |
| device=device, dtype=dtype) |
| |
| |
| class Bilinear(Module): |
| r"""Applies a bilinear transformation to the incoming data: |
| :math:`y = x_1^T A x_2 + b` |
| |
| Args: |
| in1_features: size of each first input sample |
| in2_features: size of each second input sample |
| out_features: size of each output sample |
| bias: If set to False, the layer will not learn an additive bias. |
| Default: ``True`` |
| |
| Shape: |
| - Input1: :math:`(*, H_{in1})` where :math:`H_{in1}=\text{in1\_features}` and |
| :math:`*` means any number of additional dimensions including none. All but the last dimension |
| of the inputs should be the same. |
| - Input2: :math:`(*, H_{in2})` where :math:`H_{in2}=\text{in2\_features}`. |
| - Output: :math:`(*, H_{out})` where :math:`H_{out}=\text{out\_features}` |
| and all but the last dimension are the same shape as the input. |
| |
| Attributes: |
| weight: the learnable weights of the module of shape |
| :math:`(\text{out\_features}, \text{in1\_features}, \text{in2\_features})`. |
| The values are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where |
| :math:`k = \frac{1}{\text{in1\_features}}` |
| bias: the learnable bias of the module of shape :math:`(\text{out\_features})`. |
| If :attr:`bias` is ``True``, the values are initialized from |
| :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where |
| :math:`k = \frac{1}{\text{in1\_features}}` |
| |
| Examples:: |
| |
| >>> m = nn.Bilinear(20, 30, 40) |
| >>> input1 = torch.randn(128, 20) |
| >>> input2 = torch.randn(128, 30) |
| >>> output = m(input1, input2) |
| >>> print(output.size()) |
| torch.Size([128, 40]) |
| """ |
| __constants__ = ['in1_features', 'in2_features', 'out_features'] |
| in1_features: int |
| in2_features: int |
| out_features: int |
| weight: Tensor |
| |
| def __init__(self, in1_features: int, in2_features: int, out_features: int, bias: bool = True, |
| device=None, dtype=None) -> None: |
| factory_kwargs = {'device': device, 'dtype': dtype} |
| super(Bilinear, self).__init__() |
| self.in1_features = in1_features |
| self.in2_features = in2_features |
| self.out_features = out_features |
| self.weight = Parameter(torch.empty((out_features, in1_features, in2_features), **factory_kwargs)) |
| |
| if bias: |
| self.bias = Parameter(torch.empty(out_features, **factory_kwargs)) |
| else: |
| self.register_parameter('bias', None) |
| self.reset_parameters() |
| |
| def reset_parameters(self) -> None: |
| bound = 1 / math.sqrt(self.weight.size(1)) |
| init.uniform_(self.weight, -bound, bound) |
| if self.bias is not None: |
| init.uniform_(self.bias, -bound, bound) |
| |
| def forward(self, input1: Tensor, input2: Tensor) -> Tensor: |
| return F.bilinear(input1, input2, self.weight, self.bias) |
| |
| def extra_repr(self) -> str: |
| return 'in1_features={}, in2_features={}, out_features={}, bias={}'.format( |
| self.in1_features, self.in2_features, self.out_features, self.bias is not None |
| ) |
| |
| |
| class LazyLinear(LazyModuleMixin, Linear): |
| r"""A :class:`torch.nn.Linear` module where `in_features` is inferred. |
| |
| In this module, the `weight` and `bias` are of :class:`torch.nn.UninitializedParameter` |
| class. They will be initialized after the first call to ``forward`` is done and the |
| module will become a regular :class:`torch.nn.Linear` module. The ``in_features`` argument |
| of the :class:`Linear` is inferred from the ``input.shape[-1]``. |
| |
| Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation |
| on lazy modules and their limitations. |
| |
| Args: |
| out_features: size of each output sample |
| bias: If set to ``False``, the layer will not learn an additive bias. |
| Default: ``True`` |
| |
| Attributes: |
| weight: the learnable weights of the module of shape |
| :math:`(\text{out\_features}, \text{in\_features})`. The values are |
| initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where |
| :math:`k = \frac{1}{\text{in\_features}}` |
| bias: the learnable bias of the module of shape :math:`(\text{out\_features})`. |
| If :attr:`bias` is ``True``, the values are initialized from |
| :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where |
| :math:`k = \frac{1}{\text{in\_features}}` |
| |
| |
| """ |
| |
| cls_to_become = Linear # type: ignore[assignment] |
| weight: UninitializedParameter |
| bias: UninitializedParameter # type: ignore[assignment] |
| |
| def __init__(self, out_features: int, bias: bool = True, |
| device=None, dtype=None) -> None: |
| factory_kwargs = {'device': device, 'dtype': dtype} |
| # bias is hardcoded to False to avoid creating tensor |
| # that will soon be overwritten. |
| super().__init__(0, 0, False) |
| self.weight = UninitializedParameter(**factory_kwargs) |
| self.out_features = out_features |
| if bias: |
| self.bias = UninitializedParameter(**factory_kwargs) |
| |
| def reset_parameters(self) -> None: |
| if not self.has_uninitialized_params() and self.in_features != 0: |
| super().reset_parameters() |
| |
| def initialize_parameters(self, input) -> None: # type: ignore[override] |
| if self.has_uninitialized_params(): |
| with torch.no_grad(): |
| self.in_features = input.shape[-1] |
| self.weight.materialize((self.out_features, self.in_features)) |
| if self.bias is not None: |
| self.bias.materialize((self.out_features,)) |
| self.reset_parameters() |
| # TODO: PartialLinear - maybe in sparse? |