FixUp init

Transformer w/o LayerNorm

bs = 4
sl = 128
d = 64
x = torch.randn(bs, sl, d)
m = TransformerEncoderBlockNLN(d)
out = m(x)
assert (out.size() == (bs, sl, d))
out.shape

torch.Size([4, 128, 64])

x = torch.randn(bs, sl, d)
m = TransformerEncoderNLN(d, n_layers=2)
out = m(x)
assert (out.size() == (bs, sl, d))
out.shape

torch.Size([4, 128, 64])

* vocab_sz: int
* d_model: int - inner dimension of the model
* n_layers: int (default: 6)
* n_heads: int (default: 8)
* d_ff: int - inner dimension of the pointwise FeedForward net, if None defaults to 4*d_model
* attn_dropout: float - attention dropout
* ff_dropout: float - feed-forward dropout
* emb_dropout: float - embedding dropout
* causal: bool (default: True) - if True does causal masking automatically
* max_seq_len: int (default: 512)
* tie_weights: bool - if True target embedding weights are used for computation output projection
* prenorm: bool - wether to use PreNorm or PostNorm
* attn_bias: bool - wether to allow biases in attention projection layers
* pad_idx: int - padding token id, required for autogeneration of padding mask
* pos_enc: str from {'absolute', 'fixed', 'axial'} - type of positional encoding to use
* axial_shape: tuple - [optional] should be factors of max_seq_len
* axial_emb_dims: tuple - [optional] axial embedding components, should sum to d_model

* x - input ids, shape [bs, sl]
* mask - optional boolean mask, shape [bs, sl]

* logits - target token logits, shape [bs, sl, vocab_sz]

bs = 4
sl = 128
d = 64
vocab_sz = 256
x = torch.randint(vocab_sz, (bs, sl))
model = TransformerLMNLN(vocab_sz, d, n_layers=2, causal=True)
out = model(x)
assert (out.size() == (bs, sl, vocab_sz))

init function

Scales and Shifts

class FeedForwardFixup(Module):
    """
    FeedForward with shifts and scale for FixUp
    """
    def __init__(self, d_model:int, d_ff:int=None, dropout:float=0.):
        d_ff = default(d_ff, 4 * d_model)
        layers = OrderedDict(
            [('shift1',Shift()),
            ('fc1',nn.Linear(d_model, d_ff)),
            ('shift2',Shift()),
            ('act',nn.GELU()),
            ('dropout1',nn.Dropout(dropout)),
            ('shift3',Shift()),
            ('fc2',nn.Linear(d_ff, d_model)),
            ('dropout2',nn.Dropout(dropout)),
            ('scale',Scale())])
        self.net = nn.Sequential(layers)
        self._init()

    def forward(self, x):
        return self.net(x)

    def _init(self):
        [nn.init.xavier_uniform_(p) for p in self.parameters() if p.dim() > 1]

bs = 4
sl = 128
d = 64
x = torch.randn(bs, sl, d)
m = TransformerEncoderBlockNLN2(d)
out = m(x)
assert (out.size() == (bs, sl, d))
out.shape

torch.Size([4, 128, 64])

x = torch.randn(bs, sl, d)
m = TransformerEncoderNLN2(d, n_layers=2)
out = m(x)
assert (out.size() == (bs, sl, d))
out.shape

torch.Size([4, 128, 64])

* vocab_sz: int
* d_model: int - inner dimension of the model
* n_layers: int (default: 6)
* n_heads: int (default: 8)
* d_ff: int - inner dimension of the pointwise FeedForward net, if None defaults to 4*d_model
* attn_dropout: float - attention dropout
* ff_dropout: float - feed-forward dropout
* emb_dropout: float - embedding dropout
* causal: bool (default: True) - if True does causal masking automatically
* max_seq_len: int (default: 512)
* tie_weights: bool - if True target embedding weights are used for computation output projection
* prenorm: bool - wether to use PreNorm or PostNorm
* attn_bias: bool - wether to allow biases in attention projection layers
* pad_idx: int - padding token id, required for autogeneration of padding mask
* pos_enc: str from {'absolute', 'fixed', 'axial'} - type of positional encoding to use
* axial_shape: tuple - [optional] should be factors of max_seq_len
* axial_emb_dims: tuple - [optional] axial embedding components, should sum to d_model

* x - input ids, shape [bs, sl]
* mask - optional boolean mask, shape [bs, sl]

* logits - target token logits, shape [bs, sl, vocab_sz]

bs = 4
sl = 128
d = 64
vocab_sz = 256
x = torch.randint(vocab_sz, (bs, sl))
model = TransformerLMNLN2(vocab_sz, d, n_layers=2, causal=True)
out = model(x)
assert (out.size() == (bs, sl, vocab_sz))

ADMIN init

setup

bs = 4
sl = 128
d = 64
x = torch.randn(bs, sl, d)
m = TransformerEncoderBlockAdmin(d)
out = m(x)
assert (out.size() == (bs, sl, d))
out.shape

torch.Size([4, 128, 64])

* vocab_sz: int
* d_model: int - inner dimension of the model
* n_layers: int (default: 6)
* n_heads: int (default: 8)
* d_ff: int - inner dimension of the pointwise FeedForward net, if None defaults to 4*d_model
* attn_dropout: float - attention dropout
* ff_dropout: float - feed-forward dropout
* emb_dropout: float - embedding dropout
* causal: bool (default: True) - if True does causal masking automatically
* max_seq_len: int (default: 512)
* tie_weights: bool - if True target embedding weights are used for computation output projection
* prenorm: bool - wether to use PreNorm or PostNorm
* attn_bias: bool - wether to allow biases in attention projection layers
* pad_idx: int - padding token id, required for autogeneration of padding mask
* pos_enc: str from {'absolute', 'fixed', 'axial'} - type of positional encoding to use
* axial_shape: tuple - [optional] should be factors of max_seq_len
* axial_emb_dims: tuple - [optional] axial embedding components, should sum to d_model

* x - input ids, shape [bs, sl]
* mask - optional boolean mask, shape [bs, sl]

* logits - target token logits, shape [bs, sl, vocab_sz]

profiling

#...
# config = CharLMConfig(d_model=512, n_layers=6, max_seq_len=512,
#                       pad_idx=pad_id)

# learn = Learner(dls, TransformerLMAdmin.from_config(config),
#                 loss_func=CrossEntropyLossFlat(ignore_index=pad_id),
#                 cbs = [GradientClip(1.0),
#                        SaveModelCallback(with_opt=True)],
#                 metrics=[accuracy, perplexity, bpc]).to_fp16()
# learn.add_cb(ActivationStats(modules=res_submodules(learn.model)))
# len(learn.activation_stats.modules)

#     learn.fit(1, 1e-3)

Experimental blocks and features

`class` `CharLMConfig`[source]

FixUp init

Transformer w/o LayerNorm

`class` `TransformerEncoderBlockNLN`[source]

`class` `TransformerEncoderNLN`[source]

`class` `TransformerLMNLN`[source]

init function

`fixup_init`[source]

Scales and Shifts

`class` `Scale`[source]

`class` `Shift`[source]

`class` `ShiftScale`[source]

`class` `TransformerEncoderBlockNLN2`[source]

`class` `TransformerEncoderNLN2`[source]

`class` `TransformerLMNLN2`[source]

`fixup_init2`[source]

ADMIN init

setup

`class` `AdminResidual`[source]

`class` `TransformerEncoderBlockAdmin`[source]

`class` `TransformerEncoderAdmin`[source]

`class` `TransformerLMAdmin`[source]

profiling

`class` `BreakFitCallback`[source]

`res_submodules`[source]

`res_modules`[source]

`variances`[source]

initialization

`admin_init`[source]

Experimental blocks and features

class CharLMConfig[source]

FixUp init

Transformer w/o LayerNorm

class TransformerEncoderBlockNLN[source]

class TransformerEncoderNLN[source]

class TransformerLMNLN[source]

init function

fixup_init[source]

Scales and Shifts

class Scale[source]

class Shift[source]

class ShiftScale[source]

class TransformerEncoderBlockNLN2[source]

class TransformerEncoderNLN2[source]

class TransformerLMNLN2[source]

fixup_init2[source]

ADMIN init

setup

class AdminResidual[source]

class TransformerEncoderBlockAdmin[source]

class TransformerEncoderAdmin[source]

class TransformerLMAdmin[source]

profiling

class BreakFitCallback[source]

res_submodules[source]

res_modules[source]

variances[source]

initialization

admin_init[source]

`class` `CharLMConfig`[source]

`class` `TransformerEncoderBlockNLN`[source]

`class` `TransformerEncoderNLN`[source]

`class` `TransformerLMNLN`[source]

`fixup_init`[source]

`class` `Scale`[source]

`class` `Shift`[source]

`class` `ShiftScale`[source]

`class` `TransformerEncoderBlockNLN2`[source]

`class` `TransformerEncoderNLN2`[source]

`class` `TransformerLMNLN2`[source]

`fixup_init2`[source]

`class` `AdminResidual`[source]

`class` `TransformerEncoderBlockAdmin`[source]

`class` `TransformerEncoderAdmin`[source]

`class` `TransformerLMAdmin`[source]

`class` `BreakFitCallback`[source]

`res_submodules`[source]

`res_modules`[source]

`variances`[source]

`admin_init`[source]