Place where things develope before departing to relevant modules

class CharLMConfig[source]

CharLMConfig(vocab_sz=256, d_model=512, n_layers=6, n_heads=8, d_ff=4096, attn_dropout=0.1, ff_dropout=0.1, emb_dropout=0.1, tie_weights=True, causal=True, pos_enc='absolute', max_seq_len=512, axial_shape=None, axial_emb_dims=None, pad_idx=None, prenorm=False, attn_bias=False, shared_qk=False) :: ConfigBase

Config for quick char-level LM experiments

FixUp init

Transformer w/o LayerNorm

class TransformerEncoderBlockNLN[source]

TransformerEncoderBlockNLN(d_model:int, n_heads:int=8, d_ff:int=None, attn_dropout:float=0.1, ff_dropout:float=0.1, causal:bool=False, attn_bias:bool=False, prenorm:bool=False, shared_qk:bool=False) :: Module

tmp Bacis transformer encoder block. Consists of multi-head attention and positional feedforward layers

bs = 4
sl = 128
d = 64
x = torch.randn(bs, sl, d)
m = TransformerEncoderBlockNLN(d)
out = m(x)
assert (out.size() == (bs, sl, d))
out.shape
torch.Size([4, 128, 64])

class TransformerEncoderNLN[source]

TransformerEncoderNLN(d_model, n_layers=6, n_heads=8, d_ff=None, ff_dropout=0.1, attn_dropout=0.1, attn_bias=False, causal=False, prenorm=False, shared_qk:bool=False, final_norm=None) :: Module

Stack of TransformerEncoderBlocks

x = torch.randn(bs, sl, d)
m = TransformerEncoderNLN(d, n_layers=2)
out = m(x)
assert (out.size() == (bs, sl, d))
out.shape
torch.Size([4, 128, 64])

class TransformerLMNLN[source]

TransformerLMNLN(vocab_sz:int, d_model:int, n_layers:int=6, n_heads:int=8, d_ff:int=None, attn_dropout:float=0.1, ff_dropout:float=0.1, emb_dropout:float=0.1, tie_weights:bool=True, causal:bool=True, pos_enc:str='absolute', max_seq_len:int=512, axial_shape:tuple=None, axial_emb_dims:tuple=None, pad_idx:int=None, prenorm:bool=False, attn_bias:bool=False, shared_qk:bool=False) :: Module

tmp Basic Transformer for language modelling

Parameters:

* vocab_sz: int
* d_model: int - inner dimension of the model
* n_layers: int (default: 6)
* n_heads: int (default: 8)
* d_ff: int - inner dimension of the pointwise FeedForward net, if None defaults to 4*d_model
* attn_dropout: float - attention dropout
* ff_dropout: float - feed-forward dropout
* emb_dropout: float - embedding dropout
* causal: bool (default: True) - if True does causal masking automatically
* max_seq_len: int (default: 512)
* tie_weights: bool - if True target embedding weights are used for computation output projection
* prenorm: bool - wether to use PreNorm or PostNorm
* attn_bias: bool - wether to allow biases in attention projection layers
* pad_idx: int - padding token id, required for autogeneration of padding mask
* pos_enc: str from {'absolute', 'fixed', 'axial'} - type of positional encoding to use
* axial_shape: tuple - [optional] should be factors of max_seq_len
* axial_emb_dims: tuple - [optional] axial embedding components, should sum to d_model

Inputs:

* x - input ids, shape [bs, sl]
* mask - optional boolean mask, shape [bs, sl]

Returns:

* logits - target token logits, shape [bs, sl, vocab_sz]
bs = 4
sl = 128
d = 64
vocab_sz = 256
x = torch.randint(vocab_sz, (bs, sl))
model = TransformerLMNLN(vocab_sz, d, n_layers=2, causal=True)
out = model(x)
assert (out.size() == (bs, sl, vocab_sz))

init function

fixup_init[source]

fixup_init(model)

Applies FixUp initialization to LM (proto ver)

Scales and Shifts

class Scale[source]

Scale(scale=1.0) :: Module

Scales input with single coef

class Shift[source]

Shift() :: Module

Shifts input by scalar

class ShiftScale[source]

ShiftScale(sublayer, scale=1.0) :: Module

Same as nn.Module, but no need for subclasses to call super().__init__

class FeedForwardFixup(Module):
    """
    FeedForward with shifts and scale for FixUp
    """
    def __init__(self, d_model:int, d_ff:int=None, dropout:float=0.):
        d_ff = default(d_ff, 4 * d_model)
        layers = OrderedDict(
            [('shift1',Shift()),
            ('fc1',nn.Linear(d_model, d_ff)),
            ('shift2',Shift()),
            ('act',nn.GELU()),
            ('dropout1',nn.Dropout(dropout)),
            ('shift3',Shift()),
            ('fc2',nn.Linear(d_ff, d_model)),
            ('dropout2',nn.Dropout(dropout)),
            ('scale',Scale())])
        self.net = nn.Sequential(layers)
        self._init()

    def forward(self, x):
        return self.net(x)

    def _init(self):
        [nn.init.xavier_uniform_(p) for p in self.parameters() if p.dim() > 1]

class TransformerEncoderBlockNLN2[source]

TransformerEncoderBlockNLN2(d_model:int, n_heads:int=8, d_ff:int=None, attn_dropout:float=0.1, ff_dropout:float=0.1, causal:bool=False, attn_bias:bool=False, prenorm:bool=False, shared_qk:bool=False) :: Module

tmp Bacis transformer encoder block. Consists of multi-head attention and positional feedforward layers

bs = 4
sl = 128
d = 64
x = torch.randn(bs, sl, d)
m = TransformerEncoderBlockNLN2(d)
out = m(x)
assert (out.size() == (bs, sl, d))
out.shape
torch.Size([4, 128, 64])

class TransformerEncoderNLN2[source]

TransformerEncoderNLN2(d_model, n_layers=6, n_heads=8, d_ff=None, ff_dropout=0.1, attn_dropout=0.1, attn_bias=False, causal=False, prenorm=False, shared_qk:bool=False, final_norm=None) :: Module

Stack of TransformerEncoderBlocks

x = torch.randn(bs, sl, d)
m = TransformerEncoderNLN2(d, n_layers=2)
out = m(x)
assert (out.size() == (bs, sl, d))
out.shape
torch.Size([4, 128, 64])

class TransformerLMNLN2[source]

TransformerLMNLN2(vocab_sz:int, d_model:int, n_layers:int=6, n_heads:int=8, d_ff:int=None, attn_dropout:float=0.1, ff_dropout:float=0.1, emb_dropout:float=0.1, tie_weights:bool=True, causal:bool=True, pos_enc:str='absolute', max_seq_len:int=512, axial_shape:tuple=None, axial_emb_dims:tuple=None, pad_idx:int=None, prenorm:bool=False, attn_bias:bool=False, shared_qk:bool=False) :: Module

tmp Basic Transformer for language modelling

Parameters:

* vocab_sz: int
* d_model: int - inner dimension of the model
* n_layers: int (default: 6)
* n_heads: int (default: 8)
* d_ff: int - inner dimension of the pointwise FeedForward net, if None defaults to 4*d_model
* attn_dropout: float - attention dropout
* ff_dropout: float - feed-forward dropout
* emb_dropout: float - embedding dropout
* causal: bool (default: True) - if True does causal masking automatically
* max_seq_len: int (default: 512)
* tie_weights: bool - if True target embedding weights are used for computation output projection
* prenorm: bool - wether to use PreNorm or PostNorm
* attn_bias: bool - wether to allow biases in attention projection layers
* pad_idx: int - padding token id, required for autogeneration of padding mask
* pos_enc: str from {'absolute', 'fixed', 'axial'} - type of positional encoding to use
* axial_shape: tuple - [optional] should be factors of max_seq_len
* axial_emb_dims: tuple - [optional] axial embedding components, should sum to d_model

Inputs:

* x - input ids, shape [bs, sl]
* mask - optional boolean mask, shape [bs, sl]

Returns:

* logits - target token logits, shape [bs, sl, vocab_sz]
bs = 4
sl = 128
d = 64
vocab_sz = 256
x = torch.randint(vocab_sz, (bs, sl))
model = TransformerLMNLN2(vocab_sz, d, n_layers=2, causal=True)
out = model(x)
assert (out.size() == (bs, sl, vocab_sz))

fixup_init2[source]

fixup_init2(model)

Applies FixUp initialization to LM (proto ver)

ADMIN init

setup

class AdminResidual[source]

AdminResidual(sublayer, d_model) :: Module

Same as nn.Module, but no need for subclasses to call super().__init__

class TransformerEncoderBlockAdmin[source]

TransformerEncoderBlockAdmin(d_model:int, n_heads:int=8, d_ff:int=None, attn_dropout:float=0.1, ff_dropout:float=0.1, causal:bool=False, attn_bias:bool=False, prenorm:bool=False, shared_qk:bool=False) :: Module

Bacis transformer encoder block. Consists of multi-head attention and positional feedforward layers

bs = 4
sl = 128
d = 64
x = torch.randn(bs, sl, d)
m = TransformerEncoderBlockAdmin(d)
out = m(x)
assert (out.size() == (bs, sl, d))
out.shape
torch.Size([4, 128, 64])

class TransformerEncoderAdmin[source]

TransformerEncoderAdmin(d_model, n_layers=6, n_heads=8, d_ff=None, ff_dropout=0.1, attn_dropout=0.1, attn_bias=False, causal=False, prenorm=False, shared_qk:bool=False, final_norm=None) :: Module

Stack of TransformerEncoderBlocks

class TransformerLMAdmin[source]

TransformerLMAdmin(vocab_sz:int, d_model:int, n_layers:int=6, n_heads:int=8, d_ff:int=None, attn_dropout:float=0.1, ff_dropout:float=0.1, emb_dropout:float=0.1, tie_weights:bool=True, causal:bool=True, pos_enc:str='absolute', max_seq_len:int=512, axial_shape:tuple=None, axial_emb_dims:tuple=None, pad_idx:int=None, prenorm:bool=False, attn_bias:bool=False, shared_qk:bool=False) :: Module

tmp Basic Transformer for language modelling

Parameters:

* vocab_sz: int
* d_model: int - inner dimension of the model
* n_layers: int (default: 6)
* n_heads: int (default: 8)
* d_ff: int - inner dimension of the pointwise FeedForward net, if None defaults to 4*d_model
* attn_dropout: float - attention dropout
* ff_dropout: float - feed-forward dropout
* emb_dropout: float - embedding dropout
* causal: bool (default: True) - if True does causal masking automatically
* max_seq_len: int (default: 512)
* tie_weights: bool - if True target embedding weights are used for computation output projection
* prenorm: bool - wether to use PreNorm or PostNorm
* attn_bias: bool - wether to allow biases in attention projection layers
* pad_idx: int - padding token id, required for autogeneration of padding mask
* pos_enc: str from {'absolute', 'fixed', 'axial'} - type of positional encoding to use
* axial_shape: tuple - [optional] should be factors of max_seq_len
* axial_emb_dims: tuple - [optional] axial embedding components, should sum to d_model

Inputs:

* x - input ids, shape [bs, sl]
* mask - optional boolean mask, shape [bs, sl]

Returns:

* logits - target token logits, shape [bs, sl, vocab_sz]

profiling

class BreakFitCallback[source]

BreakFitCallback(after_create=None, before_fit=None, before_epoch=None, before_train=None, before_batch=None, after_pred=None, after_loss=None, before_backward=None, before_step=None, after_cancel_step=None, after_step=None, after_cancel_batch=None, after_batch=None, after_cancel_train=None, after_train=None, before_validate=None, after_cancel_validate=None, after_validate=None, after_cancel_epoch=None, after_epoch=None, after_cancel_fit=None, after_fit=None) :: Callback

Basic class handling tweaks of the training loop by changing a Learner in various events

res_submodules[source]

res_submodules(model)

res_modules[source]

res_modules(model)

#...
# config = CharLMConfig(d_model=512, n_layers=6, max_seq_len=512,
#                       pad_idx=pad_id)

# learn = Learner(dls, TransformerLMAdmin.from_config(config),
#                 loss_func=CrossEntropyLossFlat(ignore_index=pad_id),
#                 cbs = [GradientClip(1.0),
#                        SaveModelCallback(with_opt=True)],
#                 metrics=[accuracy, perplexity, bpc]).to_fp16()
# learn.add_cb(ActivationStats(modules=res_submodules(learn.model)))
# len(learn.activation_stats.modules)
#     learn.fit(1, 1e-3)
 

variances[source]

variances(learn)

initialization

admin_init[source]

admin_init(model, scales)