#pragma warning restore MSML_PrivateFieldName // Private field name not in: _camelCase format public EncConvLayer(int channel, int kernelSize, double dropoutRate, string activationFn, double activationDropoutRate) : base(nameof(EncConvLayer)) { Conv1 = torch.nn.Sequential( ("conv", new ConvSeparable(channel, channel, kernelSize, kernelSize / 2, dropoutRate)), ("activation", new ActivationFunction(activationFn)), ("dropout", torch.nn.Dropout(activationDropoutRate)) ); LayerNorm1 = torch.nn.LayerNorm(new long[] { channel }); Conv2 = torch.nn.Sequential( ("conv", new ConvSeparable(channel, channel, kernelSize, kernelSize / 2, dropoutRate)), ("activation", new ActivationFunction(activationFn)), ("dropout", torch.nn.Dropout(activationDropoutRate)) ); LayerNorm2 = torch.nn.LayerNorm(new long[] { channel }); RegisterComponents(); }
#pragma warning restore MSML_PrivateFieldName // Private field name not in: _camelCase format public FeedForwardLayer( int embeddingDim = 768, int ffnEmbeddingDim = 3072, double dropoutRate = 0.1, double activationDropoutRate = 0.1, string activationFn = "relu", bool dynamicDropout = false) : base(nameof(FeedForwardLayer)) { // Initialize parameters if (dynamicDropout) { dropoutRate = CalculateDropout(dropoutRate, embeddingDim, SearchSpace.HiddenSizeChoices[SearchSpace.HiddenSizeChoices.Length - 1]); activationDropoutRate = CalculateDropout(activationDropoutRate, embeddingDim, SearchSpace.HiddenSizeChoices[SearchSpace.HiddenSizeChoices.Length - 1]); } // Layer norm associated with the position wise feed-forward NN var fullConnected1 = torch.nn.Linear(embeddingDim, ffnEmbeddingDim); var activation = new ActivationFunction(activationFn); var activationDropoutLayer = torch.nn.Dropout(activationDropoutRate); var fullConnected2 = torch.nn.Linear(ffnEmbeddingDim, embeddingDim); var dropoutLayer = torch.nn.Dropout(dropoutRate); ModelUtils.InitNormal(fullConnected1.weight, mean: 0.0, std: 0.02); ModelUtils.InitZeros(fullConnected1.bias); ModelUtils.InitNormal(fullConnected2.weight, mean: 0.0, std: 0.02); ModelUtils.InitZeros(fullConnected2.bias); FullConnects = torch.nn.Sequential( ("fc1", fullConnected1), ("activation", activation), ("dropout1", activationDropoutLayer), ("fc2", fullConnected2), ("dropout2", dropoutLayer) ); FinalLayerNorm = torch.nn.LayerNorm(new long[] { embeddingDim }); RegisterComponents(); }
#pragma warning restore MSML_PrivateFieldName // Private field name not in: _camelCase format public SelfAttentionLayer( int embeddingDim = 768, int numAttentionHeads = 8, double dropoutRate = 0.1f, double attentionDropoutRate = 0.1f, bool addBiasKv = false, bool addZeroAttention = false) : base(nameof(SelfAttentionLayer)) { SelfAttention = new MultiHeadAttention( embeddingDim, numAttentionHeads, dropout: attentionDropoutRate, addBiasKv: addBiasKv, addZeroAttention: addZeroAttention, selfAttention: true); DropoutLayer = torch.nn.Dropout(dropoutRate); // Layer norm associated with the self attention layer LayerNorm = torch.nn.LayerNorm(new long[] { embeddingDim }); RegisterComponents(); }
public TransformerEncoder( int paddingIdx, int vocabSize, double dropout = 0.1f, double attentionDropout = 0.1f, double activationDropout = 0.1f, string activationFn = "relu", bool dynamicDropout = false, bool addBiasKv = false, bool addZeroAttention = false, int maxSeqLen = 256, bool learnedPositionEmbedding = true, int embedSize = -1, int?embedScale = null, IList <int> arches = null, bool usePositionEmbedding = true, bool offsetPositionsByPadding = true, int numSegments = 2, bool encoderNormalizeBefore = false, int numEncoderLayers = 6, bool applyBertInit = false, bool freezeEmbeddings = false, bool freezeLayers = false, bool freezeTransfer = false, int nTransLayersToFreeze = 0) : base(nameof(TransformerEncoder)) { Contracts.AssertValue(arches); Contracts.AssertNonEmpty(arches); PaddingIdx = paddingIdx; DiscreteArches = arches.ToList(); DistillBlocks = 4; // Embedding modules EmbedScale = embedScale; TokenEmbedding = torch.nn.Embedding(vocabSize, embedSize, paddingIdx); PositionalEmbedding = usePositionEmbedding ? PositionalEmbedding.GetPositionalEmbedding(maxSeqLen, embedSize, paddingIdx, learnedPositionEmbedding) : null; SegmentEmbedding = numSegments > 0 ? torch.nn.Embedding(numSegments, embedSize) : null; EmbeddingLayerNorm = encoderNormalizeBefore ? torch.nn.LayerNorm(new long[] { embedSize }) : null; DropoutLayer = torch.nn.Dropout(dropout); ModelUtils.InitNormal(TokenEmbedding.weight, mean: 0.0, std: 0.02); ModelUtils.InitZeros(TokenEmbedding.weight[paddingIdx]); if (SegmentEmbedding != null) { ModelUtils.InitNormal(SegmentEmbedding.weight, mean: 0.0, std: 0.02); } // Encoder layers var layers = Enumerable.Range(0, numEncoderLayers) .Select(i => new TransformerCellDiscrete( arches[i], dropout, attentionDropout, activationDropout, activationFn, addBiasKv, addZeroAttention, dynamicDropout) as torch.nn.Module) .ToArray(); Layers = new ModuleList(layers); var blockPerLayer = numEncoderLayers / DistillBlocks; HiddenSizePerBlock = CheckBlockHiddenSize(blockPerLayer); EmbedTransfer = new EmbedTransferDiscrete(embedSize, HiddenSizePerBlock[0]); var hiddenSizePerBlockExtend = HiddenSizePerBlock.Append(HiddenSizePerBlock[HiddenSizePerBlock.Count - 1]).ToList(); var hiddenTransferList = Enumerable.Range(0, HiddenSizePerBlock.Count) .Select(i => new HiddenTransferDiscrete(hiddenSizePerBlockExtend[i], hiddenSizePerBlockExtend[i + 1]) as torch.nn.Module) .ToArray(); HiddenTransferList = new ModuleList(hiddenTransferList); if (freezeEmbeddings) { ModelUtils.FreezeModuleParams(TokenEmbedding); ModelUtils.FreezeModuleParams(PositionalEmbedding); ModelUtils.FreezeModuleParams(SegmentEmbedding); ModelUtils.FreezeModuleParams(EmbeddingLayerNorm); } if (freezeLayers) { ModelUtils.FreezeModuleParams(Layers); ModelUtils.FreezeModuleParams(HiddenTransferList); } if (freezeTransfer) { ModelUtils.FreezeModuleParams(HiddenTransferList); } for (var i = 0; i < nTransLayersToFreeze; ++i) { ModelUtils.FreezeModuleParams(Layers[i]); } RegisterComponents(); }