public override torch.Tensor forward(torch.Tensor features) { // TODO: try whitening-like techniques // take <s> token (equiv. to [CLS]) using var x = features[torch.TensorIndex.Colon, torch.TensorIndex.Single(0), torch.TensorIndex.Colon]; return(Classifier.forward(x)); }
/// <summary> /// Copy <paramref name="src"/> tensor to <paramref name="dst"/> tensor. /// If <paramref name="moveEosToBeginning"/> is true, an EOS token will be added to the beginning /// of <paramref name="dst"/> tensor, and the last token of <paramref name="src"/> will be dropped. /// </summary> /// <param name="src"></param> /// <param name="dst"></param> /// <param name="moveEosToBeginning"></param> /// <param name="eosIndex"></param> /// <exception cref="ArgumentException"></exception> private static void CopyTensor(torch.Tensor src, torch.Tensor dst, bool moveEosToBeginning = false, int?eosIndex = null) { if (src.numel() != dst.numel()) { throw new ArgumentException( $"Inconsistent capacity when copying tensor, got {src.numel()} and {dst.numel()}."); } if (moveEosToBeginning && (eosIndex == null || eosIndex < 0)) { throw new ArgumentException( $"{nameof(eosIndex)} must not be null or negative when {nameof(moveEosToBeginning)} is true."); } if (moveEosToBeginning && src[-1][0].ToInt32() == eosIndex) { dst[0] = torch.tensor((int)eosIndex); dst[torch.TensorIndex.Slice(start: 1)] = src[torch.TensorIndex.Slice(stop: -1)]; } else { dst.copy_(src); } }
private torch.Tensor ForwardEmbedding(torch.Tensor tokens, torch.Tensor segmentLabels, torch.Tensor positions) { using var disposeScope = torch.NewDisposeScope(); var x = TokenEmbedding.forward(tokens); if (EmbedScale != null) { x.mul_(EmbedScale); } if (PositionalEmbedding != null) { var positionalEmbedding = PositionalEmbedding.forward(tokens, new Dictionary <string, object> { { PositionalEmbedding.PositionKey, positions } }); x.add_(positionalEmbedding); } if (SegmentEmbedding != null && segmentLabels.IsNotNull()) { var segmentEmbedding = SegmentEmbedding.forward(segmentLabels); x.add_(segmentEmbedding); } if (EmbeddingLayerNorm != null) { x = EmbeddingLayerNorm.forward(x); } x = EmbedTransfer.forward(x, (int)x.size()[x.size().Length - 1]); x = DropoutLayer.forward(x); return(x.MoveToOuterDisposeScope()); }
/// <summary> /// Convert a tensor of token indices to a string. /// Can optionally remove BPE symbols or escape "<unk>" words. /// </summary> public string Tensor2String(torch.Tensor tensor, string bpeSymbol = null, bool escapeUnk = false) { if (tensor.IsNull()) { return(string.Empty); } bpeSymbol ??= ""; List <string> subStrings; if (tensor.dim() == 2) { subStrings = Enumerable.Range(0, (int)tensor.shape[0]) .Select(i => Tensor2String(tensor[i], bpeSymbol, escapeUnk)) .ToList(); return(string.Join("\n", subStrings)); } subStrings = Enumerable.Range(0, (int)tensor.shape[0]) .Select(i => _symbols[i]) .ToList(); var sentence = string.Join(" ", subStrings); return(ProcessBpeSymbol(sentence, bpeSymbol)); }
public TensorAccessor(torch.Tensor tensor) { if (tensor.device_type != DeviceType.CPU) { throw new InvalidOperationException("Reading data from non-CPU memory is not supported. Move or copy the tensor to the cpu before reading."); } var strides = tensor.stride(); for (var i = 0; i < strides.Length; i++) { if (strides[i] < 0) { throw new NotImplementedException($"Negative tensor strides are not currently supported. tensor.strides({i}) == {strides[i]}"); } } // Get the data from native code. unsafe { var res = torch.Tensor.THSTensor_data(tensor.Handle); if (res == IntPtr.Zero) { torch.CheckForErrors(); } // NOTE: there is no safety here. _tensor_data_ptr = res; } _tensor = tensor; // Keep the tensor alive now that everything is alright. }
public override torch.Tensor forward(torch.Tensor x) { x = this.squeeze_activation.forward(this.squeeze.forward(x)); return torch.cat(new torch.Tensor[]{ this.expand1x1_activation.forward(this.expand1x1.forward(x)), this.expand3x3_activation.forward(this.expand3x3.forward(x)) }, 1); }
public override torch.Tensor forward(torch.Tensor srcTokens, torch.Tensor tokenMask = null) { using var disposeScope = torch.NewDisposeScope(); var x = ExtractFeatures(srcTokens); x = _predictionHead.forward(x); return(x.MoveToOuterDisposeScope()); }
public override torch.Tensor forward(torch.Tensor x) { using var disposeScope = torch.NewDisposeScope(); var x1 = torch.pow(x, 3).mul_(_beta).add_(x).mul_(_alpha); // sqrt(2/Pi) * (x + 0.044715 x^3) var y = torch.nn.functional.tanh(x1).add_(1.0).mul_(0.5).mul_(x); return(y.MoveToOuterDisposeScope()); }
/// <summary> /// Replace non-padding symbols with their position numbers. /// Position numbers begin at padTokenIndex+1. Padding symbols are ignored. /// </summary> /// <param name="tensor">Cannot be null.</param> /// <param name="padTokenIndex"></param> protected static torch.Tensor MakePositions(torch.Tensor tensor, int padTokenIndex) { using var disposeScope = torch.NewDisposeScope(); var mask = tensor.ne(padTokenIndex).@long(); var positions = torch.cumsum(mask, dimension: 1).mul_(mask); positions.add_(PadPositionIndex); return(positions.MoveToOuterDisposeScope()); }
public override torch.Tensor forward(torch.Tensor x, torch.Tensor selfAttentionMask, torch.Tensor selfAttentionPaddingMask, int arch = 0, bool layerNormTraining = false) { return((Operations[arch] as Layer) !.forward(x, new Dictionary <string, object> { { Layer.AttentionMaskKey, selfAttentionMask }, { Layer.PaddingMaskKey, selfAttentionPaddingMask }, })); }
public override torch.Tensor forward(torch.Tensor x, Dictionary <string, object> param = null) { if (!ParseArguments(param, out var selfAttentionPaddingMask)) { throw new ArgumentException($"Invalid arguments: {param}."); } using var x1 = ForwardOneLayer(x, selfAttentionPaddingMask, Conv1, LayerNorm1); return(ForwardOneLayer(x1, selfAttentionPaddingMask, Conv2, LayerNorm2)); }
public torch.Tensor forward(torch.Tensor x) { if((x.shape[1] != 3) || (x.shape[2] != 224) || (x.shape[3] != 224)) { throw new ArgumentException("Unsupported image size: should be bx3x224x224."); } x = this.features.forward(x) as torch.Tensor; x = this.classifier.forward(x) as torch.Tensor; return torch.flatten(x, 1); }
public override torch.Tensor forward(torch.Tensor x, int hiddenSize) { if (hiddenSize == SearchSpace.HiddenSizeChoices[SearchSpace.HiddenSizeChoices.Length - 1]) { return(x); } var index = SearchSpace.HiddenSizeChoices.ToList().IndexOf(hiddenSize); return(index == -1 ? x.alias() : HiddenTransfer[index].forward(x)); }
private static torch.Tensor PadMask(torch.Tensor tensor) { if (tensor.IsNull()) { return(null); } using var zeros = tensor.new_zeros(tensor.size(0), 1); return(torch.cat(new List <torch.Tensor> { tensor, zeros }, dimension: 1)); }
private static bool ParseArguments(IReadOnlyDictionary <string, object> param, out torch.Tensor selfAttentionMask, out torch.Tensor selfAttentionPaddingMask) { selfAttentionMask = selfAttentionPaddingMask = null; if (!(param.ContainsKey(AttentionMaskKey) && param.ContainsKey(PaddingMaskKey))) { return(false); } selfAttentionMask = (torch.Tensor)param[AttentionMaskKey]; selfAttentionPaddingMask = (torch.Tensor)param[PaddingMaskKey]; return(true); }
private static torch.Tensor ForwardOneLayer(torch.Tensor input, torch.Tensor selfAttentionPaddingMask, torch.nn.Module convLayer, torch.nn.Module layerNorm) { using var disposeScope = torch.NewDisposeScope(); torch.Tensor x = selfAttentionPaddingMask.IsNull() ? input.alias() : input.masked_fill(selfAttentionPaddingMask.T.unsqueeze(-1), 0); var conv = convLayer.forward(x); conv.add_(input); var norm = layerNorm.forward(conv); return(norm.MoveToOuterDisposeScope()); }
public override torch.Tensor forward(torch.Tensor input, Dictionary <string, object> param = null) { using var disposeScope = torch.NewDisposeScope(); ParseArguments(param, out var incrementalState, out var positions); if (positions.IsNull()) { positions = incrementalState ? torch.tensor(PadPositionIndex + input.size(1)) : MakePositions(input, PadTokenIndex); } var embedding = Embedding.forward(positions); return(embedding.MoveToOuterDisposeScope()); }
public override torch.Tensor forward(torch.Tensor input, Dictionary <string, object> param = null) { using var disposeScope = torch.NewDisposeScope(); ParseArguments(param, out var incrementalState, out var timeStep); var bszInt = (int)input.shape[0]; var seqLenInt = (int)input.shape[1]; var maxPosition = (int)(PadPositionIndex + 1 + input.shape[1]); // recompute/expand embeddings if needed if (Weight is null || maxPosition > Weight.size(0)) { Weight?.Dispose(); Weight = GetEmbedding(maxPosition, EmbeddingDim); Weight = (Parameter)Weight.MoveToOuterDisposeScope(); } // move Weight to the device where _float_tensor is foreach (var(bufferName, buffer) in named_buffers()) { if (bufferName == nameof(_floatTensor)) { Weight = (Parameter)Weight.to(buffer); Weight = (Parameter)Weight.MoveToOuterDisposeScope(); break; } } // positions is the same for every token when decoding a single step if (incrementalState) { var pos = timeStep is null ? seqLenInt : timeStep.item <int>() + 1; var slice = Weight[torch.TensorIndex.Single(PadPositionIndex + pos), torch.TensorIndex.Colon]; return(slice.expand(bszInt, 1, 1).MoveToOuterDisposeScope()); } var positions = MakePositions(input, PadTokenIndex).view(-1); var weightsSelected = Weight.index_select(0, positions).view(bszInt, seqLenInt, -1); return(weightsSelected.detach().MoveToOuterDisposeScope()); }
private static void ParseArguments(IReadOnlyDictionary <string, object> param, out bool incrementalState, out torch.Tensor positions) { incrementalState = false; positions = null; if (param == null) { return; } if (param.ContainsKey(IncrementalStateKey)) { incrementalState = (bool)param[IncrementalStateKey]; } if (param.ContainsKey(PositionKey)) { positions = (torch.Tensor)param[PositionKey]; } }
public override torch.Tensor forward(torch.Tensor x, Dictionary <string, object> param) { using var disposeScope = torch.NewDisposeScope(); if (!ParseArguments(param, out var selfAttentionMask, out var selfAttentionPaddingMask)) { throw new ArgumentException("Invalid arguments."); } var attention = SelfAttention.forward(query: x, key: x, value: x, out _, keyPaddingMask: selfAttentionPaddingMask, needWeights: false, attentionMask: selfAttentionMask); var dropout = DropoutLayer.forward(attention); dropout.add_(x); var norm = LayerNorm.forward(dropout); return(norm.MoveToOuterDisposeScope()); }
public torch.Tensor forward( torch.Tensor tokens, torch.Tensor segmentLabels = null, torch.Tensor positions = null) { using var disposeScope = torch.NewDisposeScope(); var x = ForwardEmbedding(tokens, segmentLabels, positions); // Compute padding mask. This is needed for multi-head attention var paddingMask = tokens.eq(PaddingIdx); var usePaddingMask = paddingMask.any().ToBoolean(); // Account for padding while computing the representation if (usePaddingMask) { var xValidPart = paddingMask.logical_not().unsqueeze(-1).type_as(x); x.mul_(xValidPart); } // B x T x C -> T x B x C x.transpose_(0, 1); // forward Layers var blockPerLayer = Layers.Count / DistillBlocks; var blockIndex = 0; for (var i = 0; i < Layers.Count; ++i) { x = ForwardOneLayer(x, usePaddingMask ? paddingMask : null, i, blockPerLayer, ref blockIndex); } // T x B x C -> B x T x C x.transpose_(0, 1); // var sentenceRepresentation = x[torch.TensorIndex.Colon, torch.TensorIndex.Single(0), torch.TensorIndex.Colon]; return(x.MoveToOuterDisposeScope()); }
private torch.Tensor ForwardOneLayer(torch.Tensor input, torch.Tensor paddingMask, int i, int blockPerLayer, ref int blockIndex) { using var disposeScope = torch.NewDisposeScope(); var x = input.alias(); // avoid scope mess var layer = Layers[i]; if (i % blockPerLayer == 0) { x = (HiddenTransferList[blockIndex] as HiddenTransfer).forward(x, HiddenSizePerBlock[blockIndex], true); } x = (layer as TransformerCell).forward(x, null, paddingMask); if ((i + 1) % blockPerLayer == 0) { x = (HiddenTransferList[blockIndex] as HiddenTransfer).forward(x, HiddenSizePerBlock[blockIndex], false); ++blockIndex; } return(x.MoveToOuterDisposeScope()); }
private (torch.Tensor, torch.Tensor, torch.Tensor) QkvProjection( torch.Tensor query, torch.Tensor key, torch.Tensor value) { using var disposeScope = torch.NewDisposeScope(); torch.Tensor q = null; torch.Tensor k = null; torch.Tensor v = null; if (_selfAttention) { q = QProjection.forward(query); k = KProjection.forward(query); v = VProjection.forward(query); } else if (_encoderDecoderAttention) { q = QProjection.forward(query); if (key.IsNull()) { k = v = null; } else { k = KProjection.forward(key); v = VProjection.forward(key); } } else { q = QProjection.forward(query); k = KProjection.forward(key); v = VProjection.forward(value); } return(q.MoveToOuterDisposeScope(), k.MoveToOuterDisposeScope(), v.MoveToOuterDisposeScope()); }
public torch.Tensor forward( torch.Tensor query, torch.Tensor key, torch.Tensor value, out torch.Tensor outAttentionWeights, torch.Tensor keyPaddingMask = null, Dictionary <string, Dictionary <string, torch.Tensor> > incrementalState = null, bool needWeights = true, bool staticKv = false, torch.Tensor attentionMask = null) { outAttentionWeights = null; if (query.IsNull() || query.size().Length != 3 || query.size(2) != _embeddingDim) { throw new ArgumentException("query must NOT be null and must be 3D in multi-head attention;" + "the last dimension should be the same as embedding dimension."); } using var disposeScope = torch.NewDisposeScope(); var qSize = query.size(); var tgtLen = qSize[0]; var batchSize = qSize[1]; var embedDim = qSize[2]; // Get saved state from incrementalState Dictionary <string, torch.Tensor> savedState = null; if (incrementalState != null) { savedState = GetInputBuffer(incrementalState); // previous time steps are cached - no need to recompute key and value if they are static. if (savedState.ContainsKey(PrevKeyKey) && savedState.ContainsKey(PrevValueKey) && staticKv) { if (_selfAttention || !_encoderDecoderAttention) { throw new ArgumentException( "prevKey and prevValue are only valid in encoder-decoder attention."); } key = value = null; } } // Calculate current qkv projection var(q, k, v) = QkvProjection(query, key, value); // Simulate using-statement by try-finally torch.Tensor attentionMaskPad = attentionMask?.alias(); torch.Tensor keyPaddingMaskPad = keyPaddingMask?.alias(); q.mul_(_scaling); if (_addBiasKv) { var kRepeat = KBias.repeat(1, batchSize, 1); var vRepeat = VBias.repeat(1, batchSize, 1); k = torch.cat(new List <torch.Tensor> { k, kRepeat }, dimension: 0); v = torch.cat(new List <torch.Tensor> { v, vRepeat }, dimension: 0); attentionMaskPad = PadMask(attentionMaskPad); keyPaddingMaskPad = PadMask(keyPaddingMaskPad); } q = q.view(tgtLen, batchSize * _numHeads, _headDim).transpose_(0, 1); k = k?.view(-1, batchSize * _numHeads, _headDim).transpose_(0, 1); v = v?.view(-1, batchSize * _numHeads, _headDim).transpose_(0, 1); if (savedState != null) { // saved states are stored with shape (batchSize, NumHeads, seqLen, HeadDim) if (savedState.ContainsKey(PrevKeyKey)) { var prevKey = savedState[PrevKeyKey].view(batchSize * _numHeads, -1, _headDim); k = staticKv ? prevKey : torch.cat(new List <torch.Tensor> { prevKey, k }, dimension: 1); } if (savedState.ContainsKey(PrevValueKey)) { var prevValue = savedState[PrevValueKey].view(batchSize * _numHeads, -1, _headDim); v = staticKv ? prevValue : torch.cat(new List <torch.Tensor> { prevValue, v }, dimension: 1); } savedState[PrevKeyKey].Dispose(); savedState[PrevKeyKey] = k?.view(batchSize, _numHeads, -1, _headDim); savedState[PrevValueKey].Dispose(); savedState[PrevValueKey] = v?.view(batchSize, _numHeads, -1, _headDim); SetInputBuffer(incrementalState, savedState); } Debug.Assert(k.IsNotNull() && v.IsNotNull()); var srcLen = k !.size(1); // This is part of a workaround to get around fork/join parallelism not supporting Optional types. if (keyPaddingMaskPad?.shape.Length == 0) { keyPaddingMaskPad = null; } Debug.Assert(keyPaddingMaskPad.IsNull() || (keyPaddingMaskPad.size(0) == batchSize && keyPaddingMaskPad.size(1) == srcLen)); if (_addZeroAttention) { srcLen += 1; var zeroPadSize = k.size(); zeroPadSize[1] = 1; var kZeros = k.new_zeros(zeroPadSize); var vZeros = v !.new_zeros(zeroPadSize); k = torch.cat(new List <torch.Tensor> { k, kZeros }, dimension: 1); v = torch.cat(new List <torch.Tensor> { v, vZeros }, dimension: 1); attentionMaskPad = PadMask(attentionMaskPad); keyPaddingMaskPad = PadMask(keyPaddingMaskPad); } var attentionWeights = torch.matmul(q, k.transpose(1, 2)); Debug.Assert(attentionWeights.size().SequenceEqual(new[] { batchSize *_numHeads, tgtLen, srcLen })); if (attentionMaskPad.IsNotNull()) { attentionWeights.add_(attentionMaskPad.unsqueeze(0)); } if (keyPaddingMaskPad.IsNotNull()) { // Don't attend to pad symbols keyPaddingMaskPad = keyPaddingMaskPad.unsqueeze(1).unsqueeze(2); attentionWeights = attentionWeights .view(batchSize, _numHeads, tgtLen, srcLen) .masked_fill(keyPaddingMaskPad, float.NegativeInfinity) .view(batchSize * _numHeads, tgtLen, srcLen); } attentionWeights = torch.nn.functional.softmax(attentionWeights, dim: -1); attentionWeights = DropoutLayer.forward(attentionWeights); if (needWeights) { // Average attention weights over heads var weightsView = attentionWeights.view(batchSize, _numHeads, tgtLen, srcLen); outAttentionWeights = weightsView.sum(dim: 1).div_(_numHeads); } var attention = torch.matmul(attentionWeights, v); Debug.Assert(attention.size().SequenceEqual(new[] { batchSize *_numHeads, tgtLen, _headDim })); attention = attention.transpose(0, 1).contiguous().view(tgtLen, batchSize, embedDim); var attentionOutput = OutProjLinear.forward(attention); outAttentionWeights?.MoveToOuterDisposeScope(); return(attentionOutput.MoveToOuterDisposeScope()); }
public virtual torch.Tensor forward(torch.Tensor input, Dictionary <string, object> param = null) { return(input.alias()); }
public override torch.Tensor forward(torch.Tensor x) { using var x1 = x.permute(1, 2, 0); using var conv = Conv.forward(x1); return(conv.permute(2, 0, 1)); }
private static void ParseArguments(IReadOnlyDictionary <string, object> param, out bool incrementalState, out torch.Tensor timeStep) { incrementalState = false; timeStep = null; if (param == null) { return; } if (param.ContainsKey(IncrementalStateKey)) { incrementalState = (bool)param[IncrementalStateKey]; } if (param.ContainsKey(TimeStepKey)) { timeStep = (torch.Tensor)param[TimeStepKey]; } }
public override torch.Tensor forward(torch.Tensor x) { return(_function.forward(x)); }
public abstract torch.Tensor forward(torch.Tensor x, int hiddenSize);
public override torch.Tensor forward(torch.Tensor x, Dictionary <string, object> param = null) { return(x.alias()); }