Example #1
0
 public override torch.Tensor forward(torch.Tensor features)
 {
     // TODO: try whitening-like techniques
     // take <s> token (equiv. to [CLS])
     using var x = features[torch.TensorIndex.Colon, torch.TensorIndex.Single(0), torch.TensorIndex.Colon];
     return(Classifier.forward(x));
 }
Example #2
0
        /// <summary>
        /// Copy <paramref name="src"/> tensor to <paramref name="dst"/> tensor.
        /// If <paramref name="moveEosToBeginning"/> is true, an EOS token will be added to the beginning
        /// of <paramref name="dst"/> tensor, and the last token of <paramref name="src"/> will be dropped.
        /// </summary>
        /// <param name="src"></param>
        /// <param name="dst"></param>
        /// <param name="moveEosToBeginning"></param>
        /// <param name="eosIndex"></param>
        /// <exception cref="ArgumentException"></exception>
        private static void CopyTensor(torch.Tensor src, torch.Tensor dst,
                                       bool moveEosToBeginning = false, int?eosIndex = null)
        {
            if (src.numel() != dst.numel())
            {
                throw new ArgumentException(
                          $"Inconsistent capacity when copying tensor, got {src.numel()} and {dst.numel()}.");
            }

            if (moveEosToBeginning && (eosIndex == null || eosIndex < 0))
            {
                throw new ArgumentException(
                          $"{nameof(eosIndex)} must not be null or negative when {nameof(moveEosToBeginning)} is true.");
            }

            if (moveEosToBeginning && src[-1][0].ToInt32() == eosIndex)
            {
                dst[0] = torch.tensor((int)eosIndex);
                dst[torch.TensorIndex.Slice(start: 1)] = src[torch.TensorIndex.Slice(stop: -1)];
            }
            else
            {
                dst.copy_(src);
            }
        }
        private torch.Tensor ForwardEmbedding(torch.Tensor tokens, torch.Tensor segmentLabels, torch.Tensor positions)
        {
            using var disposeScope = torch.NewDisposeScope();

            var x = TokenEmbedding.forward(tokens);

            if (EmbedScale != null)
            {
                x.mul_(EmbedScale);
            }
            if (PositionalEmbedding != null)
            {
                var positionalEmbedding = PositionalEmbedding.forward(tokens,
                                                                      new Dictionary <string, object> {
                    { PositionalEmbedding.PositionKey, positions }
                });
                x.add_(positionalEmbedding);
            }
            if (SegmentEmbedding != null && segmentLabels.IsNotNull())
            {
                var segmentEmbedding = SegmentEmbedding.forward(segmentLabels);
                x.add_(segmentEmbedding);
            }
            if (EmbeddingLayerNorm != null)
            {
                x = EmbeddingLayerNorm.forward(x);
            }
            x = EmbedTransfer.forward(x, (int)x.size()[x.size().Length - 1]);
            x = DropoutLayer.forward(x);

            return(x.MoveToOuterDisposeScope());
        }
Example #4
0
        /// <summary>
        /// Convert a tensor of token indices to a string.
        /// Can optionally remove BPE symbols or escape "&lt;unk&gt;" words.
        /// </summary>
        public string Tensor2String(torch.Tensor tensor, string bpeSymbol = null, bool escapeUnk = false)
        {
            if (tensor.IsNull())
            {
                return(string.Empty);
            }
            bpeSymbol ??= "";

            List <string> subStrings;

            if (tensor.dim() == 2)
            {
                subStrings = Enumerable.Range(0, (int)tensor.shape[0])
                             .Select(i => Tensor2String(tensor[i], bpeSymbol, escapeUnk))
                             .ToList();
                return(string.Join("\n", subStrings));
            }

            subStrings = Enumerable.Range(0, (int)tensor.shape[0])
                         .Select(i => _symbols[i])
                         .ToList();
            var sentence = string.Join(" ", subStrings);

            return(ProcessBpeSymbol(sentence, bpeSymbol));
        }
Example #5
0
        public TensorAccessor(torch.Tensor tensor)
        {
            if (tensor.device_type != DeviceType.CPU)
            {
                throw new InvalidOperationException("Reading data from non-CPU memory is not supported. Move or copy the tensor to the cpu before reading.");
            }

            var strides = tensor.stride();

            for (var i = 0; i < strides.Length; i++)
            {
                if (strides[i] < 0)
                {
                    throw new NotImplementedException($"Negative tensor strides are not currently supported. tensor.strides({i}) == {strides[i]}");
                }
            }

            // Get the data from native code.

            unsafe {
                var res = torch.Tensor.THSTensor_data(tensor.Handle);
                if (res == IntPtr.Zero)
                {
                    torch.CheckForErrors();
                }
                // NOTE: there is no safety here.
                _tensor_data_ptr = res;
            }

            _tensor = tensor; // Keep the tensor alive now that everything is alright.
        }
Example #6
0
 public override torch.Tensor forward(torch.Tensor x)
 {
     x = this.squeeze_activation.forward(this.squeeze.forward(x));
     return torch.cat(new torch.Tensor[]{
                          this.expand1x1_activation.forward(this.expand1x1.forward(x)),
                          this.expand3x3_activation.forward(this.expand3x3.forward(x))
                      }, 1);
 }
        public override torch.Tensor forward(torch.Tensor srcTokens, torch.Tensor tokenMask = null)
        {
            using var disposeScope = torch.NewDisposeScope();
            var x = ExtractFeatures(srcTokens);

            x = _predictionHead.forward(x);
            return(x.MoveToOuterDisposeScope());
        }
Example #8
0
        public override torch.Tensor forward(torch.Tensor x)
        {
            using var disposeScope = torch.NewDisposeScope();
            var x1 = torch.pow(x, 3).mul_(_beta).add_(x).mul_(_alpha);  // sqrt(2/Pi) * (x + 0.044715 x^3)
            var y  = torch.nn.functional.tanh(x1).add_(1.0).mul_(0.5).mul_(x);

            return(y.MoveToOuterDisposeScope());
        }
        /// <summary>
        /// Replace non-padding symbols with their position numbers.
        /// Position numbers begin at padTokenIndex+1. Padding symbols are ignored.
        /// </summary>
        /// <param name="tensor">Cannot be null.</param>
        /// <param name="padTokenIndex"></param>
        protected static torch.Tensor MakePositions(torch.Tensor tensor, int padTokenIndex)
        {
            using var disposeScope = torch.NewDisposeScope();
            var mask      = tensor.ne(padTokenIndex).@long();
            var positions = torch.cumsum(mask, dimension: 1).mul_(mask);

            positions.add_(PadPositionIndex);
            return(positions.MoveToOuterDisposeScope());
        }
Example #10
0
 public override torch.Tensor forward(torch.Tensor x, torch.Tensor selfAttentionMask,
                                      torch.Tensor selfAttentionPaddingMask, int arch = 0, bool layerNormTraining = false)
 {
     return((Operations[arch] as Layer) !.forward(x, new Dictionary <string, object>
     {
         { Layer.AttentionMaskKey, selfAttentionMask },
         { Layer.PaddingMaskKey, selfAttentionPaddingMask },
     }));
 }
Example #11
0
        public override torch.Tensor forward(torch.Tensor x, Dictionary <string, object> param = null)
        {
            if (!ParseArguments(param, out var selfAttentionPaddingMask))
            {
                throw new ArgumentException($"Invalid arguments: {param}.");
            }

            using var x1 = ForwardOneLayer(x, selfAttentionPaddingMask, Conv1, LayerNorm1);
            return(ForwardOneLayer(x1, selfAttentionPaddingMask, Conv2, LayerNorm2));
        }
Example #12
0
 public torch.Tensor forward(torch.Tensor x)
 {
     if((x.shape[1] != 3) || (x.shape[2] != 224) || (x.shape[3] != 224))
     {
         throw new ArgumentException("Unsupported image size: should be bx3x224x224.");
     }
     x = this.features.forward(x) as torch.Tensor;
     x = this.classifier.forward(x) as torch.Tensor;
     return torch.flatten(x, 1);
 }
Example #13
0
        public override torch.Tensor forward(torch.Tensor x, int hiddenSize)
        {
            if (hiddenSize == SearchSpace.HiddenSizeChoices[SearchSpace.HiddenSizeChoices.Length - 1])
            {
                return(x);
            }
            var index = SearchSpace.HiddenSizeChoices.ToList().IndexOf(hiddenSize);

            return(index == -1
                ? x.alias()
                : HiddenTransfer[index].forward(x));
        }
        private static torch.Tensor PadMask(torch.Tensor tensor)
        {
            if (tensor.IsNull())
            {
                return(null);
            }

            using var zeros = tensor.new_zeros(tensor.size(0), 1);
            return(torch.cat(new List <torch.Tensor> {
                tensor, zeros
            }, dimension: 1));
        }
Example #15
0
        private static bool ParseArguments(IReadOnlyDictionary <string, object> param,
                                           out torch.Tensor selfAttentionMask, out torch.Tensor selfAttentionPaddingMask)
        {
            selfAttentionMask = selfAttentionPaddingMask = null;
            if (!(param.ContainsKey(AttentionMaskKey) && param.ContainsKey(PaddingMaskKey)))
            {
                return(false);
            }

            selfAttentionMask        = (torch.Tensor)param[AttentionMaskKey];
            selfAttentionPaddingMask = (torch.Tensor)param[PaddingMaskKey];
            return(true);
        }
Example #16
0
        private static torch.Tensor ForwardOneLayer(torch.Tensor input, torch.Tensor selfAttentionPaddingMask,
                                                    torch.nn.Module convLayer, torch.nn.Module layerNorm)
        {
            using var disposeScope = torch.NewDisposeScope();

            torch.Tensor x = selfAttentionPaddingMask.IsNull()
                ? input.alias()
                : input.masked_fill(selfAttentionPaddingMask.T.unsqueeze(-1), 0);

            var conv = convLayer.forward(x);

            conv.add_(input);
            var norm = layerNorm.forward(conv);

            return(norm.MoveToOuterDisposeScope());
        }
        public override torch.Tensor forward(torch.Tensor input, Dictionary <string, object> param = null)
        {
            using var disposeScope = torch.NewDisposeScope();

            ParseArguments(param, out var incrementalState, out var positions);

            if (positions.IsNull())
            {
                positions = incrementalState
                    ? torch.tensor(PadPositionIndex + input.size(1))
                    : MakePositions(input, PadTokenIndex);
            }

            var embedding = Embedding.forward(positions);

            return(embedding.MoveToOuterDisposeScope());
        }
Example #18
0
        public override torch.Tensor forward(torch.Tensor input, Dictionary <string, object> param = null)
        {
            using var disposeScope = torch.NewDisposeScope();

            ParseArguments(param, out var incrementalState, out var timeStep);

            var bszInt      = (int)input.shape[0];
            var seqLenInt   = (int)input.shape[1];
            var maxPosition = (int)(PadPositionIndex + 1 + input.shape[1]);

            // recompute/expand embeddings if needed
            if (Weight is null || maxPosition > Weight.size(0))
            {
                Weight?.Dispose();
                Weight = GetEmbedding(maxPosition, EmbeddingDim);
                Weight = (Parameter)Weight.MoveToOuterDisposeScope();
            }

            // move Weight to the device where _float_tensor is
            foreach (var(bufferName, buffer) in named_buffers())
            {
                if (bufferName == nameof(_floatTensor))
                {
                    Weight = (Parameter)Weight.to(buffer);
                    Weight = (Parameter)Weight.MoveToOuterDisposeScope();
                    break;
                }
            }

            // positions is the same for every token when decoding a single step
            if (incrementalState)
            {
                var pos = timeStep is null
                    ? seqLenInt
                    : timeStep.item <int>() + 1;
                var slice = Weight[torch.TensorIndex.Single(PadPositionIndex + pos), torch.TensorIndex.Colon];
                return(slice.expand(bszInt, 1, 1).MoveToOuterDisposeScope());
            }

            var positions       = MakePositions(input, PadTokenIndex).view(-1);
            var weightsSelected = Weight.index_select(0, positions).view(bszInt, seqLenInt, -1);

            return(weightsSelected.detach().MoveToOuterDisposeScope());
        }
        private static void ParseArguments(IReadOnlyDictionary <string, object> param, out bool incrementalState,
                                           out torch.Tensor positions)
        {
            incrementalState = false;
            positions        = null;
            if (param == null)
            {
                return;
            }

            if (param.ContainsKey(IncrementalStateKey))
            {
                incrementalState = (bool)param[IncrementalStateKey];
            }
            if (param.ContainsKey(PositionKey))
            {
                positions = (torch.Tensor)param[PositionKey];
            }
        }
Example #20
0
        public override torch.Tensor forward(torch.Tensor x, Dictionary <string, object> param)
        {
            using var disposeScope = torch.NewDisposeScope();

            if (!ParseArguments(param, out var selfAttentionMask, out var selfAttentionPaddingMask))
            {
                throw new ArgumentException("Invalid arguments.");
            }

            var attention = SelfAttention.forward(query: x, key: x, value: x,
                                                  out _,
                                                  keyPaddingMask: selfAttentionPaddingMask,
                                                  needWeights: false,
                                                  attentionMask: selfAttentionMask);
            var dropout = DropoutLayer.forward(attention);

            dropout.add_(x);
            var norm = LayerNorm.forward(dropout);

            return(norm.MoveToOuterDisposeScope());
        }
        public torch.Tensor forward(
            torch.Tensor tokens,
            torch.Tensor segmentLabels = null,
            torch.Tensor positions     = null)
        {
            using var disposeScope = torch.NewDisposeScope();

            var x = ForwardEmbedding(tokens, segmentLabels, positions);

            // Compute padding mask. This is needed for multi-head attention
            var paddingMask    = tokens.eq(PaddingIdx);
            var usePaddingMask = paddingMask.any().ToBoolean();

            // Account for padding while computing the representation
            if (usePaddingMask)
            {
                var xValidPart = paddingMask.logical_not().unsqueeze(-1).type_as(x);
                x.mul_(xValidPart);
            }

            // B x T x C -> T x B x C
            x.transpose_(0, 1);

            // forward Layers
            var blockPerLayer = Layers.Count / DistillBlocks;
            var blockIndex    = 0;

            for (var i = 0; i < Layers.Count; ++i)
            {
                x = ForwardOneLayer(x, usePaddingMask ? paddingMask : null, i, blockPerLayer, ref blockIndex);
            }

            // T x B x C -> B x T x C
            x.transpose_(0, 1);

            // var sentenceRepresentation = x[torch.TensorIndex.Colon, torch.TensorIndex.Single(0), torch.TensorIndex.Colon];
            return(x.MoveToOuterDisposeScope());
        }
        private torch.Tensor ForwardOneLayer(torch.Tensor input, torch.Tensor paddingMask,
                                             int i, int blockPerLayer, ref int blockIndex)
        {
            using var disposeScope = torch.NewDisposeScope();

            var x     = input.alias(); // avoid scope mess
            var layer = Layers[i];

            if (i % blockPerLayer == 0)
            {
                x = (HiddenTransferList[blockIndex] as HiddenTransfer).forward(x, HiddenSizePerBlock[blockIndex], true);
            }

            x = (layer as TransformerCell).forward(x, null, paddingMask);

            if ((i + 1) % blockPerLayer == 0)
            {
                x = (HiddenTransferList[blockIndex] as HiddenTransfer).forward(x, HiddenSizePerBlock[blockIndex], false);
                ++blockIndex;
            }

            return(x.MoveToOuterDisposeScope());
        }
        private (torch.Tensor, torch.Tensor, torch.Tensor) QkvProjection(
            torch.Tensor query, torch.Tensor key, torch.Tensor value)
        {
            using var disposeScope = torch.NewDisposeScope();

            torch.Tensor q = null;
            torch.Tensor k = null;
            torch.Tensor v = null;
            if (_selfAttention)
            {
                q = QProjection.forward(query);
                k = KProjection.forward(query);
                v = VProjection.forward(query);
            }
            else if (_encoderDecoderAttention)
            {
                q = QProjection.forward(query);
                if (key.IsNull())
                {
                    k = v = null;
                }
                else
                {
                    k = KProjection.forward(key);
                    v = VProjection.forward(key);
                }
            }
            else
            {
                q = QProjection.forward(query);
                k = KProjection.forward(key);
                v = VProjection.forward(value);
            }

            return(q.MoveToOuterDisposeScope(), k.MoveToOuterDisposeScope(), v.MoveToOuterDisposeScope());
        }
        public torch.Tensor forward(
            torch.Tensor query,
            torch.Tensor key,
            torch.Tensor value,
            out torch.Tensor outAttentionWeights,
            torch.Tensor keyPaddingMask = null,
            Dictionary <string, Dictionary <string, torch.Tensor> > incrementalState = null,
            bool needWeights           = true,
            bool staticKv              = false,
            torch.Tensor attentionMask = null)
        {
            outAttentionWeights = null;

            if (query.IsNull() || query.size().Length != 3 || query.size(2) != _embeddingDim)
            {
                throw new ArgumentException("query must NOT be null and must be 3D in multi-head attention;" +
                                            "the last dimension should be the same as embedding dimension.");
            }

            using var disposeScope = torch.NewDisposeScope();

            var qSize     = query.size();
            var tgtLen    = qSize[0];
            var batchSize = qSize[1];
            var embedDim  = qSize[2];

            // Get saved state from incrementalState
            Dictionary <string, torch.Tensor> savedState = null;

            if (incrementalState != null)
            {
                savedState = GetInputBuffer(incrementalState);

                // previous time steps are cached - no need to recompute key and value if they are static.
                if (savedState.ContainsKey(PrevKeyKey) && savedState.ContainsKey(PrevValueKey) && staticKv)
                {
                    if (_selfAttention || !_encoderDecoderAttention)
                    {
                        throw new ArgumentException(
                                  "prevKey and prevValue are only valid in encoder-decoder attention.");
                    }

                    key = value = null;
                }
            }

            // Calculate current qkv projection
            var(q, k, v) = QkvProjection(query, key, value);

            // Simulate using-statement by try-finally
            torch.Tensor attentionMaskPad  = attentionMask?.alias();
            torch.Tensor keyPaddingMaskPad = keyPaddingMask?.alias();
            q.mul_(_scaling);

            if (_addBiasKv)
            {
                var kRepeat = KBias.repeat(1, batchSize, 1);
                var vRepeat = VBias.repeat(1, batchSize, 1);
                k = torch.cat(new List <torch.Tensor> {
                    k, kRepeat
                }, dimension: 0);
                v = torch.cat(new List <torch.Tensor> {
                    v, vRepeat
                }, dimension: 0);
                attentionMaskPad  = PadMask(attentionMaskPad);
                keyPaddingMaskPad = PadMask(keyPaddingMaskPad);
            }

            q = q.view(tgtLen, batchSize * _numHeads, _headDim).transpose_(0, 1);
            k = k?.view(-1, batchSize * _numHeads, _headDim).transpose_(0, 1);
            v = v?.view(-1, batchSize * _numHeads, _headDim).transpose_(0, 1);

            if (savedState != null)
            {
                // saved states are stored with shape (batchSize, NumHeads, seqLen, HeadDim)
                if (savedState.ContainsKey(PrevKeyKey))
                {
                    var prevKey = savedState[PrevKeyKey].view(batchSize * _numHeads, -1, _headDim);
                    k = staticKv
                        ? prevKey
                        : torch.cat(new List <torch.Tensor> {
                        prevKey, k
                    }, dimension: 1);
                }

                if (savedState.ContainsKey(PrevValueKey))
                {
                    var prevValue = savedState[PrevValueKey].view(batchSize * _numHeads, -1, _headDim);
                    v = staticKv
                        ? prevValue
                        : torch.cat(new List <torch.Tensor> {
                        prevValue, v
                    }, dimension: 1);
                }

                savedState[PrevKeyKey].Dispose();
                savedState[PrevKeyKey] = k?.view(batchSize, _numHeads, -1, _headDim);
                savedState[PrevValueKey].Dispose();
                savedState[PrevValueKey] = v?.view(batchSize, _numHeads, -1, _headDim);

                SetInputBuffer(incrementalState, savedState);
            }

            Debug.Assert(k.IsNotNull() && v.IsNotNull());
            var srcLen = k !.size(1);

            // This is part of a workaround to get around fork/join parallelism not supporting Optional types.
            if (keyPaddingMaskPad?.shape.Length == 0)
            {
                keyPaddingMaskPad = null;
            }
            Debug.Assert(keyPaddingMaskPad.IsNull() ||
                         (keyPaddingMaskPad.size(0) == batchSize && keyPaddingMaskPad.size(1) == srcLen));

            if (_addZeroAttention)
            {
                srcLen += 1;
                var zeroPadSize = k.size();
                zeroPadSize[1] = 1;
                var kZeros = k.new_zeros(zeroPadSize);
                var vZeros = v !.new_zeros(zeroPadSize);
                k = torch.cat(new List <torch.Tensor> {
                    k, kZeros
                }, dimension: 1);
                v = torch.cat(new List <torch.Tensor> {
                    v, vZeros
                }, dimension: 1);
                attentionMaskPad  = PadMask(attentionMaskPad);
                keyPaddingMaskPad = PadMask(keyPaddingMaskPad);
            }

            var attentionWeights = torch.matmul(q, k.transpose(1, 2));

            Debug.Assert(attentionWeights.size().SequenceEqual(new[] { batchSize *_numHeads, tgtLen, srcLen }));

            if (attentionMaskPad.IsNotNull())
            {
                attentionWeights.add_(attentionMaskPad.unsqueeze(0));
            }

            if (keyPaddingMaskPad.IsNotNull())
            {
                // Don't attend to pad symbols
                keyPaddingMaskPad = keyPaddingMaskPad.unsqueeze(1).unsqueeze(2);

                attentionWeights = attentionWeights
                                   .view(batchSize, _numHeads, tgtLen, srcLen)
                                   .masked_fill(keyPaddingMaskPad, float.NegativeInfinity)
                                   .view(batchSize * _numHeads, tgtLen, srcLen);
            }

            attentionWeights = torch.nn.functional.softmax(attentionWeights, dim: -1);
            attentionWeights = DropoutLayer.forward(attentionWeights);

            if (needWeights)
            {
                // Average attention weights over heads
                var weightsView = attentionWeights.view(batchSize, _numHeads, tgtLen, srcLen);
                outAttentionWeights = weightsView.sum(dim: 1).div_(_numHeads);
            }

            var attention = torch.matmul(attentionWeights, v);

            Debug.Assert(attention.size().SequenceEqual(new[] { batchSize *_numHeads, tgtLen, _headDim }));
            attention = attention.transpose(0, 1).contiguous().view(tgtLen, batchSize, embedDim);
            var attentionOutput = OutProjLinear.forward(attention);

            outAttentionWeights?.MoveToOuterDisposeScope();
            return(attentionOutput.MoveToOuterDisposeScope());
        }
 public virtual torch.Tensor forward(torch.Tensor input, Dictionary <string, object> param = null)
 {
     return(input.alias());
 }
Example #26
0
 public override torch.Tensor forward(torch.Tensor x)
 {
     using var x1   = x.permute(1, 2, 0);
     using var conv = Conv.forward(x1);
     return(conv.permute(2, 0, 1));
 }
Example #27
0
        private static void ParseArguments(IReadOnlyDictionary <string, object> param, out bool incrementalState, out torch.Tensor timeStep)
        {
            incrementalState = false;
            timeStep         = null;
            if (param == null)
            {
                return;
            }

            if (param.ContainsKey(IncrementalStateKey))
            {
                incrementalState = (bool)param[IncrementalStateKey];
            }
            if (param.ContainsKey(TimeStepKey))
            {
                timeStep = (torch.Tensor)param[TimeStepKey];
            }
        }
Example #28
0
 public override torch.Tensor forward(torch.Tensor x)
 {
     return(_function.forward(x));
 }
Example #29
0
 public abstract torch.Tensor forward(torch.Tensor x, int hiddenSize);
Example #30
0
 public override torch.Tensor forward(torch.Tensor x, Dictionary <string, object> param = null)
 {
     return(x.alias());
 }