//public IWeightTensor Perform(IWeightTensor inputQ, IWeightTensor keyMask, int batchSize, IComputeGraph graph) //{ // if (m_sharedQKV == false) // { // throw new ArgumentException($"Layer '{m_name}' is not in shared QKV mode, please call another Perform function with three separated input tensors."); // } // using (IComputeGraph g = graph.CreateSubGraph($"{m_name}_MultiHeadAttention_SharedQKV")) // { // int seqLenQ = inputQ.Rows / batchSize; // IWeightTensor inputQNorm = layerNormQ.Norm(inputQ, g); // //Input projections // float scale = 1.0f; // / (float)(m_inputDim); // IWeightTensor mulQ, mulK, mulV; // using (IWeightTensor inputQNormView = g.View(inputQNorm, dims: new long[] { 1, inputQ.Rows, inputQ.Columns })) // { // using (IWeightTensor inputQNormViewExp = g.Expand(inputQNormView, dims: new long[] { 3, inputQ.Rows, inputQ.Columns })) // { // using (IWeightTensor mulQKV = g.MulBatch(inputQNormViewExp, QKV, null, 3, scale)) // { // mulQ = g.Select(mulQKV, 0, 0); // mulK = g.Select(mulQKV, 0, 1); // mulV = g.Select(mulQKV, 0, 2); // } // } // } // IWeightTensor allQ = g.View(mulQ, dims: new long[] { batchSize, seqLenQ, m_multiHeadNum, m_d }); // IWeightTensor allK = g.View(mulK, dims: new long[] { batchSize, seqLenQ, m_multiHeadNum, m_d }); // IWeightTensor allV = g.View(mulV, dims: new long[] { batchSize, seqLenQ, m_multiHeadNum, m_d }); // //Multi-head attentions // IWeightTensor Qs = g.View(g.Permute(allQ, 2, 0, 1, 3), dims: new long[] { m_multiHeadNum * batchSize, seqLenQ, m_d }); // IWeightTensor Ks = g.View(g.Permute(allK, 2, 0, 3, 1), dims: new long[] { m_multiHeadNum * batchSize, m_d, seqLenQ }); // IWeightTensor Vs = g.View(g.Permute(allV, 2, 0, 1, 3), dims: new long[] { m_multiHeadNum * batchSize, seqLenQ, m_d }); // // Scaled softmax // scale = 1.0f / (float)(Math.Sqrt(m_d)); // IWeightTensor attn = g.MulBatch(Qs, Ks, null, m_multiHeadNum * batchSize, scale); // IWeightTensor softmax = g.Softmax(attn, keyMask, inPlace: true); // IWeightTensor o = g.View(g.MulBatch(softmax, Vs, null, m_multiHeadNum * batchSize), dims: new long[] { m_multiHeadNum, batchSize, seqLenQ, m_d }); // IWeightTensor W = g.View(g.Permute(o, 1, 2, 0, 3), dims: new long[] { batchSize * seqLenQ, m_multiHeadNum * m_d }); // // Output projection // IWeightTensor finalAttResults = g.Dropout(g.Affine(W, W0, b0), batchSize, m_dropoutRatio, inPlace: true); // return graph.Add(finalAttResults, inputQ); // } //} public virtual List <IWeightTensor> getParams() { List <IWeightTensor> response = new List <IWeightTensor> { W0, b0 }; //if (m_sharedQKV) //{ // response.Add(QKV); //} //else //{ response.Add(Q); response.Add(Qb); response.Add(K); response.Add(Kb); response.Add(V); response.Add(Vb); // } response.AddRange(layerNormQ.getParams()); return(response); }
public virtual List <IWeightTensor> getParams() { List <IWeightTensor> response = new List <IWeightTensor>(); response.AddRange(layerNorm2.getParams()); response.AddRange(feedForwardLayer1.GetParams()); response.AddRange(feedForwardLayer2.GetParams()); return(response); }
public List <IWeightTensor> getParams() { List <IWeightTensor> response = new List <IWeightTensor>(); response.Add(m_Wxhc); response.Add(m_b); response.AddRange(m_layerNorm1.getParams()); response.AddRange(m_layerNorm2.getParams()); return(response); }
public List <IWeightMatrix> getParams() { List <IWeightMatrix> response = new List <IWeightMatrix>(); response.Add(Wxhc); response.Add(b); response.AddRange(layerNorm1.getParams()); response.AddRange(layerNorm2.getParams()); return(response); }
public virtual List <IWeightTensor> getParams() { List <IWeightTensor> response = new List <IWeightTensor> { m_Wxh, m_b }; response.AddRange(m_layerNorm1.getParams()); response.AddRange(m_layerNorm2.getParams()); return(response); }
public List <IWeightTensor> GetParams() { List <IWeightTensor> response = new List <IWeightTensor>(); foreach (MultiHeadAttention item in m_encoders) { response.AddRange(item.getParams()); } foreach (var item in m_posFFNs) { response.AddRange(item.getParams()); } response.AddRange(layerNorm.getParams()); return(response); }
public virtual List <IWeightTensor> getParams() { List <IWeightTensor> response = new List <IWeightTensor> { W0, b0 }; response.Add(Q); response.Add(Qb); response.Add(K); response.Add(Kb); response.Add(V); response.Add(Vb); response.AddRange(layerNormQ.getParams()); return(response); }
public virtual List <IWeightTensor> getParams() { List <IWeightTensor> response = new List <IWeightTensor> { Q, Qb, K, Kb, V, Vb, W0, b0 }; response.AddRange(layerNorm1.getParams()); return(response); }