public LSTMAttentionDecoderCell(string name, int batchSize, int hdim, int dim, int contextSize, int deviceId) { m_name = name; m_hdim = hdim; m_dim = dim; m_deviceId = deviceId; m_batchSize = batchSize; m_Wxhc = new WeightTensor(new long[2] { dim + hdim + contextSize, hdim * 4 }, deviceId, normal: true, name: $"{name}.{nameof(m_Wxhc)}", isTrainable: true); m_b = new WeightTensor(new long[2] { 1, hdim * 4 }, 0, deviceId, name: $"{name}.{nameof(m_b)}", isTrainable: true); Hidden = new WeightTensor(new long[2] { batchSize, hdim }, 0, deviceId, name: $"{name}.{nameof(Hidden)}", isTrainable: true); Cell = new WeightTensor(new long[2] { batchSize, hdim }, 0, deviceId, name: $"{name}.{nameof(Cell)}", isTrainable: true); layerNorm1 = new LayerNormalization($"{name}.{nameof(layerNorm1)}", hdim * 4, deviceId); layerNorm2 = new LayerNormalization($"{name}.{nameof(layerNorm2)}", hdim, deviceId); }
public TransformerEncoder(string name, int multiHeadNum, int hiddenDim, int inputDim, int depth, float dropoutRatio, int deviceId, bool isTrainable) { Logger.WriteLine($"Creating transformer encoder at device '{deviceId}'. HiddenDim = '{hiddenDim}', InputDim = '{inputDim}', Depth = '{depth}', MultiHeadNum = '{multiHeadNum}'"); m_name = name; m_multiHeadNum = multiHeadNum; m_hiddenDim = hiddenDim; m_inputDim = inputDim; m_depth = depth; m_dropoutRatio = dropoutRatio; m_deviceId = deviceId; m_isTrainable = isTrainable; if (hiddenDim != inputDim) { throw new ArgumentException($"hiddenDim is not equal to inputDim in TransformerEncoder."); } m_encoders.Add(new MultiHeadAttention($"{name}.SelfAttn_0", multiHeadNum, hiddenDim, inputDim, m_dropoutRatio, deviceId, isTrainable: isTrainable)); for (int i = 1; i < depth; i++) { m_encoders.Add(new MultiHeadAttention($"{name}.SelfAttn_{i}", multiHeadNum, hiddenDim, hiddenDim, m_dropoutRatio, deviceId, isTrainable: isTrainable)); } for (int i = 0; i < depth; i++) { m_posFFNs.Add(new PositionwiseFeedForward($"{name}.PosFFN_{i}", hiddenDim, m_dropoutRatio, deviceId, isTrainable)); } layerNorm = new LayerNormalization($"{name}.{nameof(layerNorm)}", hiddenDim, deviceId, isTrainable); }
public LSTMAttentionDecoderCell(int batchSize, int hdim, int dim, ArchTypeEnums archType, int deviceId) { int contextSize = hdim * 2; this.hdim = hdim; this.dim = dim; m_deviceId = deviceId; m_batchSize = batchSize; if (archType == ArchTypeEnums.GPU_CUDA) { Wxhc = new WeightTensor(dim + hdim + contextSize, hdim * 4, deviceId, true); b = new WeightTensor(1, hdim * 4, 0, deviceId); this.ht = new WeightTensor(batchSize, hdim, 0, deviceId); this.ct = new WeightTensor(batchSize, hdim, 0, deviceId); } else { Wxhc = new WeightMatrix(dim + hdim + contextSize, hdim * 4, true); b = new WeightMatrix(1, hdim * 4, 0); this.ht = new WeightMatrix(batchSize, hdim, 0); this.ct = new WeightMatrix(batchSize, hdim, 0); } layerNorm1 = new LayerNormalization(hdim * 4, archType, deviceId); layerNorm2 = new LayerNormalization(hdim, archType, deviceId); }
public PositionwiseFeedForward(string name, int hiddenDim, float dropoutRatio, int deviceId, bool isTrainable, float learningRateFactor = 1.0f) { m_name = name; m_dropoutRatio = dropoutRatio; layerNorm2 = new LayerNormalization($"{name}.{nameof(layerNorm2)}", hiddenDim, deviceId, isTrainable, learningRateFactor: learningRateFactor); feedForwardLayer1 = new FeedForwardLayer($"{name}.{nameof(feedForwardLayer1)}", hiddenDim, hiddenDim * 4, m_dropoutRatio, deviceId, isTrainable, learningRateFactor: learningRateFactor); feedForwardLayer2 = new FeedForwardLayer($"{name}.{nameof(feedForwardLayer2)}", hiddenDim * 4, hiddenDim, m_dropoutRatio, deviceId, isTrainable, learningRateFactor: learningRateFactor); }
public PositionwiseFeedForward(string name, int hiddenDim, float dropoutRatio, int deviceId, bool isTrainable) { this.m_name = name; this.m_hiddenDim = hiddenDim; this.m_dropoutRatio = dropoutRatio; this.layerNorm2 = new LayerNormalization($"{name}.{nameof(this.layerNorm2)}", hiddenDim, deviceId, isTrainable); this.feedForwardLayer1 = new FeedForwardLayer($"{name}.{nameof(this.feedForwardLayer1)}", hiddenDim, hiddenDim * 4, this.m_dropoutRatio, deviceId, isTrainable); this.feedForwardLayer2 = new FeedForwardLayer($"{name}.{nameof(this.feedForwardLayer2)}", hiddenDim * 4, hiddenDim, this.m_dropoutRatio, deviceId, isTrainable); }
public MultiHeadAttention(string name, int multiHeadNum, int hiddenDim, int inputDim, float dropoutRatio, int deviceId, bool isTrainable, bool sharedQKV = false, float learningRateFactor = 1.0f) { m_name = name; m_hiddenDim = hiddenDim; m_multiHeadNum = multiHeadNum; m_d = m_hiddenDim / m_multiHeadNum; m_dropoutRatio = dropoutRatio; m_sharedQKV = sharedQKV; W0 = new WeightTensor(new long[2] { hiddenDim, hiddenDim }, deviceId, name: $"{name}.{nameof(W0)}", isTrainable: isTrainable, normType: NormType.Uniform, learningRateFactor: learningRateFactor); b0 = new WeightTensor(new long[2] { 1, hiddenDim }, 0, deviceId, name: $"{name}.{nameof(b0)}", isTrainable: isTrainable); if (m_sharedQKV == false) { Q = new WeightTensor(new long[2] { inputDim, hiddenDim }, deviceId, name: $"{name}.{nameof(Q)}", isTrainable: isTrainable, normType: NormType.Uniform, learningRateFactor: learningRateFactor); Qb = new WeightTensor(new long[2] { 1, hiddenDim }, 0, deviceId, name: $"{name}.{nameof(Qb)}", isTrainable: isTrainable, learningRateFactor: learningRateFactor); K = new WeightTensor(new long[2] { inputDim, hiddenDim }, deviceId, name: $"{name}.{nameof(K)}", isTrainable: isTrainable, normType: NormType.Uniform, learningRateFactor: learningRateFactor); Kb = new WeightTensor(new long[2] { 1, hiddenDim }, 0, deviceId, name: $"{name}.{nameof(Kb)}", isTrainable: isTrainable, learningRateFactor: learningRateFactor); V = new WeightTensor(new long[2] { inputDim, hiddenDim }, deviceId, name: $"{name}.{nameof(V)}", isTrainable: isTrainable, normType: NormType.Uniform, learningRateFactor: learningRateFactor); Vb = new WeightTensor(new long[2] { 1, hiddenDim }, 0, deviceId, name: $"{name}.{nameof(Vb)}", isTrainable: isTrainable, learningRateFactor: learningRateFactor); } else { QKV = new WeightTensor(new long[2] { inputDim, hiddenDim * 3 }, deviceId, name: $"{name}.{nameof(Q)}", isTrainable: isTrainable, normType: NormType.Uniform, learningRateFactor: learningRateFactor); QKVb = new WeightTensor(new long[2] { 1, hiddenDim * 3 }, 0, deviceId, name: $"{name}.{nameof(Qb)}", isTrainable: isTrainable, learningRateFactor: learningRateFactor); } layerNormQ = new LayerNormalization($"{name}.{nameof(layerNormQ)}", m_hiddenDim, deviceId, isTrainable, learningRateFactor: learningRateFactor); }
// private readonly bool m_sharedQKV = false; public MultiHeadAttention(string name, int multiHeadNum, int hiddenDim, int inputDim, float dropoutRatio, int deviceId, bool isTrainable) { m_name = name; m_hiddenDim = hiddenDim; m_inputDim = inputDim; m_multiHeadNum = multiHeadNum; m_d = m_hiddenDim / m_multiHeadNum; m_dropoutRatio = dropoutRatio; // m_sharedQKV = sharedQKV; W0 = new WeightTensor(new long[2] { hiddenDim, hiddenDim }, deviceId, name: $"{name}.{nameof(W0)}", isTrainable: isTrainable, normal: NormType.Uniform); b0 = new WeightTensor(new long[2] { 1, hiddenDim }, 0, deviceId, name: $"{name}.{nameof(b0)}", isTrainable: isTrainable); //if (m_sharedQKV == false) //{ Q = new WeightTensor(new long[2] { inputDim, hiddenDim }, deviceId, name: $"{name}.{nameof(Q)}", isTrainable: isTrainable, normal: NormType.Uniform); Qb = new WeightTensor(new long[2] { 1, hiddenDim }, 0, deviceId, name: $"{name}.{nameof(Qb)}", isTrainable: isTrainable); K = new WeightTensor(new long[2] { inputDim, hiddenDim }, deviceId, name: $"{name}.{nameof(K)}", isTrainable: isTrainable, normal: NormType.Uniform); Kb = new WeightTensor(new long[2] { 1, hiddenDim }, 0, deviceId, name: $"{name}.{nameof(Kb)}", isTrainable: isTrainable); V = new WeightTensor(new long[2] { inputDim, hiddenDim }, deviceId, name: $"{name}.{nameof(V)}", isTrainable: isTrainable, normal: NormType.Uniform); Vb = new WeightTensor(new long[2] { 1, hiddenDim }, 0, deviceId, name: $"{name}.{nameof(Vb)}", isTrainable: isTrainable); //} //else //{ // QKV = new WeightTensor(new long[] { 3, inputDim, hiddenDim }, deviceId, name: $"{name}.{nameof(QKV)}", isTrainable: isTrainable, normal: NormType.Uniform); //} layerNormQ = new LayerNormalization($"{name}.{nameof(layerNormQ)}", m_hiddenDim, deviceId, isTrainable); }
public MultiHeadAttention(string name, int multiHeadNum, int hiddenDim, int inputDim, float dropoutRatio, int deviceId, bool isTrainable, bool sharedQKV = false) { this.m_name = name; this.m_hiddenDim = hiddenDim; this.m_inputDim = inputDim; this.m_multiHeadNum = multiHeadNum; this.m_d = this.m_hiddenDim / this.m_multiHeadNum; this.m_dropoutRatio = dropoutRatio; this.m_sharedQKV = sharedQKV; this.W0 = new WeightTensor(new long[2] { hiddenDim, hiddenDim }, deviceId, $"{name}.{nameof(this.W0)}", isTrainable, NormType.Uniform); this.b0 = new WeightTensor(new long[2] { 1, hiddenDim }, 0, deviceId, $"{name}.{nameof(this.b0)}", isTrainable); if (this.m_sharedQKV == false) { this.Q = new WeightTensor(new long[2] { inputDim, hiddenDim }, deviceId, $"{name}.{nameof(this.Q)}", isTrainable, NormType.Uniform); this.Qb = new WeightTensor(new long[2] { 1, hiddenDim }, 0, deviceId, $"{name}.{nameof(this.Qb)}", isTrainable); this.K = new WeightTensor(new long[2] { inputDim, hiddenDim }, deviceId, $"{name}.{nameof(this.K)}", isTrainable, NormType.Uniform); this.Kb = new WeightTensor(new long[2] { 1, hiddenDim }, 0, deviceId, $"{name}.{nameof(this.Kb)}", isTrainable); this.V = new WeightTensor(new long[2] { inputDim, hiddenDim }, deviceId, $"{name}.{nameof(this.V)}", isTrainable, NormType.Uniform); this.Vb = new WeightTensor(new long[2] { 1, hiddenDim }, 0, deviceId, $"{name}.{nameof(this.Vb)}", isTrainable); } else { this.QKV = new WeightTensor(new long[] { 3, inputDim, hiddenDim }, deviceId, $"{name}.{nameof(this.QKV)}", isTrainable, NormType.Uniform); } this.layerNormQ = new LayerNormalization($"{name}.{nameof(this.layerNormQ)}", this.m_hiddenDim, deviceId, isTrainable); }
public LSTMCell(string name, int hdim, int dim, int deviceId, bool isTrainable) { m_name = name; m_Wxh = new WeightTensor(new long[2] { dim + hdim, hdim * 4 }, deviceId, normType: NormType.Uniform, name: $"{name}.{nameof(m_Wxh)}", isTrainable: isTrainable); m_b = new WeightTensor(new long[2] { 1, hdim * 4 }, 0, deviceId, name: $"{name}.{nameof(m_b)}", isTrainable: isTrainable); m_hdim = hdim; m_dim = dim; m_deviceId = deviceId; m_layerNorm1 = new LayerNormalization($"{name}.{nameof(m_layerNorm1)}", hdim * 4, deviceId, isTrainable: isTrainable); m_layerNorm2 = new LayerNormalization($"{name}.{nameof(m_layerNorm2)}", hdim, deviceId, isTrainable: isTrainable); }
public LSTMAttentionDecoderCell(string name, int hiddenDim, int inputDim, int contextDim, int deviceId, bool isTrainable) { m_name = name; m_hiddenDim = hiddenDim; m_inputDim = inputDim; m_deviceId = deviceId; Logger.WriteLine($"Create LSTM attention decoder cell '{name}' HiddemDim = '{hiddenDim}', InputDim = '{inputDim}', ContextDim = '{contextDim}', DeviceId = '{deviceId}'"); m_Wxhc = new WeightTensor(new long[2] { inputDim + hiddenDim + contextDim, hiddenDim * 4 }, deviceId, normType: NormType.Uniform, name: $"{name}.{nameof(m_Wxhc)}", isTrainable: isTrainable); m_b = new WeightTensor(new long[2] { 1, hiddenDim * 4 }, 0, deviceId, name: $"{name}.{nameof(m_b)}", isTrainable: isTrainable); m_layerNorm1 = new LayerNormalization($"{name}.{nameof(m_layerNorm1)}", hiddenDim * 4, deviceId, isTrainable); m_layerNorm2 = new LayerNormalization($"{name}.{nameof(m_layerNorm2)}", hiddenDim, deviceId, isTrainable); }
public LSTMCell(string name, int batchSize, int hdim, int dim, int deviceId) { m_name = name; m_Wxh = new WeightTensor(new long[2] { dim + hdim, hdim * 4 }, deviceId, normal: true, name: $"{name}.{nameof(m_Wxh)}", isTrainable: true); m_b = new WeightTensor(new long[2] { 1, hdim * 4 }, 0, deviceId, name: $"{name}.{nameof(m_b)}", isTrainable: true); m_hdim = hdim; m_dim = dim; m_batchSize = batchSize; m_deviceId = deviceId; m_layerNorm1 = new LayerNormalization($"{name}.{nameof(m_layerNorm1)}", hdim * 4, deviceId); m_layerNorm2 = new LayerNormalization($"{name}.{nameof(m_layerNorm2)}", hdim, deviceId); }
public SelfAttention(string name, int multiHeadNum, int hiddenDim, int inputDim, float dropoutRatio, int deviceId) { m_name = name; m_hiddenDim = hiddenDim; m_multiHeadNum = multiHeadNum; m_d = m_hiddenDim / m_multiHeadNum; m_dropoutRatio = dropoutRatio; W0 = new WeightTensor(new long[2] { hiddenDim, hiddenDim }, deviceId, name: $"{name}.{nameof(W0)}", isTrainable: true); b0 = new WeightTensor(new long[2] { 1, hiddenDim }, 0, deviceId, name: $"{name}.{nameof(b0)}", isTrainable: true); Q = new WeightTensor(new long[2] { inputDim, hiddenDim }, deviceId, name: $"{name}.{nameof(Q)}", isTrainable: true); Qb = new WeightTensor(new long[2] { 1, hiddenDim }, 0, deviceId, name: $"{name}.{nameof(Qb)}", isTrainable: true); K = new WeightTensor(new long[2] { inputDim, hiddenDim }, deviceId, name: $"{name}.{nameof(K)}", isTrainable: true); Kb = new WeightTensor(new long[2] { 1, hiddenDim }, 0, deviceId, name: $"{name}.{nameof(Kb)}", isTrainable: true); V = new WeightTensor(new long[2] { inputDim, hiddenDim }, deviceId, name: $"{name}.{nameof(V)}", isTrainable: true); Vb = new WeightTensor(new long[2] { 1, hiddenDim }, 0, deviceId, name: $"{name}.{nameof(Vb)}", isTrainable: true); layerNorm1 = new LayerNormalization($"{name}.{nameof(layerNorm1)}", hiddenDim, deviceId); layerNorm2 = new LayerNormalization($"{name}.{nameof(layerNorm2)}", hiddenDim, deviceId); feedForwardLayer1 = new FeedForwardLayer($"{name}.{nameof(feedForwardLayer1)}", hiddenDim, hiddenDim * 4, m_dropoutRatio, deviceId); feedForwardLayer2 = new FeedForwardLayer($"{name}.{nameof(feedForwardLayer2)}", hiddenDim * 4, hiddenDim, m_dropoutRatio, deviceId); }
public TransformerDecoder(string name, int multiHeadNum, int hiddenDim, int inputDim, int depth, float dropoutRatio, int deviceId, bool isTrainable, float learningRateFactor = 1.0f) { Logger.WriteLine($"Creating transformer decoder at device '{deviceId}'. HiddenDim = '{hiddenDim}', InputDim = '{inputDim}', Depth = '{depth}', MultiHeadNum = '{multiHeadNum}'"); m_name = name; m_multiHeadNum = multiHeadNum; m_hiddenDim = hiddenDim; m_inputDim = inputDim; // m_outputDim = outputDim; m_depth = depth; m_dropoutRatio = dropoutRatio; m_deviceId = deviceId; m_isTrainable = isTrainable; m_learningRateFactor = learningRateFactor; if (hiddenDim != inputDim) { throw new ArgumentException($"hiddenDim is not equal to inputDim in TransformerDecoder."); } m_selfAttns.Add(new MultiHeadAttention($"{name}.SelfAttn_0", multiHeadNum, hiddenDim, inputDim, m_dropoutRatio, deviceId, isTrainable: isTrainable, sharedQKV: true, learningRateFactor: learningRateFactor)); for (int i = 1; i < depth; i++) { m_selfAttns.Add(new MultiHeadAttention($"{name}.SelfAttn_{i}", multiHeadNum, hiddenDim, hiddenDim, m_dropoutRatio, deviceId, isTrainable: isTrainable, sharedQKV: true, learningRateFactor: learningRateFactor)); } m_encAttns.Add(new MultiHeadAttention($"{name}.EncAttn_0", multiHeadNum, hiddenDim, inputDim, m_dropoutRatio, deviceId, isTrainable: isTrainable, learningRateFactor: learningRateFactor)); for (int i = 1; i < depth; i++) { m_encAttns.Add(new MultiHeadAttention($"{name}.EncAttn_{i}", multiHeadNum, hiddenDim, hiddenDim, m_dropoutRatio, deviceId, isTrainable: isTrainable, learningRateFactor: learningRateFactor)); } for (int i = 0; i < depth; i++) { m_posFFNs.Add(new PositionwiseFeedForward($"{name}.PosFFN_{i}", hiddenDim, m_dropoutRatio, deviceId, isTrainable, learningRateFactor: learningRateFactor)); } layerNorm = new LayerNormalization($"{name}.{nameof(layerNorm)}", hiddenDim, deviceId, isTrainable, learningRateFactor: learningRateFactor); // m_decoderFFLayer = new FeedForwardLayer($"{name}.FeedForward", hiddenDim, outputDim, 0.0f, deviceId: deviceId, isTrainable: isTrainable); }
public LSTMCell(int batchSize, int hdim, int dim, ArchTypeEnums archType, int deviceId) { if (archType == ArchTypeEnums.GPU_CUDA) { Wxh = new WeightTensor(dim + hdim, hdim * 4, deviceId, true); b = new WeightTensor(1, hdim * 4, 0, deviceId); } else { Wxh = new WeightMatrix(dim + hdim, hdim * 4, true); b = new WeightMatrix(1, hdim * 4, 0); } this.hdim = hdim; this.dim = dim; this.batchSize = batchSize; this.deviceId = deviceId; layerNorm1 = new LayerNormalization(hdim * 4, archType, deviceId); layerNorm2 = new LayerNormalization(hdim, archType, deviceId); }
public MultiHeadAttention(string name, int multiHeadNum, int hiddenDim, int inputDim, float dropoutRatio, int deviceId, bool isTrainable) { m_name = name; m_hiddenDim = hiddenDim; m_multiHeadNum = multiHeadNum; m_d = m_hiddenDim / m_multiHeadNum; m_dropoutRatio = dropoutRatio; W0 = new WeightTensor(new long[2] { hiddenDim, hiddenDim }, deviceId, name: $"{name}.{nameof(W0)}", isTrainable: isTrainable, normal: NormType.Uniform); b0 = new WeightTensor(new long[2] { 1, hiddenDim }, 0, deviceId, name: $"{name}.{nameof(b0)}", isTrainable: isTrainable); Q = new WeightTensor(new long[2] { inputDim, hiddenDim }, deviceId, name: $"{name}.{nameof(Q)}", isTrainable: isTrainable); Qb = new WeightTensor(new long[2] { 1, hiddenDim }, 0, deviceId, name: $"{name}.{nameof(Qb)}", isTrainable: isTrainable); K = new WeightTensor(new long[2] { inputDim, hiddenDim }, deviceId, name: $"{name}.{nameof(K)}", isTrainable: isTrainable); Kb = new WeightTensor(new long[2] { 1, hiddenDim }, 0, deviceId, name: $"{name}.{nameof(Kb)}", isTrainable: isTrainable); V = new WeightTensor(new long[2] { inputDim, hiddenDim }, deviceId, name: $"{name}.{nameof(V)}", isTrainable: isTrainable); Vb = new WeightTensor(new long[2] { 1, hiddenDim }, 0, deviceId, name: $"{name}.{nameof(Vb)}", isTrainable: isTrainable); layerNorm1 = new LayerNormalization($"{name}.{nameof(layerNorm1)}", hiddenDim, deviceId, isTrainable); }