public virtual IWeightMatrix ConcatRows(List <IWeightMatrix> wl, bool bp = true) { List <WeightMatrix> twl = new List <WeightMatrix>(); int sx = 0; int sy = 0; foreach (IWeightMatrix item in wl) { WeightMatrix m = item as WeightMatrix; sx += m.Rows; sy = m.Columns; twl.Add(m); } var res = weightMatrixFactory.CreateWeightMatrix(sx, sy); int startIdx = 0; for (var i = 0; i < twl.Count; i++) { Array.Copy(twl[i].Weight, 0, res.Weight, startIdx, twl[i].Weight.Length); startIdx += twl[i].Weight.Length; } if (this.needs_backprop) { Action backward = () => { startIdx = 0; for (var i = 0; i < twl.Count; i++) { var k = 0; var n = twl[i].Gradient.Length; var moreItem = (n % Vector <float> .Count); var Gradient = twl[i].Gradient; while (k < n - moreItem) { var vecResG = new Vector <float>(res.Gradient, startIdx + k); var vecM1G = new Vector <float>(Gradient, k); vecM1G += vecResG; vecM1G.CopyTo(Gradient, k); k += Vector <float> .Count; } while (k < n) { Gradient[k] += res.Gradient[startIdx + k]; k++; } startIdx += n; } }; this.backprop.Add(backward); } return(res); }
public virtual IWeightMatrix Softmax(IWeightMatrix src) { WeightMatrix m = src as WeightMatrix; var res = weightMatrixFactory.CreateWeightMatrix(m.Rows, m.Columns); // probability volume var maxval = -999999.0f; var n = m.Weight.Length; var moreItem = (n % Vector <float> .Count); var k = 0; var vecMaxVal = new Vector <float>(maxval); while (k < n - moreItem) { var vecMW = new Vector <float>(m.Weight, k); vecMaxVal = Vector.Max(vecMW, vecMaxVal); k += Vector <float> .Count; } for (int i = 0; i < Vector <float> .Count; i++) { if (vecMaxVal[i] > maxval) { maxval = vecMaxVal[i]; } } while (k < n) { if (m.Weight[k] > maxval) { maxval = m.Weight[k]; } k++; } double s = 0.0; k = 0; vecMaxVal = new Vector <float>(maxval); while (k < n - moreItem) { var vecMW = new Vector <float>(m.Weight, k); var vecV = FastExp(vecMW - vecMaxVal); vecV.CopyTo(res.Weight, k); s += Vector.Dot(vecV, Vector <float> .One); k += Vector <float> .Count; } k = n - moreItem; while (k < n) { float v = FastExp(m.Weight[k] - maxval); res.Weight[k] = (float)v; s += v; k++; } k = 0; var vecS = new Vector <float>((float)s); while (k < n - moreItem) { var vecResW = new Vector <float>(res.Weight, k); vecResW = vecResW / vecS; vecResW.CopyTo(res.Weight, k); k += Vector <float> .Count; } while (k < n) { float v = (float)(res.Weight[k] / s); res.Weight[k] = v; k++; } if (this.needs_backprop) { Action backward = () => { double ss = 0.0; for (int i = 0; i < n; i++) { var v = res.Gradient[i] * res.Weight[i]; m.Gradient[i] += v; ss += v; } for (int i = 0; i < n; i++) { m.Gradient[i] = (float)(m.Gradient[i] - ss * res.Weight[i]); } }; this.backprop.Add(backward); } return(res); }
public virtual IWeightMatrix ConcatColumns(params IWeightMatrix[] wl) { List <WeightMatrix> twl = new List <WeightMatrix>(); int sx = 0; int sy = 0; foreach (IWeightMatrix item in wl) { WeightMatrix m = item as WeightMatrix; sx = m.Rows; sy += m.Columns; twl.Add(m); } var res = weightMatrixFactory.CreateWeightMatrix(sx, sy); for (var i = 0; i < sx; i++) { int startIdx = 0; for (var j = 0; j < twl.Count; j++) { Array.Copy(twl[j].Weight, i * twl[j].Columns, res.Weight, i * res.Columns + startIdx, twl[j].Columns); startIdx += twl[j].Columns; } } if (this.needs_backprop) { Action backward = () => { for (var i = 0; i < sx; i++) { int startIdx = 0; for (var j = 0; j < twl.Count; j++) { var k = 0; var tw_j = twl[j]; var moreItem = (tw_j.Columns % Vector <float> .Count); var offsetM1 = i * tw_j.Columns; var offsetRes = i * res.Columns + startIdx; while (k < tw_j.Columns - moreItem) { var vecResG = new Vector <float>(res.Gradient, offsetRes + k); var vecM1G = new Vector <float>(tw_j.Gradient, offsetM1 + k); vecM1G += vecResG; vecM1G.CopyTo(tw_j.Gradient, offsetM1 + k); k += Vector <float> .Count; } while (k < twl[j].Columns) { tw_j.Gradient[offsetM1 + k] += res.Gradient[offsetRes + k]; k++; } startIdx += tw_j.Columns; } } }; this.backprop.Add(backward); } return(res); }
public IWeightMatrix LayerNorm(IWeightMatrix src, IWeightMatrix alpha, IWeightMatrix beta, float eps = 1e-09f) { WeightMatrix srcM = src as WeightMatrix; WeightMatrix alphaM = alpha as WeightMatrix; WeightMatrix betaM = beta as WeightMatrix; int rows = srcM.Rows; int cols = srcM.Columns; var res = weightMatrixFactory.CreateWeightMatrix(rows, cols); for (int j = 0; j < rows; j++) { int baseIdx = j * cols; var sum = 0.0f; for (int i = 0; i < cols; i++) { sum += srcM.Weight[baseIdx + i]; } float mean = sum / cols; float sqSum = 0.0f; for (int i = 0; i < cols; i++) { float ex = srcM.Weight[baseIdx + i] - mean; sqSum += ex * ex; } float sigma = (float)Math.Sqrt(eps + sqSum / cols); for (int i = 0; i < cols; i++) { float t = alphaM.Weight[baseIdx + i] * ((srcM.Weight[baseIdx + i] - mean) / sigma); t += betaM.Weight[baseIdx + i]; res.Weight[baseIdx + i] = t; } } if (this.needs_backprop) { Action backward = () => { for (int j = 0; j < rows; j++) { float sum_x = 0.0f; float sum_adj = 0.0f; float sum_adj_x = 0.0f; float sum_sqr = 0.0f; int baseIdx = j * cols; for (int i = 0; i < cols; i++) { sum_x += srcM.Weight[baseIdx + i]; sum_adj_x += res.Gradient[baseIdx + i] * (res.Weight[baseIdx + i] - betaM.Weight[baseIdx + i]); sum_adj += res.Gradient[baseIdx + i]; } float mean = sum_x / cols; for (int i = 0; i < cols; i++) { float ex = srcM.Weight[baseIdx + i] - mean; sum_sqr += ex * ex; } float sigma = (float)Math.Sqrt(eps + sum_sqr / cols); for (int i = 0; i < cols; i++) { float grad_x = 0.0f; float x_hat = (res.Weight[baseIdx + i] - betaM.Weight[baseIdx + i]) / alphaM.Weight[baseIdx + i]; grad_x += cols * res.Gradient[baseIdx + i]; grad_x -= sum_adj; grad_x -= sum_adj_x * x_hat; grad_x /= cols * sigma; srcM.Gradient[baseIdx + i] += alphaM.Weight[baseIdx + i] * grad_x; alphaM.Gradient[baseIdx + i] += res.Gradient[baseIdx + i] * x_hat; betaM.Gradient[baseIdx + i] += res.Gradient[baseIdx + i]; } } }; this.backprop.Add(backward); } return(res); }
public virtual IWeightMatrix SoftmaxWithCrossEntropy(IWeightMatrix src) { WeightMatrix m = src as WeightMatrix; var res = weightMatrixFactory.CreateWeightMatrix(m.Rows, m.Columns); // probability volume var maxval = -999999.0f; var n = m.Weight.Length; var moreItem = (n % Vector <float> .Count); var k = 0; var vecMaxVal = new Vector <float>(maxval); while (k < n - moreItem) { var vecMW = new Vector <float>(m.Weight, k); vecMaxVal = Vector.Max(vecMW, vecMaxVal); k += Vector <float> .Count; } for (int i = 0; i < Vector <float> .Count; i++) { if (vecMaxVal[i] > maxval) { maxval = vecMaxVal[i]; } } while (k < n) { if (m.Weight[k] > maxval) { maxval = m.Weight[k]; } k++; } double s = 0.0; k = 0; vecMaxVal = new Vector <float>(maxval); while (k < n - moreItem) { var vecMW = new Vector <float>(m.Weight, k); var vecV = FastExp(vecMW - vecMaxVal); vecV.CopyTo(res.Weight, k); s += Vector.Dot(vecV, Vector <float> .One); k += Vector <float> .Count; } k = n - moreItem; while (k < n) { float v = FastExp(m.Weight[k] - maxval); res.Weight[k] = (float)v; s += v; k++; } k = 0; var vecS = new Vector <float>((float)s); while (k < n - moreItem) { var vecResW = new Vector <float>(res.Weight, k); vecResW = vecResW / vecS; vecResW.CopyTo(res.Weight, k); k += Vector <float> .Count; } while (k < n) { float v = (float)(res.Weight[k] / s); res.Weight[k] = v; k++; } // no backward pass here needed // since we will use the computed probabilities outside // to set gradients directly on m return(res); }
public virtual WeightMatrix MulAdd(WeightMatrix m1, WeightMatrix m2, WeightMatrix m3) { var n = m1.Rows; var d = m2.Columns; var res = weightMatrixFactory.CreateWeightMatrix(n, d); var moreItemsD = (d % Vector <float> .Count); // Parallel.For(0, m1.Rows, i => for (int i = 0; i < m1.Rows; i++) { // loop over rows of m1 var m1BaseIndex = d * i; var m1ColBaseIndex = m1.Columns * i; Array.Copy(m3.Weight, m1BaseIndex, res.Weight, m1BaseIndex, d); for (var k = 0; k < m1.Columns; k++) { // dot product loop var j = 0; var m1w = m1.Weight[m1ColBaseIndex + k]; var m2BaseIndex = m2.Columns * k; while (j < d - moreItemsD) { int offset = m1BaseIndex + j; var vecM2W = new Vector <float>(m2.Weight, m2BaseIndex + j); var vecResWeight = new Vector <float>(res.Weight, offset); vecResWeight += m1w * vecM2W; vecResWeight.CopyTo(res.Weight, offset); j += Vector <float> .Count; } while (j < d) { res.Weight[m1BaseIndex + j] += m1w * m2.Weight[m2BaseIndex + j]; j++; } } }//); if (this.needs_backprop) { Action backward = () => { // Parallel.For(0, m1.Rows, i => for (int i = 0; i < m1.Rows; i++) { // loop over rows of m1 var resBaseIndex = d * i; var m1BaseIndex = m1.Columns * i; var j = 0; while (j < d - moreItemsD) { int offset = resBaseIndex + j; var vecResG = new Vector <float>(res.Gradient, offset); var vecM3G = new Vector <float>(m3.Gradient, offset); vecM3G += vecResG; vecM3G.CopyTo(m3.Gradient, offset); j += Vector <float> .Count; } while (j < d) { int offset = resBaseIndex + j; m3.Gradient[offset] += res.Gradient[offset]; j++; } // loop over cols of m2 for (var k = 0; k < m1.Columns; k++) { var m1GIndex = m1BaseIndex + k; var m2GBaseIndex = m2.Columns * k; var m1G = 0.0f; var m1W = m1.Weight[m1GIndex]; j = 0; while (j < d - moreItemsD) { int m2Index = m2GBaseIndex + j; int offset = resBaseIndex + j; var vecResG = new Vector <float>(res.Gradient, offset); var vecM2W = new Vector <float>(m2.Weight, m2Index); var vecM2G = new Vector <float>(m2.Gradient, m2Index); m1G += Vector.Dot(vecM2W, vecResG); vecM2G += m1W * vecResG; vecM2G.CopyTo(m2.Gradient, m2Index); j += Vector <float> .Count; } while (j < d) { int m2Index = m2GBaseIndex + j; var b = res.Gradient[resBaseIndex + j]; m1G += m2.Weight[m2Index] * b; m2.Gradient[m2Index] += m1W * b; j++; } m1.Gradient[m1GIndex] += m1G; } }//); }; this.backprop.Add(backward); } return(res); }
private void UpdateWeightsCPU(float step_size, float regc, float clipval, Vector <float> vecMaxClipval, Vector <float> vecMinClipval, WeightMatrix m) { if (m.RowToBeUpdated.Count == 0) { UpdateWeights(step_size, regc, clipval, m, vecMaxClipval, vecMinClipval, m.Weight.Length, 0); } else { foreach (var kv in m.RowToBeUpdated) { int rowId = kv.Key; UpdateWeights(step_size, regc, clipval, m, vecMaxClipval, vecMinClipval, m.Columns, rowId * m.Columns); } m.RowToBeUpdated.Clear(); } }
private void UpdateWeights(float step_size, float regc, float clipval, WeightMatrix m, Vector <float> vecMaxClipval, Vector <float> vecMinClipval, int n, int i) { var s = m.Cash; var l = m.LrW; var vecBaseLR = new Vector <float>(step_size); var moreItems = (n % Vector <float> .Count); while (i < n - moreItems) { var vecMDWI = new Vector <float>(m.Gradient, i); vecMDWI = Vector.Min(vecMDWI, vecMaxClipval); vecMDWI = Vector.Max(vecMDWI, vecMinClipval); var vecS = new Vector <float>(s, i); vecS = vecS * vecDecayRate + (Vector <float> .One - vecDecayRate) * vecMDWI * vecMDWI; vecS.CopyTo(s, i); var vecMDWIDelta = vecMDWI / Vector.SquareRoot(vecS + vecSmoothEPS); var vecLRWeight = new Vector <float>(l, i); var vecLR = ComputeLearningRate(vecMDWIDelta, ref vecLRWeight, vecBaseLR); vecLRWeight.CopyTo(l, i); var vecMW = new Vector <float>(m.Weight, i); var vecDelta = -vecLR * vecMDWIDelta - regc * vecMW; vecMW += vecDelta; vecMW.CopyTo(m.Weight, i); i += Vector <float> .Count; } while (i < n) { // rmsprop adaptive learning rate var mdwi = m.Gradient[i]; // gradient clip if (mdwi > clipval) { mdwi = clipval; } if (mdwi < -clipval) { mdwi = -clipval; } s[i] = (float)(s[i] * decay_rate + (1.0 - decay_rate) * mdwi * mdwi); var wDelta = (float)(mdwi / Math.Sqrt(s[i] + smooth_eps)); var lr = ComputeLearningRate(wDelta, l, i, step_size); var delta = (float)(-lr * wDelta - regc * m.Weight[i]); // update (and regularize) m.Weight[i] += delta; i++; } }
public void CopyWeights(IWeightMatrix src) { WeightMatrix m = src as WeightMatrix; Array.Copy(m.Weight, Weight, m.Weight.Length); }
public WeightMatrix Step(SparseWeightMatrix sparseInput, WeightMatrix context, WeightMatrix input, IComputeGraph innerGraph) { var hidden_prev = ht; var cell_prev = ct; var cell = this; WeightMatrix input_gate = null; WeightMatrix forget_gate = null; WeightMatrix output_gate = null; WeightMatrix cell_write = null; Parallel.Invoke( () => { var h0 = innerGraph.mul(input, cell.Wix); var h1 = innerGraph.mul(hidden_prev, cell.Wih); var h11 = innerGraph.mul(context, cell.WiC); if (sdim > 0) { var h111 = innerGraph.mul(sparseInput, cell.WiS); input_gate = innerGraph.addsigmoid(h0, h1, h11, h111, cell.bi); } else { input_gate = innerGraph.addsigmoid(h0, h1, h11, cell.bi); } }, () => { var h2 = innerGraph.mul(input, cell.Wfx); var h3 = innerGraph.mul(hidden_prev, cell.Wfh); var h33 = innerGraph.mul(context, cell.WfC); if (sdim > 0) { var h333 = innerGraph.mul(sparseInput, cell.WfS); forget_gate = innerGraph.addsigmoid(h3, h2, h33, h333, cell.bf); } else { forget_gate = innerGraph.addsigmoid(h3, h2, h33, cell.bf); } }, () => { var h4 = innerGraph.mul(input, cell.Wox); var h5 = innerGraph.mul(hidden_prev, cell.Woh); var h55 = innerGraph.mul(context, cell.WoC); if (sdim > 0) { var h555 = innerGraph.mul(sparseInput, cell.WoS); output_gate = innerGraph.addsigmoid(h5, h4, h55, h555, cell.bo); } else { output_gate = innerGraph.addsigmoid(h5, h4, h55, cell.bo); } }, () => { var h6 = innerGraph.mul(input, cell.Wcx); var h7 = innerGraph.mul(hidden_prev, cell.Wch); var h77 = innerGraph.mul(context, cell.WcC); if (sdim > 0) { var h777 = innerGraph.mul(sparseInput, cell.WcS); cell_write = innerGraph.addtanh(h7, h6, h77, h777, cell.bc); } else { cell_write = innerGraph.addtanh(h7, h6, h77, cell.bc); } }); // compute new cell activation var retain_cell = innerGraph.eltmul(forget_gate, cell_prev); // what do we keep from cell var write_cell = innerGraph.eltmul(input_gate, cell_write); // what do we write to cell var cell_d = innerGraph.add(retain_cell, write_cell); // new cell contents // compute hidden state as gated, saturated cell activations var hidden_d = innerGraph.eltmul(output_gate, innerGraph.tanh(cell_d)); this.ht = hidden_d; this.ct = cell_d; return(ht); }