private unsafe void MultiplyScalarU(Span <float> scalar, Span <float> dst) { fixed(float *pdst = dst) fixed(float *psrc = scalar) { var pDstEnd = pdst + dst.Length; var pDstCurrent = pdst; var scalarVector256 = Avx.BroadcastScalarToVector256(psrc); while (pDstCurrent + 8 <= pDstEnd) { var dstVector = Avx.LoadVector256(pDstCurrent); dstVector = Avx.Multiply(dstVector, scalarVector256); Avx.Store(pDstCurrent, dstVector); pDstCurrent += 8; } } }
// This function implements Algorithm 2 in https://github.com/wschin/fast-ffm/blob/master/fast-ffm.pdf // Calculate the stochastic gradient and update the model. public static unsafe void CalculateGradientAndUpdate(int *fieldIndices, int *featureIndices, float *featureValues, float *latentSum, float *linearWeights, float *latentWeights, float *linearAccumulatedSquaredGrads, float *latentAccumulatedSquaredGrads, float lambdaLinear, float lambdaLatent, float learningRate, int fieldCount, int latentDim, float weight, int count, float slope) { Contracts.Assert(Avx.IsSupported); int m = fieldCount; int d = latentDim; int c = count; int * pf = fieldIndices; int * pi = featureIndices; float *px = featureValues; float *pq = latentSum; float *pw = linearWeights; float *pv = latentWeights; float *phw = linearAccumulatedSquaredGrads; float *phv = latentAccumulatedSquaredGrads; Vector256 <float> wei = Vector256.Create(weight); Vector256 <float> s = Vector256.Create(slope); Vector256 <float> lr = Vector256.Create(learningRate); Vector256 <float> lambdav = Vector256.Create(lambdaLatent); for (int i = 0; i < count; i++) { int f = pf[i]; int j = pi[i]; // Calculate gradient of linear term w_j. float g = weight * (lambdaLinear * pw[j] + slope * px[i]); // Accumulate the gradient of the linear term. phw[j] += g * g; // Perform ADAGRAD update rule to adjust linear term. pw[j] -= learningRate / MathF.Sqrt(phw[j]) * g; // Update latent term, v_j,f', f'=1,...,m. Vector256 <float> x = Avx.BroadcastScalarToVector256(px + i); for (int fprime = 0; fprime < m; fprime++) { float * vjfprime = pv + j * m * d + fprime * d; float * hvjfprime = phv + j * m * d + fprime * d; float * qfprimef = pq + fprime * m * d + f * d; Vector256 <float> sx = Avx.Multiply(s, x); for (int k = 0; k + 8 <= d; k += 8) { Vector256 <float> v = Avx.LoadVector256(vjfprime + k); Vector256 <float> q = Avx.LoadVector256(qfprimef + k); // Calculate L2-norm regularization's gradient. Vector256 <float> gLatent = Avx.Multiply(lambdav, v); Vector256 <float> tmp = q; // Calculate loss function's gradient. if (fprime == f) { tmp = MultiplyAddNegated(v, x, q); } gLatent = MultiplyAdd(sx, tmp, gLatent); gLatent = Avx.Multiply(wei, gLatent); // Accumulate the gradient of latent vectors. Vector256 <float> h = MultiplyAdd(gLatent, gLatent, Avx.LoadVector256(hvjfprime + k)); // Perform ADAGRAD update rule to adjust latent vector. v = MultiplyAddNegated(lr, Avx.Multiply(Avx.ReciprocalSqrt(h), gLatent), v); Avx.Store(vjfprime + k, v); Avx.Store(hvjfprime + k, h); } } } }
// This function implements Algorithm 1 in https://github.com/wschin/fast-ffm/blob/master/fast-ffm.pdf. // Compute the output value of the field-aware factorization, as the sum of the linear part and the latent part. // The linear part is the inner product of linearWeights and featureValues. // The latent part is the sum of all intra-field interactions in one field f, for all fields possible public static unsafe void CalculateIntermediateVariables(int *fieldIndices, int *featureIndices, float *featureValues, float *linearWeights, float *latentWeights, float *latentSum, float *response, int fieldCount, int latentDim, int count) { Contracts.Assert(Avx.IsSupported); // The number of all possible fields. int m = fieldCount; int d = latentDim; int c = count; int * pf = fieldIndices; int * pi = featureIndices; float *px = featureValues; float *pw = linearWeights; float *pv = latentWeights; float *pq = latentSum; float linearResponse = 0; float latentResponse = 0; Unsafe.InitBlock(pq, 0, (uint)(m * m * d * sizeof(float))); Vector256 <float> y = Vector256 <float> .Zero; Vector256 <float> tmp = Vector256 <float> .Zero; for (int i = 0; i < c; i++) { int f = pf[i]; int j = pi[i]; linearResponse += pw[j] * px[i]; Vector256 <float> x = Avx.BroadcastScalarToVector256(px + i); Vector256 <float> xx = Avx.Multiply(x, x); // tmp -= <v_j,f, v_j,f> * x * x int vBias = j * m * d + f * d; // j-th feature's latent vector in the f-th field hidden space. float *vjf = pv + vBias; for (int k = 0; k + 8 <= d; k += 8) { Vector256 <float> vjfBuffer = Avx.LoadVector256(vjf + k); tmp = MultiplyAddNegated(Avx.Multiply(vjfBuffer, vjfBuffer), xx, tmp); } for (int fprime = 0; fprime < m; fprime++) { vBias = j * m * d + fprime * d; int qBias = f * m * d + fprime * d; float *vjfprime = pv + vBias; float *qffprime = pq + qBias; // q_f,f' += v_j,f' * x for (int k = 0; k + 8 <= d; k += 8) { Vector256 <float> vjfprimeBuffer = Avx.LoadVector256(vjfprime + k); Vector256 <float> q = Avx.LoadVector256(qffprime + k); q = MultiplyAdd(vjfprimeBuffer, x, q); Avx.Store(qffprime + k, q); } } } for (int f = 0; f < m; f++) { // tmp += <q_f,f, q_f,f> float *qff = pq + f * m * d + f * d; for (int k = 0; k + 8 <= d; k += 8) { Vector256 <float> qffBuffer = Avx.LoadVector256(qff + k); // Intra-field interactions. tmp = MultiplyAdd(qffBuffer, qffBuffer, tmp); } // y += <q_f,f', q_f',f>, f != f' // Whis loop handles inter - field interactions because f != f'. for (int fprime = f + 1; fprime < m; fprime++) { float *qffprime = pq + f * m * d + fprime * d; float *qfprimef = pq + fprime * m * d + f * d; for (int k = 0; k + 8 <= d; k += 8) { // Inter-field interaction. Vector256 <float> qffprimeBuffer = Avx.LoadVector256(qffprime + k); Vector256 <float> qfprimefBuffer = Avx.LoadVector256(qfprimef + k); y = MultiplyAdd(qffprimeBuffer, qfprimefBuffer, y); } } } y = MultiplyAdd(_point5, tmp, y); tmp = Avx.Add(y, Avx.Permute2x128(y, y, 1)); tmp = Avx.HorizontalAdd(tmp, tmp); y = Avx.HorizontalAdd(tmp, tmp); Sse.StoreScalar(&latentResponse, y.GetLower()); // The lowest slot is the response value. *response = linearResponse + latentResponse; }
protected override unsafe double CalculateImpl(double x, double stepThreshold, int maxN) { if (!Avx.IsSupported) { Status = TaylorSeriesStatus.NotSupported; return(Double.NaN); } const int vectorSize = 256 / 8 / sizeof(double); // v8888 <- (8, 8, 8, 8) var value8 = 8.0; var v8888 = Avx.BroadcastScalarToVector256(&value8); // xPow8 <- (x^8, x^8, x^8, x^8) var xPow8 = Avx.BroadcastScalarToVector256(&x); xPow8 = Avx.Multiply(xPow8, xPow8); xPow8 = Avx.Multiply(xPow8, xPow8); xPow8 = Avx.Multiply(xPow8, xPow8); // up <- (x^(-1), x^(-3), x^(-5), x^(-7)) var upSa = stackalloc double[vectorSize]; var xDiv2iPlus1 = 1 / x; for (var i = 0; i < vectorSize; i++) { upSa[i] = xDiv2iPlus1; xDiv2iPlus1 /= x * x; } var up = Avx.LoadVector256(upSa); // down <- (1, 3, 5, 7) var downSa = stackalloc double[vectorSize] { 1, 3, 5, 7 }; var down = Avx.LoadVector256(downSa); // sum <- (0, 0, 0, 0) var sum = Vector256 <double> .Zero; N = 0; while (N < maxN) { // div <- up / down var div = Avx.Divide(up, down); // sum <- sum + div sum = Avx.Add(sum, div); // div = (x1, x2, x3, last) var last = div.GetElement(vectorSize - 1); N += vectorSize; if (Math.Abs(last) < stepThreshold) { break; } // up <- up / (x^8, x^8, x^8, x^8) up = Avx.Divide(up, xPow8); // down <- down + (8, 8, 8, 8) down = Avx.Add(down, v8888); } var resultSa = stackalloc double[vectorSize]; Avx.Store(resultSa, sum); Status = N >= maxN ? TaylorSeriesStatus.TooManyIterations : TaylorSeriesStatus.Success; return(resultSa[0] + resultSa[1] + resultSa[2] + resultSa[3]); } }