public unsafe override void feedNext() { outputTensorMemAlloc(); int inputHeight = InputTensorDims[0]; int inputWidth = InputTensorDims[1]; int filterHeight = weights.Dimensions[0]; int filterWidth = weights.Dimensions[1]; int channelCount = weights.Dimensions[2]; int filterCount = weights.Dimensions[3]; int mCountH = inputHeight - filterHeight + 1; int mCountW = inputWidth - filterWidth + 1; Tensor possibleH = new Tensor(new int[] { outputDims[0], 1 }); int j = 0; for (int i = 0; i < mCountH; i += stride[0]) { possibleH.memPtr[j++] = i; } Tensor possibleW = new Tensor(new int[] { 1, outputDims[1] }); j = 0; for (int i = 0; i < mCountW; i += stride[1]) { possibleW.memPtr[j++] = i; } Tensor startingindexes = possibleW + possibleH * inputWidth; possibleH.Dispose(); possibleW.Dispose(); possibleH = new Tensor(new int[] { filterHeight, 1 }); for (int i = 0; i < filterHeight; i++) { possibleH.memPtr[i] = i; } possibleW = new Tensor(new int[] { 1, filterWidth }); for (int i = 0; i < filterWidth; i++) { possibleW.memPtr[i] = i; } Tensor offsets = possibleW + possibleH * inputWidth; possibleH.Dispose(); possibleW.Dispose(); startingindexes.reshape(new int[] { startingindexes.TotalLength, 1 }); offsets.reshape(new int[] { 1, offsets.TotalLength }); Tensor allindexes = startingindexes + offsets; startingindexes.Dispose(); offsets.Dispose(); int outputH_W = outputDims[0] * outputDims[1]; Tensor allInOne = new Tensor(new int[] { outputH_W, filterHeight * filterWidth * channelCount }); int h_W = inputHeight * inputWidth; int fH_fW = filterHeight * filterWidth; int h_w_fH_fW = h_W * fH_fW; int[] aiInd = new int[] { 0, 0 }; int[] aioInd = new int[] { 0, 0 }; int tmp; for (int ch = 0; ch < channelCount; ch++) { for (int k = 0; k < outputH_W; k++) { aioInd[0] = k; aiInd[0] = k; for (int m = 0; m < fH_fW; m++) { aioInd[1] = ch * fH_fW + m; aiInd[1] = m; tmp = (int)allindexes[aiInd] + h_W * ch; allInOne[aioInd] = inputTensor.memPtr[tmp]; } } } allindexes.Dispose(); nextLayer.inputTensor.reshape(new int[] { allInOne.Dimensions[0], filterCount }); int x = allInOne.Dimensions[1]; int y = filterCount; int z = allInOne.Dimensions[0]; ParallelOptions po = new ParallelOptions(); po.MaxDegreeOfParallelism = Environment.ProcessorCount; Parallel.For(0, y, po, f => { int aioInd_; int outputInd_; int weightsInd_; for (int g = 0; g < z; g++) { float sum = 0; for (int h = 0; h < x; h++) { aioInd_ = h * z + g; weightsInd_ = f * x + h; sum += weights.memPtr[weightsInd_] * allInOne.memPtr[aioInd_]; } outputInd_ = f * z + g; nextLayer.inputTensor.memPtr[outputInd_] = sum + biases.memPtr[f]; } }); nextLayer.inputTensor.reshape(outputDims); allInOne.Dispose(); disposeInputTensor(); }
public unsafe override void feedNext() { outputTensorMemAlloc(); int inputHeight = InputTensorDims[0]; int inputWidth = InputTensorDims[1]; int filterHeight = weights.Dimensions[0]; int filterWidth = weights.Dimensions[1]; int channelCount = weights.Dimensions[2]; int filterCount = weights.Dimensions[3]; int mCountH = inputHeight - filterHeight + 1; int mCountW = inputWidth - filterWidth + 1; Tensor possibleH = new Tensor(new int[] { outputDims[0], 1 }); int j = 0; for (int i = 0; i < mCountH; i += stride[0]) { possibleH.memPtr[j++] = i; } Tensor possibleW = new Tensor(new int[] { 1, outputDims[1] }); j = 0; for (int i = 0; i < mCountW; i += stride[1]) { possibleW.memPtr[j++] = i; } Tensor startingIndexes = possibleW + possibleH * inputWidth; possibleH.Dispose(); possibleW.Dispose(); possibleH = new Tensor(new int[] { filterHeight, 1 }); for (int i = 0; i < filterHeight; i++) { possibleH.memPtr[i] = i; } possibleW = new Tensor(new int[] { 1, filterWidth }); for (int i = 0; i < filterWidth; i++) { possibleW.memPtr[i] = i; } Tensor offsets = possibleW + possibleH * inputWidth; possibleH.Dispose(); possibleW.Dispose(); startingIndexes.reshape(new int[] { startingIndexes.TotalLength, 1 }); offsets.reshape(new int[] { 1, offsets.TotalLength }); Tensor allIndexes = startingIndexes + offsets; startingIndexes.Dispose(); offsets.Dispose(); int outputH_W = outputDims[0] * outputDims[1]; Tensor allInOne = new Tensor(new int[] { outputH_W, filterHeight * filterWidth * channelCount }); int h_W = inputHeight * inputWidth; int fH_fW = filterHeight * filterWidth; int h_w_fH_fW = h_W * fH_fW; int tmp; //int[] aiInd = new int[] { 0, 0 }; //int[] aioInd = new int[] { 0, 0 }; //for (int ch = 0; ch < channelCount; ch++) //{ // for (int k = 0; k < outputH_W; k++) // { // aioInd[0] = k; // aiInd[0] = k; // for (int m = 0; m < fH_fW; m++) // { // aioInd[1] = ch * fH_fW + m; // aiInd[1] = m; // tmp = (int)allIndexes[aiInd] + h_W * ch; // allInOne[aioInd] = inputTensor.memPtr[tmp]; // } // } //} // A bit faster: int aiInd, aioInd; for (int ch = 0; ch < channelCount; ch++) { int fH_fW_ch = fH_fW * ch; int h_W_ch = h_W * ch; for (int m = 0; m < fH_fW; m++) { aiInd = m * outputH_W; aioInd = (fH_fW_ch + m) * outputH_W; for (int k = 0; k < outputH_W; k++) { tmp = (int)allIndexes.memPtr[aiInd++] + h_W_ch; allInOne.memPtr[aioInd++] = inputTensor.memPtr[tmp]; } } } allIndexes.Dispose(); nextLayer.inputTensor.reshape(new int[] { allInOne.Dimensions[0], filterCount }); if (useCBLAS) { weights.reshape(new int[] { filterHeight *filterWidth *channelCount, filterCount }); //for (int i = 0; i < nextLayer.inputTensor.Dimensions[0]; i++) // for (int u = 0; u < biases.Dimensions[0]; u++) // nextLayer.inputTensor[i,u] = biases.memPtr[u]; int q = 0; for (int i = 0; i < nextLayer.inputTensor.Dimensions[1]; i++) { float f = biases.memPtr[i]; for (int p = 0; p < nextLayer.inputTensor.Dimensions[0]; p++) { nextLayer.inputTensor.memPtr[q++] = f; } } nextLayer.inputTensor.GeMM(allInOne, weights, 1.0f, 1.0f); weights.reshape(new int[] { filterHeight, filterWidth, channelCount, filterCount }); } else { int x = allInOne.Dimensions[1]; int y = filterCount; int z = allInOne.Dimensions[0]; ParallelOptions po = new ParallelOptions(); po.MaxDegreeOfParallelism = Environment.ProcessorCount; Parallel.For(0, y, po, f => { int aioInd_; int outputInd_; int weightsInd_; for (int g = 0; g < z; g++) { float sum = 0; for (int h = 0; h < x; h++) { aioInd_ = h * z + g; weightsInd_ = f * x + h; sum += weights.memPtr[weightsInd_] * allInOne.memPtr[aioInd_]; } outputInd_ = f * z + g; nextLayer.inputTensor.memPtr[outputInd_] = sum + biases.memPtr[f]; } }); } nextLayer.inputTensor.reshape(outputDims); allInOne.Dispose(); disposeInputTensor(); }