/// <summary>
        /// Computes the error gradient w.r.t the inputs.
        /// </summary>
        /// <param name="colTop">top output Blob vector (Length 1), providing the error gradient
        /// with respect to computed outputs.</param>
        /// <param name="rgbPropagateDown">propagate down see Layer::Backward</param>
        /// <param name="colBottom">bottom input Blob vector (Length 1)
        /// </param>
        protected override void backward(BlobCollection <T> colTop, List <bool> rgbPropagateDown, BlobCollection <T> colBottom)
        {
            if (rgbPropagateDown[0])
            {
                m_cuda.copy(colTop[0].count(), colTop[0].gpu_diff, colBottom[0].mutable_gpu_diff);

                if (m_param.binary_hash_param.enable_triplet_loss && (m_param.binary_hash_param.iteration_enable == 0 || m_nIteration > m_param.binary_hash_param.iteration_enable))
                {
                    int nNum = colBottom[1].num;
                    int nDim = colBottom[1].count() / colBottom[1].num;

                    List <PoolItemCollection> rgrgPool = run(nNum, colBottom, colTop);
                    if (rgrgPool == null)
                    {
                        return;
                    }

                    float[] rgDiff  = convertF(colTop[0].mutable_cpu_diff);
                    bool    bUpdate = false;

                    for (int i = 0; i < nNum; i++)
                    {
                        PoolItemCollection rgPool  = rgrgPool[i];
                        PoolItem           itemPos = null;
                        PoolItem           itemNeg = null;
                        int nLabel = (int)m_rgLabels[i];

                        // Find values with smallest differences for negatives (argmin)
                        for (int j = 0; j < rgPool.Count; j++)
                        {
                            int nPooledLabel = rgPool[j].Label;
                            if (rgPool[j].IndexIntoClass != m_rgCache2CurrentIndex[nPooledLabel].LastIndex)
                            {
                                if (itemNeg == null && nPooledLabel != nLabel)
                                {
                                    itemNeg = rgPool[j];
                                    break;
                                }
                            }
                        }

                        // Find values with largest differences for positives (argmax)
                        for (int j = rgPool.Count - 1; j >= 0; j--)
                        {
                            int nPooledLabel = rgPool[j].Label;
                            if (rgPool[j].IndexIntoClass != m_rgCache2CurrentIndex[nPooledLabel].LastIndex)
                            {
                                if (itemPos == null && nPooledLabel == nLabel)
                                {
                                    itemPos = rgPool[j];
                                    break;
                                }
                            }
                        }

                        if (itemNeg != null && itemPos != null)
                        {
                            // See page 3 of https://arxiv.org/pdf/1503.03832.pdf
                            int    nAOff    = i * nDim;
                            int    nBOffPos = (itemPos.Label * (m_param.binary_hash_param.cache_depth * nDim)) + (itemPos.IndexIntoClass * nDim);
                            double dfPos    = m_cuda.sumsqdiff(nDim, m_blobWork.mutable_gpu_data, colTop[0].gpu_data, m_colBlobs[1].gpu_data, nAOff, nBOffPos);
                            int    nBOffNeg = (itemNeg.Label * (m_param.binary_hash_param.cache_depth * nDim)) + (itemNeg.IndexIntoClass * nDim);
                            double dfNeg    = m_cuda.sumsqdiff(nDim, m_blobWork.mutable_gpu_data, colTop[0].gpu_data, m_colBlobs[1].gpu_data, nAOff, nBOffNeg);
                            double dfErr    = (Math.Sqrt(dfPos) - Math.Sqrt(dfNeg)) + m_param.binary_hash_param.alpha;

                            if (dfErr > 0)
                            {
                                for (int k = 0; k < m_nLabelCount; k++)
                                {
                                    rgDiff[i * m_nLabelCount + k] += (float)(dfErr / nDim);
                                }

                                bUpdate = true;
                            }
                        }
                    }

                    if (bUpdate)
                    {
                        colBottom[0].mutable_cpu_diff = convert(rgDiff);
                    }
                }
            }
        }
        private List <PoolItemCollection> run(int nNum, BlobCollection <T> colBottom, BlobCollection <T> colTop)
        {
            List <PoolItemCollection> rgrgPool = new List <PoolItemCollection>();

            // This happens on the test or run net so we need to see if the cache that is shared (or loaded) is ready to use.
            if (!m_bIsFull)
            {
                float[] rgIsFull = convertF(m_colBlobs[2].mutable_cpu_data);
                if (rgIsFull[0] == 1 && rgIsFull[1] == 1)
                {
                    m_bIsFull = true;
                    m_log.WriteLine("The Binary Hash Cache is ready to use.");
                }
            }

            if (!m_bIsFull)
            {
                return(null);
            }

            float[] rgOutput = convertF(colTop[0].mutable_cpu_data);
            bool    bUpdate  = false;

            // Normalize the input to range [0,1].
            normalize(colBottom[1], m_blobNormalized, false);
            normalize(colBottom[2], m_blobNormalized, true);

            // Find the distance between each input and each element of cache #1
            // and then from cache #2.
            for (int i = 0; i < nNum; i++)
            {
                //-----------------------------------------
                //  Build the pool using the Rough pass
                //-----------------------------------------
                PoolItemCollection rgPool1 = new alpha.PoolItemCollection();

                int[,] rgOffsets1 = new int[m_nLabelCount * m_param.binary_hash_param.cache_depth, 2];
                for (int j = 0; j < m_nLabelCount; j++)
                {
                    for (int k = 0; k < m_param.binary_hash_param.cache_depth; k++)
                    {
                        int nIdx1 = j * m_param.binary_hash_param.cache_depth + k;
                        rgOffsets1[nIdx1, 0] = i * m_nLayer2Dim;
                        rgOffsets1[nIdx1, 1] = j * m_nLayer2Dim * m_param.binary_hash_param.cache_depth + k * m_nLayer2Dim;
                    }
                }

                DistanceMethod distMethod1 = (m_param.binary_hash_param.dist_calc_pass1 == BinaryHashParameter.DISTANCE_TYPE.EUCLIDEAN) ? DistanceMethod.EUCLIDEAN : DistanceMethod.HAMMING;
                double[]       rgDist1     = m_cuda.calculate_batch_distances(distMethod1, m_dfBinaryThreshold1, m_nLayer2Dim, m_blobNormalized.gpu_data, m_colBlobs[0].gpu_data, m_blobWork.mutable_gpu_data, rgOffsets1);

                for (int j = 0; j < m_nLabelCount; j++)
                {
                    for (int k = 0; k < m_param.binary_hash_param.cache_depth; k++)
                    {
                        int nIdx1 = j * m_param.binary_hash_param.cache_depth + k;
                        rgPool1.Add(new PoolItem(j, k, rgDist1[nIdx1]));
                    }
                }

                rgPool1.Sort();


                //-----------------------------------------
                //  Fine tuned pass
                //
                //  Find the 'pool_size' number of
                //  minimum distances.
                //-----------------------------------------
                PoolItemCollection rgPool2 = new PoolItemCollection();

                int[,] rgOffsets2 = new int[m_param.binary_hash_param.pool_size, 2];
                for (int k = 0; k < m_param.binary_hash_param.pool_size; k++)
                {
                    PoolItem poolItem = rgPool1[k];
                    rgOffsets2[k, 0] = i * m_nLayer3Dim;
                    rgOffsets2[k, 1] = poolItem.Label * m_nLayer3Dim * m_param.binary_hash_param.cache_depth + poolItem.IndexIntoClass * m_nLayer3Dim;
                }

                DistanceMethod distMethod2 = (m_param.binary_hash_param.dist_calc_pass2 == BinaryHashParameter.DISTANCE_TYPE.EUCLIDEAN) ? DistanceMethod.EUCLIDEAN : DistanceMethod.HAMMING;
                double[]       rgDist2     = m_cuda.calculate_batch_distances(distMethod2, m_dfBinaryThreshold2, m_nLayer3Dim, m_blobNormalized.gpu_diff, m_colBlobs[1].gpu_data, m_blobWork.mutable_gpu_data, rgOffsets2);

                for (int k = 0; k < m_param.binary_hash_param.pool_size; k++)
                {
                    PoolItem poolItem = rgPool1[k];
                    rgPool2.Add(new PoolItem(poolItem.Label, poolItem.IndexIntoClass, rgDist2[k]));
                }

                rgPool2.Sort();


                //-----------------------------------------
                //  Select the label from the 'top_k'
                //  minimum items of the fine-tuned pass.
                //-----------------------------------------

                int nNewLabel        = rgPool2.SelectNewLabel(m_param.binary_hash_param.selection_method, (int)m_param.binary_hash_param.top_k, m_phase);
                int nIdx             = i * m_nLabelCount;
                int nPredictionLabel = getLabel(rgOutput, nIdx, m_nLabelCount);

                // If the new label is different from the previously predicted label, replace it.
                if (nNewLabel != nPredictionLabel)
                {
                    setLabel(rgOutput, nIdx, m_nLabelCount, nNewLabel);
                    bUpdate = true;
                }

                rgrgPool.Add(rgPool2);
            }

            if (bUpdate)
            {
                colTop[0].mutable_cpu_data = convert(rgOutput);
            }

            return(rgrgPool);
        }