/// <summary> /// Checks the gradient of a single output with respect to particular input /// blob(s). If check_bottom = i >= 0, check only the ith bottom Blob<T>. /// If check_bottom == -1, check everything -- all bottom Blobs and all /// param Blobs. Otherwise (if check_bottom less than -1), check only param Blobs. /// </summary> public void CheckGradientSingle(Layer <T> layer, BlobCollection <T> colBottom, BlobCollection <T> colTop, int nCheckBottom, int nTopID, int nTopDataID, bool bElementwise = false) { if (bElementwise) { m_log.CHECK_EQ(0, layer.blobs.Count(), "Cannot have blobs in the layer checked for element-wise checking."); m_log.CHECK_LE(0, nTopID, "The top ID '" + nTopID.ToString() + "' must be zero or greater with element-wise checking."); m_log.CHECK_LE(0, nTopDataID, "The top data ID '" + nTopDataID.ToString() + "' must be zero or greater with element-wise checking."); int nTopCount = colTop[nTopID].count(); for (int nBlobID = 0; nBlobID < colBottom.Count(); nBlobID++) { m_log.CHECK_EQ(nTopCount, colBottom[nBlobID].count(), "The top count and blob counts must be equal for element-wise checking."); } } // First, figure out what blobs we need to check against, and zero init // parameter blobs. BlobCollection <T> colBlobsToCheck = new BlobCollection <T>(); List <bool> rgPropagateDown = new List <bool>(); for (int i = 0; i < colBottom.Count; i++) { rgPropagateDown.Add((nCheckBottom == -1) ? true : false); } for (int i = 0; i < layer.blobs.Count; i++) { Blob <T> blob = layer.blobs[i]; blob.SetDiff(0); colBlobsToCheck.Add(blob); } if (nCheckBottom == -1) { for (int i = 0; i < colBottom.Count; i++) { colBlobsToCheck.Add(colBottom[i]); } } else if (nCheckBottom >= 0) { m_log.CHECK_LT(nCheckBottom, colBottom.Count, "The check bottom value '" + nCheckBottom.ToString() + "' must be less than the number of bottom blobs."); colBlobsToCheck.Add(colBottom[nCheckBottom]); rgPropagateDown[nCheckBottom] = true; } m_log.CHECK_GT(colBlobsToCheck.Count, 0, "No blobs to check!"); // Compute the gradient analytically using Backward. m_cuda.rng_setseed(m_uiSeed); // Ignore the loss from the layer (it's just the weighted sum of the losses // from the top blobs, whose gradients we may want to test individually). layer.Forward(colBottom, colTop); // Get additional loss from the objective. GetObjAndGradient(layer, colTop, nTopID, nTopDataID); layer.Backward(colTop, rgPropagateDown, colBottom); // Store computed gradients for all checked blobs BlobCollection <T> colComputedGradientBlobs = new BlobCollection <T>(); for (int nBlobID = 0; nBlobID < colBlobsToCheck.Count; nBlobID++) { Blob <T> current_blob = colBlobsToCheck[nBlobID]; Blob <T> new_blob = new Blob <T>(m_cuda, m_log); if (current_blob.DiffExists) { new_blob.ReshapeLike(current_blob); m_cuda.copy(current_blob.count(), current_blob.gpu_diff, new_blob.mutable_gpu_data); } colComputedGradientBlobs.Add(new_blob); } // Compute derivative of top w.r.t. each bottom and parameter input using // finite differencing. for (int nBlobID = 0; nBlobID < colBlobsToCheck.Count; nBlobID++) { Blob <T> current_blob = colBlobsToCheck[nBlobID]; if (!current_blob.DiffExists) { continue; } T[] rgdfComputedGradients = colComputedGradientBlobs[nBlobID].update_cpu_data(); double dfData; for (int nFeatID = 0; nFeatID < current_blob.count(); nFeatID++) { if (m_evtCancel.WaitOne(0)) { throw new Exception("Aborted!"); } // For an element-wise layer, we only need to do finite differencing to // compute the derivative of top[nTopID][nTopDataID] w.r.t. // bottom[nBlobID][i] only for i == nTopDataID. For any otehr // i != nTopDataID, we know the derivative is 0 by definition, and simply // check that that's true. double dfEstimateGradient = 0; double dfPositiveObjective = 0; double dfNegativeObjective = 0; if (!bElementwise || (nFeatID == nTopDataID)) { // Do finite differencing. // Compute loss with stepwise added to input. dfData = (double)Convert.ChangeType(current_blob.GetData(nFeatID), typeof(double)); dfData += m_dfStepsize; current_blob.SetData(dfData, nFeatID); m_cuda.rng_setseed(m_uiSeed); layer.Forward(colBottom, colTop); dfPositiveObjective = GetObjAndGradient(layer, colTop, nTopID, nTopDataID); // Compute loss with stepsize subtracted from input. dfData = (double)Convert.ChangeType(current_blob.GetData(nFeatID), typeof(double)); dfData -= (m_dfStepsize * 2); current_blob.SetData(dfData, nFeatID); m_cuda.rng_setseed(m_uiSeed); layer.Forward(colBottom, colTop); dfNegativeObjective = GetObjAndGradient(layer, colTop, nTopID, nTopDataID); // Recover original input value. dfData = (double)Convert.ChangeType(current_blob.GetData(nFeatID), typeof(double)); dfData += m_dfStepsize; current_blob.SetData(dfData, nFeatID); dfEstimateGradient = (dfPositiveObjective - dfNegativeObjective) / m_dfStepsize / 2.0; } double dfComputedGradient = (double)Convert.ChangeType(rgdfComputedGradients[nFeatID], typeof(double)); double dfFeature = (double)Convert.ChangeType(current_blob.GetData(nFeatID), typeof(double)); if (m_dfKink - m_dfKinkRange > Math.Abs(dfFeature) || Math.Abs(dfFeature) > m_dfKink + m_dfKinkRange) { // We check the relative accuracy, but for too small values, we threshold // the scale factor by 1. double dfScale = Math.Max(Math.Max(Math.Abs(dfComputedGradient), Math.Abs(dfEstimateGradient)), 1.0); m_log.EXPECT_NEAR(dfComputedGradient, dfEstimateGradient, m_dfThreshold * dfScale, "DEBUG: (nTopID, nTopDataID, nBlobID, nFeatID)=" + nTopID.ToString() + ", " + nTopDataID.ToString() + ", " + nBlobID.ToString() + ", " + nFeatID.ToString() + "; feat = " + dfFeature.ToString() + "; objective+ = " + dfPositiveObjective.ToString() + "; objective- = " + dfNegativeObjective.ToString()); } } } }
private void layerSetUpCaffe(BlobCollection <T> colBottom, BlobCollection <T> colTop) { // Get (recurrent) input/output names. List <string> rgOutputNames = new List <string>(); OutputBlobNames(rgOutputNames); List <string> rgRecurInputNames = new List <string>(); RecurrentInputBlobNames(rgRecurInputNames); List <string> rgRecurOutputNames = new List <string>(); RecurrentOutputBlobNames(rgRecurOutputNames); int nNumRecurBlobs = rgRecurInputNames.Count; m_log.CHECK_EQ(nNumRecurBlobs, rgRecurOutputNames.Count, "The number of recurrent input names must equal the number of recurrent output names."); // If provided, bottom[2] is a static input to the recurrent net. int nNumHiddenExposed = (m_bExposeHidden) ? nNumRecurBlobs : 0; m_bStaticInput = (colBottom.Count > 2 + nNumHiddenExposed) ? true : false; if (m_bStaticInput) { m_log.CHECK_GE(colBottom[2].num_axes, 1, "When static input is present, the bottom[2].num_axes must be >= 1"); m_log.CHECK_EQ(m_nN, colBottom[2].shape(0), "When static input is present, the bottom[2].shape(0) must = N which is " + m_nN.ToString()); } // Create a NetParameter; setup the inputs that aren't unique to particular // recurrent architectures. NetParameter net_param = new NetParameter(); LayerParameter input_layer = new LayerParameter(LayerParameter.LayerType.INPUT); input_layer.top.Add("x"); BlobShape input_shape1 = new param.BlobShape(); for (int i = 0; i < colBottom[0].num_axes; i++) { input_shape1.dim.Add(colBottom[0].shape(i)); } input_layer.input_param.shape.Add(input_shape1); input_layer.top.Add("cont"); BlobShape input_shape2 = new param.BlobShape(); for (int i = 0; i < colBottom[1].num_axes; i++) { input_shape2.dim.Add(colBottom[1].shape(i)); } input_layer.input_param.shape.Add(input_shape2); if (m_bStaticInput) { input_layer.top.Add("x_static"); BlobShape input_shape3 = new BlobShape(); for (int i = 0; i < colBottom[2].num_axes; i++) { input_shape3.dim.Add(colBottom[2].shape(i)); } input_layer.input_param.shape.Add(input_shape3); } net_param.layer.Add(input_layer); // Call the child's FillUnrolledNet implementation to specify the unrolled // recurrent architecture. FillUnrolledNet(net_param); // Prepend this layer's name to the names of each layer in the unrolled net. string strLayerName = m_param.name; if (strLayerName.Length > 0) { for (int i = 0; i < net_param.layer.Count; i++) { LayerParameter layer = net_param.layer[i]; layer.name = strLayerName + "_" + layer.name; } } // Add 'pseudo-losses' to all outputs to force backpropagation. // (Setting force_backward is too agressive as we may not need to backprop to // all inputs, e.g., the sequence continuation indicators.) List <string> rgPseudoLosses = new List <string>(); for (int i = 0; i < rgOutputNames.Count; i++) { rgPseudoLosses.Add(rgOutputNames[i] + "_pseudoloss"); LayerParameter layer = new LayerParameter(LayerParameter.LayerType.REDUCTION, rgPseudoLosses[i]); layer.bottom.Add(rgOutputNames[i]); layer.top.Add(rgPseudoLosses[i]); layer.loss_weight.Add(1.0); net_param.layer.Add(layer); } // Create the unrolled net. Net <T> sharedNet = null; if (m_param is LayerParameterEx <T> ) { RecurrentLayer <T> sharedLayer = ((LayerParameterEx <T>)m_param).SharedLayer as RecurrentLayer <T>; if (sharedLayer != null) { sharedNet = sharedLayer.m_unrolledNet; } } m_unrolledNet = new Net <T>(m_cuda, m_log, net_param, m_evtCancel, null, m_phase, null, sharedNet); m_unrolledNet.set_debug_info(m_param.recurrent_param.debug_info); // Setup pointers to the inputs. m_blobXInputBlob = m_unrolledNet.blob_by_name("x"); m_blobContInputBlob = m_unrolledNet.blob_by_name("cont"); if (m_bStaticInput) { m_blobXStaticInputBlob = m_unrolledNet.blob_by_name("x_static"); } // Setup pointers to paired recurrent inputs/outputs. m_colRecurInputBlobs = new common.BlobCollection <T>(); m_colRecurOutputBlobs = new common.BlobCollection <T>(); for (int i = 0; i < nNumRecurBlobs; i++) { m_colRecurInputBlobs.Add(m_unrolledNet.blob_by_name(rgRecurInputNames[i])); m_colRecurOutputBlobs.Add(m_unrolledNet.blob_by_name(rgRecurOutputNames[i])); } // Setup pointers to outputs. m_log.CHECK_EQ(colTop.Count() - nNumHiddenExposed, rgOutputNames.Count, "OutputBlobNames must provide output blob name for each top."); m_colOutputBlobs = new common.BlobCollection <T>(); for (int i = 0; i < rgOutputNames.Count; i++) { m_colOutputBlobs.Add(m_unrolledNet.blob_by_name(rgOutputNames[i])); } // We should have 2 inputs (x and cont), plus a number of recurrent inputs, // plus maybe a static input. int nStaticInput = (m_bStaticInput) ? 1 : 0; m_log.CHECK_EQ(2 + nNumRecurBlobs + nStaticInput, m_unrolledNet.input_blobs.Count, "The unrolled net input count should equal 2 + number of recurrent blobs (" + nNumRecurBlobs.ToString() + ") + static inputs (" + nStaticInput.ToString() + ")"); // This layer's parameters are any parameters in the layers of the unrolled // net. We only want one copy of each parameter, so check that the parameter // is 'owned' by the layer, rather than shared with another. blobs.Clear(); for (int i = 0; i < m_unrolledNet.parameters.Count; i++) { if (m_unrolledNet.param_owners[i] == -1) { m_log.WriteLine("Adding parameter " + i.ToString() + ": " + m_unrolledNet.param_display_names[i]); blobs.Add(m_unrolledNet.parameters[i]); } } // Check that param_propagate_down is set for all of the parameters in the // unrolled net; set param_propagate_down to true in this layer. for (int i = 0; i < m_unrolledNet.layers.Count; i++) { for (int j = 0; j < m_unrolledNet.layers[i].blobs.Count; j++) { m_log.CHECK(m_unrolledNet.layers[i].param_propagate_down(j), "param_propagate_down not set for layer " + i.ToString() + ", param " + j.ToString()); } } m_rgbParamPropagateDown = new DictionaryMap <bool>(blobs.Count, true); // Set the diffs of recurrent outputs to 0 -- we can't backpropagate across // batches. for (int i = 0; i < m_colRecurOutputBlobs.Count; i++) { m_colRecurOutputBlobs[i].SetDiff(0); } // Check that the last output_names.count layers are the pseudo-losses; // set last_layer_index so that we don't actually run these layers. List <string> rgLayerNames = m_unrolledNet.layer_names; m_nLastLayerIndex = rgLayerNames.Count - 1 - rgPseudoLosses.Count; for (int i = m_nLastLayerIndex + 1, j = 0; i < rgLayerNames.Count; i++, j++) { m_log.CHECK(rgLayerNames[i] == rgPseudoLosses[j], "The last layer at idx " + i.ToString() + " should be the pseudo layer named " + rgPseudoLosses[j]); } }