private protected override void CheckDataValid(IChannel ch, RoleMappedData data) { Host.AssertValue(ch); base.CheckDataValid(ch, data); // Check label types. var labelCol = data.Schema.Label.Value; var labelType = labelCol.Type; if (!(labelType is KeyType || labelType == NumberType.R4)) { throw ch.ExceptParam(nameof(data), $"Label column '{labelCol.Name}' is of type '{labelType}', but must be key or R4."); } // Check group types. ch.CheckParam(data.Schema.Group.HasValue, nameof(data), "Need a group column."); var groupCol = data.Schema.Group.Value; var groupType = groupCol.Type; if (!(groupType == NumberType.U4 || groupType is KeyType)) { throw ch.ExceptParam(nameof(data), $"Group column '{groupCol.Name}' is of type '{groupType}', but must be U4 or a Key."); } }
private protected override void ConvertNaNLabels(IChannel ch, RoleMappedData data, float[] labels) { // Only initialize one time. if (_numClass < 0) { float minLabel = float.MaxValue; float maxLabel = float.MinValue; bool hasNaNLabel = false; foreach (var labelColumn in labels) { if (float.IsNaN(labelColumn)) { hasNaNLabel = true; } else { minLabel = Math.Min(minLabel, labelColumn); maxLabel = Math.Max(maxLabel, labelColumn); } } ch.CheckParam(minLabel >= 0, nameof(data), "Minimum value in label column cannot be negative"); if (maxLabel >= _maxNumClass) { throw ch.ExceptParam(nameof(data), $"Maximum value {maxLabel} in label column exceeds {_maxNumClass}"); } if (data.Schema.Label.Value.Type is KeyDataViewType keyType) { if (hasNaNLabel) { _numClass = keyType.GetCountAsInt32(Host) + 1; } else { _numClass = keyType.GetCountAsInt32(Host); } _tlcNumClass = keyType.GetCountAsInt32(Host); } else { if (hasNaNLabel) { _numClass = (int)maxLabel + 2; } else { _numClass = (int)maxLabel + 1; } _tlcNumClass = (int)maxLabel + 1; } } float defaultLabel = _numClass - 1; for (int i = 0; i < labels.Length; ++i) { if (float.IsNaN(labels[i])) { labels[i] = defaultLabel; } } }
protected virtual void CheckDataValid(IChannel ch, RoleMappedData data) { data.CheckFeatureFloatVector(); ch.CheckParam(data.Schema.Label != null, nameof(data), "Need a label column"); }
protected override void ConvertNaNLabels(IChannel ch, RoleMappedData data, float[] labels) { // Only initialize one time. if (_numClass < 0) { float minLabel = float.MaxValue; float maxLabel = float.MinValue; bool hasNaNLabel = false; foreach (var labelColumn in labels) { if (float.IsNaN(labelColumn)) { hasNaNLabel = true; } else { minLabel = Math.Min(minLabel, labelColumn); maxLabel = Math.Max(maxLabel, labelColumn); } } ch.CheckParam(minLabel >= 0, nameof(data), "min labelColumn cannot be negative"); if (maxLabel >= _maxNumClass) { throw ch.ExceptParam(nameof(data), $"max labelColumn cannot exceed {_maxNumClass}"); } if (data.Schema.Label.Type.IsKey) { ch.Check(data.Schema.Label.Type.AsKey.Contiguous, "labelColumn value should be contiguous"); if (hasNaNLabel) { _numClass = data.Schema.Label.Type.AsKey.Count + 1; } else { _numClass = data.Schema.Label.Type.AsKey.Count; } _tlcNumClass = data.Schema.Label.Type.AsKey.Count; } else { if (hasNaNLabel) { _numClass = (int)maxLabel + 2; } else { _numClass = (int)maxLabel + 1; } _tlcNumClass = (int)maxLabel + 1; } } float defaultLabel = _numClass - 1; for (int i = 0; i < labels.Length; ++i) { if (float.IsNaN(labels[i])) { labels[i] = defaultLabel; } } }
private MatrixFactorizationPredictor TrainCore(IChannel ch, RoleMappedData data, RoleMappedData validData = null) { Host.AssertValue(ch); ch.AssertValue(data); ch.AssertValueOrNull(validData); ch.CheckParam(data.Schema.Label.HasValue, nameof(data), "Input data did not have a unique label"); RecommenderUtils.CheckAndGetMatrixIndexColumns(data, out var matrixColumnIndexColInfo, out var matrixRowIndexColInfo, isDecode: false); var labelCol = data.Schema.Label.Value; if (labelCol.Type != NumberType.R4 && labelCol.Type != NumberType.R8) { throw ch.Except("Column '{0}' for label should be floating point, but is instead {1}", labelCol.Name, labelCol.Type); } MatrixFactorizationPredictor predictor; if (validData != null) { ch.CheckValue(validData, nameof(validData)); ch.CheckParam(validData.Schema.Label.HasValue, nameof(validData), "Input validation data did not have a unique label"); RecommenderUtils.CheckAndGetMatrixIndexColumns(validData, out var validMatrixColumnIndexColInfo, out var validMatrixRowIndexColInfo, isDecode: false); var validLabelCol = validData.Schema.Label.Value; if (validLabelCol.Type != NumberType.R4 && validLabelCol.Type != NumberType.R8) { throw ch.Except("Column '{0}' for validation label should be floating point, but is instead {1}", validLabelCol.Name, validLabelCol.Type); } if (!matrixColumnIndexColInfo.Type.Equals(validMatrixColumnIndexColInfo.Type)) { throw ch.ExceptParam(nameof(validData), "Train and validation sets' matrix-column types differed, {0} vs. {1}", matrixColumnIndexColInfo.Type, validMatrixColumnIndexColInfo.Type); } if (!matrixRowIndexColInfo.Type.Equals(validMatrixRowIndexColInfo.Type)) { throw ch.ExceptParam(nameof(validData), "Train and validation sets' matrix-row types differed, {0} vs. {1}", matrixRowIndexColInfo.Type, validMatrixRowIndexColInfo.Type); } } int colCount = matrixColumnIndexColInfo.Type.GetKeyCount(); int rowCount = matrixRowIndexColInfo.Type.GetKeyCount(); ch.Assert(rowCount > 0); ch.Assert(colCount > 0); // Checks for equality on the validation set ensure it is correct here. using (var cursor = data.Data.GetRowCursor(c => c == matrixColumnIndexColInfo.Index || c == matrixRowIndexColInfo.Index || c == data.Schema.Label.Value.Index)) { // LibMF works only over single precision floats, but we want to be able to consume either. var labGetter = RowCursorUtils.GetGetterAs <float>(NumberType.R4, cursor, data.Schema.Label.Value.Index); var matrixColumnIndexGetter = RowCursorUtils.GetGetterAs <uint>(NumberType.U4, cursor, matrixColumnIndexColInfo.Index); var matrixRowIndexGetter = RowCursorUtils.GetGetterAs <uint>(NumberType.U4, cursor, matrixRowIndexColInfo.Index); if (validData == null) { // Have the trainer do its work. using (var buffer = PrepareBuffer()) { buffer.Train(ch, rowCount, colCount, cursor, labGetter, matrixRowIndexGetter, matrixColumnIndexGetter); predictor = new MatrixFactorizationPredictor(Host, buffer, (KeyType)matrixColumnIndexColInfo.Type, (KeyType)matrixRowIndexColInfo.Type); } } else { RecommenderUtils.CheckAndGetMatrixIndexColumns(validData, out var validMatrixColumnIndexColInfo, out var validMatrixRowIndexColInfo, isDecode: false); using (var validCursor = validData.Data.GetRowCursor( c => c == validMatrixColumnIndexColInfo.Index || c == validMatrixRowIndexColInfo.Index || c == validData.Schema.Label.Value.Index)) { ValueGetter <float> validLabelGetter = RowCursorUtils.GetGetterAs <float>(NumberType.R4, validCursor, validData.Schema.Label.Value.Index); var validMatrixColumnIndexGetter = RowCursorUtils.GetGetterAs <uint>(NumberType.U4, validCursor, validMatrixColumnIndexColInfo.Index); var validMatrixRowIndexGetter = RowCursorUtils.GetGetterAs <uint>(NumberType.U4, validCursor, validMatrixRowIndexColInfo.Index); // Have the trainer do its work. using (var buffer = PrepareBuffer()) { buffer.TrainWithValidation(ch, rowCount, colCount, cursor, labGetter, matrixRowIndexGetter, matrixColumnIndexGetter, validCursor, validLabelGetter, validMatrixRowIndexGetter, validMatrixColumnIndexGetter); predictor = new MatrixFactorizationPredictor(Host, buffer, (KeyType)matrixColumnIndexColInfo.Type, (KeyType)matrixRowIndexColInfo.Type); } } } } return(predictor); }
GeneralFunctionAnalyzer <TIn, TDelegateInput, TOutShape>( IHostEnvironment env, IChannel ch, TDelegateInput input, ReaderReconciler <TIn> baseReconciler, Func <TDelegateInput, TOutShape> mapper, out IEstimator <ITransformer> estimator, Func <PipelineColumn, string> inputNameFunction) { Contracts.CheckValue(mapper, nameof(mapper)); var method = mapper.Method; var output = mapper(input); KeyValuePair <string, PipelineColumn>[] outPairs = StaticPipeInternalUtils.GetNamesValues(output, method.ReturnParameter); // Map where the key depends on the set of things in the value. The value contains the yet unresolved dependencies. var keyDependsOn = new Dictionary <PipelineColumn, HashSet <PipelineColumn> >(); // Map where the set of things in the value depend on the key. var dependsOnKey = new Dictionary <PipelineColumn, HashSet <PipelineColumn> >(); // The set of columns detected with zero dependencies. var zeroDependencies = new List <PipelineColumn>(); // First we build up the two structures above, using a queue and visiting from the outputs up. var toVisit = new Queue <PipelineColumn>(outPairs.Select(p => p.Value)); while (toVisit.Count > 0) { var col = toVisit.Dequeue(); ch.CheckParam(col != null, nameof(mapper), "The delegate seems to have null columns returned somewhere in the pipe."); if (keyDependsOn.ContainsKey(col)) { continue; // Already visited. } var dependsOn = new HashSet <PipelineColumn>(); foreach (var dep in col.Dependencies ?? Enumerable.Empty <PipelineColumn>()) { dependsOn.Add(dep); if (!dependsOnKey.TryGetValue(dep, out var dependsOnDep)) { dependsOnKey[dep] = dependsOnDep = new HashSet <PipelineColumn>(); toVisit.Enqueue(dep); } dependsOnDep.Add(col); } keyDependsOn[col] = dependsOn; if (dependsOn.Count == 0) { zeroDependencies.Add(col); } } // Get the base input columns. var baseInputs = keyDependsOn.Select(p => p.Key).Where(col => col.ReconcilerObj == baseReconciler).ToArray(); // The columns that utilize the base reconciler should have no dependencies. This could only happen if // the caller of this function has introduced a situation whereby they are claiming they can reconcile // to a data-reader object but still have input data dependencies, which does not make sense and // indicates that there is a bug in that component code. Unfortunately we can only detect that condition, // not determine exactly how it arose, but we can still do so to indicate to the user that there is a // problem somewhere in the stack. ch.CheckParam(baseInputs.All(col => keyDependsOn[col].Count == 0), nameof(input), "Bug detected where column producing object was yielding columns with dependencies."); // This holds the mappings of columns to names and back. Note that while the same column could be used on // the *output*, for example, you could hypothetically have `(a: r.Foo, b: r.Foo)`, we treat that as the last thing // that is done. var nameMap = new BidirectionalDictionary <string, PipelineColumn>(); // Check to see if we have any set of initial names. This is important in the case where we are mapping // in an input data view. foreach (var col in baseInputs) { string inputName = inputNameFunction(col); if (inputName != null) { ch.Assert(!nameMap.ContainsKey(col)); ch.Assert(!nameMap.ContainsKey(inputName)); nameMap[col] = inputName; ch.Trace($"Using input with name {inputName}."); } } estimator = null; var toCopy = new List <(string src, string dst)>(); int tempNum = 0; // For all outputs, get potential name collisions with used inputs. Resolve by assigning the input a temporary name. foreach (var p in outPairs) { // If the name for the output is already used by one of the inputs, and this output column does not // happen to have the same name, then we need to rename that input to keep it available. if (nameMap.TryGetValue(p.Key, out var inputCol) && p.Value != inputCol) { ch.Assert(baseInputs.Contains(inputCol)); string tempName = $"#Temp_{tempNum++}"; ch.Trace($"Input/output name collision: Renaming '{p.Key}' to '{tempName}'."); toCopy.Add((p.Key, tempName)); nameMap[tempName] = nameMap[p.Key]; ch.Assert(!nameMap.ContainsKey(p.Key)); } // If we already have a name for this output column, maybe it is used elsewhere. (This can happen when // the only thing done with an input is we rename it, or output it twice, or something like this.) In // this case it is most appropriate to delay renaming till after all other processing has been done in // that case. But otherwise we may as well just take the name. if (!nameMap.ContainsKey(p.Value)) { nameMap[p.Key] = p.Value; } } // If any renamings were necessary, create the CopyColumns estimator. if (toCopy.Count > 0) { estimator = new ColumnCopyingEstimator(env, toCopy.ToArray()); } // First clear the inputs from zero-dependencies yet to be resolved. foreach (var col in baseInputs) { ch.Assert(zeroDependencies.Contains(col)); ch.Assert(col.ReconcilerObj == baseReconciler); zeroDependencies.Remove(col); // Make more efficient... if (!dependsOnKey.TryGetValue(col, out var depends)) { continue; } // If any of these base inputs do not have names because, for example, they do not directly appear // in the outputs and otherwise do not have names, assign them a name. if (!nameMap.ContainsKey(col)) { nameMap[col] = $"Temp_{tempNum++}"; } foreach (var depender in depends) { var dependencies = keyDependsOn[depender]; ch.Assert(dependencies.Contains(col)); dependencies.Remove(col); if (dependencies.Count == 0) { zeroDependencies.Add(depender); } } dependsOnKey.Remove(col); } // Call the reconciler to get the base reader estimator. var readerEstimator = baseReconciler.Reconcile(env, baseInputs, nameMap.AsOther(baseInputs)); ch.AssertValueOrNull(readerEstimator); // Next we iteratively find those columns with zero dependencies, "create" them, and if anything depends on // these add them to the collection of zero dependencies, etc. etc. while (zeroDependencies.Count > 0) { // All columns with the same reconciler can be transformed together. // Note that the following policy of just taking the first group is not optimal. So for example, we // could have three columns, (a, b, c). If we had the output (a.X(), b.X() c.Y().X()), then maybe we'd // reconcile a.X() and b.X() together, then reconcile c.Y(), then reconcile c.Y().X() alone. Whereas, we // could have reconciled c.Y() first, then reconciled a.X(), b.X(), and c.Y().X() together. var group = zeroDependencies.GroupBy(p => p.ReconcilerObj).First(); // Beyond that first group that *might* be a data reader reconciler, all subsequent operations will // be on where the data is already loaded and so accept data as an input, that is, they should produce // an estimator. If this is not the case something seriously wonky is going on, most probably that the // user tried to use a column from another source. If this is detected we can produce a sensible error // message to tell them not to do this. if (!(group.Key is EstimatorReconciler rec)) { throw ch.Except("Columns from multiple sources were detected. " + "Did the caller use a " + nameof(PipelineColumn) + " from another delegate?"); } PipelineColumn[] cols = group.ToArray(); // All dependencies should, by this time, have names. ch.Assert(cols.SelectMany(c => c.Dependencies).All(dep => nameMap.ContainsKey(dep))); foreach (var newCol in cols) { if (!nameMap.ContainsKey(newCol)) { nameMap[newCol] = $"#Temp_{tempNum++}"; } } var localInputNames = nameMap.AsOther(cols.SelectMany(c => c.Dependencies ?? Enumerable.Empty <PipelineColumn>())); var localOutputNames = nameMap.AsOther(cols); var usedNames = new HashSet <string>(nameMap.Keys1.Except(localOutputNames.Values)); var localEstimator = rec.Reconcile(env, cols, localInputNames, localOutputNames, usedNames); readerEstimator = readerEstimator?.Append(localEstimator); estimator = estimator?.Append(localEstimator) ?? localEstimator; foreach (var newCol in cols) { zeroDependencies.Remove(newCol); // Make more efficient!! // Finally, we find all columns that depend on this one. If this happened to be the last pending // dependency, then we add it to the list. if (dependsOnKey.TryGetValue(newCol, out var depends)) { foreach (var depender in depends) { var dependencies = keyDependsOn[depender]; Contracts.Assert(dependencies.Contains(newCol)); dependencies.Remove(newCol); if (dependencies.Count == 0) { zeroDependencies.Add(depender); } } dependsOnKey.Remove(newCol); } } } if (keyDependsOn.Any(p => p.Value.Count > 0)) { // This might happen if the user does something incredibly strange, like, say, take some prior // lambda, assign a column to a local variable, then re-use it downstream in a different lambdas. // The user would have to go to some extraorindary effort to do that, but nonetheless we want to // fail with a semi-sensible error message. throw ch.Except("There were some leftover columns with unresolved dependencies. " + "Did the caller use a " + nameof(PipelineColumn) + " from another delegate?"); } // Now do the final renaming, if any is necessary. toCopy.Clear(); foreach (var p in outPairs) { // TODO: Right now we just write stuff out. Once the copy-columns estimator is in place // we ought to do this for real. Contracts.Assert(nameMap.ContainsKey(p.Value)); string currentName = nameMap[p.Value]; if (currentName != p.Key) { ch.Trace($"Will copy '{currentName}' to '{p.Key}'"); toCopy.Add((currentName, p.Key)); } } // If any final renamings were necessary, insert the appropriate CopyColumns transform. if (toCopy.Count > 0) { var copyEstimator = new ColumnCopyingEstimator(env, toCopy.ToArray()); if (estimator == null) { estimator = copyEstimator; } else { estimator = estimator.Append(copyEstimator); } } ch.Trace($"Exiting {nameof(ReaderEstimatorAnalyzerHelper)}"); return(readerEstimator); }