public IPredictor Calibrate(IChannel ch, IDataView data, ICalibratorTrainer caliTrainer, int maxRows) { Host.CheckValue(ch, nameof(ch)); ch.CheckValue(data, nameof(data)); ch.CheckValue(caliTrainer, nameof(caliTrainer)); if (caliTrainer.NeedsTraining) { var bound = new Bound(this, new RoleMappedSchema(data.Schema)); using (var curs = data.GetRowCursor(col => true)) { var scoreGetter = (ValueGetter <Single>)bound.CreateScoreGetter(curs, col => true, out Action disposer); // We assume that we can use the label column of the first predictor, since if the labels are not identical // then the whole model is garbage anyway. var labelGetter = bound.GetLabelGetter(curs, 0, out Action disp); disposer += disp; var weightGetter = bound.GetWeightGetter(curs, 0, out disp); disposer += disp; try { int num = 0; while (curs.MoveNext()) { Single label = 0; labelGetter(ref label); if (!FloatUtils.IsFinite(label)) { continue; } Single score = 0; scoreGetter(ref score); if (!FloatUtils.IsFinite(score)) { continue; } Single weight = 0; weightGetter(ref weight); if (!FloatUtils.IsFinite(weight)) { continue; } caliTrainer.ProcessTrainingExample(score, label > 0, weight); if (maxRows > 0 && ++num >= maxRows) { break; } } } finally { disposer?.Invoke(); } } } var calibrator = caliTrainer.FinishTraining(ch); return(CalibratorUtils.CreateCalibratedPredictor(Host, this, calibrator)); }
private MatrixFactorizationPredictor TrainCore(IChannel ch, RoleMappedData data, RoleMappedData validData = null) { Host.AssertValue(ch); ch.AssertValue(data); ch.AssertValueOrNull(validData); ColumnInfo matrixColumnIndexColInfo; ColumnInfo matrixRowIndexColInfo; ColumnInfo validMatrixColumnIndexColInfo = null; ColumnInfo validMatrixRowIndexColInfo = null; ch.CheckValue(data.Schema.Label, nameof(data), "Input data did not have a unique label"); RecommenderUtils.CheckAndGetMatrixIndexColumns(data, out matrixColumnIndexColInfo, out matrixRowIndexColInfo, isDecode: false); if (data.Schema.Label.Type != NumberType.R4 && data.Schema.Label.Type != NumberType.R8) { throw ch.Except("Column '{0}' for label should be floating point, but is instead {1}", data.Schema.Label.Name, data.Schema.Label.Type); } MatrixFactorizationPredictor predictor; if (validData != null) { ch.CheckValue(validData, nameof(validData)); ch.CheckValue(validData.Schema.Label, nameof(validData), "Input validation data did not have a unique label"); RecommenderUtils.CheckAndGetMatrixIndexColumns(validData, out validMatrixColumnIndexColInfo, out validMatrixRowIndexColInfo, isDecode: false); if (validData.Schema.Label.Type != NumberType.R4 && validData.Schema.Label.Type != NumberType.R8) { throw ch.Except("Column '{0}' for validation label should be floating point, but is instead {1}", data.Schema.Label.Name, data.Schema.Label.Type); } if (!matrixColumnIndexColInfo.Type.Equals(validMatrixColumnIndexColInfo.Type)) { throw ch.ExceptParam(nameof(validData), "Train and validation sets' matrix-column types differed, {0} vs. {1}", matrixColumnIndexColInfo.Type, validMatrixColumnIndexColInfo.Type); } if (!matrixRowIndexColInfo.Type.Equals(validMatrixRowIndexColInfo.Type)) { throw ch.ExceptParam(nameof(validData), "Train and validation sets' matrix-row types differed, {0} vs. {1}", matrixRowIndexColInfo.Type, validMatrixRowIndexColInfo.Type); } } int colCount = matrixColumnIndexColInfo.Type.KeyCount; int rowCount = matrixRowIndexColInfo.Type.KeyCount; ch.Assert(rowCount > 0); ch.Assert(colCount > 0); // Checks for equality on the validation set ensure it is correct here. using (var cursor = data.Data.GetRowCursor(c => c == matrixColumnIndexColInfo.Index || c == matrixRowIndexColInfo.Index || c == data.Schema.Label.Index)) { // LibMF works only over single precision floats, but we want to be able to consume either. var labGetter = RowCursorUtils.GetGetterAs <float>(NumberType.R4, cursor, data.Schema.Label.Index); var matrixColumnIndexGetter = RowCursorUtils.GetGetterAs <uint>(NumberType.U4, cursor, matrixColumnIndexColInfo.Index); var matrixRowIndexGetter = RowCursorUtils.GetGetterAs <uint>(NumberType.U4, cursor, matrixRowIndexColInfo.Index); if (validData == null) { // Have the trainer do its work. using (var buffer = PrepareBuffer()) { buffer.Train(ch, rowCount, colCount, cursor, labGetter, matrixRowIndexGetter, matrixColumnIndexGetter); predictor = new MatrixFactorizationPredictor(Host, buffer, matrixColumnIndexColInfo.Type.AsKey, matrixRowIndexColInfo.Type.AsKey); } } else { using (var validCursor = validData.Data.GetRowCursor( c => c == validMatrixColumnIndexColInfo.Index || c == validMatrixRowIndexColInfo.Index || c == validData.Schema.Label.Index)) { ValueGetter <float> validLabelGetter = RowCursorUtils.GetGetterAs <float>(NumberType.R4, validCursor, validData.Schema.Label.Index); var validMatrixColumnIndexGetter = RowCursorUtils.GetGetterAs <uint>(NumberType.U4, validCursor, validMatrixColumnIndexColInfo.Index); var validMatrixRowIndexGetter = RowCursorUtils.GetGetterAs <uint>(NumberType.U4, validCursor, validMatrixRowIndexColInfo.Index); // Have the trainer do its work. using (var buffer = PrepareBuffer()) { buffer.TrainWithValidation(ch, rowCount, colCount, cursor, labGetter, matrixRowIndexGetter, matrixColumnIndexGetter, validCursor, validLabelGetter, validMatrixRowIndexGetter, validMatrixColumnIndexGetter); predictor = new MatrixFactorizationPredictor(Host, buffer, matrixColumnIndexColInfo.Type.AsKey, matrixRowIndexColInfo.Type.AsKey); } } } } return(predictor); }
// Returns true if a normalizer was added. public static bool AddNormalizerIfNeeded(IHostEnvironment env, IChannel ch, ITrainer trainer, ref IDataView view, string featureColumn, NormalizeOption autoNorm) { Contracts.CheckValue(env, nameof(env)); env.CheckValue(ch, nameof(ch)); ch.CheckValue(trainer, nameof(trainer)); ch.CheckValue(view, nameof(view)); ch.CheckValueOrNull(featureColumn); ch.CheckUserArg(Enum.IsDefined(typeof(NormalizeOption), autoNorm), nameof(TrainCommand.Arguments.NormalizeFeatures), "Normalize option is invalid. Specify one of 'norm=No', 'norm=Warn', 'norm=Auto', or 'norm=Yes'."); if (autoNorm == NormalizeOption.No) { ch.Info("Not adding a normalizer."); return(false); } if (string.IsNullOrEmpty(featureColumn)) { return(false); } int featCol; var schema = view.Schema; if (schema.TryGetColumnIndex(featureColumn, out featCol)) { if (autoNorm != NormalizeOption.Yes) { var nn = trainer as ITrainerEx; DvBool isNormalized = DvBool.False; if (nn == null || !nn.NeedNormalization || (schema.TryGetMetadata(BoolType.Instance, MetadataUtils.Kinds.IsNormalized, featCol, ref isNormalized) && isNormalized.IsTrue)) { ch.Info("Not adding a normalizer."); return(false); } if (autoNorm == NormalizeOption.Warn) { ch.Warning("A normalizer is needed for this trainer. Either add a normalizing transform or use the 'norm=Auto', 'norm=Yes' or 'norm=No' options."); return(false); } } ch.Info("Automatically adding a MinMax normalization transform, use 'norm=Warn' or 'norm=No' to turn this behavior off."); // Quote the feature column name string quotedFeatureColumnName = featureColumn; StringBuilder sb = new StringBuilder(); if (CmdQuoter.QuoteValue(quotedFeatureColumnName, sb)) { quotedFeatureColumnName = sb.ToString(); } var component = new SubComponent <IDataTransform, SignatureDataTransform>("MinMax", string.Format("col={{ name={0} source={0} }}", quotedFeatureColumnName)); var loader = view as IDataLoader; if (loader != null) { view = CompositeDataLoader.Create(env, loader, new KeyValuePair <string, SubComponent <IDataTransform, SignatureDataTransform> >(null, component)); } else { view = component.CreateInstance(env, view); } return(true); } return(false); }
private MatrixFactorizationModelParameters TrainCore(IChannel ch, RoleMappedData data, RoleMappedData validData = null) { _host.AssertValue(ch); ch.AssertValue(data); ch.AssertValueOrNull(validData); ch.CheckParam(data.Schema.Label.HasValue, nameof(data), "Input data did not have a unique label"); RecommenderUtils.CheckAndGetMatrixIndexColumns(data, out var matrixColumnIndexColInfo, out var matrixRowIndexColInfo, isDecode: false); var labelCol = data.Schema.Label.Value; if (labelCol.Type != NumberDataViewType.Single && labelCol.Type != NumberDataViewType.Double) { throw ch.Except("Column '{0}' for label should be floating point, but is instead {1}", labelCol.Name, labelCol.Type); } MatrixFactorizationModelParameters predictor; if (validData != null) { ch.CheckValue(validData, nameof(validData)); ch.CheckParam(validData.Schema.Label.HasValue, nameof(validData), "Input validation data did not have a unique label"); RecommenderUtils.CheckAndGetMatrixIndexColumns(validData, out var validMatrixColumnIndexColInfo, out var validMatrixRowIndexColInfo, isDecode: false); var validLabelCol = validData.Schema.Label.Value; if (validLabelCol.Type != NumberDataViewType.Single && validLabelCol.Type != NumberDataViewType.Double) { throw ch.Except("Column '{0}' for validation label should be floating point, but is instead {1}", validLabelCol.Name, validLabelCol.Type); } if (!matrixColumnIndexColInfo.Type.Equals(validMatrixColumnIndexColInfo.Type)) { throw ch.ExceptParam(nameof(validData), "Train and validation sets' matrix-column types differed, {0} vs. {1}", matrixColumnIndexColInfo.Type, validMatrixColumnIndexColInfo.Type); } if (!matrixRowIndexColInfo.Type.Equals(validMatrixRowIndexColInfo.Type)) { throw ch.ExceptParam(nameof(validData), "Train and validation sets' matrix-row types differed, {0} vs. {1}", matrixRowIndexColInfo.Type, validMatrixRowIndexColInfo.Type); } } int colCount = matrixColumnIndexColInfo.Type.GetKeyCountAsInt32(_host); int rowCount = matrixRowIndexColInfo.Type.GetKeyCountAsInt32(_host); ch.Assert(rowCount > 0); ch.Assert(colCount > 0); // Checks for equality on the validation set ensure it is correct here. using (var cursor = data.Data.GetRowCursor(matrixColumnIndexColInfo, matrixRowIndexColInfo, data.Schema.Label.Value)) { // LibMF works only over single precision floats, but we want to be able to consume either. var labGetter = RowCursorUtils.GetGetterAs <float>(NumberDataViewType.Single, cursor, data.Schema.Label.Value.Index); var matrixColumnIndexGetter = RowCursorUtils.GetGetterAs <uint>(NumberDataViewType.UInt32, cursor, matrixColumnIndexColInfo.Index); var matrixRowIndexGetter = RowCursorUtils.GetGetterAs <uint>(NumberDataViewType.UInt32, cursor, matrixRowIndexColInfo.Index); if (validData == null) { // Have the trainer do its work. using (var buffer = PrepareBuffer()) { buffer.Train(ch, rowCount, colCount, cursor, labGetter, matrixRowIndexGetter, matrixColumnIndexGetter); predictor = new MatrixFactorizationModelParameters(_host, buffer, (KeyDataViewType)matrixColumnIndexColInfo.Type, (KeyDataViewType)matrixRowIndexColInfo.Type); } } else { RecommenderUtils.CheckAndGetMatrixIndexColumns(validData, out var validMatrixColumnIndexColInfo, out var validMatrixRowIndexColInfo, isDecode: false); using (var validCursor = validData.Data.GetRowCursor(matrixColumnIndexColInfo, matrixRowIndexColInfo, data.Schema.Label.Value)) { ValueGetter <float> validLabelGetter = RowCursorUtils.GetGetterAs <float>(NumberDataViewType.Single, validCursor, validData.Schema.Label.Value.Index); var validMatrixColumnIndexGetter = RowCursorUtils.GetGetterAs <uint>(NumberDataViewType.UInt32, validCursor, validMatrixColumnIndexColInfo.Index); var validMatrixRowIndexGetter = RowCursorUtils.GetGetterAs <uint>(NumberDataViewType.UInt32, validCursor, validMatrixRowIndexColInfo.Index); // Have the trainer do its work. using (var buffer = PrepareBuffer()) { buffer.TrainWithValidation(ch, rowCount, colCount, cursor, labGetter, matrixRowIndexGetter, matrixColumnIndexGetter, validCursor, validLabelGetter, validMatrixRowIndexGetter, validMatrixColumnIndexGetter); predictor = new MatrixFactorizationModelParameters(_host, buffer, (KeyDataViewType)matrixColumnIndexColInfo.Type, (KeyDataViewType)matrixRowIndexColInfo.Type); } } } } return(predictor); }
private static void Save(IChannel ch, ModelSaveContext ctx, CodecFactory factory, ref VBuffer <ReadOnlyMemory <char> > values) { Contracts.AssertValue(ch); ch.CheckValue(ctx, nameof(ctx)); ctx.CheckAtModel(); ctx.SetVersionInfo(GetVersionInfo()); // *** Binary format *** // Codec parameterization: A codec parameterization that should be a ReadOnlyMemory codec // int: n, the number of bytes used to write the values // byte[n]: As encoded using the codec // Get the codec from the factory IValueCodec codec; var result = factory.TryGetCodec(new VectorType(TextType.Instance), out codec); ch.Assert(result); ch.Assert(codec.Type.IsVector); ch.Assert(codec.Type.VectorSize == 0); ch.Assert(codec.Type.ItemType.RawType == typeof(ReadOnlyMemory <char>)); IValueCodec <VBuffer <ReadOnlyMemory <char> > > textCodec = (IValueCodec <VBuffer <ReadOnlyMemory <char> > >)codec; factory.WriteCodec(ctx.Writer.BaseStream, codec); using (var mem = new MemoryStream()) { using (var writer = textCodec.OpenWriter(mem)) { writer.Write(ref values); writer.Commit(); } ctx.Writer.WriteByteArray(mem.ToArray()); } // Make this resemble, more or less, the auxiliary output from the TermTransform. // It will differ somewhat due to the vector being possibly sparse. To distinguish // between missing and empty, empties are not written at all, while missings are. var v = values; char[] buffer = null; ctx.SaveTextStream("Terms.txt", writer => { writer.WriteLine("# Number of terms = {0} of length {1}", v.Count, v.Length); foreach (var pair in v.Items()) { var text = pair.Value; if (text.IsEmpty) { continue; } writer.Write("{0}\t", pair.Key); // REVIEW: What about escaping this, *especially* for linebreaks? // Do C# and .NET really have no equivalent to Python's "repr"? :( if (text.IsEmpty) { writer.WriteLine(); continue; } Utils.EnsureSize(ref buffer, text.Length); var span = text.Span; for (int i = 0; i < text.Length; i++) { buffer[i] = span[i]; } writer.WriteLine(buffer, 0, text.Length); } }); }
public static void SavePredictor(IChannel ch, IPredictor predictor, RoleMappedSchema schema, Stream binaryModelStream = null, Stream summaryModelStream = null, Stream textModelStream = null, Stream iniModelStream = null, Stream codeModelStream = null) { Contracts.CheckValue(ch, nameof(ch)); ch.CheckValue(predictor, nameof(predictor)); ch.CheckValue(schema, nameof(schema)); int count = 0; if (binaryModelStream != null) { ch.Info("Saving predictor as binary"); using (var writer = new BinaryWriter(binaryModelStream, Encoding.UTF8, true)) PredictorUtils.SaveBinary(ch, predictor, writer); count++; } ch.CheckValue(schema, nameof(schema)); if (summaryModelStream != null) { ch.Info("Saving predictor summary"); using (StreamWriter writer = Utils.OpenWriter(summaryModelStream)) PredictorUtils.SaveSummary(ch, predictor, schema, writer); count++; } if (textModelStream != null) { ch.Info("Saving predictor as text"); using (StreamWriter writer = Utils.OpenWriter(textModelStream)) PredictorUtils.SaveText(ch, predictor, schema, writer); count++; } if (iniModelStream != null) { ch.Info("Saving predictor as ini"); using (StreamWriter writer = Utils.OpenWriter(iniModelStream)) { // Test if our predictor implements the more modern INI export interface. // If it does not, use the old utility method. ICanSaveInIniFormat saver = predictor as ICanSaveInIniFormat; if (saver == null) { PredictorUtils.SaveIni(ch, predictor, schema, writer); } else { saver.SaveAsIni(writer, schema); } } count++; } if (codeModelStream != null) { ch.Info("Saving predictor as code"); using (StreamWriter writer = Utils.OpenWriter(codeModelStream)) PredictorUtils.SaveCode(ch, predictor, schema, writer); count++; } // Note that we don't check for this case up front so this command can be used to simply // check that the predictor is loadable. if (count == 0) { ch.Info("No files saved. Must specify at least one output file."); } }