/// <summary> /// Save the model in text format (if it can save itself) /// </summary> public static void SaveIni(IChannel ch, IPredictor predictor, RoleMappedSchema schema, TextWriter writer) { Contracts.CheckValue(ch, nameof(ch)); ch.CheckValue(predictor, nameof(predictor)); ch.CheckValueOrNull(schema); ch.CheckValue(writer, nameof(writer)); var iniSaver = predictor as ICanSaveInIniFormat; if (iniSaver != null) { iniSaver.SaveAsIni(writer, schema); return; } var summarySaver = predictor as ICanSaveSummary; if (summarySaver != null) { writer.WriteLine("'{0}' does not support saving in INI format, writing out model summary instead", predictor.GetType().Name); ch.Error("'{0}' doesn't currently have standardized INI format output, will save model summary instead", predictor.GetType().Name); summarySaver.SaveSummary(writer, schema); } else { writer.WriteLine("'{0}' does not support saving in INI format", predictor.GetType().Name); ch.Error("'{0}' doesn't currently have standardized INI format output", predictor.GetType().Name); } }
internal static IDataView GetSummaryAndStats(IHostEnvironment env, IPredictor predictor, RoleMappedSchema schema, out IDataView stats) { var calibrated = predictor as IWeaklyTypedCalibratedModelParameters; while (calibrated != null) { predictor = calibrated.WeeklyTypedSubModel; calibrated = predictor as IWeaklyTypedCalibratedModelParameters; } IDataView summary = null; stats = null; var dvGetter = predictor as ICanGetSummaryAsIDataView; var rowGetter = predictor as ICanGetSummaryAsIRow; if (dvGetter != null) { summary = dvGetter.GetSummaryDataView(schema); } if (rowGetter != null) { var row = rowGetter.GetSummaryIRowOrNull(schema); env.Check(dvGetter == null || row == null, "Predictor outputs two summary data views, don't know which one to choose"); if (row != null) { summary = RowCursorUtils.RowAsDataView(env, row); } var statsRow = rowGetter.GetStatsIRowOrNull(schema); if (statsRow != null) { stats = RowCursorUtils.RowAsDataView(env, statsRow); } } if (dvGetter == null && rowGetter == null) { var bldr = new ArrayDataViewBuilder(env); var summaryModel = predictor as ICanSaveSummary; // Save a data view containing one row and one column with the model summary. if (summaryModel != null) { var sb = new StringBuilder(); using (StringWriter sw = new StringWriter(sb)) summaryModel.SaveSummary(sw, schema); bldr.AddColumn("Summary", sb.ToString()); } else { bldr.AddColumn("PredictorName", predictor.GetType().ToString()); } summary = bldr.GetDataView(); } env.AssertValue(summary); return(summary); }
/// <summary> /// Save the model summary. /// </summary> public static void SaveSummary(IChannel ch, IPredictor predictor, RoleMappedSchema schema, TextWriter writer) { Contracts.CheckValue(ch, nameof(ch)); ch.CheckValue(predictor, nameof(predictor)); ch.CheckValueOrNull(schema); ch.CheckValue(writer, nameof(writer)); var saver = predictor as ICanSaveSummary; if (saver != null) { saver.SaveSummary(writer, schema); } else { writer.WriteLine("'{0}' does not support saving summary", predictor.GetType().Name); ch.Error("'{0}' does not support saving summary", predictor.GetType().Name); } }
/// <summary> /// Save the model in text format (if it can save itself) /// </summary> public static void SaveCode(IChannel ch, IPredictor predictor, RoleMappedSchema schema, TextWriter writer) { Contracts.CheckValue(ch, nameof(ch)); ch.CheckValue(predictor, nameof(predictor)); ch.CheckValueOrNull(schema); ch.CheckValue(writer, nameof(writer)); var saver = predictor as ICanSaveInSourceCode; if (saver != null) { saver.SaveAsCode(writer, schema); } else { writer.WriteLine("'{0}' does not support saving in code.", predictor.GetType().Name); ch.Error("'{0}' doesn't currently support saving the model as code", predictor.GetType().Name); } }
private static IPredictor TrainCore(IHostEnvironment env, IChannel ch, RoleMappedData data, ITrainer trainer, string name, RoleMappedData validData, ICalibratorTrainer calibrator, int maxCalibrationExamples, bool?cacheData, IPredictor inpPredictor = null) { Contracts.CheckValue(env, nameof(env)); env.CheckValue(ch, nameof(ch)); ch.CheckValue(data, nameof(data)); ch.CheckValue(trainer, nameof(trainer)); ch.CheckNonEmpty(name, nameof(name)); ch.CheckValueOrNull(validData); ch.CheckValueOrNull(inpPredictor); var trainerRmd = trainer as ITrainer <RoleMappedData>; if (trainerRmd == null) { throw ch.ExceptUserArg(nameof(TrainCommand.Arguments.Trainer), "Trainer '{0}' does not accept known training data type", name); } Action <IChannel, ITrainer, Action <object>, object, object, object> trainCoreAction = TrainCore; IPredictor predictor; AddCacheIfWanted(env, ch, trainer, ref data, cacheData); ch.Trace("Training"); if (validData != null) { AddCacheIfWanted(env, ch, trainer, ref validData, cacheData); } var genericExam = trainCoreAction.GetMethodInfo().GetGenericMethodDefinition().MakeGenericMethod( typeof(RoleMappedData), inpPredictor != null ? inpPredictor.GetType() : typeof(IPredictor)); Action <RoleMappedData> trainExam = trainerRmd.Train; genericExam.Invoke(null, new object[] { ch, trainerRmd, trainExam, data, validData, inpPredictor }); ch.Trace("Constructing predictor"); predictor = trainerRmd.CreatePredictor(); return(CalibratorUtils.TrainCalibratorIfNeeded(env, ch, calibrator, maxCalibrationExamples, trainer, predictor, data)); }
/// <summary> /// Save the model in binary format (if it can save itself) /// </summary> public static void SaveBinary(IChannel ch, IPredictor predictor, BinaryWriter writer) { Contracts.CheckValue(ch, nameof(ch)); var saver = predictor as ICanSaveInBinaryFormat; if (saver == null) { ch.Error("'{0}' doesn't currently have standardized binary format for /mb", predictor.GetType().Name); return; } saver.SaveAsBinary(writer); }
/// <summary> /// Finalizes the test on a predictor, calls the predictor with a scorer, /// saves the data, saves the models, loads it back, saves the data again, /// checks the output is the same. /// </summary> /// <param name="env">environment</param> /// <param name="outModelFilePath">output filename</param> /// <param name="predictor">predictor</param> /// <param name="roles">label, feature, ...</param> /// <param name="outData">first output data</param> /// <param name="outData2">second output data</param> /// <param name="kind">prediction kind</param> /// <param name="checkError">checks errors</param> /// <param name="ratio">check the error is below that threshold (if checkError is true)</param> /// <param name="ratioReadSave">check the predictions difference after reloading the model are below this threshold</param> public static void FinalizeSerializationTest(IHostEnvironment env, string outModelFilePath, IPredictor predictor, RoleMappedData roles, string outData, string outData2, PredictionKind kind, bool checkError = true, float ratio = 0.8f, float ratioReadSave = 0.06f, bool checkType = true) { string labelColumn = kind != PredictionKind.Clustering ? roles.Schema.Label.Value.Name : null; #region save, reading, running // Saves model. using (var ch = env.Start("Save")) using (var fs = File.Create(outModelFilePath)) TrainUtils.SaveModel(env, ch, fs, predictor, roles); if (!File.Exists(outModelFilePath)) { throw new FileNotFoundException(outModelFilePath); } // Loads the model back. using (var fs = File.OpenRead(outModelFilePath)) { var pred_local = ModelFileUtils.LoadPredictorOrNull(env, fs); if (pred_local == null) { throw new Exception(string.Format("Unable to load '{0}'", outModelFilePath)); } if (checkType && predictor.GetType() != pred_local.GetType()) { throw new Exception(string.Format("Type mismatch {0} != {1}", predictor.GetType(), pred_local.GetType())); } } // Checks the outputs. var sch1 = SchemaHelper.ToString(roles.Schema.Schema); var scorer = PredictorHelper.CreateDefaultScorer(env, roles, predictor); var sch2 = SchemaHelper.ToString(scorer.Schema); if (string.IsNullOrEmpty(sch1) || string.IsNullOrEmpty(sch2)) { throw new Exception("Empty schemas"); } var saver = env.CreateSaver("Text"); var columns = new int[scorer.Schema.Count]; for (int i = 0; i < columns.Length; ++i) { columns[i] = saver.IsColumnSavable(scorer.Schema[i].Type) ? i : -1; } columns = columns.Where(c => c >= 0).ToArray(); using (var fs2 = File.Create(outData)) saver.SaveData(fs2, scorer, columns); if (!File.Exists(outModelFilePath)) { throw new FileNotFoundException(outData); } // Check we have the same output. using (var fs = File.OpenRead(outModelFilePath)) { var model = ModelFileUtils.LoadPredictorOrNull(env, fs); scorer = PredictorHelper.CreateDefaultScorer(env, roles, model); saver = env.CreateSaver("Text"); using (var fs2 = File.Create(outData2)) saver.SaveData(fs2, scorer, columns); } var t1 = File.ReadAllLines(outData); var t2 = File.ReadAllLines(outData2); if (t1.Length != t2.Length) { throw new Exception(string.Format("Not the same number of lines: {0} != {1}", t1.Length, t2.Length)); } var linesN = new List <int>(); for (int i = 0; i < t1.Length; ++i) { if (t1[i] != t2[i]) { linesN.Add(i); } } if (linesN.Count > (int)(t1.Length * ratioReadSave)) { var rows = linesN.Select(i => string.Format("1-Mismatch on line {0}/{3}:\n{1}\n{2}", i, t1[i], t2[i], t1.Length)).ToList(); rows.Add($"Number of differences: {linesN.Count}/{t1.Length}"); throw new Exception(string.Join("\n", rows)); } #endregion #region clustering if (kind == PredictionKind.Clustering) { // Nothing to do here. return; } #endregion #region supervized string expectedOuput = kind == PredictionKind.Regression ? "Score" : "PredictedLabel"; // Get label and basic checking about performance. using (var cursor = scorer.GetRowCursor(scorer.Schema)) { int ilabel, ipred; ilabel = SchemaHelper.GetColumnIndex(cursor.Schema, labelColumn); ipred = SchemaHelper.GetColumnIndex(cursor.Schema, expectedOuput); var ty1 = cursor.Schema[ilabel].Type; var ty2 = cursor.Schema[ipred].Type; var dist1 = new Dictionary <int, int>(); var dist2 = new Dictionary <int, int>(); var conf = new Dictionary <Tuple <int, int>, long>(); if (kind == PredictionKind.MulticlassClassification) { #region Multiclass if (!ty2.IsKey()) { throw new Exception(string.Format("Label='{0}' Predicted={1}'\nSchema: {2}", ty1, ty2, SchemaHelper.ToString(cursor.Schema))); } if (ty1.RawKind() == DataKind.Single) { var lgetter = cursor.GetGetter <float>(SchemaHelper._dc(ilabel, cursor)); var pgetter = cursor.GetGetter <uint>(SchemaHelper._dc(ipred, cursor)); float ans = 0; uint pre = 0; while (cursor.MoveNext()) { lgetter(ref ans); pgetter(ref pre); // The scorer +1 to the argmax. ++ans; var key = new Tuple <int, int>((int)pre, (int)ans); if (!conf.ContainsKey(key)) { conf[key] = 1; } else { ++conf[key]; } if (!dist1.ContainsKey((int)ans)) { dist1[(int)ans] = 1; } else { ++dist1[(int)ans]; } if (!dist2.ContainsKey((int)pre)) { dist2[(int)pre] = 1; } else { ++dist2[(int)pre]; } } } else if (ty1.RawKind() == DataKind.UInt32 && ty1.IsKey()) { var lgetter = cursor.GetGetter <uint>(SchemaHelper._dc(ilabel, cursor)); var pgetter = cursor.GetGetter <uint>(SchemaHelper._dc(ipred, cursor)); uint ans = 0; uint pre = 0; while (cursor.MoveNext()) { lgetter(ref ans); pgetter(ref pre); var key = new Tuple <int, int>((int)pre, (int)ans); if (!conf.ContainsKey(key)) { conf[key] = 1; } else { ++conf[key]; } if (!dist1.ContainsKey((int)ans)) { dist1[(int)ans] = 1; } else { ++dist1[(int)ans]; } if (!dist2.ContainsKey((int)pre)) { dist2[(int)pre] = 1; } else { ++dist2[(int)pre]; } } } else { throw new NotImplementedException(string.Format("Not implemented for type {0}", ty1.ToString())); } #endregion } else if (kind == PredictionKind.BinaryClassification) { #region binary classification if (ty2.RawKind() != DataKind.Boolean) { throw new Exception(string.Format("Label='{0}' Predicted={1}'\nSchema: {2}", ty1, ty2, SchemaHelper.ToString(cursor.Schema))); } if (ty1.RawKind() == DataKind.Single) { var lgetter = cursor.GetGetter <float>(SchemaHelper._dc(ilabel, cursor)); var pgetter = cursor.GetGetter <bool>(SchemaHelper._dc(ipred, cursor)); float ans = 0; bool pre = default(bool); while (cursor.MoveNext()) { lgetter(ref ans); pgetter(ref pre); if (ans != 0 && ans != 1) { throw Contracts.Except("The problem is not binary, expected answer is {0}", ans); } var key = new Tuple <int, int>(pre ? 1 : 0, (int)ans); if (!conf.ContainsKey(key)) { conf[key] = 1; } else { ++conf[key]; } if (!dist1.ContainsKey((int)ans)) { dist1[(int)ans] = 1; } else { ++dist1[(int)ans]; } if (!dist2.ContainsKey(pre ? 1 : 0)) { dist2[pre ? 1 : 0] = 1; } else { ++dist2[pre ? 1 : 0]; } } } else if (ty1.RawKind() == DataKind.UInt32) { var lgetter = cursor.GetGetter <uint>(SchemaHelper._dc(ilabel, cursor)); var pgetter = cursor.GetGetter <bool>(SchemaHelper._dc(ipred, cursor)); uint ans = 0; bool pre = default(bool); while (cursor.MoveNext()) { lgetter(ref ans); pgetter(ref pre); if (ty1.IsKey()) { --ans; } if (ans != 0 && ans != 1) { throw Contracts.Except("The problem is not binary, expected answer is {0}", ans); } var key = new Tuple <int, int>(pre ? 1 : 0, (int)ans); if (!conf.ContainsKey(key)) { conf[key] = 1; } else { ++conf[key]; } if (!dist1.ContainsKey((int)ans)) { dist1[(int)ans] = 1; } else { ++dist1[(int)ans]; } if (!dist2.ContainsKey(pre ? 1 : 0)) { dist2[pre ? 1 : 0] = 1; } else { ++dist2[pre ? 1 : 0]; } } } else if (ty1.RawKind() == DataKind.Boolean) { var lgetter = cursor.GetGetter <bool>(SchemaHelper._dc(ilabel, cursor)); var pgetter = cursor.GetGetter <bool>(SchemaHelper._dc(ipred, cursor)); bool ans = default(bool); bool pre = default(bool); while (cursor.MoveNext()) { lgetter(ref ans); pgetter(ref pre); var key = new Tuple <int, int>(pre ? 1 : 0, ans ? 1 : 0); if (!conf.ContainsKey(key)) { conf[key] = 1; } else { ++conf[key]; } if (!dist1.ContainsKey(ans ? 1 : 0)) { dist1[ans ? 1 : 0] = 1; } else { ++dist1[ans ? 1 : 0]; } if (!dist2.ContainsKey(pre ? 1 : 0)) { dist2[pre ? 1 : 0] = 1; } else { ++dist2[pre ? 1 : 0]; } } } else { throw new NotImplementedException(string.Format("Not implemented for type {0}", ty1)); } #endregion } else if (kind == PredictionKind.Regression) { #region regression if (ty1.RawKind() != DataKind.Single) { throw new Exception(string.Format("Label='{0}' Predicted={1}'\nSchema: {2}", ty1, ty2, SchemaHelper.ToString(cursor.Schema))); } if (ty2.RawKind() != DataKind.Single) { throw new Exception(string.Format("Label='{0}' Predicted={1}'\nSchema: {2}", ty1, ty2, SchemaHelper.ToString(cursor.Schema))); } var lgetter = cursor.GetGetter <float>(SchemaHelper._dc(ilabel, cursor)); var pgetter = cursor.GetGetter <float>(SchemaHelper._dc(ipred, cursor)); float ans = 0; float pre = 0f; float error = 0f; while (cursor.MoveNext()) { lgetter(ref ans); pgetter(ref pre); error += (ans - pre) * (ans - pre); if (!dist1.ContainsKey((int)ans)) { dist1[(int)ans] = 1; } else { ++dist1[(int)ans]; } if (!dist2.ContainsKey((int)pre)) { dist2[(int)pre] = 1; } else { ++dist2[(int)pre]; } } if (float.IsNaN(error) || float.IsInfinity(error)) { throw new Exception("Regression wen wrong. Error is infinite."); } #endregion } else { throw new NotImplementedException(string.Format("Not implemented for kind {0}", kind)); } var nbError = conf.Where(c => c.Key.Item1 != c.Key.Item2).Select(c => c.Value).Sum(); var nbTotal = conf.Select(c => c.Value).Sum(); if (checkError && (nbError * 1.0 > nbTotal * ratio || dist2.Count <= 1)) { var sconf = string.Join("\n", conf.OrderBy(c => c.Key) .Select(c => string.Format("pred={0} exp={1} count={2}", c.Key.Item1, c.Key.Item2, c.Value))); var sdist2 = string.Join("\n", dist1.OrderBy(c => c.Key) .Select(c => string.Format("label={0} count={1}", c.Key, c.Value))); var sdist1 = string.Join("\n", dist2.OrderBy(c => c.Key).Take(20) .Select(c => string.Format("label={0} count={1}", c.Key, c.Value))); throw new Exception(string.Format("Too many errors {0}/{1}={7}\n###########\nConfusion:\n{2}\n########\nDIST1\n{3}\n###########\nDIST2\n{4}\nOutput:\n{5}\n...\n{6}", nbError, nbTotal, sconf, sdist1, sdist2, string.Join("\n", t1.Take(Math.Min(30, t1.Length))), string.Join("\n", t1.Skip(Math.Max(0, t1.Length - 30))), nbError * 1.0 / nbTotal)); } } #endregion }
/// <summary> /// Build a Bing TreeEnsemble .ini representation of the given predictor /// </summary> public static string LinearModelAsIni(ref VBuffer <Float> weights, Float bias, IPredictor predictor = null, RoleMappedSchema schema = null, PlattCalibrator calibrator = null) { // TODO: Might need to consider a max line length for the Weights list, requiring us to split it up into // multiple evaluators StringBuilder inputBuilder = new StringBuilder(); StringBuilder aggregatedNodesBuilder = new StringBuilder("Nodes="); StringBuilder weightsBuilder = new StringBuilder("Weights="); var featureNames = default(VBuffer <ReadOnlyMemory <char> >); MetadataUtils.GetSlotNames(schema, RoleMappedSchema.ColumnRole.Feature, weights.Length, ref featureNames); int numNonZeroWeights = 0; const string weightsSep = "\t"; VBufferUtils.ForEachDefined(ref weights, (idx, value) => { if (Math.Abs(value - 0) >= Epsilon) { numNonZeroWeights++; var name = featureNames.GetItemOrDefault(idx); inputBuilder.AppendLine("[Input:" + numNonZeroWeights + "]"); inputBuilder.AppendLine("Name=" + (featureNames.Count == 0 ? "Feature_" + idx : name.IsEmpty ? $"f{idx}" : name.ToString())); inputBuilder.AppendLine("Transform=linear"); inputBuilder.AppendLine("Slope=1"); inputBuilder.AppendLine("Intercept=0"); inputBuilder.AppendLine(); aggregatedNodesBuilder.Append("I:" + numNonZeroWeights + weightsSep); weightsBuilder.Append(value + weightsSep); } }); StringBuilder builder = new StringBuilder(); builder.AppendLine("[TreeEnsemble]"); builder.AppendLine("Inputs=" + numNonZeroWeights); builder.AppendLine("Evaluators=1"); builder.AppendLine(); builder.AppendLine(inputBuilder.ToString()); builder.AppendLine("[Evaluator:1]"); builder.AppendLine("EvaluatorType=Aggregator"); builder.AppendLine("Type=Linear"); builder.AppendLine("Bias=" + bias); builder.AppendLine("NumNodes=" + numNonZeroWeights); builder.AppendLine(aggregatedNodesBuilder.ToString().Trim()); builder.AppendLine(weightsBuilder.ToString().Trim()); #if false // REVIEW: This should be done by the caller using the actual training args! builder.AppendLine(); builder.AppendLine("[Comments]"); builder.Append("Trained by TLC"); if (predictor != null) { builder.Append(" as /cl " + predictor.GetType().Name); if (predictor is IInitializable) { string settings = string.Join(";", (predictor as IInitializable).GetSettings()); if (!string.IsNullOrEmpty(settings)) { builder.Append(" /cls " + settings); } } } #endif string ini = builder.ToString(); // Add the calibration if the model was trained with calibration if (calibrator != null) { string calibratorEvaluatorIni = IniFileUtils.GetCalibratorEvaluatorIni(ini, calibrator); ini = IniFileUtils.AddEvaluator(ini, calibratorEvaluatorIni); } return(ini); }