protected SchemaBindablePipelineEnsembleBase(IHostEnvironment env, ModelLoadContext ctx, string scoreColumnKind) { Host = env.Register(LoaderSignature); Host.AssertNonEmpty(scoreColumnKind); _scoreColumnKind = scoreColumnKind; // *** Binary format *** // int: id of _scoreColumnKind (loaded in the Create method) // int: number of predictors // The predictor models // int: the number of input columns // for each input column: // int: id of the column name var length = ctx.Reader.ReadInt32(); Host.CheckDecode(length > 0); PredictorModels = new IPredictorModel[length]; for (int i = 0; i < PredictorModels.Length; i++) { string dir = ctx.Header.ModelVerWritten == 0x00010001 ? "PredictorModels" : Path.Combine(ctx.Directory, "PredictorModels"); using (var ent = ctx.Repository.OpenEntry(dir, $"PredictorModel_{i:000}")) PredictorModels[i] = new PredictorModel(Host, ent.Stream); } length = ctx.Reader.ReadInt32(); Host.CheckDecode(length >= 0); _inputCols = new string[length]; for (int i = 0; i < length; i++) { _inputCols[i] = ctx.LoadNonEmptyString(); } }
private SchemaShape.Column CheckInputsAndMakeColumn( SchemaShape inputSchema, string name, string[] sources) { _host.AssertNonEmpty(sources); var cols = new SchemaShape.Column[sources.Length]; // If any input is a var vector, so is the output. bool varVector = false; // If any input is not normalized, the output is not normalized. bool isNormalized = true; // If any input has categorical indices, so will the output. bool hasCategoricals = false; // If any is scalar or had slot names, then the output will have slot names. bool hasSlotNames = false; // We will get the item type from the first column. ColumnType itemType = null; for (int i = 0; i < sources.Length; ++i) { if (!inputSchema.TryFindColumn(sources[i], out var col)) { throw _host.ExceptSchemaMismatch(nameof(inputSchema), "input", sources[i]); } if (i == 0) { itemType = col.ItemType; } // For the sake of an estimator I am going to have a hard policy of no keys. // Appending keys makes no real sense anyway. if (col.IsKey) { throw _host.Except($"Column '{sources[i]}' is key." + $"Concatenation of keys is unsupported."); } if (!col.ItemType.Equals(itemType)) { throw _host.Except($"Column '{sources[i]}' has values of {col.ItemType}" + $"which is not the same as earlier observed type of {itemType}."); } varVector |= col.Kind == SchemaShape.Column.VectorKind.VariableVector; isNormalized &= col.IsNormalized(); hasCategoricals |= HasCategoricals(col); hasSlotNames |= col.Kind == SchemaShape.Column.VectorKind.Scalar || col.HasSlotNames(); } var vecKind = varVector ? SchemaShape.Column.VectorKind.VariableVector : SchemaShape.Column.VectorKind.Vector; List <SchemaShape.Column> meta = new List <SchemaShape.Column>(); if (isNormalized) { meta.Add(new SchemaShape.Column(MetadataUtils.Kinds.IsNormalized, SchemaShape.Column.VectorKind.Scalar, BoolType.Instance, false)); } if (hasCategoricals) { meta.Add(new SchemaShape.Column(MetadataUtils.Kinds.CategoricalSlotRanges, SchemaShape.Column.VectorKind.Vector, NumberType.I4, false)); } if (hasSlotNames) { meta.Add(new SchemaShape.Column(MetadataUtils.Kinds.SlotNames, SchemaShape.Column.VectorKind.Vector, TextType.Instance, false)); } return(new SchemaShape.Column(name, vecKind, itemType, false, new SchemaShape(meta))); }
private static Float[] Train(IHost host, ColInfo[] infos, Arguments args, IDataView trainingData) { Contracts.AssertValue(host, "host"); host.AssertNonEmpty(infos); var avgDistances = new Float[infos.Length]; const int reservoirSize = 5000; bool[] activeColumns = new bool[trainingData.Schema.ColumnCount]; for (int i = 0; i < infos.Length; i++) { activeColumns[infos[i].Source] = true; } var reservoirSamplers = new ReservoirSamplerWithReplacement <VBuffer <Float> > [infos.Length]; using (var cursor = trainingData.GetRowCursor(col => activeColumns[col])) { var rng = args.Seed.HasValue ? RandomUtils.Create(args.Seed) : host.Rand; for (int i = 0; i < infos.Length; i++) { if (infos[i].TypeSrc.IsVector) { var get = cursor.GetGetter <VBuffer <Float> >(infos[i].Source); reservoirSamplers[i] = new ReservoirSamplerWithReplacement <VBuffer <Float> >(rng, reservoirSize, get); } else { var getOne = cursor.GetGetter <Float>(infos[i].Source); Float val = 0; ValueGetter <VBuffer <Float> > get = (ref VBuffer <Float> dst) => { getOne(ref val); dst = new VBuffer <float>(1, new[] { val }); }; reservoirSamplers[i] = new ReservoirSamplerWithReplacement <VBuffer <Float> >(rng, reservoirSize, get); } } while (cursor.MoveNext()) { for (int i = 0; i < infos.Length; i++) { reservoirSamplers[i].Sample(); } } for (int i = 0; i < infos.Length; i++) { reservoirSamplers[i].Lock(); } } for (int iinfo = 0; iinfo < infos.Length; iinfo++) { var instanceCount = reservoirSamplers[iinfo].NumSampled; // If the number of pairs is at most the maximum reservoir size / 2, we go over all the pairs, // so we get all the examples. Otherwise, get a sample with replacement. VBuffer <Float>[] res; int resLength; if (instanceCount < reservoirSize && instanceCount * (instanceCount - 1) <= reservoirSize) { res = reservoirSamplers[iinfo].GetCache(); resLength = reservoirSamplers[iinfo].Size; Contracts.Assert(resLength == instanceCount); } else { res = reservoirSamplers[iinfo].GetSample().ToArray(); resLength = res.Length; } // If the dataset contains only one valid Instance, then we can't learn anything anyway, so just return 1. if (instanceCount <= 1) { avgDistances[iinfo] = 1; } else { Float[] distances; var sub = args.Column[iinfo].MatrixGenerator; if (sub == null) { sub = args.MatrixGenerator; } // create a dummy generator in order to get its type. // REVIEW this should be refactored. See https://github.com/dotnet/machinelearning/issues/699 var matrixGenerator = sub.CreateComponent(host, 1); bool gaussian = matrixGenerator is GaussianFourierSampler; // If the number of pairs is at most the maximum reservoir size / 2, go over all the pairs. if (resLength < reservoirSize) { distances = new Float[instanceCount * (instanceCount - 1) / 2]; int count = 0; for (int i = 0; i < instanceCount; i++) { for (int j = i + 1; j < instanceCount; j++) { distances[count++] = gaussian ? VectorUtils.L2DistSquared(ref res[i], ref res[j]) : VectorUtils.L1Distance(ref res[i], ref res[j]); } } host.Assert(count == distances.Length); } else { distances = new Float[reservoirSize / 2]; for (int i = 0; i < reservoirSize - 1; i += 2) { // For Gaussian kernels, we scale by the L2 distance squared, since the kernel function is exp(-gamma ||x-y||^2). // For Laplacian kernels, we scale by the L1 distance, since the kernel function is exp(-gamma ||x-y||_1). distances[i / 2] = gaussian ? VectorUtils.L2DistSquared(ref res[i], ref res[i + 1]) : VectorUtils.L1Distance(ref res[i], ref res[i + 1]); } } // If by chance, in the random permutation all the pairs are the same instance we return 1. Float median = MathUtils.GetMedianInPlace(distances, distances.Length); avgDistances[iinfo] = median == 0 ? 1 : median; } } return(avgDistances); }
private static IDataLoader ApplyTransformsCore(IHost host, IDataLoader srcLoader, KeyValuePair <string, string>[] tagData, Func <IHostEnvironment, int, IDataView, IDataView> createTransform) { Contracts.AssertValue(host, "host"); host.AssertValue(srcLoader, "srcLoader"); host.AssertNonEmpty(tagData); host.AssertValue(createTransform, "createTransform"); // If the loader is a composite, we need to start with its underlying pipeline end. var exes = new List <TransformEx>(); var composite = srcLoader as CompositeDataLoader; IDataView srcView; IDataLoader pipeStart; if (composite != null) { srcView = composite.View; exes.AddRange(composite._transforms); pipeStart = composite._loader; } else { srcView = pipeStart = srcLoader; } IDataView view = srcView; using (var ch = host.Start("Transforms")) { int count = Utils.Size(tagData); var newlyCreated = new List <TransformEx>(); for (int i = 0; i < count; i++) { // REVIEW: this might cause silent automatic tag conflicts if the pipeline is short-circuited. // Maybe it's better to allow empty tags? var tag = tagData[i].Key; if (string.IsNullOrEmpty(tag)) { tag = GenerateTag(exes.Count); } var newDataView = createTransform(host, i, view); // Append the newly created transforms to the exes list. // If the newTransform is a 'no-op' transform, i.e. equal to the original view, // the exes array will not be modified: there's no reason to record details of a no-op transform, // especially since this would overwrite the useful details of the upstream transform. newlyCreated.Clear(); IDataView curDataView = newDataView; while (true) { var cur = curDataView as IDataTransform; if (cur == null) { // We reached all the way back to the pipe start. The exes accumulated so far are irrelevant. ch.Check(curDataView == pipeStart, "The transform has corrupted the chain (chain no longer starts with the same loader)."); exes.Clear(); break; } int index = exes.FindLastIndex(x => x.Transform == cur); if (index >= 0) { // We found a transform in exes to attach to. if (index < exes.Count - 1) { // The transform short-circuited some of the existing ones, remove them. exes.RemoveRange(index + 1, exes.Count - index - 1); } break; } newlyCreated.Add(new TransformEx(tag, tagData[i].Value, cur)); curDataView = cur.Source; } newlyCreated.Reverse(); exes.AddRange(newlyCreated); view = newDataView; } } return(view == srcView ? srcLoader : new CompositeDataLoader(host, exes.ToArray())); }
public void Save(ModelSaveContext ctx) { _host.AssertValue(ctx); ctx.CheckAtModel(); ctx.SetVersionInfo(GetVersionInfo()); // *** Binary format *** // byte: indicator for frozen models // stream: tensorFlow model. // int: number of input columns // for each input column // int: id of int column name // int: number of output columns // for each output column // int: id of output column name var isFrozen = string.IsNullOrEmpty(_savedModelPath); ctx.Writer.WriteBoolByte(isFrozen); if (isFrozen) { var buffer = new TFBuffer(); Session.Graph.ToGraphDef(buffer); ctx.SaveBinaryStream("TFModel", w => { w.WriteByteArray(buffer.ToArray()); }); } else { ctx.SaveBinaryStream("TFSavedModel", w => { string[] modelFilePaths = Directory.GetFiles(_savedModelPath, "*", SearchOption.AllDirectories); w.Write(modelFilePaths.Length); foreach (var fullPath in modelFilePaths) { var relativePath = fullPath.Substring(_savedModelPath.Length + 1); w.Write(relativePath); using (var fs = new FileStream(fullPath, FileMode.Open)) { long fileLength = fs.Length; w.Write(fileLength); long actualWritten = fs.CopyRange(w.BaseStream, fileLength); _host.Assert(actualWritten == fileLength); } } }); } _host.AssertNonEmpty(Inputs); ctx.Writer.Write(Inputs.Length); foreach (var colName in Inputs) { ctx.SaveNonEmptyString(colName); } _host.AssertNonEmpty(Outputs); ctx.Writer.Write(Outputs.Length); foreach (var colName in Outputs) { ctx.SaveNonEmptyString(colName); } }