protected SchemaBindablePipelineEnsembleBase(IHostEnvironment env, ModelLoadContext ctx, string scoreColumnKind)
        {
            Host = env.Register(LoaderSignature);
            Host.AssertNonEmpty(scoreColumnKind);

            _scoreColumnKind = scoreColumnKind;

            // *** Binary format ***
            // int: id of _scoreColumnKind (loaded in the Create method)
            // int: number of predictors
            // The predictor models
            // int: the number of input columns
            // for each input column:
            //   int: id of the column name

            var length = ctx.Reader.ReadInt32();

            Host.CheckDecode(length > 0);
            PredictorModels = new IPredictorModel[length];
            for (int i = 0; i < PredictorModels.Length; i++)
            {
                string dir =
                    ctx.Header.ModelVerWritten == 0x00010001
                        ? "PredictorModels"
                        : Path.Combine(ctx.Directory, "PredictorModels");
                using (var ent = ctx.Repository.OpenEntry(dir, $"PredictorModel_{i:000}"))
                    PredictorModels[i] = new PredictorModel(Host, ent.Stream);
            }

            length = ctx.Reader.ReadInt32();
            Host.CheckDecode(length >= 0);
            _inputCols = new string[length];
            for (int i = 0; i < length; i++)
            {
                _inputCols[i] = ctx.LoadNonEmptyString();
            }
        }
        private SchemaShape.Column CheckInputsAndMakeColumn(
            SchemaShape inputSchema, string name, string[] sources)
        {
            _host.AssertNonEmpty(sources);

            var cols = new SchemaShape.Column[sources.Length];
            // If any input is a var vector, so is the output.
            bool varVector = false;
            // If any input is not normalized, the output is not normalized.
            bool isNormalized = true;
            // If any input has categorical indices, so will the output.
            bool hasCategoricals = false;
            // If any is scalar or had slot names, then the output will have slot names.
            bool hasSlotNames = false;

            // We will get the item type from the first column.
            ColumnType itemType = null;

            for (int i = 0; i < sources.Length; ++i)
            {
                if (!inputSchema.TryFindColumn(sources[i], out var col))
                {
                    throw _host.ExceptSchemaMismatch(nameof(inputSchema), "input", sources[i]);
                }
                if (i == 0)
                {
                    itemType = col.ItemType;
                }
                // For the sake of an estimator I am going to have a hard policy of no keys.
                // Appending keys makes no real sense anyway.
                if (col.IsKey)
                {
                    throw _host.Except($"Column '{sources[i]}' is key." +
                                       $"Concatenation of keys is unsupported.");
                }
                if (!col.ItemType.Equals(itemType))
                {
                    throw _host.Except($"Column '{sources[i]}' has values of {col.ItemType}" +
                                       $"which is not the same as earlier observed type of {itemType}.");
                }
                varVector       |= col.Kind == SchemaShape.Column.VectorKind.VariableVector;
                isNormalized    &= col.IsNormalized();
                hasCategoricals |= HasCategoricals(col);
                hasSlotNames    |= col.Kind == SchemaShape.Column.VectorKind.Scalar || col.HasSlotNames();
            }
            var vecKind = varVector ? SchemaShape.Column.VectorKind.VariableVector :
                          SchemaShape.Column.VectorKind.Vector;

            List <SchemaShape.Column> meta = new List <SchemaShape.Column>();

            if (isNormalized)
            {
                meta.Add(new SchemaShape.Column(MetadataUtils.Kinds.IsNormalized, SchemaShape.Column.VectorKind.Scalar, BoolType.Instance, false));
            }
            if (hasCategoricals)
            {
                meta.Add(new SchemaShape.Column(MetadataUtils.Kinds.CategoricalSlotRanges, SchemaShape.Column.VectorKind.Vector, NumberType.I4, false));
            }
            if (hasSlotNames)
            {
                meta.Add(new SchemaShape.Column(MetadataUtils.Kinds.SlotNames, SchemaShape.Column.VectorKind.Vector, TextType.Instance, false));
            }

            return(new SchemaShape.Column(name, vecKind, itemType, false, new SchemaShape(meta)));
        }
示例#3
0
        private static Float[] Train(IHost host, ColInfo[] infos, Arguments args, IDataView trainingData)
        {
            Contracts.AssertValue(host, "host");
            host.AssertNonEmpty(infos);

            var       avgDistances  = new Float[infos.Length];
            const int reservoirSize = 5000;

            bool[] activeColumns = new bool[trainingData.Schema.ColumnCount];
            for (int i = 0; i < infos.Length; i++)
            {
                activeColumns[infos[i].Source] = true;
            }

            var reservoirSamplers = new ReservoirSamplerWithReplacement <VBuffer <Float> > [infos.Length];

            using (var cursor = trainingData.GetRowCursor(col => activeColumns[col]))
            {
                var rng = args.Seed.HasValue ? RandomUtils.Create(args.Seed) : host.Rand;
                for (int i = 0; i < infos.Length; i++)
                {
                    if (infos[i].TypeSrc.IsVector)
                    {
                        var get = cursor.GetGetter <VBuffer <Float> >(infos[i].Source);
                        reservoirSamplers[i] = new ReservoirSamplerWithReplacement <VBuffer <Float> >(rng, reservoirSize, get);
                    }
                    else
                    {
                        var   getOne = cursor.GetGetter <Float>(infos[i].Source);
                        Float val    = 0;
                        ValueGetter <VBuffer <Float> > get =
                            (ref VBuffer <Float> dst) =>
                        {
                            getOne(ref val);
                            dst = new VBuffer <float>(1, new[] { val });
                        };
                        reservoirSamplers[i] = new ReservoirSamplerWithReplacement <VBuffer <Float> >(rng, reservoirSize, get);
                    }
                }

                while (cursor.MoveNext())
                {
                    for (int i = 0; i < infos.Length; i++)
                    {
                        reservoirSamplers[i].Sample();
                    }
                }
                for (int i = 0; i < infos.Length; i++)
                {
                    reservoirSamplers[i].Lock();
                }
            }

            for (int iinfo = 0; iinfo < infos.Length; iinfo++)
            {
                var instanceCount = reservoirSamplers[iinfo].NumSampled;

                // If the number of pairs is at most the maximum reservoir size / 2, we go over all the pairs,
                // so we get all the examples. Otherwise, get a sample with replacement.
                VBuffer <Float>[] res;
                int resLength;
                if (instanceCount < reservoirSize && instanceCount * (instanceCount - 1) <= reservoirSize)
                {
                    res       = reservoirSamplers[iinfo].GetCache();
                    resLength = reservoirSamplers[iinfo].Size;
                    Contracts.Assert(resLength == instanceCount);
                }
                else
                {
                    res       = reservoirSamplers[iinfo].GetSample().ToArray();
                    resLength = res.Length;
                }

                // If the dataset contains only one valid Instance, then we can't learn anything anyway, so just return 1.
                if (instanceCount <= 1)
                {
                    avgDistances[iinfo] = 1;
                }
                else
                {
                    Float[] distances;
                    var     sub = args.Column[iinfo].MatrixGenerator;
                    if (sub == null)
                    {
                        sub = args.MatrixGenerator;
                    }
                    // create a dummy generator in order to get its type.
                    // REVIEW this should be refactored. See https://github.com/dotnet/machinelearning/issues/699
                    var  matrixGenerator = sub.CreateComponent(host, 1);
                    bool gaussian        = matrixGenerator is GaussianFourierSampler;

                    // If the number of pairs is at most the maximum reservoir size / 2, go over all the pairs.
                    if (resLength < reservoirSize)
                    {
                        distances = new Float[instanceCount * (instanceCount - 1) / 2];
                        int count = 0;
                        for (int i = 0; i < instanceCount; i++)
                        {
                            for (int j = i + 1; j < instanceCount; j++)
                            {
                                distances[count++] = gaussian ? VectorUtils.L2DistSquared(ref res[i], ref res[j])
                                    : VectorUtils.L1Distance(ref res[i], ref res[j]);
                            }
                        }
                        host.Assert(count == distances.Length);
                    }
                    else
                    {
                        distances = new Float[reservoirSize / 2];
                        for (int i = 0; i < reservoirSize - 1; i += 2)
                        {
                            // For Gaussian kernels, we scale by the L2 distance squared, since the kernel function is exp(-gamma ||x-y||^2).
                            // For Laplacian kernels, we scale by the L1 distance, since the kernel function is exp(-gamma ||x-y||_1).
                            distances[i / 2] = gaussian ? VectorUtils.L2DistSquared(ref res[i], ref res[i + 1]) :
                                               VectorUtils.L1Distance(ref res[i], ref res[i + 1]);
                        }
                    }

                    // If by chance, in the random permutation all the pairs are the same instance we return 1.
                    Float median = MathUtils.GetMedianInPlace(distances, distances.Length);
                    avgDistances[iinfo] = median == 0 ? 1 : median;
                }
            }
            return(avgDistances);
        }
        private static IDataLoader ApplyTransformsCore(IHost host, IDataLoader srcLoader,
                                                       KeyValuePair <string, string>[] tagData, Func <IHostEnvironment, int, IDataView, IDataView> createTransform)
        {
            Contracts.AssertValue(host, "host");
            host.AssertValue(srcLoader, "srcLoader");
            host.AssertNonEmpty(tagData);
            host.AssertValue(createTransform, "createTransform");

            // If the loader is a composite, we need to start with its underlying pipeline end.
            var         exes      = new List <TransformEx>();
            var         composite = srcLoader as CompositeDataLoader;
            IDataView   srcView;
            IDataLoader pipeStart;

            if (composite != null)
            {
                srcView = composite.View;
                exes.AddRange(composite._transforms);
                pipeStart = composite._loader;
            }
            else
            {
                srcView = pipeStart = srcLoader;
            }

            IDataView view = srcView;

            using (var ch = host.Start("Transforms"))
            {
                int count        = Utils.Size(tagData);
                var newlyCreated = new List <TransformEx>();
                for (int i = 0; i < count; i++)
                {
                    // REVIEW: this might cause silent automatic tag conflicts if the pipeline is short-circuited.
                    // Maybe it's better to allow empty tags?
                    var tag = tagData[i].Key;
                    if (string.IsNullOrEmpty(tag))
                    {
                        tag = GenerateTag(exes.Count);
                    }

                    var newDataView = createTransform(host, i, view);
                    // Append the newly created transforms to the exes list.
                    // If the newTransform is a 'no-op' transform, i.e. equal to the original view,
                    // the exes array will not be modified: there's no reason to record details of a no-op transform,
                    // especially since this would overwrite the useful details of the upstream transform.
                    newlyCreated.Clear();
                    IDataView curDataView = newDataView;
                    while (true)
                    {
                        var cur = curDataView as IDataTransform;
                        if (cur == null)
                        {
                            // We reached all the way back to the pipe start. The exes accumulated so far are irrelevant.
                            ch.Check(curDataView == pipeStart,
                                     "The transform has corrupted the chain (chain no longer starts with the same loader).");
                            exes.Clear();
                            break;
                        }

                        int index = exes.FindLastIndex(x => x.Transform == cur);
                        if (index >= 0)
                        {
                            // We found a transform in exes to attach to.
                            if (index < exes.Count - 1)
                            {
                                // The transform short-circuited some of the existing ones, remove them.
                                exes.RemoveRange(index + 1, exes.Count - index - 1);
                            }
                            break;
                        }

                        newlyCreated.Add(new TransformEx(tag, tagData[i].Value, cur));
                        curDataView = cur.Source;
                    }

                    newlyCreated.Reverse();
                    exes.AddRange(newlyCreated);

                    view = newDataView;
                }
            }

            return(view == srcView ? srcLoader : new CompositeDataLoader(host, exes.ToArray()));
        }
        public void Save(ModelSaveContext ctx)
        {
            _host.AssertValue(ctx);
            ctx.CheckAtModel();
            ctx.SetVersionInfo(GetVersionInfo());

            // *** Binary format ***
            // byte: indicator for frozen models
            // stream: tensorFlow model.
            // int: number of input columns
            // for each input column
            //   int: id of int column name
            // int: number of output columns
            // for each output column
            //   int: id of output column name
            var isFrozen = string.IsNullOrEmpty(_savedModelPath);

            ctx.Writer.WriteBoolByte(isFrozen);
            if (isFrozen)
            {
                var buffer = new TFBuffer();
                Session.Graph.ToGraphDef(buffer);
                ctx.SaveBinaryStream("TFModel", w =>
                {
                    w.WriteByteArray(buffer.ToArray());
                });
            }
            else
            {
                ctx.SaveBinaryStream("TFSavedModel", w =>
                {
                    string[] modelFilePaths = Directory.GetFiles(_savedModelPath, "*", SearchOption.AllDirectories);
                    w.Write(modelFilePaths.Length);

                    foreach (var fullPath in modelFilePaths)
                    {
                        var relativePath = fullPath.Substring(_savedModelPath.Length + 1);
                        w.Write(relativePath);

                        using (var fs = new FileStream(fullPath, FileMode.Open))
                        {
                            long fileLength = fs.Length;
                            w.Write(fileLength);
                            long actualWritten = fs.CopyRange(w.BaseStream, fileLength);
                            _host.Assert(actualWritten == fileLength);
                        }
                    }
                });
            }
            _host.AssertNonEmpty(Inputs);
            ctx.Writer.Write(Inputs.Length);
            foreach (var colName in Inputs)
            {
                ctx.SaveNonEmptyString(colName);
            }

            _host.AssertNonEmpty(Outputs);
            ctx.Writer.Write(Outputs.Length);
            foreach (var colName in Outputs)
            {
                ctx.SaveNonEmptyString(colName);
            }
        }