private void CacheTypes(out ColumnType[] types, out ColumnType[] typesSlotNames, out bool[] echoSrc,
                                    out bool[] isNormalized, out ColumnType[] typesCategoricals)
            {
                Contracts.AssertNonEmpty(Infos);
                echoSrc           = new bool[Infos.Length];
                isNormalized      = new bool[Infos.Length];
                types             = new ColumnType[Infos.Length];
                typesSlotNames    = new ColumnType[Infos.Length];
                typesCategoricals = new ColumnType[Infos.Length];

                for (int i = 0; i < Infos.Length; i++)
                {
                    var info = Infos[i];
                    // REVIEW: Add support for implicit conversions?
                    if (info.SrcTypes.Length == 1 && info.SrcTypes[0].IsVector)
                    {
                        // All meta-data is passed through in this case, so don't need the slot names type.
                        echoSrc[i] = true;
                        DvBool b = DvBool.False;
                        isNormalized[i] =
                            info.SrcTypes[0].ItemType.IsNumber &&
                            Input.TryGetMetadata(BoolType.Instance, MetadataUtils.Kinds.IsNormalized, info.SrcIndices[0], ref b) &&
                            b.IsTrue;
                        types[i] = info.SrcTypes[0];
                        continue;
                    }

                    // The single scalar and multiple vector case.
                    isNormalized[i] = info.SrcTypes[0].ItemType.IsNumber;
                    if (isNormalized[i])
                    {
                        foreach (var srcCol in info.SrcIndices)
                        {
                            DvBool b = DvBool.False;
                            if (!Input.TryGetMetadata(BoolType.Instance, MetadataUtils.Kinds.IsNormalized, srcCol, ref b) ||
                                !b.IsTrue)
                            {
                                isNormalized[i] = false;
                                break;
                            }
                        }
                    }

                    types[i] = new VectorType(info.SrcTypes[0].ItemType.AsPrimitive, info.SrcSize);
                    if (info.SrcSize == 0)
                    {
                        continue;
                    }

                    bool hasCategoricals = false;
                    int  catCount        = 0;
                    for (int j = 0; j < info.SrcTypes.Length; j++)
                    {
                        if (info.SrcTypes[j].ValueCount == 0)
                        {
                            hasCategoricals = false;
                            break;
                        }

                        if (MetadataUtils.TryGetCategoricalFeatureIndices(Input, info.SrcIndices[j], out int[] typeCat))
Esempio n. 2
0
        // Returns true if a normalizer was added.
        public static bool AddNormalizerIfNeeded(IHostEnvironment env, IChannel ch, ITrainer trainer, ref IDataView view, string featureColumn, NormalizeOption autoNorm)
        {
            Contracts.CheckValue(env, nameof(env));
            env.CheckValue(ch, nameof(ch));
            ch.CheckValue(trainer, nameof(trainer));
            ch.CheckValue(view, nameof(view));
            ch.CheckValueOrNull(featureColumn);
            ch.CheckUserArg(Enum.IsDefined(typeof(NormalizeOption), autoNorm), nameof(TrainCommand.Arguments.NormalizeFeatures),
                            "Normalize option is invalid. Specify one of 'norm=No', 'norm=Warn', 'norm=Auto', or 'norm=Yes'.");

            if (autoNorm == NormalizeOption.No)
            {
                ch.Info("Not adding a normalizer.");
                return(false);
            }

            if (string.IsNullOrEmpty(featureColumn))
            {
                return(false);
            }

            int featCol;
            var schema = view.Schema;

            if (schema.TryGetColumnIndex(featureColumn, out featCol))
            {
                if (autoNorm != NormalizeOption.Yes)
                {
                    DvBool isNormalized = DvBool.False;
                    if (!trainer.Info.NeedNormalization || schema.IsNormalized(featCol))
                    {
                        ch.Info("Not adding a normalizer.");
                        return(false);
                    }
                    if (autoNorm == NormalizeOption.Warn)
                    {
                        ch.Warning("A normalizer is needed for this trainer. Either add a normalizing transform or use the 'norm=Auto', 'norm=Yes' or 'norm=No' options.");
                        return(false);
                    }
                }
                ch.Info("Automatically adding a MinMax normalization transform, use 'norm=Warn' or 'norm=No' to turn this behavior off.");
                IDataView ApplyNormalizer(IHostEnvironment innerEnv, IDataView input)
                => NormalizeTransform.CreateMinMaxNormalizer(innerEnv, input, featureColumn);

                if (view is IDataLoader loader)
                {
                    view = CompositeDataLoader.ApplyTransform(env, loader, tag: null, creationArgs: null, ApplyNormalizer);
                }
                else
                {
                    view = ApplyNormalizer(env, view);
                }
                return(true);
            }
            return(false);
        }
        public static CommonOutputs.MacroOutput <CommonOutputs.TransformOutput> IfNeeded(
            IHostEnvironment env,
            NormalizeTransform.MinMaxArguments input,
            EntryPointNode node)
        {
            var    schema             = input.Data.Schema;
            DvBool isNormalized       = DvBool.False;
            var    columnsToNormalize = new List <NormalizeTransform.AffineColumn>();

            foreach (var column in input.Column)
            {
                int col;
                if (!schema.TryGetColumnIndex(column.Source, out col))
                {
                    throw env.ExceptUserArg(nameof(input.Column), $"Column '{column.Source}' does not exist.");
                }
                if (!schema.TryGetMetadata(BoolType.Instance, MetadataUtils.Kinds.IsNormalized, col, ref isNormalized) ||
                    isNormalized.IsFalse)
                {
                    columnsToNormalize.Add(column);
                }
            }

            var entryPoints = new List <EntryPointNode>();

            if (columnsToNormalize.Count == 0)
            {
                var entryPointNode = EntryPointNode.Create(env, "Transforms.NoOperation", new NopTransform.NopInput(),
                                                           node.Catalog, node.Context, node.InputBindingMap, node.InputMap, node.OutputMap);
                entryPoints.Add(entryPointNode);
            }
            else
            {
                input.Column = columnsToNormalize.ToArray();
                var entryPointNode = EntryPointNode.Create(env, "Transforms.MinMaxNormalizer", input,
                                                           node.Catalog, node.Context, node.InputBindingMap, node.InputMap, node.OutputMap);
                entryPoints.Add(entryPointNode);
            }

            return(new CommonOutputs.MacroOutput <CommonOutputs.TransformOutput>()
            {
                Nodes = entryPoints
            });
        }
Esempio n. 4
0
        private static ValueGetter <Single> GetLabelGetterNotFloat(IRow cursor, int labelIndex)
        {
            var type = cursor.Schema.GetColumnType(labelIndex);

            Contracts.Assert(type != NumberType.R4 && type != NumberType.R8);

            // DvBool type label mapping: True -> 1, False -> 0, NA -> NaN.
            if (type.IsBool)
            {
                var getBoolSrc = cursor.GetGetter <DvBool>(labelIndex);
                return
                    ((ref Single dst) =>
                {
                    DvBool src = DvBool.NA;
                    getBoolSrc(ref src);
                    dst = (Single)src;
                });
            }

            Contracts.Check(type.IsKey, "Only floating point number, boolean, and key type values can be used as label.");
            Contracts.Assert(TestGetLabelGetter(type) == null);
            ulong keyMax = (ulong)type.KeyCount;

            if (keyMax == 0)
            {
                keyMax = ulong.MaxValue;
            }
            var getSrc = RowCursorUtils.GetGetterAs <ulong>(NumberType.U8, cursor, labelIndex);

            return
                ((ref Single dst) =>
            {
                ulong src = 0;
                getSrc(ref src);
                if (0 < src && src <= keyMax)
                {
                    dst = src - 1;
                }
                else
                {
                    dst = Single.NaN;
                }
            });
        }
        // The multi-output regression evaluator prints only the per-label metrics for each fold.
        protected override void PrintFoldResultsCore(IChannel ch, Dictionary <string, IDataView> metrics)
        {
            IDataView fold;

            if (!metrics.TryGetValue(MetricKinds.OverallMetrics, out fold))
            {
                throw ch.Except("No overall metrics found");
            }

            int  isWeightedCol;
            bool needWeighted = fold.Schema.TryGetColumnIndex(MetricKinds.ColumnNames.IsWeighted, out isWeightedCol);

            int  stratCol;
            bool hasStrats = fold.Schema.TryGetColumnIndex(MetricKinds.ColumnNames.StratCol, out stratCol);
            int  stratVal;
            bool hasStratVals = fold.Schema.TryGetColumnIndex(MetricKinds.ColumnNames.StratVal, out stratVal);

            ch.Assert(hasStrats == hasStratVals);

            var colCount       = fold.Schema.ColumnCount;
            var vBufferGetters = new ValueGetter <VBuffer <double> > [colCount];

            using (var cursor = fold.GetRowCursor(col => true))
            {
                DvBool isWeighted = DvBool.False;
                ValueGetter <DvBool> isWeightedGetter;
                if (needWeighted)
                {
                    isWeightedGetter = cursor.GetGetter <DvBool>(isWeightedCol);
                }
                else
                {
                    isWeightedGetter = (ref DvBool dst) => dst = DvBool.False;
                }

                ValueGetter <uint> stratGetter;
                if (hasStrats)
                {
                    var type = cursor.Schema.GetColumnType(stratCol);
                    stratGetter = RowCursorUtils.GetGetterAs <uint>(type, cursor, stratCol);
                }
                else
                {
                    stratGetter = (ref uint dst) => dst = 0;
                }

                int labelCount = 0;
                for (int i = 0; i < fold.Schema.ColumnCount; i++)
                {
                    if (fold.Schema.IsHidden(i) || (needWeighted && i == isWeightedCol) ||
                        (hasStrats && (i == stratCol || i == stratVal)))
                    {
                        continue;
                    }

                    var type = fold.Schema.GetColumnType(i);
                    if (type.IsKnownSizeVector && type.ItemType == NumberType.R8)
                    {
                        vBufferGetters[i] = cursor.GetGetter <VBuffer <double> >(i);
                        if (labelCount == 0)
                        {
                            labelCount = type.VectorSize;
                        }
                        else
                        {
                            ch.Check(labelCount == type.VectorSize, "All vector metrics should contain the same number of slots");
                        }
                    }
                }
                var labelNames = new DvText[labelCount];
                for (int j = 0; j < labelCount; j++)
                {
                    labelNames[j] = new DvText(string.Format("Label_{0}", j));
                }

                var sb = new StringBuilder();
                sb.AppendLine("Per-label metrics:");
                sb.AppendFormat("{0,12} ", " ");
                for (int i = 0; i < labelCount; i++)
                {
                    sb.AppendFormat(" {0,20}", labelNames[i]);
                }
                sb.AppendLine();

                VBuffer <Double> metricVals      = default(VBuffer <Double>);
                bool             foundWeighted   = !needWeighted;
                bool             foundUnweighted = false;
                uint             strat           = 0;
                while (cursor.MoveNext())
                {
                    isWeightedGetter(ref isWeighted);
                    if (foundWeighted && isWeighted.IsTrue || foundUnweighted && isWeighted.IsFalse)
                    {
                        throw ch.Except("Multiple {0} rows found in overall metrics data view",
                                        isWeighted.IsTrue ? "weighted" : "unweighted");
                    }
                    if (isWeighted.IsTrue)
                    {
                        foundWeighted = true;
                    }
                    else
                    {
                        foundUnweighted = true;
                    }

                    stratGetter(ref strat);
                    if (strat > 0)
                    {
                        continue;
                    }

                    for (int i = 0; i < colCount; i++)
                    {
                        if (vBufferGetters[i] != null)
                        {
                            vBufferGetters[i](ref metricVals);
                            ch.Assert(metricVals.Length == labelCount);

                            sb.AppendFormat("{0}{1,12}:", isWeighted.IsTrue ? "Weighted " : "", fold.Schema.GetColumnName(i));
                            foreach (var metric in metricVals.Items(all: true))
                            {
                                sb.AppendFormat(" {0,20:G20}", metric.Value);
                            }
                            sb.AppendLine();
                        }
                    }
                    if (foundUnweighted && foundWeighted)
                    {
                        break;
                    }
                }
                ch.Assert(foundUnweighted && foundWeighted);
                ch.Info(sb.ToString());
            }
        }
Esempio n. 6
0
 private void BinOneBool(ref DvBool src, ref int dst)
 {
     dst = src.IsNA ? -1 : src.IsFalse ? 0 : 1;
 }
Esempio n. 7
0
 private void GetPredictedLabelCore(Float score, ref DvBool value)
 {
     value = score > _threshold ? DvBool.True : score <= _threshold ? DvBool.False : DvBool.NA;
 }
 private void IsNormalized(int iinfo, ref DvBool dst)
 {
     Contracts.Assert(0 <= iinfo & iinfo < InfoCount);
     dst = DvBool.True;
 }
 private void IsNormalized(int iinfo, ref DvBool dst)
 {
     dst = DvBool.True;
 }
Esempio n. 10
0
        // Returns true if a normalizer was added.
        public static bool AddNormalizerIfNeeded(IHostEnvironment env, IChannel ch, ITrainer trainer, ref IDataView view, string featureColumn, NormalizeOption autoNorm)
        {
            Contracts.CheckValue(env, nameof(env));
            env.CheckValue(ch, nameof(ch));
            ch.CheckValue(trainer, nameof(trainer));
            ch.CheckValue(view, nameof(view));
            ch.CheckValueOrNull(featureColumn);
            ch.CheckUserArg(Enum.IsDefined(typeof(NormalizeOption), autoNorm), nameof(TrainCommand.Arguments.NormalizeFeatures),
                            "Normalize option is invalid. Specify one of 'norm=No', 'norm=Warn', 'norm=Auto', or 'norm=Yes'.");

            if (autoNorm == NormalizeOption.No)
            {
                ch.Info("Not adding a normalizer.");
                return(false);
            }

            if (string.IsNullOrEmpty(featureColumn))
            {
                return(false);
            }

            int featCol;
            var schema = view.Schema;

            if (schema.TryGetColumnIndex(featureColumn, out featCol))
            {
                if (autoNorm != NormalizeOption.Yes)
                {
                    var    nn           = trainer as ITrainerEx;
                    DvBool isNormalized = DvBool.False;
                    if (nn == null || !nn.NeedNormalization ||
                        (schema.TryGetMetadata(BoolType.Instance, MetadataUtils.Kinds.IsNormalized, featCol, ref isNormalized) &&
                         isNormalized.IsTrue))
                    {
                        ch.Info("Not adding a normalizer.");
                        return(false);
                    }
                    if (autoNorm == NormalizeOption.Warn)
                    {
                        ch.Warning("A normalizer is needed for this trainer. Either add a normalizing transform or use the 'norm=Auto', 'norm=Yes' or 'norm=No' options.");
                        return(false);
                    }
                }
                ch.Info("Automatically adding a MinMax normalization transform, use 'norm=Warn' or 'norm=No' to turn this behavior off.");
                // Quote the feature column name
                string        quotedFeatureColumnName = featureColumn;
                StringBuilder sb = new StringBuilder();
                if (CmdQuoter.QuoteValue(quotedFeatureColumnName, sb))
                {
                    quotedFeatureColumnName = sb.ToString();
                }
                var component = new SubComponent <IDataTransform, SignatureDataTransform>("MinMax", string.Format("col={{ name={0} source={0} }}", quotedFeatureColumnName));
                var loader    = view as IDataLoader;
                if (loader != null)
                {
                    view = CompositeDataLoader.Create(env, loader,
                                                      new KeyValuePair <string, SubComponent <IDataTransform, SignatureDataTransform> >(null, component));
                }
                else
                {
                    view = component.CreateInstance(env, view);
                }
                return(true);
            }
            return(false);
        }
Esempio n. 11
0
 public void Conv(ref bool?src, ref DvBool dst) => dst = src ?? DvBool.NA;
Esempio n. 12
0
        /// <summary>
        ///  Fills indicator values for vectors.  The indices is a list that either holds all of the NAs or all
        ///  of the non-NAs, indicated by sense being true or false respectively.
        /// </summary>
        private void FillValues(int srcLength, ref VBuffer <DvBool> dst, List <int> indices, bool sense)
        {
            var dstValues  = dst.Values;
            var dstIndices = dst.Indices;

            if (indices.Count == 0)
            {
                if (sense)
                {
                    // Return empty VBuffer.
                    dst = new VBuffer <DvBool>(srcLength, 0, dstValues, dstIndices);
                    return;
                }

                // Return VBuffer filled with 1's.
                Utils.EnsureSize(ref dstValues, srcLength, false);
                for (int i = 0; i < srcLength; i++)
                {
                    dstValues[i] = DvBool.True;
                }
                dst = new VBuffer <DvBool>(srcLength, dstValues, dstIndices);
                return;
            }

            if (sense && indices.Count < srcLength / 2)
            {
                // Will produce sparse output.
                int dstCount = indices.Count;
                Utils.EnsureSize(ref dstValues, dstCount, false);
                Utils.EnsureSize(ref dstIndices, dstCount, false);

                indices.CopyTo(dstIndices);
                for (int ii = 0; ii < dstCount; ii++)
                {
                    dstValues[ii] = DvBool.True;
                }

                Host.Assert(dstCount <= srcLength);
                dst = new VBuffer <DvBool>(srcLength, dstCount, dstValues, dstIndices);
            }
            else if (!sense && srcLength - indices.Count < srcLength / 2)
            {
                // Will produce sparse output.
                int dstCount = srcLength - indices.Count;
                Utils.EnsureSize(ref dstValues, dstCount, false);
                Utils.EnsureSize(ref dstIndices, dstCount, false);

                // Appends the length of the src to make the loop simpler,
                // as the length of src will never be reached in the loop.
                indices.Add(srcLength);

                int iiDst = 0;
                int iiSrc = 0;
                int iNext = indices[iiSrc];
                for (int i = 0; i < srcLength; i++)
                {
                    Host.Assert(0 <= i && i <= iNext);
                    Host.Assert(iiSrc + iiDst == i);
                    if (i < iNext)
                    {
                        Host.Assert(iiDst < dstCount);
                        dstValues[iiDst]    = DvBool.True;
                        dstIndices[iiDst++] = i;
                    }
                    else
                    {
                        Host.Assert(iiSrc + 1 < indices.Count);
                        Host.Assert(iNext < indices[iiSrc + 1]);
                        iNext = indices[++iiSrc];
                    }
                }
                Host.Assert(srcLength == iiSrc + iiDst);
                Host.Assert(iiDst == dstCount);

                dst = new VBuffer <DvBool>(srcLength, dstCount, dstValues, dstIndices);
            }
            else
            {
                // Will produce dense output.
                Utils.EnsureSize(ref dstValues, srcLength, false);

                // Appends the length of the src to make the loop simpler,
                // as the length of src will never be reached in the loop.
                indices.Add(srcLength);

                int ii = 0;
                // Assigns values correctly depending on the sense.
                DvBool hit  = sense ? DvBool.True : DvBool.False;
                DvBool miss = sense ? DvBool.False : DvBool.True;
                for (int i = 0; i < srcLength; i++)
                {
                    Host.Assert(0 <= i && i <= indices[ii]);
                    if (i == indices[ii])
                    {
                        dstValues[i] = hit;
                        ii++;
                        Host.Assert(ii < indices.Count);
                        Host.Assert(indices[ii - 1] < indices[ii]);
                    }
                    else
                    {
                        dstValues[i] = miss;
                    }
                }

                dst = new VBuffer <DvBool>(srcLength, dstValues, dstIndices);
            }
        }