private void CacheTypes(out ColumnType[] types, out ColumnType[] typesSlotNames, out bool[] echoSrc, out bool[] isNormalized, out ColumnType[] typesCategoricals) { Contracts.AssertNonEmpty(Infos); echoSrc = new bool[Infos.Length]; isNormalized = new bool[Infos.Length]; types = new ColumnType[Infos.Length]; typesSlotNames = new ColumnType[Infos.Length]; typesCategoricals = new ColumnType[Infos.Length]; for (int i = 0; i < Infos.Length; i++) { var info = Infos[i]; // REVIEW: Add support for implicit conversions? if (info.SrcTypes.Length == 1 && info.SrcTypes[0].IsVector) { // All meta-data is passed through in this case, so don't need the slot names type. echoSrc[i] = true; DvBool b = DvBool.False; isNormalized[i] = info.SrcTypes[0].ItemType.IsNumber && Input.TryGetMetadata(BoolType.Instance, MetadataUtils.Kinds.IsNormalized, info.SrcIndices[0], ref b) && b.IsTrue; types[i] = info.SrcTypes[0]; continue; } // The single scalar and multiple vector case. isNormalized[i] = info.SrcTypes[0].ItemType.IsNumber; if (isNormalized[i]) { foreach (var srcCol in info.SrcIndices) { DvBool b = DvBool.False; if (!Input.TryGetMetadata(BoolType.Instance, MetadataUtils.Kinds.IsNormalized, srcCol, ref b) || !b.IsTrue) { isNormalized[i] = false; break; } } } types[i] = new VectorType(info.SrcTypes[0].ItemType.AsPrimitive, info.SrcSize); if (info.SrcSize == 0) { continue; } bool hasCategoricals = false; int catCount = 0; for (int j = 0; j < info.SrcTypes.Length; j++) { if (info.SrcTypes[j].ValueCount == 0) { hasCategoricals = false; break; } if (MetadataUtils.TryGetCategoricalFeatureIndices(Input, info.SrcIndices[j], out int[] typeCat))
// Returns true if a normalizer was added. public static bool AddNormalizerIfNeeded(IHostEnvironment env, IChannel ch, ITrainer trainer, ref IDataView view, string featureColumn, NormalizeOption autoNorm) { Contracts.CheckValue(env, nameof(env)); env.CheckValue(ch, nameof(ch)); ch.CheckValue(trainer, nameof(trainer)); ch.CheckValue(view, nameof(view)); ch.CheckValueOrNull(featureColumn); ch.CheckUserArg(Enum.IsDefined(typeof(NormalizeOption), autoNorm), nameof(TrainCommand.Arguments.NormalizeFeatures), "Normalize option is invalid. Specify one of 'norm=No', 'norm=Warn', 'norm=Auto', or 'norm=Yes'."); if (autoNorm == NormalizeOption.No) { ch.Info("Not adding a normalizer."); return(false); } if (string.IsNullOrEmpty(featureColumn)) { return(false); } int featCol; var schema = view.Schema; if (schema.TryGetColumnIndex(featureColumn, out featCol)) { if (autoNorm != NormalizeOption.Yes) { DvBool isNormalized = DvBool.False; if (!trainer.Info.NeedNormalization || schema.IsNormalized(featCol)) { ch.Info("Not adding a normalizer."); return(false); } if (autoNorm == NormalizeOption.Warn) { ch.Warning("A normalizer is needed for this trainer. Either add a normalizing transform or use the 'norm=Auto', 'norm=Yes' or 'norm=No' options."); return(false); } } ch.Info("Automatically adding a MinMax normalization transform, use 'norm=Warn' or 'norm=No' to turn this behavior off."); IDataView ApplyNormalizer(IHostEnvironment innerEnv, IDataView input) => NormalizeTransform.CreateMinMaxNormalizer(innerEnv, input, featureColumn); if (view is IDataLoader loader) { view = CompositeDataLoader.ApplyTransform(env, loader, tag: null, creationArgs: null, ApplyNormalizer); } else { view = ApplyNormalizer(env, view); } return(true); } return(false); }
public static CommonOutputs.MacroOutput <CommonOutputs.TransformOutput> IfNeeded( IHostEnvironment env, NormalizeTransform.MinMaxArguments input, EntryPointNode node) { var schema = input.Data.Schema; DvBool isNormalized = DvBool.False; var columnsToNormalize = new List <NormalizeTransform.AffineColumn>(); foreach (var column in input.Column) { int col; if (!schema.TryGetColumnIndex(column.Source, out col)) { throw env.ExceptUserArg(nameof(input.Column), $"Column '{column.Source}' does not exist."); } if (!schema.TryGetMetadata(BoolType.Instance, MetadataUtils.Kinds.IsNormalized, col, ref isNormalized) || isNormalized.IsFalse) { columnsToNormalize.Add(column); } } var entryPoints = new List <EntryPointNode>(); if (columnsToNormalize.Count == 0) { var entryPointNode = EntryPointNode.Create(env, "Transforms.NoOperation", new NopTransform.NopInput(), node.Catalog, node.Context, node.InputBindingMap, node.InputMap, node.OutputMap); entryPoints.Add(entryPointNode); } else { input.Column = columnsToNormalize.ToArray(); var entryPointNode = EntryPointNode.Create(env, "Transforms.MinMaxNormalizer", input, node.Catalog, node.Context, node.InputBindingMap, node.InputMap, node.OutputMap); entryPoints.Add(entryPointNode); } return(new CommonOutputs.MacroOutput <CommonOutputs.TransformOutput>() { Nodes = entryPoints }); }
private static ValueGetter <Single> GetLabelGetterNotFloat(IRow cursor, int labelIndex) { var type = cursor.Schema.GetColumnType(labelIndex); Contracts.Assert(type != NumberType.R4 && type != NumberType.R8); // DvBool type label mapping: True -> 1, False -> 0, NA -> NaN. if (type.IsBool) { var getBoolSrc = cursor.GetGetter <DvBool>(labelIndex); return ((ref Single dst) => { DvBool src = DvBool.NA; getBoolSrc(ref src); dst = (Single)src; }); } Contracts.Check(type.IsKey, "Only floating point number, boolean, and key type values can be used as label."); Contracts.Assert(TestGetLabelGetter(type) == null); ulong keyMax = (ulong)type.KeyCount; if (keyMax == 0) { keyMax = ulong.MaxValue; } var getSrc = RowCursorUtils.GetGetterAs <ulong>(NumberType.U8, cursor, labelIndex); return ((ref Single dst) => { ulong src = 0; getSrc(ref src); if (0 < src && src <= keyMax) { dst = src - 1; } else { dst = Single.NaN; } }); }
// The multi-output regression evaluator prints only the per-label metrics for each fold. protected override void PrintFoldResultsCore(IChannel ch, Dictionary <string, IDataView> metrics) { IDataView fold; if (!metrics.TryGetValue(MetricKinds.OverallMetrics, out fold)) { throw ch.Except("No overall metrics found"); } int isWeightedCol; bool needWeighted = fold.Schema.TryGetColumnIndex(MetricKinds.ColumnNames.IsWeighted, out isWeightedCol); int stratCol; bool hasStrats = fold.Schema.TryGetColumnIndex(MetricKinds.ColumnNames.StratCol, out stratCol); int stratVal; bool hasStratVals = fold.Schema.TryGetColumnIndex(MetricKinds.ColumnNames.StratVal, out stratVal); ch.Assert(hasStrats == hasStratVals); var colCount = fold.Schema.ColumnCount; var vBufferGetters = new ValueGetter <VBuffer <double> > [colCount]; using (var cursor = fold.GetRowCursor(col => true)) { DvBool isWeighted = DvBool.False; ValueGetter <DvBool> isWeightedGetter; if (needWeighted) { isWeightedGetter = cursor.GetGetter <DvBool>(isWeightedCol); } else { isWeightedGetter = (ref DvBool dst) => dst = DvBool.False; } ValueGetter <uint> stratGetter; if (hasStrats) { var type = cursor.Schema.GetColumnType(stratCol); stratGetter = RowCursorUtils.GetGetterAs <uint>(type, cursor, stratCol); } else { stratGetter = (ref uint dst) => dst = 0; } int labelCount = 0; for (int i = 0; i < fold.Schema.ColumnCount; i++) { if (fold.Schema.IsHidden(i) || (needWeighted && i == isWeightedCol) || (hasStrats && (i == stratCol || i == stratVal))) { continue; } var type = fold.Schema.GetColumnType(i); if (type.IsKnownSizeVector && type.ItemType == NumberType.R8) { vBufferGetters[i] = cursor.GetGetter <VBuffer <double> >(i); if (labelCount == 0) { labelCount = type.VectorSize; } else { ch.Check(labelCount == type.VectorSize, "All vector metrics should contain the same number of slots"); } } } var labelNames = new DvText[labelCount]; for (int j = 0; j < labelCount; j++) { labelNames[j] = new DvText(string.Format("Label_{0}", j)); } var sb = new StringBuilder(); sb.AppendLine("Per-label metrics:"); sb.AppendFormat("{0,12} ", " "); for (int i = 0; i < labelCount; i++) { sb.AppendFormat(" {0,20}", labelNames[i]); } sb.AppendLine(); VBuffer <Double> metricVals = default(VBuffer <Double>); bool foundWeighted = !needWeighted; bool foundUnweighted = false; uint strat = 0; while (cursor.MoveNext()) { isWeightedGetter(ref isWeighted); if (foundWeighted && isWeighted.IsTrue || foundUnweighted && isWeighted.IsFalse) { throw ch.Except("Multiple {0} rows found in overall metrics data view", isWeighted.IsTrue ? "weighted" : "unweighted"); } if (isWeighted.IsTrue) { foundWeighted = true; } else { foundUnweighted = true; } stratGetter(ref strat); if (strat > 0) { continue; } for (int i = 0; i < colCount; i++) { if (vBufferGetters[i] != null) { vBufferGetters[i](ref metricVals); ch.Assert(metricVals.Length == labelCount); sb.AppendFormat("{0}{1,12}:", isWeighted.IsTrue ? "Weighted " : "", fold.Schema.GetColumnName(i)); foreach (var metric in metricVals.Items(all: true)) { sb.AppendFormat(" {0,20:G20}", metric.Value); } sb.AppendLine(); } } if (foundUnweighted && foundWeighted) { break; } } ch.Assert(foundUnweighted && foundWeighted); ch.Info(sb.ToString()); } }
private void BinOneBool(ref DvBool src, ref int dst) { dst = src.IsNA ? -1 : src.IsFalse ? 0 : 1; }
private void GetPredictedLabelCore(Float score, ref DvBool value) { value = score > _threshold ? DvBool.True : score <= _threshold ? DvBool.False : DvBool.NA; }
private void IsNormalized(int iinfo, ref DvBool dst) { Contracts.Assert(0 <= iinfo & iinfo < InfoCount); dst = DvBool.True; }
private void IsNormalized(int iinfo, ref DvBool dst) { dst = DvBool.True; }
// Returns true if a normalizer was added. public static bool AddNormalizerIfNeeded(IHostEnvironment env, IChannel ch, ITrainer trainer, ref IDataView view, string featureColumn, NormalizeOption autoNorm) { Contracts.CheckValue(env, nameof(env)); env.CheckValue(ch, nameof(ch)); ch.CheckValue(trainer, nameof(trainer)); ch.CheckValue(view, nameof(view)); ch.CheckValueOrNull(featureColumn); ch.CheckUserArg(Enum.IsDefined(typeof(NormalizeOption), autoNorm), nameof(TrainCommand.Arguments.NormalizeFeatures), "Normalize option is invalid. Specify one of 'norm=No', 'norm=Warn', 'norm=Auto', or 'norm=Yes'."); if (autoNorm == NormalizeOption.No) { ch.Info("Not adding a normalizer."); return(false); } if (string.IsNullOrEmpty(featureColumn)) { return(false); } int featCol; var schema = view.Schema; if (schema.TryGetColumnIndex(featureColumn, out featCol)) { if (autoNorm != NormalizeOption.Yes) { var nn = trainer as ITrainerEx; DvBool isNormalized = DvBool.False; if (nn == null || !nn.NeedNormalization || (schema.TryGetMetadata(BoolType.Instance, MetadataUtils.Kinds.IsNormalized, featCol, ref isNormalized) && isNormalized.IsTrue)) { ch.Info("Not adding a normalizer."); return(false); } if (autoNorm == NormalizeOption.Warn) { ch.Warning("A normalizer is needed for this trainer. Either add a normalizing transform or use the 'norm=Auto', 'norm=Yes' or 'norm=No' options."); return(false); } } ch.Info("Automatically adding a MinMax normalization transform, use 'norm=Warn' or 'norm=No' to turn this behavior off."); // Quote the feature column name string quotedFeatureColumnName = featureColumn; StringBuilder sb = new StringBuilder(); if (CmdQuoter.QuoteValue(quotedFeatureColumnName, sb)) { quotedFeatureColumnName = sb.ToString(); } var component = new SubComponent <IDataTransform, SignatureDataTransform>("MinMax", string.Format("col={{ name={0} source={0} }}", quotedFeatureColumnName)); var loader = view as IDataLoader; if (loader != null) { view = CompositeDataLoader.Create(env, loader, new KeyValuePair <string, SubComponent <IDataTransform, SignatureDataTransform> >(null, component)); } else { view = component.CreateInstance(env, view); } return(true); } return(false); }
public void Conv(ref bool?src, ref DvBool dst) => dst = src ?? DvBool.NA;
/// <summary> /// Fills indicator values for vectors. The indices is a list that either holds all of the NAs or all /// of the non-NAs, indicated by sense being true or false respectively. /// </summary> private void FillValues(int srcLength, ref VBuffer <DvBool> dst, List <int> indices, bool sense) { var dstValues = dst.Values; var dstIndices = dst.Indices; if (indices.Count == 0) { if (sense) { // Return empty VBuffer. dst = new VBuffer <DvBool>(srcLength, 0, dstValues, dstIndices); return; } // Return VBuffer filled with 1's. Utils.EnsureSize(ref dstValues, srcLength, false); for (int i = 0; i < srcLength; i++) { dstValues[i] = DvBool.True; } dst = new VBuffer <DvBool>(srcLength, dstValues, dstIndices); return; } if (sense && indices.Count < srcLength / 2) { // Will produce sparse output. int dstCount = indices.Count; Utils.EnsureSize(ref dstValues, dstCount, false); Utils.EnsureSize(ref dstIndices, dstCount, false); indices.CopyTo(dstIndices); for (int ii = 0; ii < dstCount; ii++) { dstValues[ii] = DvBool.True; } Host.Assert(dstCount <= srcLength); dst = new VBuffer <DvBool>(srcLength, dstCount, dstValues, dstIndices); } else if (!sense && srcLength - indices.Count < srcLength / 2) { // Will produce sparse output. int dstCount = srcLength - indices.Count; Utils.EnsureSize(ref dstValues, dstCount, false); Utils.EnsureSize(ref dstIndices, dstCount, false); // Appends the length of the src to make the loop simpler, // as the length of src will never be reached in the loop. indices.Add(srcLength); int iiDst = 0; int iiSrc = 0; int iNext = indices[iiSrc]; for (int i = 0; i < srcLength; i++) { Host.Assert(0 <= i && i <= iNext); Host.Assert(iiSrc + iiDst == i); if (i < iNext) { Host.Assert(iiDst < dstCount); dstValues[iiDst] = DvBool.True; dstIndices[iiDst++] = i; } else { Host.Assert(iiSrc + 1 < indices.Count); Host.Assert(iNext < indices[iiSrc + 1]); iNext = indices[++iiSrc]; } } Host.Assert(srcLength == iiSrc + iiDst); Host.Assert(iiDst == dstCount); dst = new VBuffer <DvBool>(srcLength, dstCount, dstValues, dstIndices); } else { // Will produce dense output. Utils.EnsureSize(ref dstValues, srcLength, false); // Appends the length of the src to make the loop simpler, // as the length of src will never be reached in the loop. indices.Add(srcLength); int ii = 0; // Assigns values correctly depending on the sense. DvBool hit = sense ? DvBool.True : DvBool.False; DvBool miss = sense ? DvBool.False : DvBool.True; for (int i = 0; i < srcLength; i++) { Host.Assert(0 <= i && i <= indices[ii]); if (i == indices[ii]) { dstValues[i] = hit; ii++; Host.Assert(ii < indices.Count); Host.Assert(indices[ii - 1] < indices[ii]); } else { dstValues[i] = miss; } } dst = new VBuffer <DvBool>(srcLength, dstValues, dstIndices); } }