public float[][] GetScores(IDataView input, string labelColumnName, string[] columns, int numBins, int[] colSizes) { _numBins = numBins; var schema = input.Schema; var size = columns.Length; if (!schema.TryGetColumnIndex(labelColumnName, out int labelCol)) { throw _host.ExceptUserArg(nameof(MutualInformationFeatureSelectingEstimator.Options.LabelColumnName), "Label column '{0}' not found", labelColumnName); } var labelType = schema[labelCol].Type; if (!IsValidColumnType(labelType)) { throw _host.ExceptUserArg(nameof(MutualInformationFeatureSelectingEstimator.Options.LabelColumnName), "Label column '{0}' does not have compatible type", labelColumnName); } var colSrcs = new int[size + 1]; colSrcs[size] = labelCol; for (int i = 0; i < size; i++) { var colName = columns[i]; if (!schema.TryGetColumnIndex(colName, out int colSrc)) { throw _host.ExceptUserArg(nameof(MutualInformationFeatureSelectingEstimator.Options.Columns), "Source column '{0}' not found", colName); } var colType = schema[colSrc].Type; if (colType is VectorDataViewType vectorType && !vectorType.IsKnownSize) { throw _host.ExceptUserArg(nameof(MutualInformationFeatureSelectingEstimator.Options.Columns), "Variable length column '{0}' is not allowed", colName); } if (!IsValidColumnType(colType.GetItemType())) { throw _host.ExceptUserArg(nameof(MutualInformationFeatureSelectingEstimator.Options.Columns), "Column '{0}' of type '{1}' does not have compatible type.", colName, colType); } colSrcs[i] = colSrc; colSizes[i] = colType.GetValueCount(); } var scores = new float[size][]; using (var ch = _host.Start("Computing mutual information scores")) using (var pch = _host.StartProgressChannel("Computing mutual information scores")) { using (var trans = Transposer.Create(_host, input, false, colSrcs)) { int i = 0; var header = new ProgressHeader(new[] { "columns" }); var b = trans.Schema.TryGetColumnIndex(labelColumnName, out labelCol); Contracts.Assert(b); GetLabels(trans, labelType, labelCol); _contingencyTable = new int[_numLabels][]; _labelSums = new int[_numLabels]; pch.SetHeader(header, e => e.SetProgress(0, i, size)); for (i = 0; i < size; i++) { b = trans.Schema.TryGetColumnIndex(columns[i], out int col); Contracts.Assert(b); ch.Trace("Computing scores for column '{0}'", columns[i]); scores[i] = ComputeMutualInformation(trans, col); #if DEBUG ch.Trace("Scores for column '{0}': {1}", columns[i], string.Join(", ", scores[i])); #endif pch.Checkpoint(i + 1); } } } return(scores); }
public void TransposerTest() { const int rowCount = 1000; Random rgen = new Random(0); ArrayDataViewBuilder builder = new ArrayDataViewBuilder(Env); // A is to check the splitting of a sparse-ish column. var dataA = GenerateHelper(rowCount, 0.1, rgen, () => (int)rgen.Next(), 50, 5, 10, 15); dataA[rowCount / 2] = new VBuffer <int>(50, 0, null, null); // Coverage for the null vbuffer case. builder.AddColumn("A", NumberType.I4, dataA); // B is to check the splitting of a dense-ish column. builder.AddColumn("B", NumberType.R8, GenerateHelper(rowCount, 0.8, rgen, rgen.NextDouble, 50, 0, 25, 49)); // C is to just have some column we do nothing with. builder.AddColumn("C", NumberType.I2, GenerateHelper(rowCount, 0.1, rgen, () => (short)1, 30, 3, 10, 24)); // D is to check some column we don't have to split because it's sufficiently small. builder.AddColumn("D", NumberType.R8, GenerateHelper(rowCount, 0.1, rgen, rgen.NextDouble, 3, 1)); // E is to check a sparse scalar column. builder.AddColumn("E", NumberType.U4, GenerateHelper(rowCount, 0.1, rgen, () => (uint)rgen.Next(int.MinValue, int.MaxValue))); // F is to check a dense-ish scalar column. builder.AddColumn("F", NumberType.I4, GenerateHelper(rowCount, 0.8, rgen, () => rgen.Next())); IDataView view = builder.GetDataView(); // Do not force save. This will have a mix of passthrough and saved columns. Note that duplicate // specification of "D" to test that specifying a column twice has no ill effects. string[] names = { "B", "A", "E", "D", "F", "D" }; using (Transposer trans = Transposer.Create(Env, view, false, names)) { // Before checking the contents, check the names. for (int i = 0; i < names.Length; ++i) { int index; Assert.True(trans.Schema.TryGetColumnIndex(names[i], out index), $"Transpose schema couldn't find column '{names[i]}'"); int trueIndex; bool result = view.Schema.TryGetColumnIndex(names[i], out trueIndex); Contracts.Assert(result); Assert.True(trueIndex == index, $"Transpose schema had column '{names[i]}' at unexpected index"); } // Check the contents Assert.Null(trans.TransposeSchema.GetSlotType(2)); // C check to see that it's not transposable. TransposeCheckHelper <int>(view, 0, trans); // A check. TransposeCheckHelper <Double>(view, 1, trans); // B check. TransposeCheckHelper <Double>(view, 3, trans); // D check. TransposeCheckHelper <uint>(view, 4, trans); // E check. TransposeCheckHelper <int>(view, 5, trans); // F check. } // Force save. Recheck columns that would have previously been passthrough columns. // The primary benefit of this check is that we check the binary saving / loading // functionality of scalars which are otherwise always must necessarily be // passthrough. Also exercise the select by index functionality while we're at it. using (Transposer trans = Transposer.Create(Env, view, true, 3, 5, 4)) { // Check to see that A, B, and C were not transposed somehow. Assert.Null(trans.TransposeSchema.GetSlotType(0)); Assert.Null(trans.TransposeSchema.GetSlotType(1)); Assert.Null(trans.TransposeSchema.GetSlotType(2)); TransposeCheckHelper <Double>(view, 3, trans); // D check. TransposeCheckHelper <uint>(view, 4, trans); // E check. TransposeCheckHelper <int>(view, 5, trans); // F check. } }