public static IEnumerable <KeyValuePair <string, IEnumerable <string> > > GetFactorVariableValues( this IDataset ds, IEnumerable <string> factorVariables, IEnumerable <int> rows) { return(from factor in factorVariables let distinctValues = ds.GetStringValues(factor, rows).Distinct().ToArray() // 1 distinct value => skip (constant) // 2 distinct values => only take one of the two values // >=3 distinct values => create a binary value for each value let reducedValues = distinctValues.Length <= 2 ? distinctValues.Take(distinctValues.Length - 1) : distinctValues select new KeyValuePair <string, IEnumerable <string> >(factor, reducedValues)); }
private IDataset CreateRevertedDataset(IDataset data, double[,] pcs) { var n = VariableNames; var nDouble = data.DoubleVariables.Where(x => !ComponentNames.Contains(x)).ToArray(); var nDateTime = data.DateTimeVariables.ToArray(); var nString = data.StringVariables.ToArray(); IEnumerable <IList> nData = n.Select((_, x) => Enumerable.Range(0, pcs.GetLength(0)).Select(r => pcs[r, x]).ToList()); IEnumerable <IList> nDoubleData = nDouble.Select(x => data.GetDoubleValues(x).ToList()); IEnumerable <IList> nDateTimeData = nDateTime.Select(x => data.GetDateTimeValues(x).ToList()); IEnumerable <IList> nStringData = nString.Select(x => data.GetStringValues(x).ToList()); return(new Dataset(n.Concat(nDouble).Concat(nDateTime).Concat(nString), nData.Concat(nDoubleData).Concat(nDateTimeData).Concat(nStringData).ToArray())); }
/// <summary> /// Prepares a binary data matrix from a number of factors and specified factor values /// </summary> /// <param name="dataset">A dataset that contains the variable values</param> /// <param name="factorVariables">An enumerable of categorical variables (factors). For each variable an enumerable of values must be specified.</param> /// <param name="rows">An enumerable of row indices for the dataset</param> /// <returns></returns> /// <remarks>Factor variables (categorical variables) are split up into multiple binary variables one for each specified value.</remarks> public static double[,] ToArray( this IDataset dataset, IEnumerable <KeyValuePair <string, IEnumerable <string> > > factorVariables, IEnumerable <int> rows) { // check input variables. Only string variables are allowed. var invalidInputs = factorVariables.Select(kvp => kvp.Key).Where(name => !dataset.VariableHasType <string>(name)); if (invalidInputs.Any()) { throw new NotSupportedException("Unsupported inputs: " + string.Join(", ", invalidInputs)); } int numBinaryColumns = factorVariables.Sum(kvp => kvp.Value.Count()); List <int> rowsList = rows.ToList(); double[,] matrix = new double[rowsList.Count, numBinaryColumns]; int col = 0; foreach (var kvp in factorVariables) { var varName = kvp.Key; var cats = kvp.Value; if (!cats.Any()) { continue; } foreach (var cat in cats) { var values = dataset.GetStringValues(varName, rows); int row = 0; foreach (var value in values) { matrix[row, col] = value == cat ? 1 : 0; row++; } col++; } } return(matrix); }
public override IEnumerable <double> GetEstimatedClassValues(IDataset dataset, IEnumerable <int> rows) { return(dataset.GetStringValues(Variable, rows) .Select(GetPredictedValueForInput)); }
private static IndexedDataTable <double> CoefficientGraph(double[,] coeff, double[] lambda, IEnumerable <string> allowedVars, IDataset ds, bool showOnlyRelevantBasisFuncs = true) { var coeffTable = new IndexedDataTable <double>("Coefficients", "The paths of standarized coefficient values over different lambda values"); coeffTable.VisualProperties.YAxisMaximumAuto = false; coeffTable.VisualProperties.YAxisMinimumAuto = false; coeffTable.VisualProperties.XAxisMaximumAuto = false; coeffTable.VisualProperties.XAxisMinimumAuto = false; coeffTable.VisualProperties.XAxisLogScale = true; coeffTable.VisualProperties.XAxisTitle = "Lambda"; coeffTable.VisualProperties.YAxisTitle = "Coefficients"; coeffTable.VisualProperties.SecondYAxisTitle = "Number of variables"; var nLambdas = lambda.Length; var nCoeff = coeff.GetLength(1); var dataRows = new IndexedDataRow <double> [nCoeff]; var numNonZeroCoeffs = new int[nLambdas]; var doubleVariables = allowedVars.Where(ds.VariableHasType <double>); var factorVariableNames = allowedVars.Where(ds.VariableHasType <string>); var factorVariablesAndValues = ds.GetFactorVariableValues(factorVariableNames, Enumerable.Range(0, ds.Rows)); //must consider all factor values (in train and test set) for (int i = 0; i < coeff.GetLength(0); i++) { for (int j = 0; j < coeff.GetLength(1); j++) { if (!coeff[i, j].IsAlmost(0.0)) { numNonZeroCoeffs[i]++; } } } { int i = 0; foreach (var factorVariableAndValues in factorVariablesAndValues) { foreach (var factorValue in factorVariableAndValues.Value) { double sigma = ds.GetStringValues(factorVariableAndValues.Key) .Select(s => s == factorValue ? 1.0 : 0.0) .StandardDeviation(); // calc std dev of binary indicator var path = Enumerable.Range(0, nLambdas).Select(r => Tuple.Create(lambda[r], coeff[r, i] * sigma)).ToArray(); dataRows[i] = new IndexedDataRow <double>(factorVariableAndValues.Key + "=" + factorValue, factorVariableAndValues.Key + "=" + factorValue, path); i++; } } foreach (var doubleVariable in doubleVariables) { double sigma = ds.GetDoubleValues(doubleVariable).StandardDeviation(); var path = Enumerable.Range(0, nLambdas).Select(r => Tuple.Create(lambda[r], coeff[r, i] * sigma)).ToArray(); dataRows[i] = new IndexedDataRow <double>(doubleVariable, doubleVariable, path); i++; } // add to coeffTable by total weight (larger area under the curve => more important); foreach (var r in dataRows.OrderByDescending(r => r.Values.Select(t => t.Item2).Sum(x => Math.Abs(x)))) { coeffTable.Rows.Add(r); } } if (lambda.Length > 2) { coeffTable.VisualProperties.XAxisMinimumFixedValue = Math.Pow(10, Math.Floor(Math.Log10(lambda.Last()))); coeffTable.VisualProperties.XAxisMaximumFixedValue = Math.Pow(10, Math.Ceiling(Math.Log10(lambda.Skip(1).First()))); } coeffTable.Rows.Add(new IndexedDataRow <double>("Number of variables", "The number of non-zero coefficients for each step in the path", lambda.Zip(numNonZeroCoeffs, (l, v) => Tuple.Create(l, (double)v)))); coeffTable.Rows["Number of variables"].VisualProperties.ChartType = DataRowVisualProperties.DataRowChartType.Points; coeffTable.Rows["Number of variables"].VisualProperties.SecondYAxis = true; return(coeffTable); }