public static IEnumerable <KeyValuePair <string, IEnumerable <string> > > GetFactorVariableValues(
     this IDataset ds, IEnumerable <string> factorVariables, IEnumerable <int> rows)
 {
     return(from factor in factorVariables
            let distinctValues = ds.GetStringValues(factor, rows).Distinct().ToArray()
                                 // 1 distinct value => skip (constant)
                                 // 2 distinct values => only take one of the two values
                                 // >=3 distinct values => create a binary value for each value
                                 let reducedValues = distinctValues.Length <= 2
        ? distinctValues.Take(distinctValues.Length - 1)
        : distinctValues
                                                     select new KeyValuePair <string, IEnumerable <string> >(factor, reducedValues));
 }
        private IDataset CreateRevertedDataset(IDataset data, double[,] pcs)
        {
            var n         = VariableNames;
            var nDouble   = data.DoubleVariables.Where(x => !ComponentNames.Contains(x)).ToArray();
            var nDateTime = data.DateTimeVariables.ToArray();
            var nString   = data.StringVariables.ToArray();

            IEnumerable <IList> nData         = n.Select((_, x) => Enumerable.Range(0, pcs.GetLength(0)).Select(r => pcs[r, x]).ToList());
            IEnumerable <IList> nDoubleData   = nDouble.Select(x => data.GetDoubleValues(x).ToList());
            IEnumerable <IList> nDateTimeData = nDateTime.Select(x => data.GetDateTimeValues(x).ToList());
            IEnumerable <IList> nStringData   = nString.Select(x => data.GetStringValues(x).ToList());

            return(new Dataset(n.Concat(nDouble).Concat(nDateTime).Concat(nString), nData.Concat(nDoubleData).Concat(nDateTimeData).Concat(nStringData).ToArray()));
        }
        /// <summary>
        /// Prepares a binary data matrix from a number of factors and specified factor values
        /// </summary>
        /// <param name="dataset">A dataset that contains the variable values</param>
        /// <param name="factorVariables">An enumerable of categorical variables (factors). For each variable an enumerable of values must be specified.</param>
        /// <param name="rows">An enumerable of row indices for the dataset</param>
        /// <returns></returns>
        /// <remarks>Factor variables (categorical variables) are split up into multiple binary variables one for each specified value.</remarks>
        public static double[,] ToArray(
            this IDataset dataset,
            IEnumerable <KeyValuePair <string, IEnumerable <string> > > factorVariables,
            IEnumerable <int> rows)
        {
            // check input variables. Only string variables are allowed.
            var invalidInputs =
                factorVariables.Select(kvp => kvp.Key).Where(name => !dataset.VariableHasType <string>(name));

            if (invalidInputs.Any())
            {
                throw new NotSupportedException("Unsupported inputs: " + string.Join(", ", invalidInputs));
            }

            int numBinaryColumns = factorVariables.Sum(kvp => kvp.Value.Count());

            List <int> rowsList = rows.ToList();

            double[,] matrix = new double[rowsList.Count, numBinaryColumns];

            int col = 0;

            foreach (var kvp in factorVariables)
            {
                var varName = kvp.Key;
                var cats    = kvp.Value;
                if (!cats.Any())
                {
                    continue;
                }
                foreach (var cat in cats)
                {
                    var values = dataset.GetStringValues(varName, rows);
                    int row    = 0;
                    foreach (var value in values)
                    {
                        matrix[row, col] = value == cat ? 1 : 0;
                        row++;
                    }
                    col++;
                }
            }
            return(matrix);
        }
 public override IEnumerable <double> GetEstimatedClassValues(IDataset dataset, IEnumerable <int> rows)
 {
     return(dataset.GetStringValues(Variable, rows)
            .Select(GetPredictedValueForInput));
 }
Example #5
0
        private static IndexedDataTable <double> CoefficientGraph(double[,] coeff, double[] lambda, IEnumerable <string> allowedVars, IDataset ds, bool showOnlyRelevantBasisFuncs = true)
        {
            var coeffTable = new IndexedDataTable <double>("Coefficients", "The paths of standarized coefficient values over different lambda values");

            coeffTable.VisualProperties.YAxisMaximumAuto = false;
            coeffTable.VisualProperties.YAxisMinimumAuto = false;
            coeffTable.VisualProperties.XAxisMaximumAuto = false;
            coeffTable.VisualProperties.XAxisMinimumAuto = false;

            coeffTable.VisualProperties.XAxisLogScale    = true;
            coeffTable.VisualProperties.XAxisTitle       = "Lambda";
            coeffTable.VisualProperties.YAxisTitle       = "Coefficients";
            coeffTable.VisualProperties.SecondYAxisTitle = "Number of variables";

            var nLambdas         = lambda.Length;
            var nCoeff           = coeff.GetLength(1);
            var dataRows         = new IndexedDataRow <double> [nCoeff];
            var numNonZeroCoeffs = new int[nLambdas];

            var doubleVariables          = allowedVars.Where(ds.VariableHasType <double>);
            var factorVariableNames      = allowedVars.Where(ds.VariableHasType <string>);
            var factorVariablesAndValues = ds.GetFactorVariableValues(factorVariableNames, Enumerable.Range(0, ds.Rows)); //must consider all factor values (in train and test set)

            for (int i = 0; i < coeff.GetLength(0); i++)
            {
                for (int j = 0; j < coeff.GetLength(1); j++)
                {
                    if (!coeff[i, j].IsAlmost(0.0))
                    {
                        numNonZeroCoeffs[i]++;
                    }
                }
            }

            {
                int i = 0;
                foreach (var factorVariableAndValues in factorVariablesAndValues)
                {
                    foreach (var factorValue in factorVariableAndValues.Value)
                    {
                        double sigma = ds.GetStringValues(factorVariableAndValues.Key)
                                       .Select(s => s == factorValue ? 1.0 : 0.0)
                                       .StandardDeviation(); // calc std dev of binary indicator
                        var path = Enumerable.Range(0, nLambdas).Select(r => Tuple.Create(lambda[r], coeff[r, i] * sigma)).ToArray();
                        dataRows[i] = new IndexedDataRow <double>(factorVariableAndValues.Key + "=" + factorValue, factorVariableAndValues.Key + "=" + factorValue, path);
                        i++;
                    }
                }

                foreach (var doubleVariable in doubleVariables)
                {
                    double sigma = ds.GetDoubleValues(doubleVariable).StandardDeviation();
                    var    path  = Enumerable.Range(0, nLambdas).Select(r => Tuple.Create(lambda[r], coeff[r, i] * sigma)).ToArray();
                    dataRows[i] = new IndexedDataRow <double>(doubleVariable, doubleVariable, path);
                    i++;
                }

                // add to coeffTable by total weight (larger area under the curve => more important);
                foreach (var r in dataRows.OrderByDescending(r => r.Values.Select(t => t.Item2).Sum(x => Math.Abs(x))))
                {
                    coeffTable.Rows.Add(r);
                }
            }

            if (lambda.Length > 2)
            {
                coeffTable.VisualProperties.XAxisMinimumFixedValue = Math.Pow(10, Math.Floor(Math.Log10(lambda.Last())));
                coeffTable.VisualProperties.XAxisMaximumFixedValue = Math.Pow(10, Math.Ceiling(Math.Log10(lambda.Skip(1).First())));
            }

            coeffTable.Rows.Add(new IndexedDataRow <double>("Number of variables", "The number of non-zero coefficients for each step in the path", lambda.Zip(numNonZeroCoeffs, (l, v) => Tuple.Create(l, (double)v))));
            coeffTable.Rows["Number of variables"].VisualProperties.ChartType   = DataRowVisualProperties.DataRowChartType.Points;
            coeffTable.Rows["Number of variables"].VisualProperties.SecondYAxis = true;

            return(coeffTable);
        }