Esempio n. 1
0
        public void CreateNumericalBins(int maxBins)
        {
            if (_integerFrame == null)
            {
                _integerFrame = new Dictionary <string, int[]>();
            }



            var helper = new StatsFunctions(_blas);

            foreach (var item in _rawNumerical)
            {
                var name       = item.Key;
                var src        = item.Value;
                var thresholds = helper.GetQuantiles(src, maxBins);
                if (thresholds == null)
                {
                    continue;
                }
                var bin  = new NumericalBin(thresholds);
                var dest = new int[src.Count];
                for (int i = 0; i < src.Count; i++)
                {
                    dest[i] = bin.GetIndex(src[i]);
                }
                _binCollection.Add(name, bin);
                _integerFrame.Add(name, dest);
            }
        }
Esempio n. 2
0
        public void GetCopheneticCorrelations()
        {
            ConsoleWriter.WriteSystemMessage("Cophenetic correlations:");

            Tuple <int, int> minCopheneticCorrelationIndex = null;
            Tuple <int, int> maxCopheneticCorrelationIndex = null;
            double           minCorrelation = double.MaxValue;
            double           maxCorrelation = double.MinValue;

            double[,] correlations = new double[3, 3];
            for (int i = 0; i < 3; i++)
            {
                for (int j = 0; j < 3; j++)
                {
                    double correlation =
                        StatsFunctions.Correlation(this.analysisResults[i, j].Distances, this.analysisResults[i, j].CopheneticDistances);
                    correlations[i, j] = correlation;
                    if (correlation < minCorrelation)
                    {
                        minCorrelation = correlation;
                        minCopheneticCorrelationIndex = new Tuple <int, int>(i, j);
                    }
                    if (correlation > maxCorrelation)
                    {
                        maxCorrelation = correlation;
                        maxCopheneticCorrelationIndex = new Tuple <int, int>(i, j);
                    }
                    ConsoleWriter.WriteMessage($"{this.DistanceFunctions[i]}, {this.LinkageFunctions[j]}: {correlation}");
                }
                Console.WriteLine();
            }

            ConsoleWriter.WriteSystemMessage("Minimal cophenetic correlation:");
            ConsoleWriter.WriteMessage(distanceFunctions[minCopheneticCorrelationIndex.Item1] +
                                       " and " + linkageFunctions[minCopheneticCorrelationIndex.Item2]);

            ConsoleWriter.WriteSystemMessage("Maximal cophenetic correlation:");
            ConsoleWriter.WriteMessage(distanceFunctions[maxCopheneticCorrelationIndex.Item1] +
                                       " and " + linkageFunctions[maxCopheneticCorrelationIndex.Item2]);

            this.bestClusterAnalysis = this.analysisResults[maxCopheneticCorrelationIndex.Item1, maxCopheneticCorrelationIndex.Item2];
        }
        static void Main(string[] args)
        {
            Console.SetWindowSize(100, 50);

            // Read in the Online Retail dataset
            // TODO: change the path to point to your data directory
            string dataDirPath = @"\\Mac\Home\Documents\c-sharp-machine-learning\ch.6\input-data";

            // Load the data into a data frame
            string dataPath = Path.Combine(dataDirPath, "data-clean.csv");

            Console.WriteLine("Loading {0}\n\n", dataPath);
            var ecommerceDF = Frame.ReadCsv(
                dataPath,
                hasHeaders: true,
                inferTypes: true
                );

            Console.WriteLine("* Shape: {0}, {1}\n\n", ecommerceDF.RowCount, ecommerceDF.ColumnCount);

            // 1. Net Revenue per Customer
            var revPerCustomerDF = ecommerceDF.AggregateRowsBy <double, double>(
                new string[] { "CustomerID" },
                new string[] { "Amount" },
                x => x.Sum()
                );
            // 2. # of Total Transactions per Customer
            var numTransactionsPerCustomerDF = ecommerceDF.AggregateRowsBy <double, double>(
                new string[] { "CustomerID" },
                new string[] { "Quantity" },
                x => x.ValueCount
                );
            // 3. # of Cancelled Transactions per Customer
            var numCancelledPerCustomerDF = ecommerceDF.AggregateRowsBy <double, double>(
                new string[] { "CustomerID" },
                new string[] { "Quantity" },
                x => x.Select(y => y.Value >= 0 ? 0.0 : 1.0).Sum()
                );
            // 4. Average UnitPrice per Customer
            var avgUnitPricePerCustomerDF = ecommerceDF.AggregateRowsBy <double, double>(
                new string[] { "CustomerID" },
                new string[] { "UnitPrice" },
                x => x.Sum() / x.ValueCount
                );
            // 5. Average Quantity per Customer
            var avgQuantityPerCustomerDF = ecommerceDF.AggregateRowsBy <double, double>(
                new string[] { "CustomerID" },
                new string[] { "Quantity" },
                x => x.Sum() / x.ValueCount
                );

            // Aggregate all results
            var featuresDF = Frame.CreateEmpty <int, string>();

            featuresDF.AddColumn("CustomerID", revPerCustomerDF.GetColumn <double>("CustomerID"));
            featuresDF.AddColumn("Description", ecommerceDF.GetColumn <string>("Description"));
            featuresDF.AddColumn("NetRevenue", revPerCustomerDF.GetColumn <double>("Amount"));
            featuresDF.AddColumn("NumTransactions", numTransactionsPerCustomerDF.GetColumn <double>("Quantity"));
            featuresDF.AddColumn("NumCancelled", numCancelledPerCustomerDF.GetColumn <double>("Quantity"));
            featuresDF.AddColumn("AvgUnitPrice", avgUnitPricePerCustomerDF.GetColumn <double>("UnitPrice"));
            featuresDF.AddColumn("AvgQuantity", avgQuantityPerCustomerDF.GetColumn <double>("Quantity"));
            featuresDF.AddColumn("PercentageCancelled", featuresDF["NumCancelled"] / featuresDF["NumTransactions"]);

            Console.WriteLine("\n\n* Feature Set:");
            featuresDF.Print();

            // NetRevenue feature distribution
            PrintQuartiles(featuresDF, "NetRevenue");
            // NumTransactions feature distribution
            PrintQuartiles(featuresDF, "NumTransactions");
            // AvgUnitPrice feature distribution
            PrintQuartiles(featuresDF, "AvgUnitPrice");
            // AvgQuantity feature distribution
            PrintQuartiles(featuresDF, "AvgQuantity");
            // PercentageCancelled feature distribution
            PrintQuartiles(featuresDF, "PercentageCancelled");
            Console.WriteLine("\n\n* Feature DF Shape: ({0}, {1})", featuresDF.RowCount, featuresDF.ColumnCount);

            // 1. Drop Customers with Negative NetRevenue
            featuresDF = featuresDF.Rows[
                featuresDF["NetRevenue"].Where(x => x.Value >= 0.0).Keys
                         ];
            // 2. Drop Customers with Negative AvgQuantity
            featuresDF = featuresDF.Rows[
                featuresDF["AvgQuantity"].Where(x => x.Value >= 0.0).Keys
                         ];
            // 3. Drop Customers who have more cancel orders than purchase orders
            featuresDF = featuresDF.Rows[
                featuresDF["PercentageCancelled"].Where(x => x.Value < 0.5).Keys
                         ];

            Console.WriteLine("\n\n\n\n* After dropping customers with potential orphan cancel orders:");
            // NetRevenue feature distribution
            PrintQuartiles(featuresDF, "NetRevenue");
            // NumTransactions feature distribution
            PrintQuartiles(featuresDF, "NumTransactions");
            // AvgUnitPrice feature distribution
            PrintQuartiles(featuresDF, "AvgUnitPrice");
            // AvgQuantity feature distribution
            PrintQuartiles(featuresDF, "AvgQuantity");
            // PercentageCancelled feature distribution
            PrintQuartiles(featuresDF, "PercentageCancelled");
            Console.WriteLine("\n\n* Feature DF Shape: ({0}, {1})", featuresDF.RowCount, featuresDF.ColumnCount);

            HistogramBox.CheckForIllegalCrossThreadCalls = false;
            HistogramBox
            .Show(
                featuresDF.DropSparseRows()["NetRevenue"].ValuesAll.ToArray(),
                title: "NetRevenue Distribution"
                )
            .SetNumberOfBins(50);
            HistogramBox
            .Show(
                featuresDF.DropSparseRows()["NumTransactions"].ValuesAll.ToArray(),
                title: "NumTransactions Distribution"
                )
            .SetNumberOfBins(50);
            HistogramBox
            .Show(
                featuresDF.DropSparseRows()["AvgUnitPrice"].ValuesAll.ToArray(),
                title: "AvgUnitPrice Distribution"
                )
            .SetNumberOfBins(50);
            HistogramBox
            .Show(
                featuresDF.DropSparseRows()["AvgQuantity"].ValuesAll.ToArray(),
                title: "AvgQuantity Distribution"
                )
            .SetNumberOfBins(50);
            HistogramBox
            .Show(
                featuresDF.DropSparseRows()["PercentageCancelled"].ValuesAll.ToArray(),
                title: "PercentageCancelled Distribution"
                )
            .SetNumberOfBins(50);


            // Create Percentile Features
            featuresDF.AddColumn(
                "NetRevenuePercentile",
                featuresDF["NetRevenue"].Select(
                    x => StatsFunctions.PercentileRank(featuresDF["NetRevenue"].Values.ToArray(), x.Value)
                    )
                );
            featuresDF.AddColumn(
                "NumTransactionsPercentile",
                featuresDF["NumTransactions"].Select(
                    x => StatsFunctions.PercentileRank(featuresDF["NumTransactions"].Values.ToArray(), x.Value)
                    )
                );
            featuresDF.AddColumn(
                "AvgUnitPricePercentile",
                featuresDF["AvgUnitPrice"].Select(
                    x => StatsFunctions.PercentileRank(featuresDF["AvgUnitPrice"].Values.ToArray(), x.Value)
                    )
                );
            featuresDF.AddColumn(
                "AvgQuantityPercentile",
                featuresDF["AvgQuantity"].Select(
                    x => StatsFunctions.PercentileRank(featuresDF["AvgQuantity"].Values.ToArray(), x.Value)
                    )
                );
            featuresDF.AddColumn(
                "PercentageCancelledPercentile",
                featuresDF["PercentageCancelled"].Select(
                    x => StatsFunctions.PercentileRank(featuresDF["PercentageCancelled"].Values.ToArray(), x.Value)
                    )
                );
            Console.WriteLine("\n\n\n* Percentile Features:");
            featuresDF.Columns[
                new string[] { "NetRevenue", "NetRevenuePercentile", "NumTransactions", "NumTransactionsPercentile" }
            ].Print();

            HistogramBox
            .Show(
                featuresDF.DropSparseRows()["NetRevenuePercentile"].ValuesAll.ToArray(),
                title: "NetRevenuePercentile Distribution"
                )
            .SetNumberOfBins(50);
            HistogramBox
            .Show(
                featuresDF.DropSparseRows()["NumTransactionsPercentile"].ValuesAll.ToArray(),
                title: "NumTransactionsPercentile Distribution"
                )
            .SetNumberOfBins(50);
            HistogramBox
            .Show(
                featuresDF.DropSparseRows()["AvgUnitPricePercentile"].ValuesAll.ToArray(),
                title: "AvgUnitPricePercentile Distribution"
                )
            .SetNumberOfBins(50);
            HistogramBox
            .Show(
                featuresDF.DropSparseRows()["AvgQuantityPercentile"].ValuesAll.ToArray(),
                title: "AvgQuantityPercentile Distribution"
                )
            .SetNumberOfBins(50);
            HistogramBox
            .Show(
                featuresDF.DropSparseRows()["PercentageCancelledPercentile"].ValuesAll.ToArray(),
                title: "PercentageCancelledPercentile Distribution"
                )
            .SetNumberOfBins(50);

            string outputPath = Path.Combine(dataDirPath, "features.csv");

            Console.WriteLine("* Exporting features data: {0}", outputPath);
            featuresDF.SaveCsv(outputPath);

            Console.WriteLine("\n\n\n\nDONE!!");
            Console.ReadKey();
        }