public void CreateNumericalBins(int maxBins) { if (_integerFrame == null) { _integerFrame = new Dictionary <string, int[]>(); } var helper = new StatsFunctions(_blas); foreach (var item in _rawNumerical) { var name = item.Key; var src = item.Value; var thresholds = helper.GetQuantiles(src, maxBins); if (thresholds == null) { continue; } var bin = new NumericalBin(thresholds); var dest = new int[src.Count]; for (int i = 0; i < src.Count; i++) { dest[i] = bin.GetIndex(src[i]); } _binCollection.Add(name, bin); _integerFrame.Add(name, dest); } }
public void GetCopheneticCorrelations() { ConsoleWriter.WriteSystemMessage("Cophenetic correlations:"); Tuple <int, int> minCopheneticCorrelationIndex = null; Tuple <int, int> maxCopheneticCorrelationIndex = null; double minCorrelation = double.MaxValue; double maxCorrelation = double.MinValue; double[,] correlations = new double[3, 3]; for (int i = 0; i < 3; i++) { for (int j = 0; j < 3; j++) { double correlation = StatsFunctions.Correlation(this.analysisResults[i, j].Distances, this.analysisResults[i, j].CopheneticDistances); correlations[i, j] = correlation; if (correlation < minCorrelation) { minCorrelation = correlation; minCopheneticCorrelationIndex = new Tuple <int, int>(i, j); } if (correlation > maxCorrelation) { maxCorrelation = correlation; maxCopheneticCorrelationIndex = new Tuple <int, int>(i, j); } ConsoleWriter.WriteMessage($"{this.DistanceFunctions[i]}, {this.LinkageFunctions[j]}: {correlation}"); } Console.WriteLine(); } ConsoleWriter.WriteSystemMessage("Minimal cophenetic correlation:"); ConsoleWriter.WriteMessage(distanceFunctions[minCopheneticCorrelationIndex.Item1] + " and " + linkageFunctions[minCopheneticCorrelationIndex.Item2]); ConsoleWriter.WriteSystemMessage("Maximal cophenetic correlation:"); ConsoleWriter.WriteMessage(distanceFunctions[maxCopheneticCorrelationIndex.Item1] + " and " + linkageFunctions[maxCopheneticCorrelationIndex.Item2]); this.bestClusterAnalysis = this.analysisResults[maxCopheneticCorrelationIndex.Item1, maxCopheneticCorrelationIndex.Item2]; }
static void Main(string[] args) { Console.SetWindowSize(100, 50); // Read in the Online Retail dataset // TODO: change the path to point to your data directory string dataDirPath = @"\\Mac\Home\Documents\c-sharp-machine-learning\ch.6\input-data"; // Load the data into a data frame string dataPath = Path.Combine(dataDirPath, "data-clean.csv"); Console.WriteLine("Loading {0}\n\n", dataPath); var ecommerceDF = Frame.ReadCsv( dataPath, hasHeaders: true, inferTypes: true ); Console.WriteLine("* Shape: {0}, {1}\n\n", ecommerceDF.RowCount, ecommerceDF.ColumnCount); // 1. Net Revenue per Customer var revPerCustomerDF = ecommerceDF.AggregateRowsBy <double, double>( new string[] { "CustomerID" }, new string[] { "Amount" }, x => x.Sum() ); // 2. # of Total Transactions per Customer var numTransactionsPerCustomerDF = ecommerceDF.AggregateRowsBy <double, double>( new string[] { "CustomerID" }, new string[] { "Quantity" }, x => x.ValueCount ); // 3. # of Cancelled Transactions per Customer var numCancelledPerCustomerDF = ecommerceDF.AggregateRowsBy <double, double>( new string[] { "CustomerID" }, new string[] { "Quantity" }, x => x.Select(y => y.Value >= 0 ? 0.0 : 1.0).Sum() ); // 4. Average UnitPrice per Customer var avgUnitPricePerCustomerDF = ecommerceDF.AggregateRowsBy <double, double>( new string[] { "CustomerID" }, new string[] { "UnitPrice" }, x => x.Sum() / x.ValueCount ); // 5. Average Quantity per Customer var avgQuantityPerCustomerDF = ecommerceDF.AggregateRowsBy <double, double>( new string[] { "CustomerID" }, new string[] { "Quantity" }, x => x.Sum() / x.ValueCount ); // Aggregate all results var featuresDF = Frame.CreateEmpty <int, string>(); featuresDF.AddColumn("CustomerID", revPerCustomerDF.GetColumn <double>("CustomerID")); featuresDF.AddColumn("Description", ecommerceDF.GetColumn <string>("Description")); featuresDF.AddColumn("NetRevenue", revPerCustomerDF.GetColumn <double>("Amount")); featuresDF.AddColumn("NumTransactions", numTransactionsPerCustomerDF.GetColumn <double>("Quantity")); featuresDF.AddColumn("NumCancelled", numCancelledPerCustomerDF.GetColumn <double>("Quantity")); featuresDF.AddColumn("AvgUnitPrice", avgUnitPricePerCustomerDF.GetColumn <double>("UnitPrice")); featuresDF.AddColumn("AvgQuantity", avgQuantityPerCustomerDF.GetColumn <double>("Quantity")); featuresDF.AddColumn("PercentageCancelled", featuresDF["NumCancelled"] / featuresDF["NumTransactions"]); Console.WriteLine("\n\n* Feature Set:"); featuresDF.Print(); // NetRevenue feature distribution PrintQuartiles(featuresDF, "NetRevenue"); // NumTransactions feature distribution PrintQuartiles(featuresDF, "NumTransactions"); // AvgUnitPrice feature distribution PrintQuartiles(featuresDF, "AvgUnitPrice"); // AvgQuantity feature distribution PrintQuartiles(featuresDF, "AvgQuantity"); // PercentageCancelled feature distribution PrintQuartiles(featuresDF, "PercentageCancelled"); Console.WriteLine("\n\n* Feature DF Shape: ({0}, {1})", featuresDF.RowCount, featuresDF.ColumnCount); // 1. Drop Customers with Negative NetRevenue featuresDF = featuresDF.Rows[ featuresDF["NetRevenue"].Where(x => x.Value >= 0.0).Keys ]; // 2. Drop Customers with Negative AvgQuantity featuresDF = featuresDF.Rows[ featuresDF["AvgQuantity"].Where(x => x.Value >= 0.0).Keys ]; // 3. Drop Customers who have more cancel orders than purchase orders featuresDF = featuresDF.Rows[ featuresDF["PercentageCancelled"].Where(x => x.Value < 0.5).Keys ]; Console.WriteLine("\n\n\n\n* After dropping customers with potential orphan cancel orders:"); // NetRevenue feature distribution PrintQuartiles(featuresDF, "NetRevenue"); // NumTransactions feature distribution PrintQuartiles(featuresDF, "NumTransactions"); // AvgUnitPrice feature distribution PrintQuartiles(featuresDF, "AvgUnitPrice"); // AvgQuantity feature distribution PrintQuartiles(featuresDF, "AvgQuantity"); // PercentageCancelled feature distribution PrintQuartiles(featuresDF, "PercentageCancelled"); Console.WriteLine("\n\n* Feature DF Shape: ({0}, {1})", featuresDF.RowCount, featuresDF.ColumnCount); HistogramBox.CheckForIllegalCrossThreadCalls = false; HistogramBox .Show( featuresDF.DropSparseRows()["NetRevenue"].ValuesAll.ToArray(), title: "NetRevenue Distribution" ) .SetNumberOfBins(50); HistogramBox .Show( featuresDF.DropSparseRows()["NumTransactions"].ValuesAll.ToArray(), title: "NumTransactions Distribution" ) .SetNumberOfBins(50); HistogramBox .Show( featuresDF.DropSparseRows()["AvgUnitPrice"].ValuesAll.ToArray(), title: "AvgUnitPrice Distribution" ) .SetNumberOfBins(50); HistogramBox .Show( featuresDF.DropSparseRows()["AvgQuantity"].ValuesAll.ToArray(), title: "AvgQuantity Distribution" ) .SetNumberOfBins(50); HistogramBox .Show( featuresDF.DropSparseRows()["PercentageCancelled"].ValuesAll.ToArray(), title: "PercentageCancelled Distribution" ) .SetNumberOfBins(50); // Create Percentile Features featuresDF.AddColumn( "NetRevenuePercentile", featuresDF["NetRevenue"].Select( x => StatsFunctions.PercentileRank(featuresDF["NetRevenue"].Values.ToArray(), x.Value) ) ); featuresDF.AddColumn( "NumTransactionsPercentile", featuresDF["NumTransactions"].Select( x => StatsFunctions.PercentileRank(featuresDF["NumTransactions"].Values.ToArray(), x.Value) ) ); featuresDF.AddColumn( "AvgUnitPricePercentile", featuresDF["AvgUnitPrice"].Select( x => StatsFunctions.PercentileRank(featuresDF["AvgUnitPrice"].Values.ToArray(), x.Value) ) ); featuresDF.AddColumn( "AvgQuantityPercentile", featuresDF["AvgQuantity"].Select( x => StatsFunctions.PercentileRank(featuresDF["AvgQuantity"].Values.ToArray(), x.Value) ) ); featuresDF.AddColumn( "PercentageCancelledPercentile", featuresDF["PercentageCancelled"].Select( x => StatsFunctions.PercentileRank(featuresDF["PercentageCancelled"].Values.ToArray(), x.Value) ) ); Console.WriteLine("\n\n\n* Percentile Features:"); featuresDF.Columns[ new string[] { "NetRevenue", "NetRevenuePercentile", "NumTransactions", "NumTransactionsPercentile" } ].Print(); HistogramBox .Show( featuresDF.DropSparseRows()["NetRevenuePercentile"].ValuesAll.ToArray(), title: "NetRevenuePercentile Distribution" ) .SetNumberOfBins(50); HistogramBox .Show( featuresDF.DropSparseRows()["NumTransactionsPercentile"].ValuesAll.ToArray(), title: "NumTransactionsPercentile Distribution" ) .SetNumberOfBins(50); HistogramBox .Show( featuresDF.DropSparseRows()["AvgUnitPricePercentile"].ValuesAll.ToArray(), title: "AvgUnitPricePercentile Distribution" ) .SetNumberOfBins(50); HistogramBox .Show( featuresDF.DropSparseRows()["AvgQuantityPercentile"].ValuesAll.ToArray(), title: "AvgQuantityPercentile Distribution" ) .SetNumberOfBins(50); HistogramBox .Show( featuresDF.DropSparseRows()["PercentageCancelledPercentile"].ValuesAll.ToArray(), title: "PercentageCancelledPercentile Distribution" ) .SetNumberOfBins(50); string outputPath = Path.Combine(dataDirPath, "features.csv"); Console.WriteLine("* Exporting features data: {0}", outputPath); featuresDF.SaveCsv(outputPath); Console.WriteLine("\n\n\n\nDONE!!"); Console.ReadKey(); }