public void HistogramBox_ShowTest1() { // Generate some normally distributed samples double[] data = NormalDistribution.Standard.Generate(100); HistogramBox.Show(data).Hold(); }
static void Main(string[] args) { Console.SetWindowSize(100, 50); // Read in the OHLC dataset // TODO: change the path to point to your data directory string dataDirPath = @"<path-to-your-data-dir>"; // Load the OHLC data into a data frame string ohlcDataPath = Path.Combine(dataDirPath, "eurusd-daily-ohlc.csv"); Console.WriteLine("Loading {0}\n", ohlcDataPath); var ohlcDF = Frame.ReadCsv( ohlcDataPath, hasHeaders: true, inferTypes: true ); // Time-series line chart of close prices var closePriceLineChart = DataSeriesBox.Show( ohlcDF.RowKeys.Select(x => (double)x), ohlcDF.GetColumn <double>("Close").ValuesAll ); System.Threading.Thread.Sleep(3000); closePriceLineChart.Invoke( new Action(() => { closePriceLineChart.Size = new System.Drawing.Size(700, 500); }) ); // Time-series line chart of daily returns var dailyReturnLineChart = DataSeriesBox.Show( ohlcDF.RowKeys.Select(x => (double)x), ohlcDF.FillMissing(0.0)["DailyReturn"].ValuesAll ); System.Threading.Thread.Sleep(3000); dailyReturnLineChart.Invoke( new Action(() => { dailyReturnLineChart.Size = new System.Drawing.Size(700, 500); }) ); var dailyReturnHistogram = HistogramBox .Show( ohlcDF.FillMissing(0.0)["DailyReturn"].ValuesAll.ToArray() ) .SetNumberOfBins(20); System.Threading.Thread.Sleep(3000); dailyReturnHistogram.Invoke( new Action(() => { dailyReturnHistogram.Size = new System.Drawing.Size(700, 500); }) ); // Check the distribution of daily returns double returnMax = ohlcDF["DailyReturn"].Max(); double returnMean = ohlcDF["DailyReturn"].Mean(); double returnMedian = ohlcDF["DailyReturn"].Median(); double returnMin = ohlcDF["DailyReturn"].Min(); double returnStdDev = ohlcDF["DailyReturn"].StdDev(); double[] quantiles = Accord.Statistics.Measures.Quantiles( ohlcDF.FillMissing(0.0)["DailyReturn"].ValuesAll.ToArray(), new double[] { 0.25, 0.5, 0.75 } ); Console.WriteLine("-- DailyReturn Distribution-- "); Console.WriteLine("Mean: \t\t\t{0:0.00}\nStdDev: \t\t{1:0.00}\n", returnMean, returnStdDev); Console.WriteLine( "Min: \t\t\t{0:0.00}\nQ1 (25% Percentile): \t{1:0.00}\nQ2 (Median): \t\t{2:0.00}\nQ3 (75% Percentile): \t{3:0.00}\nMax: \t\t\t{4:0.00}", returnMin, quantiles[0], quantiles[1], quantiles[2], returnMax ); Console.WriteLine("\nDONE!!!"); Console.ReadKey(); }
static void Main(string[] args) { Console.SetWindowSize(100, 55); // Read in the Credit Card Fraud dataset // TODO: change the path to point to your data directory string dataDirPath = @"\\Mac\Home\Documents\c-sharp-machine-learning\ch.10\input-data"; // Load the data into a data frame string dataPath = Path.Combine(dataDirPath, "creditcard.csv"); Console.WriteLine("Loading {0}\n\n", dataPath); var df = Frame.ReadCsv( dataPath, hasHeaders: true, inferTypes: true ); Console.WriteLine("* Shape: {0}, {1}\n\n", df.RowCount, df.ColumnCount); // Target variable distribution var targetVarCount = df.AggregateRowsBy <string, int>( new string[] { "Class" }, new string[] { "V1" }, x => x.ValueCount ).SortRows("V1"); targetVarCount.RenameColumns(new string[] { "is_fraud", "count" }); targetVarCount.Print(); DataBarBox.Show( targetVarCount.GetColumn <string>("is_fraud").Values.ToArray(), targetVarCount["count"].Values.ToArray() ).SetTitle( "Counts by Target Class" ); // Feature distributions HistogramBox.CheckForIllegalCrossThreadCalls = false; foreach (string col in df.ColumnKeys) { if (col.Equals("Class") || col.Equals("Time")) { continue; } double[] values = df[col].DropMissing().ValuesAll.ToArray(); // Compute Quartiles Console.WriteLine(String.Format("\n\n-- {0} Distribution -- ", col)); double[] quartiles = Accord.Statistics.Measures.Quantiles( values, new double[] { 0, 0.25, 0.5, 0.75, 1.0 } ); Console.WriteLine( "Min: \t\t\t{0:0.00}\nQ1 (25% Percentile): \t{1:0.00}\nQ2 (Median): \t\t{2:0.00}\nQ3 (75% Percentile): \t{3:0.00}\nMax: \t\t\t{4:0.00}", quartiles[0], quartiles[1], quartiles[2], quartiles[3], quartiles[4] ); // Visualize Distributions HistogramBox.Show( values, title: col ) .SetNumberOfBins(50); } // Target Var Distributions on 2-dimensional feature space double[][] data = BuildJaggedArray( df.ToArray2D <double>(), df.RowCount, df.ColumnCount ); int[] labels = df.GetColumn <int>("Class").ValuesAll.ToArray(); double[][] first2Components = data.Select( x => x.Where((y, i) => i < 2 ).ToArray()).ToArray(); ScatterplotBox.Show("Feature #1 vs. Feature #2", first2Components, labels); double[][] next2Components = data.Select( x => x.Where((y, i) => i >= 1 && i <= 2).ToArray() ).ToArray(); ScatterplotBox.Show("Feature #2 vs. Feature #3", next2Components, labels); next2Components = data.Select( x => x.Where((y, i) => i >= 2 && i <= 3).ToArray() ).ToArray(); ScatterplotBox.Show("Feature #3 vs. Feature #4", next2Components, labels); Console.WriteLine("\n\n\n\n\nDONE!!!"); Console.ReadKey(); }
static void Main(string[] args) { Console.SetWindowSize(100, 50); // Read in the House Price dataset // TODO: change the path to point to your data directory string dataDirPath = @"\\Mac\Home\Documents\c-sharp-machine-learning\ch.5\input-data"; // Load the data into a data frame string dataPath = Path.Combine(dataDirPath, "train.csv"); Console.WriteLine("Loading {0}\n", dataPath); var houseDF = Frame.ReadCsv( dataPath, hasHeaders: true, inferTypes: true ); // Categorical Variable #1: Building Type Console.WriteLine("\nCategorical Variable #1: Building Type"); var buildingTypeDistribution = houseDF.GetColumn <string>( "BldgType" ).GroupBy <string>(x => x.Value).Select(x => (double)x.Value.KeyCount); buildingTypeDistribution.Print(); var buildingTypeBarChart = DataBarBox.Show( buildingTypeDistribution.Keys.ToArray(), buildingTypeDistribution.Values.ToArray() ); buildingTypeBarChart.SetTitle("Building Type Distribution (Categorical)"); System.Threading.Thread.Sleep(3000); buildingTypeBarChart.Invoke( new Action(() => { buildingTypeBarChart.Size = new System.Drawing.Size(1000, 700); }) ); // Categorical Variable #2: Lot Configuration Console.WriteLine("\nCategorical Variable #1: Building Type"); var lotConfigDistribution = houseDF.GetColumn <string>( "LotConfig" ).GroupBy <string>(x => x.Value).Select(x => (double)x.Value.KeyCount); lotConfigDistribution.Print(); var lotConfigBarChart = DataBarBox.Show( lotConfigDistribution.Keys.ToArray(), lotConfigDistribution.Values.ToArray() ); lotConfigBarChart.SetTitle("Lot Configuration Distribution (Categorical)"); System.Threading.Thread.Sleep(3000); lotConfigBarChart.Invoke( new Action(() => { lotConfigBarChart.Size = new System.Drawing.Size(1000, 700); }) ); // Ordinal Categorical Variable #1: Overall material and finish of the house Console.WriteLine("\nOrdinal Categorical #1: Overall material and finish of the house"); var overallQualDistribution = houseDF.GetColumn <string>( "OverallQual" ).GroupBy <int>( x => Convert.ToInt32(x.Value) ).Select( x => (double)x.Value.KeyCount ).SortByKey().Reversed; overallQualDistribution.Print(); var overallQualBarChart = DataBarBox.Show( overallQualDistribution.Keys.Select(x => x.ToString()), overallQualDistribution.Values.ToArray() ); overallQualBarChart.SetTitle("Overall House Quality Distribution (Ordinal)"); System.Threading.Thread.Sleep(3000); overallQualBarChart.Invoke( new Action(() => { overallQualBarChart.Size = new System.Drawing.Size(1000, 700); }) ); // Ordinal Categorical Variable #2: Exterior Quality Console.WriteLine("\nOrdinal Categorical #2: Exterior Quality"); var exteriorQualDistribution = houseDF.GetColumn <string>( "ExterQual" ).GroupBy <string>(x => x.Value).Select( x => (double)x.Value.KeyCount )[new string[] { "Ex", "Gd", "TA", "Fa" }]; exteriorQualDistribution.Print(); var exteriorQualBarChart = DataBarBox.Show( exteriorQualDistribution.Keys.Select(x => x.ToString()), exteriorQualDistribution.Values.ToArray() ); exteriorQualBarChart.SetTitle("Exterior Quality Distribution (Ordinal)"); System.Threading.Thread.Sleep(3000); exteriorQualBarChart.Invoke( new Action(() => { exteriorQualBarChart.Size = new System.Drawing.Size(1000, 700); }) ); HistogramBox.CheckForIllegalCrossThreadCalls = false; // Continuous Variable #1-1: First Floor Square Feet var firstFloorHistogram = HistogramBox .Show( houseDF.DropSparseRows()["1stFlrSF"].ValuesAll.ToArray(), title: "First Floor Square Feet (Continuous)" ) .SetNumberOfBins(20); System.Threading.Thread.Sleep(3000); firstFloorHistogram.Invoke( new Action(() => { firstFloorHistogram.Size = new System.Drawing.Size(1000, 700); }) ); // Continuous Variable #1-2: Log of First Floor Square Feet var logFirstFloorHistogram = HistogramBox .Show( houseDF.DropSparseRows()["1stFlrSF"].Log().ValuesAll.ToArray(), title: "First Floor Square Feet - Log Transformed (Continuous)" ) .SetNumberOfBins(20); System.Threading.Thread.Sleep(3000); logFirstFloorHistogram.Invoke( new Action(() => { logFirstFloorHistogram.Size = new System.Drawing.Size(1000, 700); }) ); // Continuous Variable #2-1: Size of garage in square feet var garageHistogram = HistogramBox .Show( houseDF.DropSparseRows()["GarageArea"].ValuesAll.ToArray(), title: "Size of garage in square feet (Continuous)" ) .SetNumberOfBins(20); System.Threading.Thread.Sleep(3000); garageHistogram.Invoke( new Action(() => { garageHistogram.Size = new System.Drawing.Size(1000, 700); }) ); // Continuous Variable #2-2: Log of Value of miscellaneous feature var logGarageHistogram = HistogramBox .Show( houseDF.DropSparseRows()["GarageArea"].Log().ValuesAll.ToArray(), title: "Size of garage in square feet - Log Transformed (Continuous)" ) .SetNumberOfBins(20); System.Threading.Thread.Sleep(3000); logGarageHistogram.Invoke( new Action(() => { logGarageHistogram.Size = new System.Drawing.Size(1000, 700); }) ); // Target Variable: Sale Price var salePriceHistogram = HistogramBox .Show( houseDF.DropSparseRows()["SalePrice"].ValuesAll.ToArray(), title: "Sale Price (Continuous)" ) .SetNumberOfBins(20); System.Threading.Thread.Sleep(3000); salePriceHistogram.Invoke( new Action(() => { salePriceHistogram.Size = new System.Drawing.Size(1000, 700); }) ); // Target Variable: Sale Price - Log Transformed var logSalePriceHistogram = HistogramBox .Show( houseDF.DropSparseRows()["SalePrice"].Log().ValuesAll.ToArray(), title: "Sale Price - Log Transformed (Continuous)" ) .SetNumberOfBins(20); System.Threading.Thread.Sleep(3000); logSalePriceHistogram.Invoke( new Action(() => { logSalePriceHistogram.Size = new System.Drawing.Size(1000, 700); }) ); Console.WriteLine("\nDONE!!!"); Console.ReadKey(); }
static void Main(string[] args) { Console.SetWindowSize(100, 50); // Read in the Online Retail dataset // TODO: change the path to point to your data directory string dataDirPath = @"\\Mac\Home\Documents\c-sharp-machine-learning\ch.6\input-data"; // Load the data into a data frame string dataPath = Path.Combine(dataDirPath, "data-clean.csv"); Console.WriteLine("Loading {0}\n\n", dataPath); var ecommerceDF = Frame.ReadCsv( dataPath, hasHeaders: true, inferTypes: true ); Console.WriteLine("* Shape: {0}, {1}\n\n", ecommerceDF.RowCount, ecommerceDF.ColumnCount); // 1. Net Revenue per Customer var revPerCustomerDF = ecommerceDF.AggregateRowsBy <double, double>( new string[] { "CustomerID" }, new string[] { "Amount" }, x => x.Sum() ); // 2. # of Total Transactions per Customer var numTransactionsPerCustomerDF = ecommerceDF.AggregateRowsBy <double, double>( new string[] { "CustomerID" }, new string[] { "Quantity" }, x => x.ValueCount ); // 3. # of Cancelled Transactions per Customer var numCancelledPerCustomerDF = ecommerceDF.AggregateRowsBy <double, double>( new string[] { "CustomerID" }, new string[] { "Quantity" }, x => x.Select(y => y.Value >= 0 ? 0.0 : 1.0).Sum() ); // 4. Average UnitPrice per Customer var avgUnitPricePerCustomerDF = ecommerceDF.AggregateRowsBy <double, double>( new string[] { "CustomerID" }, new string[] { "UnitPrice" }, x => x.Sum() / x.ValueCount ); // 5. Average Quantity per Customer var avgQuantityPerCustomerDF = ecommerceDF.AggregateRowsBy <double, double>( new string[] { "CustomerID" }, new string[] { "Quantity" }, x => x.Sum() / x.ValueCount ); // Aggregate all results var featuresDF = Frame.CreateEmpty <int, string>(); featuresDF.AddColumn("CustomerID", revPerCustomerDF.GetColumn <double>("CustomerID")); featuresDF.AddColumn("Description", ecommerceDF.GetColumn <string>("Description")); featuresDF.AddColumn("NetRevenue", revPerCustomerDF.GetColumn <double>("Amount")); featuresDF.AddColumn("NumTransactions", numTransactionsPerCustomerDF.GetColumn <double>("Quantity")); featuresDF.AddColumn("NumCancelled", numCancelledPerCustomerDF.GetColumn <double>("Quantity")); featuresDF.AddColumn("AvgUnitPrice", avgUnitPricePerCustomerDF.GetColumn <double>("UnitPrice")); featuresDF.AddColumn("AvgQuantity", avgQuantityPerCustomerDF.GetColumn <double>("Quantity")); featuresDF.AddColumn("PercentageCancelled", featuresDF["NumCancelled"] / featuresDF["NumTransactions"]); Console.WriteLine("\n\n* Feature Set:"); featuresDF.Print(); // NetRevenue feature distribution PrintQuartiles(featuresDF, "NetRevenue"); // NumTransactions feature distribution PrintQuartiles(featuresDF, "NumTransactions"); // AvgUnitPrice feature distribution PrintQuartiles(featuresDF, "AvgUnitPrice"); // AvgQuantity feature distribution PrintQuartiles(featuresDF, "AvgQuantity"); // PercentageCancelled feature distribution PrintQuartiles(featuresDF, "PercentageCancelled"); Console.WriteLine("\n\n* Feature DF Shape: ({0}, {1})", featuresDF.RowCount, featuresDF.ColumnCount); // 1. Drop Customers with Negative NetRevenue featuresDF = featuresDF.Rows[ featuresDF["NetRevenue"].Where(x => x.Value >= 0.0).Keys ]; // 2. Drop Customers with Negative AvgQuantity featuresDF = featuresDF.Rows[ featuresDF["AvgQuantity"].Where(x => x.Value >= 0.0).Keys ]; // 3. Drop Customers who have more cancel orders than purchase orders featuresDF = featuresDF.Rows[ featuresDF["PercentageCancelled"].Where(x => x.Value < 0.5).Keys ]; Console.WriteLine("\n\n\n\n* After dropping customers with potential orphan cancel orders:"); // NetRevenue feature distribution PrintQuartiles(featuresDF, "NetRevenue"); // NumTransactions feature distribution PrintQuartiles(featuresDF, "NumTransactions"); // AvgUnitPrice feature distribution PrintQuartiles(featuresDF, "AvgUnitPrice"); // AvgQuantity feature distribution PrintQuartiles(featuresDF, "AvgQuantity"); // PercentageCancelled feature distribution PrintQuartiles(featuresDF, "PercentageCancelled"); Console.WriteLine("\n\n* Feature DF Shape: ({0}, {1})", featuresDF.RowCount, featuresDF.ColumnCount); HistogramBox.CheckForIllegalCrossThreadCalls = false; HistogramBox .Show( featuresDF.DropSparseRows()["NetRevenue"].ValuesAll.ToArray(), title: "NetRevenue Distribution" ) .SetNumberOfBins(50); HistogramBox .Show( featuresDF.DropSparseRows()["NumTransactions"].ValuesAll.ToArray(), title: "NumTransactions Distribution" ) .SetNumberOfBins(50); HistogramBox .Show( featuresDF.DropSparseRows()["AvgUnitPrice"].ValuesAll.ToArray(), title: "AvgUnitPrice Distribution" ) .SetNumberOfBins(50); HistogramBox .Show( featuresDF.DropSparseRows()["AvgQuantity"].ValuesAll.ToArray(), title: "AvgQuantity Distribution" ) .SetNumberOfBins(50); HistogramBox .Show( featuresDF.DropSparseRows()["PercentageCancelled"].ValuesAll.ToArray(), title: "PercentageCancelled Distribution" ) .SetNumberOfBins(50); // Create Percentile Features featuresDF.AddColumn( "NetRevenuePercentile", featuresDF["NetRevenue"].Select( x => StatsFunctions.PercentileRank(featuresDF["NetRevenue"].Values.ToArray(), x.Value) ) ); featuresDF.AddColumn( "NumTransactionsPercentile", featuresDF["NumTransactions"].Select( x => StatsFunctions.PercentileRank(featuresDF["NumTransactions"].Values.ToArray(), x.Value) ) ); featuresDF.AddColumn( "AvgUnitPricePercentile", featuresDF["AvgUnitPrice"].Select( x => StatsFunctions.PercentileRank(featuresDF["AvgUnitPrice"].Values.ToArray(), x.Value) ) ); featuresDF.AddColumn( "AvgQuantityPercentile", featuresDF["AvgQuantity"].Select( x => StatsFunctions.PercentileRank(featuresDF["AvgQuantity"].Values.ToArray(), x.Value) ) ); featuresDF.AddColumn( "PercentageCancelledPercentile", featuresDF["PercentageCancelled"].Select( x => StatsFunctions.PercentileRank(featuresDF["PercentageCancelled"].Values.ToArray(), x.Value) ) ); Console.WriteLine("\n\n\n* Percentile Features:"); featuresDF.Columns[ new string[] { "NetRevenue", "NetRevenuePercentile", "NumTransactions", "NumTransactionsPercentile" } ].Print(); HistogramBox .Show( featuresDF.DropSparseRows()["NetRevenuePercentile"].ValuesAll.ToArray(), title: "NetRevenuePercentile Distribution" ) .SetNumberOfBins(50); HistogramBox .Show( featuresDF.DropSparseRows()["NumTransactionsPercentile"].ValuesAll.ToArray(), title: "NumTransactionsPercentile Distribution" ) .SetNumberOfBins(50); HistogramBox .Show( featuresDF.DropSparseRows()["AvgUnitPricePercentile"].ValuesAll.ToArray(), title: "AvgUnitPricePercentile Distribution" ) .SetNumberOfBins(50); HistogramBox .Show( featuresDF.DropSparseRows()["AvgQuantityPercentile"].ValuesAll.ToArray(), title: "AvgQuantityPercentile Distribution" ) .SetNumberOfBins(50); HistogramBox .Show( featuresDF.DropSparseRows()["PercentageCancelledPercentile"].ValuesAll.ToArray(), title: "PercentageCancelledPercentile Distribution" ) .SetNumberOfBins(50); string outputPath = Path.Combine(dataDirPath, "features.csv"); Console.WriteLine("* Exporting features data: {0}", outputPath); featuresDF.SaveCsv(outputPath); Console.WriteLine("\n\n\n\nDONE!!"); Console.ReadKey(); }
static void Main(string[] args) { Console.SetWindowSize(100, 60); // Read in the Cyber Attack dataset // TODO: change the path to point to your data directory string dataDirPath = @"\\Mac\Home\Documents\c-sharp-machine-learning\ch.9\input-data"; // Load the data into a data frame string dataPath = Path.Combine(dataDirPath, "pca-transformed-features.csv"); Console.WriteLine("Loading {0}\n\n", dataPath); var featuresDF = Frame.ReadCsv( dataPath, hasHeaders: false, inferTypes: true ); featuresDF.RenameColumns( featuresDF.ColumnKeys.Select((x, i) => String.Format("component-{0}", i + 1)) ); int[] labels = File.ReadLines( Path.Combine(dataDirPath, "pca-transformed-labels.csv") ).Select(x => int.Parse(x)).ToArray(); featuresDF.AddColumn("attack_category", labels); Console.WriteLine("* Shape: ({0}, {1})\n\n", featuresDF.RowCount, featuresDF.ColumnCount); var count = featuresDF.AggregateRowsBy<string, int>( new string[] { "attack_category" }, new string[] { "component-1" }, x => x.ValueCount ).SortRows("component-1"); count.RenameColumns(new string[] { "attack_category", "count" }); count.Print(); // First 13 components explain about 50% of the variance // First 19 components explain about 60% of the variance // First 27 components explain about 70% of the variance // First 34 components explain about 80% of the variance int numComponents = 27; string[] cols = featuresDF.ColumnKeys.Where((x, i) => i < numComponents).ToArray(); // First, compute distances from the center/mean among normal events var normalDF = featuresDF.Rows[ featuresDF["attack_category"].Where(x => x.Value == 0).Keys ].Columns[cols]; double[][] normalData = BuildJaggedArray( normalDF.ToArray2D<double>(), normalDF.RowCount, cols.Length ); double[] normalVariances = ComputeVariances(normalData); double[] rawDistances = ComputeDistances(normalData, normalVariances); // Filter out extreme values int[] idxFiltered = Matrix.ArgSort(rawDistances) .Where((x, i) => i < rawDistances.Length * 0.99).ToArray(); double[] distances = rawDistances.Where((x, i) => idxFiltered.Contains(i)).ToArray(); double meanDistance = distances.Average(); double stdDistance = Math.Sqrt( distances .Select(x => Math.Pow(x - meanDistance, 2)) .Sum() / distances.Length ); Console.WriteLine( "\n\n* Normal - mean: {0:0.0000}, std: {1:0.0000}", meanDistance, stdDistance ); HistogramBox.CheckForIllegalCrossThreadCalls = false; HistogramBox.Show( distances, title: "Distances" ) .SetNumberOfBins(50); // Detection var attackDF = featuresDF.Rows[ featuresDF["attack_category"].Where(x => x.Value > 0).Keys ].Columns[cols]; double[][] attackData = BuildJaggedArray( attackDF.ToArray2D<double>(), attackDF.RowCount, cols.Length ); double[] attackDistances = ComputeDistances(attackData, normalVariances); int[] attackLabels = featuresDF.Rows[ featuresDF["attack_category"].Where(x => x.Value > 0).Keys ].GetColumn<int>("attack_category").ValuesAll.ToArray(); // 5-10% false alarm rate for (int i = 4; i < 10; i++) { double targetFalseAlarmRate = 0.01 * (i + 1); double threshold = Accord.Statistics.Measures.Quantile( distances, 1 - targetFalseAlarmRate ); Console.WriteLine(threshold); int[] detected = attackDistances.Select(x => x > threshold ? 1 : 0).ToArray(); EvaluateResults(attackLabels, detected, targetFalseAlarmRate); } Console.WriteLine("\n\n\n\n\nDONE!!!"); Console.ReadKey(); }