static void Main(string[] args) { Console.SetWindowSize(100, 50); // Read in the OHLC dataset // TODO: change the path to point to your data directory string dataDirPath = @"<path-to-your-data-dir>"; // Load the OHLC data into a data frame string ohlcDataPath = Path.Combine(dataDirPath, "eurusd-daily-ohlc.csv"); Console.WriteLine("Loading {0}\n", ohlcDataPath); var ohlcDF = Frame.ReadCsv( ohlcDataPath, hasHeaders: true, inferTypes: true ); // Time-series line chart of close prices var closePriceLineChart = DataSeriesBox.Show( ohlcDF.RowKeys.Select(x => (double)x), ohlcDF.GetColumn <double>("Close").ValuesAll ); System.Threading.Thread.Sleep(3000); closePriceLineChart.Invoke( new Action(() => { closePriceLineChart.Size = new System.Drawing.Size(700, 500); }) ); // Time-series line chart of daily returns var dailyReturnLineChart = DataSeriesBox.Show( ohlcDF.RowKeys.Select(x => (double)x), ohlcDF.FillMissing(0.0)["DailyReturn"].ValuesAll ); System.Threading.Thread.Sleep(3000); dailyReturnLineChart.Invoke( new Action(() => { dailyReturnLineChart.Size = new System.Drawing.Size(700, 500); }) ); var dailyReturnHistogram = HistogramBox .Show( ohlcDF.FillMissing(0.0)["DailyReturn"].ValuesAll.ToArray() ) .SetNumberOfBins(20); System.Threading.Thread.Sleep(3000); dailyReturnHistogram.Invoke( new Action(() => { dailyReturnHistogram.Size = new System.Drawing.Size(700, 500); }) ); // Check the distribution of daily returns double returnMax = ohlcDF["DailyReturn"].Max(); double returnMean = ohlcDF["DailyReturn"].Mean(); double returnMedian = ohlcDF["DailyReturn"].Median(); double returnMin = ohlcDF["DailyReturn"].Min(); double returnStdDev = ohlcDF["DailyReturn"].StdDev(); double[] quantiles = Accord.Statistics.Measures.Quantiles( ohlcDF.FillMissing(0.0)["DailyReturn"].ValuesAll.ToArray(), new double[] { 0.25, 0.5, 0.75 } ); Console.WriteLine("-- DailyReturn Distribution-- "); Console.WriteLine("Mean: \t\t\t{0:0.00}\nStdDev: \t\t{1:0.00}\n", returnMean, returnStdDev); Console.WriteLine( "Min: \t\t\t{0:0.00}\nQ1 (25% Percentile): \t{1:0.00}\nQ2 (Median): \t\t{2:0.00}\nQ3 (75% Percentile): \t{3:0.00}\nMax: \t\t\t{4:0.00}", returnMin, quantiles[0], quantiles[1], quantiles[2], returnMax ); Console.WriteLine("\nDONE!!!"); Console.ReadKey(); }
static void Main(string[] args) { Console.SetWindowSize(100, 60); // Read in the Image Features dataset // TODO: change the path to point to your data directory string dataDirPath = @"\\Mac\Home\Documents\c-sharp-machine-learning\ch.8\input-data"; // Load the data into a data frame string dataPath = Path.Combine(dataDirPath, "train.csv"); Console.WriteLine("Loading {0}\n\n", dataPath); var featuresDF = Frame.ReadCsv( dataPath, hasHeaders: true, inferTypes: true ); Console.WriteLine("* Shape: {0}, {1}\n\n", featuresDF.RowCount, featuresDF.ColumnCount); double trainSetProportiona = 0.7; var rnd = new Random(); var trainIdx = featuresDF.RowKeys.Where((x, i) => rnd.NextDouble() <= trainSetProportiona); var testIdx = featuresDF.RowKeys.Where((x, i) => !trainIdx.Contains(i)); var trainset = featuresDF.Rows[trainIdx]; var testset = featuresDF.Rows[testIdx]; var trainLabels = trainset.GetColumn <int>("label").Values.ToArray(); string[] nonZeroPixelCols = trainset.ColumnKeys.Where(x => trainset[x].Max() > 0 && !x.Equals("label")).ToArray(); double[][] data = trainset.Columns[nonZeroPixelCols].Rows.Select( x => Array.ConvertAll <object, double>(x.Value.ValuesAll.ToArray(), o => Convert.ToDouble(o)) ).ValuesAll.ToArray(); Console.WriteLine("* Shape: {0}, {1}\n\n", data.Length, data[0].Length); var digitCount = trainset.AggregateRowsBy <string, int>( new string[] { "label" }, new string[] { "pixel0" }, x => x.ValueCount ).SortRows("pixel0"); digitCount.Print(); var barChart = DataBarBox.Show( digitCount.GetColumn <string>("label").Values.ToArray(), digitCount["pixel0"].Values.ToArray() ).SetTitle( "Train Set - Digit Count" ); digitCount = testset.AggregateRowsBy <string, int>( new string[] { "label" }, new string[] { "pixel0" }, x => x.ValueCount ).SortRows("pixel0"); digitCount.Print(); barChart = DataBarBox.Show( digitCount.GetColumn <string>("label").Values.ToArray(), digitCount["pixel0"].Values.ToArray() ).SetTitle( "Test Set - Digit Count" ); var pca = new PrincipalComponentAnalysis( PrincipalComponentMethod.Standardize ); pca.Learn(data); double[][] transformed = pca.Transform(data); double[][] first2Components = transformed.Select(x => x.Where((y, i) => i < 2).ToArray()).ToArray(); ScatterplotBox.Show("Component #1 vs. Component #2", first2Components, trainLabels); DataSeriesBox.Show( pca.Components.Select((x, i) => (double)i), pca.Components.Select(x => x.CumulativeProportion) ).SetTitle("Explained Variance"); System.IO.File.WriteAllLines( Path.Combine(dataDirPath, "explained-variance.csv"), pca.Components.Select((x, i) => String.Format("{0},{1:0.0000}", i, x.CumulativeProportion)) ); Console.WriteLine("exporting train set..."); var trainTransformed = pca.Transform( trainset.Columns[nonZeroPixelCols].Rows.Select( x => Array.ConvertAll <object, double>(x.Value.ValuesAll.ToArray(), o => Convert.ToDouble(o)) ).ValuesAll.ToArray() ); System.IO.File.WriteAllLines( Path.Combine(dataDirPath, "pca-train.csv"), trainTransformed.Select((x, i) => String.Format("{0},{1}", String.Join(",", x), trainset["label"].GetAt(i))) ); Console.WriteLine("exporting test set..."); var testTransformed = pca.Transform( testset.Columns[nonZeroPixelCols].Rows.Select( x => Array.ConvertAll <object, double>(x.Value.ValuesAll.ToArray(), o => Convert.ToDouble(o)) ).ValuesAll.ToArray() ); System.IO.File.WriteAllLines( Path.Combine(dataDirPath, "pca-test.csv"), testTransformed.Select((x, i) => String.Format("{0},{1}", String.Join(",", x), testset["label"].GetAt(i))) ); Console.WriteLine("\n\n\n\n\nDONE!!!"); Console.ReadKey(); }
static void Main(string[] args) { Console.SetWindowSize(100, 60); // Read in the Cyber Attack dataset // TODO: change the path to point to your data directory string dataDirPath = @"\\Mac\Home\Documents\c-sharp-machine-learning\ch.9\input-data"; // Load the data into a data frame string dataPath = Path.Combine(dataDirPath, "data.csv"); Console.WriteLine("Loading {0}\n\n", dataPath); var rawDF = Frame.ReadCsv( dataPath, hasHeaders: true, inferTypes: true ); // Encode Categorical Variables string[] categoricalVars = { "protocol_type", "service", "flag", "land" }; // Encode Target Variables IDictionary <string, int> targetVarEncoding = new Dictionary <string, int> { { "normal", 0 }, { "dos", 1 }, { "probe", 2 }, { "r2l", 3 }, { "u2r", 4 } }; var featuresDF = Frame.CreateEmpty <int, string>(); foreach (string col in rawDF.ColumnKeys) { if (col.Equals("attack_type")) { continue; } else if (col.Equals("attack_category")) { featuresDF.AddColumn( col, rawDF.GetColumn <string>(col).Select(x => targetVarEncoding[x.Value]) ); } else if (categoricalVars.Contains(col)) { var categoryDF = EncodeOneHot(rawDF.GetColumn <string>(col), col); foreach (string newCol in categoryDF.ColumnKeys) { featuresDF.AddColumn(newCol, categoryDF.GetColumn <int>(newCol)); } } else { featuresDF.AddColumn( col, rawDF[col].Select((x, i) => double.IsNaN(x.Value) ? 0.0 : x.Value) ); } } Console.WriteLine("* Shape: {0}, {1}\n\n", featuresDF.RowCount, featuresDF.ColumnCount); Console.WriteLine("* Exporting feature set..."); featuresDF.SaveCsv(Path.Combine(dataDirPath, "features.csv")); // Build PCA with only normal data var rnd = new Random(); int[] normalIdx = featuresDF["attack_category"] .Where(x => x.Value == 0) .Keys .OrderBy(x => rnd.Next()) .Take(90000).ToArray(); int[] attackIdx = featuresDF["attack_category"] .Where(x => x.Value > 0) .Keys .OrderBy(x => rnd.Next()) .Take(10000).ToArray(); int[] totalIdx = normalIdx.Concat(attackIdx).ToArray(); var normalSet = featuresDF.Rows[normalIdx]; string[] nonZeroValueCols = normalSet.ColumnKeys.Where( x => !x.Equals("attack_category") && normalSet[x].Max() != normalSet[x].Min() ).ToArray(); double[][] normalData = BuildJaggedArray( normalSet.Columns[nonZeroValueCols].ToArray2D <double>(), normalSet.RowCount, nonZeroValueCols.Length ); double[][] wholeData = BuildJaggedArray( featuresDF.Rows[totalIdx].Columns[nonZeroValueCols].ToArray2D <double>(), totalIdx.Length, nonZeroValueCols.Length ); int[] labels = featuresDF .Rows[totalIdx] .GetColumn <int>("attack_category") .ValuesAll.ToArray(); var pca = new PrincipalComponentAnalysis( PrincipalComponentMethod.Standardize ); pca.Learn(normalData); double[][] transformed = pca.Transform(wholeData); double[][] first2Components = transformed.Select( x => x.Where((y, i) => i < 2).ToArray() ).ToArray(); ScatterplotBox.Show("Component #1 vs. Component #2", first2Components, labels); double[][] next2Components = transformed.Select( x => x.Where((y, i) => i < 3 && i >= 1).ToArray() ).ToArray(); ScatterplotBox.Show("Component #2 vs. Component #3", next2Components, labels); next2Components = transformed.Select( x => x.Where((y, i) => i < 4 && i >= 2).ToArray() ).ToArray(); ScatterplotBox.Show("Component #3 vs. Component #4", next2Components, labels); next2Components = transformed.Select( x => x.Where((y, i) => i < 5 && i >= 3).ToArray() ).ToArray(); ScatterplotBox.Show("Component #4 vs. Component #5", next2Components, labels); next2Components = transformed.Select( x => x.Where((y, i) => i < 6 && i >= 4).ToArray() ).ToArray(); ScatterplotBox.Show("Component #5 vs. Component #6", next2Components, labels); double[] explainedVariance = pca.Components .Select(x => x.CumulativeProportion) .Where(x => x < 1) .ToArray(); DataSeriesBox.Show( explainedVariance.Select((x, i) => (double)i), explainedVariance ).SetTitle("Explained Variance"); System.IO.File.WriteAllLines( Path.Combine(dataDirPath, "explained-variance.csv"), explainedVariance.Select((x, i) => String.Format("{0},{1:0.0000}", i, x)) ); Console.WriteLine("* Exporting pca-transformed feature set..."); System.IO.File.WriteAllLines( Path.Combine( dataDirPath, "pca-transformed-features.csv" ), transformed.Select(x => String.Join(",", x)) ); System.IO.File.WriteAllLines( Path.Combine( dataDirPath, "pca-transformed-labels.csv" ), labels.Select(x => x.ToString()) ); Console.WriteLine("\n\n\n\n\nDONE!!!"); Console.ReadKey(); }
static void Main(string[] args) { Console.SetWindowSize(100, 69); // Read in the OHLC dataset // TODO: change the path to point to your data directory string dataDirPath = @"<path-to-your-dir>"; // Load the OHLC data into a data frame string ohlcDataPath = Path.Combine(dataDirPath, "eurusd-daily-ohlc.csv"); Console.WriteLine("Loading {0}", ohlcDataPath); var ohlcDF = Frame.ReadCsv( ohlcDataPath, hasHeaders: true, inferTypes: true ); // 1. Moving Averages ohlcDF.AddColumn("10_MA", ohlcDF.Window(10).Select(x => x.Value["Close"].Mean())); ohlcDF.AddColumn("20_MA", ohlcDF.Window(20).Select(x => x.Value["Close"].Mean())); ohlcDF.AddColumn("50_MA", ohlcDF.Window(50).Select(x => x.Value["Close"].Mean())); ohlcDF.AddColumn("200_MA", ohlcDF.Window(200).Select(x => x.Value["Close"].Mean())); // Time-series line chart of close prices & moving averages var maLineChart = DataSeriesBox.Show( ohlcDF.Where(x => x.Key > 4400 && x.Key < 4900).RowKeys.Select(x => (double)x), ohlcDF.Where(x => x.Key > 4400 && x.Key < 4900).GetColumn <double>("Close").ValuesAll, ohlcDF.Where(x => x.Key > 4400 && x.Key < 4900).GetColumn <double>("10_MA").ValuesAll, ohlcDF.Where(x => x.Key > 4400 && x.Key < 4900).GetColumn <double>("20_MA").ValuesAll, ohlcDF.Where(x => x.Key > 4400 && x.Key < 4900).GetColumn <double>("50_MA").ValuesAll, ohlcDF.Where(x => x.Key > 4400 && x.Key < 4900).GetColumn <double>("200_MA").ValuesAll ); System.Threading.Thread.Sleep(3000); maLineChart.Invoke( new Action(() => { maLineChart.Size = new System.Drawing.Size(900, 700); }) ); // Distance from moving averages ohlcDF.AddColumn("Close_minus_10_MA", ohlcDF["Close"] - ohlcDF["10_MA"]); ohlcDF.AddColumn("Close_minus_20_MA", ohlcDF["Close"] - ohlcDF["20_MA"]); ohlcDF.AddColumn("Close_minus_50_MA", ohlcDF["Close"] - ohlcDF["50_MA"]); ohlcDF.AddColumn("Close_minus_200_MA", ohlcDF["Close"] - ohlcDF["200_MA"]); // 2. Bollinger Band ohlcDF.AddColumn("20_day_std", ohlcDF.Window(20).Select(x => x.Value["Close"].StdDev())); ohlcDF.AddColumn("BollingerUpperBound", ohlcDF["20_MA"] + ohlcDF["20_day_std"] * 2); ohlcDF.AddColumn("BollingerLowerBound", ohlcDF["20_MA"] - ohlcDF["20_day_std"] * 2); // Time-series line chart of close prices & bollinger bands var bbLineChart = DataSeriesBox.Show( ohlcDF.Where(x => x.Key > 4400 && x.Key < 4900).RowKeys.Select(x => (double)x), ohlcDF.Where(x => x.Key > 4400 && x.Key < 4900).GetColumn <double>("Close").ValuesAll, ohlcDF.Where(x => x.Key > 4400 && x.Key < 4900).GetColumn <double>("BollingerUpperBound").ValuesAll, ohlcDF.Where(x => x.Key > 4400 && x.Key < 4900).GetColumn <double>("20_MA").ValuesAll, ohlcDF.Where(x => x.Key > 4400 && x.Key < 4900).GetColumn <double>("BollingerLowerBound").ValuesAll ); System.Threading.Thread.Sleep(3000); bbLineChart.Invoke( new Action(() => { bbLineChart.Size = new System.Drawing.Size(900, 700); }) ); // Distance from Bollinger Bands ohlcDF.AddColumn("Close_minus_BollingerUpperBound", ohlcDF["Close"] - ohlcDF["BollingerUpperBound"]); ohlcDF.AddColumn("Close_minus_BollingerLowerBound", ohlcDF["Close"] - ohlcDF["BollingerLowerBound"]); // 3. Lagging Variables ohlcDF.AddColumn("DailyReturn_T-1", ohlcDF["DailyReturn"].Shift(1)); ohlcDF.AddColumn("DailyReturn_T-2", ohlcDF["DailyReturn"].Shift(2)); ohlcDF.AddColumn("DailyReturn_T-3", ohlcDF["DailyReturn"].Shift(3)); ohlcDF.AddColumn("DailyReturn_T-4", ohlcDF["DailyReturn"].Shift(4)); ohlcDF.AddColumn("DailyReturn_T-5", ohlcDF["DailyReturn"].Shift(5)); ohlcDF.AddColumn("Close_minus_10_MA_T-1", ohlcDF["Close_minus_10_MA"].Shift(1)); ohlcDF.AddColumn("Close_minus_10_MA_T-2", ohlcDF["Close_minus_10_MA"].Shift(2)); ohlcDF.AddColumn("Close_minus_10_MA_T-3", ohlcDF["Close_minus_10_MA"].Shift(3)); ohlcDF.AddColumn("Close_minus_10_MA_T-4", ohlcDF["Close_minus_10_MA"].Shift(4)); ohlcDF.AddColumn("Close_minus_10_MA_T-5", ohlcDF["Close_minus_10_MA"].Shift(5)); ohlcDF.AddColumn("Close_minus_20_MA_T-1", ohlcDF["Close_minus_20_MA"].Shift(1)); ohlcDF.AddColumn("Close_minus_20_MA_T-2", ohlcDF["Close_minus_20_MA"].Shift(2)); ohlcDF.AddColumn("Close_minus_20_MA_T-3", ohlcDF["Close_minus_20_MA"].Shift(3)); ohlcDF.AddColumn("Close_minus_20_MA_T-4", ohlcDF["Close_minus_20_MA"].Shift(4)); ohlcDF.AddColumn("Close_minus_20_MA_T-5", ohlcDF["Close_minus_20_MA"].Shift(5)); ohlcDF.AddColumn("Close_minus_50_MA_T-1", ohlcDF["Close_minus_50_MA"].Shift(1)); ohlcDF.AddColumn("Close_minus_50_MA_T-2", ohlcDF["Close_minus_50_MA"].Shift(2)); ohlcDF.AddColumn("Close_minus_50_MA_T-3", ohlcDF["Close_minus_50_MA"].Shift(3)); ohlcDF.AddColumn("Close_minus_50_MA_T-4", ohlcDF["Close_minus_50_MA"].Shift(4)); ohlcDF.AddColumn("Close_minus_50_MA_T-5", ohlcDF["Close_minus_50_MA"].Shift(5)); ohlcDF.AddColumn("Close_minus_200_MA_T-1", ohlcDF["Close_minus_200_MA"].Shift(1)); ohlcDF.AddColumn("Close_minus_200_MA_T-2", ohlcDF["Close_minus_200_MA"].Shift(2)); ohlcDF.AddColumn("Close_minus_200_MA_T-3", ohlcDF["Close_minus_200_MA"].Shift(3)); ohlcDF.AddColumn("Close_minus_200_MA_T-4", ohlcDF["Close_minus_200_MA"].Shift(4)); ohlcDF.AddColumn("Close_minus_200_MA_T-5", ohlcDF["Close_minus_200_MA"].Shift(5)); ohlcDF.AddColumn("Close_minus_BollingerUpperBound_T-1", ohlcDF["Close_minus_BollingerUpperBound"].Shift(1)); ohlcDF.AddColumn("Close_minus_BollingerUpperBound_T-2", ohlcDF["Close_minus_BollingerUpperBound"].Shift(2)); ohlcDF.AddColumn("Close_minus_BollingerUpperBound_T-3", ohlcDF["Close_minus_BollingerUpperBound"].Shift(3)); ohlcDF.AddColumn("Close_minus_BollingerUpperBound_T-4", ohlcDF["Close_minus_BollingerUpperBound"].Shift(4)); ohlcDF.AddColumn("Close_minus_BollingerUpperBound_T-5", ohlcDF["Close_minus_BollingerUpperBound"].Shift(5)); Console.WriteLine("Saving features DF into a CSV file..."); Console.WriteLine("\n\nDF Shape BEFORE Dropping Missing Values: ({0}, {1})", ohlcDF.RowCount, ohlcDF.ColumnCount); ohlcDF = ohlcDF.DropSparseRows(); Console.WriteLine("\nDF Shape AFTER Dropping Missing Values: ({0}, {1})\n\n", ohlcDF.RowCount, ohlcDF.ColumnCount); ohlcDF.SaveCsv(Path.Combine(dataDirPath, "eurusd-features.csv")); Console.WriteLine("\nDONE!!!"); Console.ReadKey(); }
static void Main(string[] args) { Console.SetWindowSize(100, 60); // Read in the Credit Card Fraud dataset // TODO: change the path to point to your data directory string dataDirPath = @"\\Mac\Home\Documents\c-sharp-machine-learning\ch.10\input-data"; // Load the data into a data frame string dataPath = Path.Combine(dataDirPath, "creditcard.csv"); Console.WriteLine("Loading {0}\n\n", dataPath); var df = Frame.ReadCsv( dataPath, hasHeaders: true, inferTypes: true ); Console.WriteLine("* Shape: {0}, {1}\n\n", df.RowCount, df.ColumnCount); string[] featureCols = df.ColumnKeys.Where( x => !x.Equals("Time") && !x.Equals("Class") ).ToArray(); var noFraudData = df.Rows[ df["Class"].Where(x => x.Value == 0.0).Keys ].Columns[featureCols]; double[][] data = BuildJaggedArray( noFraudData.ToArray2D <double>(), noFraudData.RowCount, featureCols.Length ); double[][] wholeData = BuildJaggedArray( df.Columns[featureCols].ToArray2D <double>(), df.RowCount, featureCols.Length ); int[] labels = df.GetColumn <int>("Class").ValuesAll.ToArray(); var pca = new PrincipalComponentAnalysis( PrincipalComponentMethod.Standardize ); pca.Learn(data); double[][] transformed = pca.Transform(wholeData); double[][] first2Components = transformed.Select(x => x.Where((y, i) => i < 2).ToArray()).ToArray(); ScatterplotBox.Show("Component #1 vs. Component #2", first2Components, labels); double[][] next2Components = transformed.Select( x => x.Where((y, i) => i >= 1 && i <= 2).ToArray() ).ToArray(); ScatterplotBox.Show("Component #2 vs. Component #3", next2Components, labels); next2Components = transformed.Select( x => x.Where((y, i) => i >= 2 && i <= 3).ToArray() ).ToArray(); ScatterplotBox.Show("Component #3 vs. Component #4", next2Components, labels); next2Components = transformed.Select( x => x.Where((y, i) => i >= 3 && i <= 4).ToArray() ).ToArray(); ScatterplotBox.Show("Component #4 vs. Component #5", next2Components, labels); DataSeriesBox.Show( pca.Components.Select((x, i) => (double)i), pca.Components.Select(x => x.CumulativeProportion) ).SetTitle("Explained Variance"); System.IO.File.WriteAllLines( Path.Combine(dataDirPath, "explained-variance.csv"), pca.Components.Select((x, i) => String.Format("{0},{1:0.0000}", i + 1, x.CumulativeProportion)) ); Console.WriteLine("exporting train set..."); System.IO.File.WriteAllLines( Path.Combine(dataDirPath, "pca-features.csv"), transformed.Select((x, i) => String.Format("{0},{1}", String.Join(",", x), labels[i])) ); Console.WriteLine("\n\n\n\n\nDONE!!!"); Console.ReadKey(); }