Exemple #1
0
        public void DataBarBox_ShowTest1()
        {
            double[] data   = new double[] { 100.0, 150.0, 42.0 };
            string[] labels = { "1", "2", "3" };

            DataBarBox.Show(labels, data)
            .Hold();
        }
        public void GenerateTotalsGraph(Series <string, int> values, string title = "Classification Frequency")
        {
            var barChart = DataBarBox.Show(
                values.Keys,
                values.Values.Select(Convert.ToDouble).ToArray());

            barChart.SetTitle(title);
        }
        public void GenerateTotalsGraph(Frame <int, string> frame, int hamTotal, int spamTotal)
        {
            var barChart = DataBarBox.Show(
                new string[] { "Ham", "Spam" },
                new double[] { hamTotal, spamTotal }
                );

            barChart.SetTitle("Ham vs. Spam in Sample Set");
        }
        public static void HamVsSpamBarChart(int hamEmailCount, int spamEmailCount)
        {
            var barChart = DataBarBox.Show(
                new string[] { "Ham", "Spam" },
                new double[] { hamEmailCount, spamEmailCount }
                );

            barChart.SetTitle("Ham vs. Spam in sample set");
        }
Exemple #5
0
        public void GenerateTermProportionsGraph(Series <string, double> termsSeries, int totalTerms, Series <string, double> comparisonSeries, int comparisonTotal, string graphTitle)
        {
            var termsProportions    = termsSeries / totalTerms;
            var topTerms            = termsProportions.Keys.Take(_appSettings.NumberTermsToGraph).ToList();
            var topTermsProportions = termsProportions.Values.Take(_appSettings.NumberTermsToGraph);

            var comparisonProportions = comparisonSeries / comparisonTotal;

            var barChart = DataBarBox.Show(
                topTerms.ToArray(),
                new double[][]
            {
                topTermsProportions.ToArray(),
                comparisonProportions.GetItems(topTerms).Values.ToArray(),
            });

            barChart.SetTitle(graphTitle);
        }
        public static void Top10HamTermsChart(
            IEnumerable <string> topHamTerms,
            IEnumerable <double> topHamTermsProportions,
            Series <string, double> spamTermProportions)
        {
            var hamBarChart = DataBarBox.Show(
                topHamTerms.ToArray(),
                new double[][] {
                topHamTermsProportions.ToArray(),
                spamTermProportions.GetItems(topHamTerms).Values.ToArray()
            }
                );

            hamBarChart.SetTitle("Top 10 Terms in Ham Emails (blue: HAM, red: SPAM)");
            System.Threading.Thread.Sleep(3000);
            hamBarChart.Invoke(
                new Action(() =>
            {
                hamBarChart.Size = new System.Drawing.Size(5000, 1500);
            })
                );
        }
        static void Main(string[] args)
        {
            Console.SetWindowSize(100, 60);

            // Read in the Image Features dataset
            // TODO: change the path to point to your data directory
            string dataDirPath = @"\\Mac\Home\Documents\c-sharp-machine-learning\ch.8\input-data";

            // Load the data into a data frame
            string dataPath = Path.Combine(dataDirPath, "train.csv");

            Console.WriteLine("Loading {0}\n\n", dataPath);
            var featuresDF = Frame.ReadCsv(
                dataPath,
                hasHeaders: true,
                inferTypes: true
                );

            Console.WriteLine("* Shape: {0}, {1}\n\n", featuresDF.RowCount, featuresDF.ColumnCount);

            double trainSetProportiona = 0.7;

            var rnd      = new Random();
            var trainIdx = featuresDF.RowKeys.Where((x, i) => rnd.NextDouble() <= trainSetProportiona);
            var testIdx  = featuresDF.RowKeys.Where((x, i) => !trainIdx.Contains(i));

            var trainset = featuresDF.Rows[trainIdx];
            var testset  = featuresDF.Rows[testIdx];

            var trainLabels = trainset.GetColumn <int>("label").Values.ToArray();

            string[] nonZeroPixelCols = trainset.ColumnKeys.Where(x => trainset[x].Max() > 0 && !x.Equals("label")).ToArray();

            double[][] data = trainset.Columns[nonZeroPixelCols].Rows.Select(
                x => Array.ConvertAll <object, double>(x.Value.ValuesAll.ToArray(), o => Convert.ToDouble(o))
                ).ValuesAll.ToArray();

            Console.WriteLine("* Shape: {0}, {1}\n\n", data.Length, data[0].Length);

            var digitCount = trainset.AggregateRowsBy <string, int>(
                new string[] { "label" },
                new string[] { "pixel0" },
                x => x.ValueCount
                ).SortRows("pixel0");

            digitCount.Print();

            var barChart = DataBarBox.Show(
                digitCount.GetColumn <string>("label").Values.ToArray(),
                digitCount["pixel0"].Values.ToArray()
                ).SetTitle(
                "Train Set - Digit Count"
                );

            digitCount = testset.AggregateRowsBy <string, int>(
                new string[] { "label" },
                new string[] { "pixel0" },
                x => x.ValueCount
                ).SortRows("pixel0");

            digitCount.Print();

            barChart = DataBarBox.Show(
                digitCount.GetColumn <string>("label").Values.ToArray(),
                digitCount["pixel0"].Values.ToArray()
                ).SetTitle(
                "Test Set - Digit Count"
                );

            var pca = new PrincipalComponentAnalysis(
                PrincipalComponentMethod.Standardize
                );

            pca.Learn(data);

            double[][] transformed      = pca.Transform(data);
            double[][] first2Components = transformed.Select(x => x.Where((y, i) => i < 2).ToArray()).ToArray();
            ScatterplotBox.Show("Component #1 vs. Component #2", first2Components, trainLabels);

            DataSeriesBox.Show(
                pca.Components.Select((x, i) => (double)i),
                pca.Components.Select(x => x.CumulativeProportion)
                ).SetTitle("Explained Variance");
            System.IO.File.WriteAllLines(
                Path.Combine(dataDirPath, "explained-variance.csv"),
                pca.Components.Select((x, i) => String.Format("{0},{1:0.0000}", i, x.CumulativeProportion))
                );

            Console.WriteLine("exporting train set...");
            var trainTransformed = pca.Transform(
                trainset.Columns[nonZeroPixelCols].Rows.Select(
                    x => Array.ConvertAll <object, double>(x.Value.ValuesAll.ToArray(), o => Convert.ToDouble(o))
                    ).ValuesAll.ToArray()
                );

            System.IO.File.WriteAllLines(
                Path.Combine(dataDirPath, "pca-train.csv"),
                trainTransformed.Select((x, i) => String.Format("{0},{1}", String.Join(",", x), trainset["label"].GetAt(i)))
                );

            Console.WriteLine("exporting test set...");
            var testTransformed = pca.Transform(
                testset.Columns[nonZeroPixelCols].Rows.Select(
                    x => Array.ConvertAll <object, double>(x.Value.ValuesAll.ToArray(), o => Convert.ToDouble(o))
                    ).ValuesAll.ToArray()
                );

            System.IO.File.WriteAllLines(
                Path.Combine(dataDirPath, "pca-test.csv"),
                testTransformed.Select((x, i) => String.Format("{0},{1}", String.Join(",", x), testset["label"].GetAt(i)))
                );

            Console.WriteLine("\n\n\n\n\nDONE!!!");
            Console.ReadKey();
        }
        static void Main(string[] args)
        {
            Console.SetWindowSize(150, 80);

            // Read in the file we created in the Data Preparation step
            // TODO: change the path to point to your data directory
            string dataDirPath = @"<path-to-data-dir>";

            // Read in stopwords list that we used in Chapter #2
            ISet <string> stopWords = new HashSet <string>(
                File.ReadLines("<path-to-stopwords.txt>")
                );

            // Load the data into a data frame
            string trainDataPath = Path.Combine(dataDirPath, "processed-training.csv");

            Console.WriteLine("- Loading {0}", trainDataPath);
            var rawDF = Frame.ReadCsv(
                trainDataPath,
                hasHeaders: true,
                inferTypes: true
                );

            // Look at the sentiment distributions in our sample set
            var sampleSetDistribution = rawDF.GetColumn <string>(
                "airline_sentiment"
                ).GroupBy <string>(x => x.Value).Select(x => x.Value.KeyCount);

            sampleSetDistribution.Print();
            Console.WriteLine(String.Join(",", sampleSetDistribution.Values.ToArray()));

            var barChart = DataBarBox.Show(
                new string[] { "neutral", "positive", "negative" },
                sampleSetDistribution.Values.Select(i => (double)i).ToArray()
                );

            barChart.SetTitle("Sentiment Distribution in Sample Set");

            // Look at words in pre-processed Tweets
            var tweetWordVecDF = CreateWordVec(rawDF.GetColumn <string>("tweet"), stopWords, useLemma: false);

            tweetWordVecDF.AddColumn(
                "tweet_polarity",
                rawDF.GetColumn <string>("airline_sentiment").Select(
                    x => x.Value == "neutral" ? 0 : x.Value == "positive" ? 1 : 2
                    )
                );
            WriteDataFrameRowByRow(tweetWordVecDF, Path.Combine(dataDirPath, "tweet-words.csv"));
            Console.WriteLine("* Tweet Word Vec DF Shape ({0}, {1})", tweetWordVecDF.RowCount, tweetWordVecDF.ColumnCount);


            // Look at lemmas in pre-processed Tweets
            var tweetLemmaVecDF = CreateWordVec(rawDF.GetColumn <string>("tweet"), stopWords, useLemma: true);

            tweetLemmaVecDF.AddColumn(
                "tweet_polarity",
                rawDF.GetColumn <string>("airline_sentiment").Select(
                    x => x.Value == "neutral" ? 0 : x.Value == "positive" ? 1 : 2
                    )
                );
            WriteDataFrameRowByRow(tweetLemmaVecDF, Path.Combine(dataDirPath, "tweet-lemma.csv"));
            Console.WriteLine("* Tweet Lemma Vec DF Shape ({0}, {1})", tweetLemmaVecDF.RowCount, tweetLemmaVecDF.ColumnCount);

            Console.WriteLine("Done!!!");
            Console.ReadKey();
        }
Exemple #9
0
        static void Main(string[] args)
        {
            Console.SetWindowSize(125, 50);
            // Read in the file we created in the Data Preparation (TwitterTokenizer) step
            // TODO: change the path to point to your data directory
            string dataDirPath = @"<path-to-data-dir>";

            // Load the twitter-lemma data into a data frame
            var tweetLemmaDF = Frame.ReadCsv(
                Path.Combine(dataDirPath, "tweet-lemma.csv"),
                hasHeaders: true,
                inferTypes: true
                );

            Console.WriteLine("* DF shape: ({0}, {1})", tweetLemmaDF.RowCount, tweetLemmaDF.ColumnCount);

            var sampleSetDistribution = tweetLemmaDF.GetColumn <string>(
                "tweet_polarity"
                ).GroupBy <string>(x => x.Value).Select(x => x.Value.KeyCount);

            int[] sampleSizes        = sampleSetDistribution.Values.ToArray();
            int   neutralSampleSize  = sampleSizes[0];
            int   positiveSampleSize = sampleSizes[1];
            int   negativeSampleSize = sampleSizes[2];

            Console.WriteLine("* sentiment distribution - neutral: {0}, positive: {1}, negative: {2}", neutralSampleSize, positiveSampleSize, negativeSampleSize);

            var neutralTermFrequencies = ColumnWiseSum(
                tweetLemmaDF.Where(
                    x => x.Value.GetAs <int>("tweet_polarity") == 0
                    ),
                "tweet_polarity"
                ).Sort().Reversed;

            var positiveTermFrequencies = ColumnWiseSum(
                tweetLemmaDF.Where(
                    x => x.Value.GetAs <int>("tweet_polarity") == 1
                    ),
                "tweet_polarity"
                ).Sort().Reversed;

            var negativeTermFrequencies = ColumnWiseSum(
                tweetLemmaDF.Where(
                    x => x.Value.GetAs <int>("tweet_polarity") == 2
                    ),
                "tweet_polarity"
                ).Sort().Reversed;

            // Look at Top 10 terms that appear in Neutral vs. Positive vs. Negative tweets
            var topN = 7;

            var neutralTermProportions  = neutralTermFrequencies / neutralSampleSize;
            var positiveTermProportions = positiveTermFrequencies / positiveSampleSize;
            var negativeTermProportions = negativeTermFrequencies / negativeSampleSize;

            var topNeutralTerms            = neutralTermProportions.Keys.Take(topN);
            var topNeutralTermsProportions = neutralTermProportions.Values.Take(topN);

            var topPositiveTerms            = positiveTermProportions.Keys.Take(topN);
            var topPositiveTermsProportions = positiveTermProportions.Values.Take(topN);

            var topNegativeTerms            = negativeTermProportions.Keys.Take(topN);
            var topNegativeTermsProportions = negativeTermProportions.Values.Take(topN);

            System.IO.File.WriteAllLines(
                dataDirPath + "\\neutral-frequencies.csv",
                neutralTermFrequencies.Keys.Zip(
                    neutralTermFrequencies.Values, (a, b) => string.Format("{0},{1}", a, b)
                    )
                );
            System.IO.File.WriteAllLines(
                dataDirPath + "\\positive-frequencies.csv",
                positiveTermFrequencies.Keys.Zip(
                    positiveTermFrequencies.Values, (a, b) => string.Format("{0},{1}", a, b)
                    )
                );
            System.IO.File.WriteAllLines(
                dataDirPath + "\\negative-frequencies.csv",
                negativeTermFrequencies.Keys.Zip(
                    negativeTermFrequencies.Values, (a, b) => string.Format("{0},{1}", a, b)
                    )
                );

            var topNeutralBarChart = DataBarBox.Show(
                topNeutralTerms.ToArray(),
                new double[][] {
                topNeutralTermsProportions.ToArray(),
                negativeTermProportions.GetItems(topNeutralTerms).Values.ToArray(),
                positiveTermProportions.GetItems(topNeutralTerms).Values.ToArray()
            }
                );

            topNeutralBarChart.SetTitle(
                String.Format(
                    "Top {0} Terms in Neutral Tweets (blue: neutral, red: negative, green: positive)",
                    topN
                    )
                );
            System.Threading.Thread.Sleep(3000);
            topNeutralBarChart.Invoke(
                new Action(() =>
            {
                topNeutralBarChart.Size = new System.Drawing.Size(5000, 1500);
            })
                );

            var topPositiveBarChart = DataBarBox.Show(
                topPositiveTerms.ToArray(),
                new double[][] {
                neutralTermProportions.GetItems(topPositiveTerms).Values.ToArray(),
                negativeTermProportions.GetItems(topPositiveTerms).Values.ToArray(),
                topPositiveTermsProportions.ToArray()
            }
                );

            topPositiveBarChart.SetTitle(
                String.Format(
                    "Top {0} Terms in Positive Tweets (blue: neutral, red: negative, green: positive)",
                    topN
                    )
                );
            System.Threading.Thread.Sleep(3000);
            topPositiveBarChart.Invoke(
                new Action(() =>
            {
                topPositiveBarChart.Size = new System.Drawing.Size(5000, 1500);
            })
                );

            var topNegattiveBarChart = DataBarBox.Show(
                topNegativeTerms.ToArray(),
                new double[][] {
                neutralTermProportions.GetItems(topNegativeTerms).Values.ToArray(),
                topNegativeTermsProportions.ToArray(),
                positiveTermProportions.GetItems(topNegativeTerms).Values.ToArray()
            }
                );

            topNegattiveBarChart.SetTitle(
                String.Format(
                    "Top {0} Terms in Negative Tweets (blue: neutral, red: negative, green: positive)",
                    topN
                    )
                );
            System.Threading.Thread.Sleep(3000);
            topNegattiveBarChart.Invoke(
                new Action(() =>
            {
                topNegattiveBarChart.Size = new System.Drawing.Size(5000, 1500);
            })
                );

            Console.WriteLine("Done");
            Console.ReadKey();
        }
Exemple #10
0
        static void Main(string[] args)
        {
            Console.SetWindowSize(100, 60);

            // Read in the Online Retail dataset
            // TODO: change the path to point to your data directory
            string dataDirPath = @"\\Mac\Home\Documents\c-sharp-machine-learning\ch.6\input-data";

            // Load the data into a data frame
            string dataPath = Path.Combine(dataDirPath, "data.csv");

            Console.WriteLine("Loading {0}\n\n", dataPath);
            var ecommerceDF = Frame.ReadCsv(
                dataPath,
                hasHeaders: true,
                inferTypes: true
                );

            Console.WriteLine("* Shape: {0}, {1}\n\n", ecommerceDF.RowCount, ecommerceDF.ColumnCount);

            // 1. Missing CustomerID Values
            ecommerceDF
            .Columns[new string[] { "CustomerID", "InvoiceNo", "StockCode", "Quantity", "UnitPrice", "Country" }]
            .GetRowsAt(new int[] { 1440, 1441, 1442, 1443, 1444, 1445, 1446 })
            .Print();
            Console.WriteLine("\n\n* # of values in CustomerID column: {0}", ecommerceDF["CustomerID"].ValueCount);
            // Drop missing values
            ecommerceDF = ecommerceDF
                          .Columns[new string[] { "CustomerID", "Description", "Quantity", "UnitPrice", "Country" }]
                          .DropSparseRows();
            // Per-Transaction Purchase Amount = Quantity * UnitPrice
            ecommerceDF.AddColumn("Amount", ecommerceDF["Quantity"] * ecommerceDF["UnitPrice"]);

            Console.WriteLine("\n\n* Shape (After dropping missing values): {0}, {1}\n", ecommerceDF.RowCount, ecommerceDF.ColumnCount);
            Console.WriteLine("* After dropping missing values and unnecessary columns:");
            ecommerceDF.GetRowsAt(new int[] { 0, 1, 2, 3, 4 }).Print();
            // Export Data
            ecommerceDF.SaveCsv(Path.Combine(dataDirPath, "data-clean.csv"));

            // 2. Number of transactions by country
            var numTransactionsByCountry = ecommerceDF
                                           .AggregateRowsBy <string, int>(
                new string[] { "Country" },
                new string[] { "CustomerID" },
                x => x.ValueCount
                ).SortRows("CustomerID");

            var top5 = numTransactionsByCountry
                       .GetRowsAt(new int[] {
                numTransactionsByCountry.RowCount - 1, numTransactionsByCountry.RowCount - 2,
                numTransactionsByCountry.RowCount - 3, numTransactionsByCountry.RowCount - 4,
                numTransactionsByCountry.RowCount - 5
            });

            top5.Print();

            var topTransactionByCountryBarChart = DataBarBox.Show(
                top5.GetColumn <string>("Country").Values.ToArray().Select(x => x.Equals("United Kingdom") ? "UK" : x),
                top5["CustomerID"].Values.ToArray()
                );

            topTransactionByCountryBarChart.SetTitle(
                "Top 5 Countries with the most number of transactions"
                );

            // 3. Per-Transaction Quantity Distributions
            Console.WriteLine("\n\n-- Per-Transaction Order Quantity Distribution-- ");
            double[] quantiles = Accord.Statistics.Measures.Quantiles(
                ecommerceDF["Quantity"].ValuesAll.ToArray(),
                new double[] { 0, 0.25, 0.5, 0.75, 1.0 }
                );
            Console.WriteLine(
                "Min: \t\t\t{0:0.00}\nQ1 (25% Percentile): \t{1:0.00}\nQ2 (Median): \t\t{2:0.00}\nQ3 (75% Percentile): \t{3:0.00}\nMax: \t\t\t{4:0.00}",
                quantiles[0], quantiles[1], quantiles[2], quantiles[3], quantiles[4]
                );

            Console.WriteLine("\n\n-- Per-Transaction Purchase-Order Quantity Distribution-- ");
            quantiles = Accord.Statistics.Measures.Quantiles(
                ecommerceDF["Quantity"].Where(x => x.Value >= 0).ValuesAll.ToArray(),
                new double[] { 0, 0.25, 0.5, 0.75, 1.0 }
                );
            Console.WriteLine(
                "Min: \t\t\t{0:0.00}\nQ1 (25% Percentile): \t{1:0.00}\nQ2 (Median): \t\t{2:0.00}\nQ3 (75% Percentile): \t{3:0.00}\nMax: \t\t\t{4:0.00}",
                quantiles[0], quantiles[1], quantiles[2], quantiles[3], quantiles[4]
                );

            Console.WriteLine("\n\n-- Per-Transaction Cancel-Order Quantity Distribution-- ");
            quantiles = Accord.Statistics.Measures.Quantiles(
                ecommerceDF["Quantity"].Where(x => x.Value < 0).ValuesAll.ToArray(),
                new double[] { 0, 0.25, 0.5, 0.75, 1.0 }
                );
            Console.WriteLine(
                "Min: \t\t\t{0:0.00}\nQ1 (25% Percentile): \t{1:0.00}\nQ2 (Median): \t\t{2:0.00}\nQ3 (75% Percentile): \t{3:0.00}\nMax: \t\t\t{4:0.00}",
                quantiles[0], quantiles[1], quantiles[2], quantiles[3], quantiles[4]
                );

            // 4. Per-Transaction Unit Price Distributions
            Console.WriteLine("\n\n-- Per-Transaction Unit Price Distribution-- ");
            quantiles = Accord.Statistics.Measures.Quantiles(
                ecommerceDF["UnitPrice"].ValuesAll.ToArray(),
                new double[] { 0, 0.25, 0.5, 0.75, 1.0 }
                );
            Console.WriteLine(
                "Min: \t\t\t{0:0.00}\nQ1 (25% Percentile): \t{1:0.00}\nQ2 (Median): \t\t{2:0.00}\nQ3 (75% Percentile): \t{3:0.00}\nMax: \t\t\t{4:0.00}",
                quantiles[0], quantiles[1], quantiles[2], quantiles[3], quantiles[4]
                );

            // 5. Per-Transaction Purchase Price Distributions
            Console.WriteLine("\n\n-- Per-Transaction Total Amount Distribution-- ");
            quantiles = Accord.Statistics.Measures.Quantiles(
                ecommerceDF["Amount"].ValuesAll.ToArray(),
                new double[] { 0, 0.25, 0.5, 0.75, 1.0 }
                );
            Console.WriteLine(
                "Min: \t\t\t{0:0.00}\nQ1 (25% Percentile): \t{1:0.00}\nQ2 (Median): \t\t{2:0.00}\nQ3 (75% Percentile): \t{3:0.00}\nMax: \t\t\t{4:0.00}",
                quantiles[0], quantiles[1], quantiles[2], quantiles[3], quantiles[4]
                );

            Console.WriteLine("\n\n-- Per-Transaction Purchase-Order Total Amount Distribution-- ");
            quantiles = Accord.Statistics.Measures.Quantiles(
                ecommerceDF["Amount"].Where(x => x.Value >= 0).ValuesAll.ToArray(),
                new double[] { 0, 0.25, 0.5, 0.75, 1.0 }
                );
            Console.WriteLine(
                "Min: \t\t\t{0:0.00}\nQ1 (25% Percentile): \t{1:0.00}\nQ2 (Median): \t\t{2:0.00}\nQ3 (75% Percentile): \t{3:0.00}\nMax: \t\t\t{4:0.00}",
                quantiles[0], quantiles[1], quantiles[2], quantiles[3], quantiles[4]
                );

            Console.WriteLine("\n\n-- Per-Transaction Cancel-Order Total Amount Distribution-- ");
            quantiles = Accord.Statistics.Measures.Quantiles(
                ecommerceDF["Amount"].Where(x => x.Value < 0).ValuesAll.ToArray(),
                new double[] { 0, 0.25, 0.5, 0.75, 1.0 }
                );
            Console.WriteLine(
                "Min: \t\t\t{0:0.00}\nQ1 (25% Percentile): \t{1:0.00}\nQ2 (Median): \t\t{2:0.00}\nQ3 (75% Percentile): \t{3:0.00}\nMax: \t\t\t{4:0.00}",
                quantiles[0], quantiles[1], quantiles[2], quantiles[3], quantiles[4]
                );

            // 6. # of Purchase vs. Cancelled Transactions
            var purchaseVSCancelBarChart = DataBarBox.Show(
                new string[] { "Purchase", "Cancel" },
                new double[] {
                ecommerceDF["Quantity"].Where(x => x.Value >= 0).ValueCount,
                ecommerceDF["Quantity"].Where(x => x.Value < 0).ValueCount
            }
                );

            purchaseVSCancelBarChart.SetTitle(
                "Purchase vs. Cancel"
                );


            Console.WriteLine("\n\n\n\n\nDONE!!!");
            Console.ReadKey();
        }
Exemple #11
0
        static void Main(string[] args)
        {
            Console.SetWindowSize(125, 50);
            // Read in the file we created in the Data Preparation step
            // TODO: change the path to point to your data directory
            string dataDirPath = "\\\\Mac\\Home\\Documents\\c-sharp-machine-learning\\ch.2\\output";
            // Read in stopwords list
            ISet <string> stopWords = new HashSet <string>(
                File.ReadLines("\\\\Mac\\Home\\Documents\\c-sharp-machine-learning\\ch.2\\stopwords.txt")
                );

            // Load the data into a data frame and set the "emailNum" column as an index
            var rawDF = Frame.ReadCsv(
                Path.Combine(dataDirPath, "data-preparation-step\\transformed.csv"),
                hasHeaders: true,
                inferTypes: false,
                schema: "int,string,string,int"
                ).IndexRows <int>("emailNum").SortRowsByKey();;

            // Look at words used in Subject lines
            var subjectWordVecDF = CreateWordVec(rawDF.GetColumn <string>("subject"));

            subjectWordVecDF.SaveCsv(Path.Combine(dataDirPath, "data-preparation-step\\subjectWordVec-alphaonly.csv"));
            Console.WriteLine("* Subject Word Vec DF Shape ({0}, {1})", subjectWordVecDF.RowCount, subjectWordVecDF.ColumnCount);

            // Get term frequencies by each group (ham vs. spam)
            var hamEmailCount  = rawDF.GetColumn <int>("is_ham").NumSum();
            var spamEmailCount = subjectWordVecDF.RowCount - hamEmailCount;

            subjectWordVecDF.AddColumn("is_ham", rawDF.GetColumn <int>("is_ham"));
            var hamTermFrequencies = subjectWordVecDF.Where(
                x => x.Value.GetAs <int>("is_ham") == 1
                ).Sum().Sort().Reversed.Where(x => x.Key != "is_ham");

            var spamTermFrequencies = subjectWordVecDF.Where(
                x => x.Value.GetAs <int>("is_ham") == 0
                ).Sum().Sort().Reversed;

            // Look at Top 10 terms that appear in Ham vs. Spam emails
            var topN = 10;

            var hamTermProportions     = hamTermFrequencies / hamEmailCount;
            var topHamTerms            = hamTermProportions.Keys.Take(topN);
            var topHamTermsProportions = hamTermProportions.Values.Take(topN);

            System.IO.File.WriteAllLines(
                dataDirPath + "\\ham-frequencies.csv",
                hamTermFrequencies.Keys.Zip(
                    hamTermFrequencies.Values, (a, b) => string.Format("{0},{1}", a, b)
                    )
                );

            var spamTermProportions     = spamTermFrequencies / spamEmailCount;
            var topSpamTerms            = spamTermProportions.Keys.Take(topN);
            var topSpamTermsProportions = spamTermProportions.Values.Take(topN);

            System.IO.File.WriteAllLines(
                dataDirPath + "\\spam-frequencies.csv",
                spamTermFrequencies.Keys.Zip(
                    spamTermFrequencies.Values, (a, b) => string.Format("{0},{1}", a, b)
                    )
                );

            var barChart = DataBarBox.Show(
                new string[] { "Ham", "Spam" },
                new double[] {
                hamEmailCount,
                spamEmailCount
            }
                );

            barChart.SetTitle("Ham vs. Spam in Sample Set");

            var hamBarChart = DataBarBox.Show(
                topHamTerms.ToArray(),
                new double[][] {
                topHamTermsProportions.ToArray(),
                spamTermProportions.GetItems(topHamTerms).Values.ToArray()
            }
                );

            hamBarChart.SetTitle("Top 10 Terms in Ham Emails (blue: HAM, red: SPAM)");
            System.Threading.Thread.Sleep(3000);
            hamBarChart.Invoke(
                new Action(() =>
            {
                hamBarChart.Size = new System.Drawing.Size(5000, 1500);
            })
                );

            var spamBarChart = DataBarBox.Show(
                topSpamTerms.ToArray(),
                new double[][] {
                hamTermProportions.GetItems(topSpamTerms).Values.ToArray(),
                topSpamTermsProportions.ToArray()
            }
                );

            spamBarChart.SetTitle("Top 10 Terms in Spam Emails (blue: HAM, red: SPAM)");
            System.Threading.Thread.Sleep(3000);
            spamBarChart.Invoke(
                new Action(() =>
            {
                spamBarChart.Size = new System.Drawing.Size(5000, 1500);
            })
                );

            // Look at top terms appear in Ham vs. Spam emails after filtering out stopwords
            var hamTermFrequenciesAfterStopWords = hamTermFrequencies.Where(
                x => !stopWords.Contains(x.Key)
                );
            var hamTermProportionsAfterStopWords = hamTermProportions.Where(
                x => !stopWords.Contains(x.Key)
                );
            var topHamTermsAfterStopWords            = hamTermProportionsAfterStopWords.Keys.Take(topN);
            var topHamTermsProportionsAfterStopWords = hamTermProportionsAfterStopWords.Values.Take(topN);

            System.IO.File.WriteAllLines(
                dataDirPath + "\\ham-frequencies-after-stopwords.csv",
                hamTermFrequenciesAfterStopWords.Keys.Zip(
                    hamTermFrequenciesAfterStopWords.Values, (a, b) => string.Format("{0},{1}", a, b)
                    )
                );

            var spamTermFrequenciesAfterStopWords = spamTermFrequencies.Where(
                x => !stopWords.Contains(x.Key)
                );
            var spamTermProportionsAfterStopWords = spamTermProportions.Where(
                x => !stopWords.Contains(x.Key)
                );
            var topSpamTermsAfterStopWords            = spamTermProportionsAfterStopWords.Keys.Take(topN);
            var topSpamTermsProportionsAfterStopWords = spamTermProportionsAfterStopWords.Values.Take(topN);

            System.IO.File.WriteAllLines(
                dataDirPath + "\\spam-frequencies-after-stopwords.csv",
                spamTermFrequenciesAfterStopWords.Keys.Zip(
                    spamTermFrequenciesAfterStopWords.Values, (a, b) => string.Format("{0},{1}", a, b)
                    )
                );

            hamBarChart = DataBarBox.Show(
                topHamTermsAfterStopWords.ToArray(),
                new double[][] {
                topHamTermsProportionsAfterStopWords.ToArray(),
                spamTermProportionsAfterStopWords.GetItems(topHamTermsAfterStopWords).Values.ToArray()
            }
                );
            hamBarChart.SetTitle("Top 10 Terms in Ham Emails - after filtering out stopwords (blue: HAM, red: SPAM)");
            System.Threading.Thread.Sleep(3000);
            hamBarChart.Invoke(
                new Action(() =>
            {
                hamBarChart.Size = new System.Drawing.Size(5000, 1500);
            })
                );

            spamBarChart = DataBarBox.Show(
                topSpamTermsAfterStopWords.ToArray(),
                new double[][] {
                hamTermProportionsAfterStopWords.GetItems(topSpamTermsAfterStopWords).Values.ToArray(),
                topSpamTermsProportionsAfterStopWords.ToArray()
            }
                );
            spamBarChart.SetTitle("Top 10 Terms in Spam Emails - after filtering out stopwords (blue: HAM, red: SPAM)");
            System.Threading.Thread.Sleep(3000);
            spamBarChart.Invoke(
                new Action(() =>
            {
                spamBarChart.Size = new System.Drawing.Size(5000, 1500);
            })
                );

            Console.WriteLine("Data Analysis Step Done!");
            Console.ReadKey();
        }
        static void Main(string[] args)
        {
            Console.SetWindowSize(100, 55);

            // Read in the Credit Card Fraud dataset
            // TODO: change the path to point to your data directory
            string dataDirPath = @"\\Mac\Home\Documents\c-sharp-machine-learning\ch.10\input-data";

            // Load the data into a data frame
            string dataPath = Path.Combine(dataDirPath, "creditcard.csv");

            Console.WriteLine("Loading {0}\n\n", dataPath);
            var df = Frame.ReadCsv(
                dataPath,
                hasHeaders: true,
                inferTypes: true
                );

            Console.WriteLine("* Shape: {0}, {1}\n\n", df.RowCount, df.ColumnCount);

            // Target variable distribution
            var targetVarCount = df.AggregateRowsBy <string, int>(
                new string[] { "Class" },
                new string[] { "V1" },
                x => x.ValueCount
                ).SortRows("V1");

            targetVarCount.RenameColumns(new string[] { "is_fraud", "count" });

            targetVarCount.Print();

            DataBarBox.Show(
                targetVarCount.GetColumn <string>("is_fraud").Values.ToArray(),
                targetVarCount["count"].Values.ToArray()
                ).SetTitle(
                "Counts by Target Class"
                );

            // Feature distributions
            HistogramBox.CheckForIllegalCrossThreadCalls = false;

            foreach (string col in df.ColumnKeys)
            {
                if (col.Equals("Class") || col.Equals("Time"))
                {
                    continue;
                }

                double[] values = df[col].DropMissing().ValuesAll.ToArray();
                // Compute Quartiles
                Console.WriteLine(String.Format("\n\n-- {0} Distribution -- ", col));
                double[] quartiles = Accord.Statistics.Measures.Quantiles(
                    values,
                    new double[] { 0, 0.25, 0.5, 0.75, 1.0 }
                    );
                Console.WriteLine(
                    "Min: \t\t\t{0:0.00}\nQ1 (25% Percentile): \t{1:0.00}\nQ2 (Median): \t\t{2:0.00}\nQ3 (75% Percentile): \t{3:0.00}\nMax: \t\t\t{4:0.00}",
                    quartiles[0], quartiles[1], quartiles[2], quartiles[3], quartiles[4]
                    );
                // Visualize Distributions
                HistogramBox.Show(
                    values,
                    title: col
                    )
                .SetNumberOfBins(50);
            }

            // Target Var Distributions on 2-dimensional feature space
            double[][] data = BuildJaggedArray(
                df.ToArray2D <double>(), df.RowCount, df.ColumnCount
                );
            int[] labels = df.GetColumn <int>("Class").ValuesAll.ToArray();

            double[][] first2Components = data.Select(
                x => x.Where((y, i) => i < 2
                             ).ToArray()).ToArray();
            ScatterplotBox.Show("Feature #1 vs. Feature #2", first2Components, labels);

            double[][] next2Components = data.Select(
                x => x.Where((y, i) => i >= 1 && i <= 2).ToArray()
                ).ToArray();
            ScatterplotBox.Show("Feature #2 vs. Feature #3", next2Components, labels);

            next2Components = data.Select(
                x => x.Where((y, i) => i >= 2 && i <= 3).ToArray()
                ).ToArray();
            ScatterplotBox.Show("Feature #3 vs. Feature #4", next2Components, labels);

            Console.WriteLine("\n\n\n\n\nDONE!!!");
            Console.ReadKey();
        }
        static void Main(string[] args)
        {
            Console.SetWindowSize(100, 60);

            // Read in the Cyber Attack dataset
            // TODO: change the path to point to your data directory
            string dataDirPath = @"\\Mac\Home\Documents\c-sharp-machine-learning\ch.9\input-data";

            // Load the data into a data frame
            string dataPath = Path.Combine(dataDirPath, "kddcup.data_10_percent");

            Console.WriteLine("Loading {0}\n\n", dataPath);
            var featuresDF = Frame.ReadCsv(
                dataPath,
                hasHeaders: false,
                inferTypes: true
                );

            string[] colnames =
            {
                "duration",                    "protocol_type",               "service",                  "flag",               "src_bytes",
                "dst_bytes",                   "land",                        "wrong_fragment",           "urgent",             "hot",
                "num_failed_logins",           "logged_in",                   "num_compromised",          "root_shell",
                "su_attempted",                "num_root",                    "num_file_creations",       "num_shells",
                "num_access_files",            "num_outbound_cmds",           "is_host_login",            "is_guest_login",
                "count",                       "srv_count",                   "serror_rate",              "srv_serror_rate",    "rerror_rate",
                "srv_rerror_rate",             "same_srv_rate",               "diff_srv_rate",            "srv_diff_host_rate",
                "dst_host_count",              "dst_host_srv_count",          "dst_host_same_srv_rate",
                "dst_host_diff_srv_rate",      "dst_host_same_src_port_rate",
                "dst_host_srv_diff_host_rate", "dst_host_serror_rate",
                "dst_host_srv_serror_rate",    "dst_host_rerror_rate",        "dst_host_srv_rerror_rate",
                "attack_type"
            };
            featuresDF.RenameColumns(colnames);

            Console.WriteLine("* Shape: {0}, {1}\n\n", featuresDF.RowCount, featuresDF.ColumnCount);

            // keeping "normal" for now for plotting purposes
            IDictionary <string, string> attackCategories = new Dictionary <string, string>
            {
                { "back", "dos" },
                { "land", "dos" },
                { "neptune", "dos" },
                { "pod", "dos" },
                { "smurf", "dos" },
                { "teardrop", "dos" },
                { "ipsweep", "probe" },
                { "nmap", "probe" },
                { "portsweep", "probe" },
                { "satan", "probe" },
                { "ftp_write", "r2l" },
                { "guess_passwd", "r2l" },
                { "imap", "r2l" },
                { "multihop", "r2l" },
                { "phf", "r2l" },
                { "spy", "r2l" },
                { "warezclient", "r2l" },
                { "warezmaster", "r2l" },
                { "buffer_overflow", "u2r" },
                { "loadmodule", "u2r" },
                { "perl", "u2r" },
                { "rootkit", "u2r" },
                { "normal", "normal" }
            };

            featuresDF.AddColumn(
                "attack_category",
                featuresDF.GetColumn <string>("attack_type")
                .Select(x => attackCategories[x.Value.Replace(".", "")])
                );

            // Export with Categories
            Console.WriteLine("* Exporting data...");
            featuresDF.SaveCsv(Path.Combine(dataDirPath, "data.csv"));

            // 1. Target Variable Distribution
            Console.WriteLine("\n\n-- Counts by Attack Category --\n");
            var attackCount = featuresDF.AggregateRowsBy <string, int>(
                new string[] { "attack_category" },
                new string[] { "duration" },
                x => x.ValueCount
                ).SortRows("duration");

            attackCount.RenameColumns(new string[] { "attack_category", "count" });

            attackCount.Print();

            DataBarBox.Show(
                attackCount.GetColumn <string>("attack_category").Values.ToArray(),
                attackCount["count"].Values.ToArray()
                ).SetTitle(
                "Counts by Attack Category"
                );

            // Now, remove normal records
            var attackSubset = featuresDF.Rows[
                featuresDF.GetColumn <string>("attack_category").Where(
                    x => !x.Value.Equals("normal")
                    ).Keys
                               ];
            var normalSubset = featuresDF.Rows[
                featuresDF.GetColumn <string>("attack_category").Where(
                    x => x.Value.Equals("normal")
                    ).Keys
                               ];

            // 2. Categorical Variable Distribution
            string[] categoricalVars =
            {
                "protocol_type", "service", "flag", "land"
            };
            foreach (string variable in categoricalVars)
            {
                Console.WriteLine("\n\n-- Counts by {0} --\n", variable);
                Console.WriteLine("* Attack:");
                var attackCountDF = attackSubset.AggregateRowsBy <string, int>(
                    new string[] { variable },
                    new string[] { "duration" },
                    x => x.ValueCount
                    );
                attackCountDF.RenameColumns(new string[] { variable, "count" });

                attackCountDF.SortRows("count").Print();

                Console.WriteLine("* Normal:");
                var countDF = normalSubset.AggregateRowsBy <string, int>(
                    new string[] { variable },
                    new string[] { "duration" },
                    x => x.ValueCount
                    );
                countDF.RenameColumns(new string[] { variable, "count" });

                countDF.SortRows("count").Print();

                DataBarBox.Show(
                    countDF.GetColumn <string>(variable).Values.ToArray(),
                    new double[][]
                {
                    attackCountDF["count"].Values.ToArray(),
                    countDF["count"].Values.ToArray()
                }
                    ).SetTitle(
                    String.Format("Counts by {0} (0 - Attack, 1 - Normal)", variable)
                    );
            }

            // 3. Continuous Variable Distribution
            string[] continuousVars =
            {
                "duration",                    "src_bytes",              "dst_bytes",                   "wrong_fragment",   "urgent",          "hot",
                "num_failed_logins",           "num_compromised",        "root_shell",                  "su_attempted",
                "num_root",                    "num_file_creations",     "num_shells",                  "num_access_files",
                "num_outbound_cmds",           "count",                  "srv_count",                   "serror_rate",      "srv_serror_rate",
                "rerror_rate",                 "srv_rerror_rate",        "same_srv_rate",               "diff_srv_rate",
                "srv_diff_host_rate",          "dst_host_count",         "dst_host_srv_count",
                "dst_host_same_srv_rate",      "dst_host_diff_srv_rate", "dst_host_same_src_port_rate",
                "dst_host_srv_diff_host_rate", "dst_host_serror_rate",   "dst_host_srv_serror_rate",
                "dst_host_rerror_rate",        "dst_host_srv_rerror_rate"
            };

            foreach (string variable in continuousVars)
            {
                Console.WriteLine(String.Format("\n\n-- {0} Distribution (Attack) -- ", variable));
                double[] attachQuartiles = Accord.Statistics.Measures.Quantiles(
                    attackSubset[variable].DropMissing().ValuesAll.ToArray(),
                    new double[] { 0, 0.25, 0.5, 0.75, 1.0 }
                    );
                Console.WriteLine(
                    "Min: \t\t\t{0:0.00}\nQ1 (25% Percentile): \t{1:0.00}\nQ2 (Median): \t\t{2:0.00}\nQ3 (75% Percentile): \t{3:0.00}\nMax: \t\t\t{4:0.00}",
                    attachQuartiles[0], attachQuartiles[1], attachQuartiles[2], attachQuartiles[3], attachQuartiles[4]
                    );

                Console.WriteLine(String.Format("\n\n-- {0} Distribution (Normal) -- ", variable));
                double[] normalQuantiles = Accord.Statistics.Measures.Quantiles(
                    normalSubset[variable].DropMissing().ValuesAll.ToArray(),
                    new double[] { 0, 0.25, 0.5, 0.75, 1.0 }
                    );
                Console.WriteLine(
                    "Min: \t\t\t{0:0.00}\nQ1 (25% Percentile): \t{1:0.00}\nQ2 (Median): \t\t{2:0.00}\nQ3 (75% Percentile): \t{3:0.00}\nMax: \t\t\t{4:0.00}",
                    normalQuantiles[0], normalQuantiles[1], normalQuantiles[2], normalQuantiles[3], normalQuantiles[4]
                    );
            }


            Console.WriteLine("\n\n\n\n\nDONE!!!");
            Console.ReadKey();
        }
        static void Main(string[] args)
        {
            Console.SetWindowSize(100, 50);

            // Read in the House Price dataset
            // TODO: change the path to point to your data directory
            string dataDirPath = @"\\Mac\Home\Documents\c-sharp-machine-learning\ch.5\input-data";

            // Load the data into a data frame
            string dataPath = Path.Combine(dataDirPath, "train.csv");

            Console.WriteLine("Loading {0}\n", dataPath);
            var houseDF = Frame.ReadCsv(
                dataPath,
                hasHeaders: true,
                inferTypes: true
                );

            // Categorical Variable #1: Building Type
            Console.WriteLine("\nCategorical Variable #1: Building Type");
            var buildingTypeDistribution = houseDF.GetColumn <string>(
                "BldgType"
                ).GroupBy <string>(x => x.Value).Select(x => (double)x.Value.KeyCount);

            buildingTypeDistribution.Print();

            var buildingTypeBarChart = DataBarBox.Show(
                buildingTypeDistribution.Keys.ToArray(),
                buildingTypeDistribution.Values.ToArray()
                );

            buildingTypeBarChart.SetTitle("Building Type Distribution (Categorical)");
            System.Threading.Thread.Sleep(3000);
            buildingTypeBarChart.Invoke(
                new Action(() =>
            {
                buildingTypeBarChart.Size = new System.Drawing.Size(1000, 700);
            })
                );

            // Categorical Variable #2: Lot Configuration
            Console.WriteLine("\nCategorical Variable #1: Building Type");
            var lotConfigDistribution = houseDF.GetColumn <string>(
                "LotConfig"
                ).GroupBy <string>(x => x.Value).Select(x => (double)x.Value.KeyCount);

            lotConfigDistribution.Print();

            var lotConfigBarChart = DataBarBox.Show(
                lotConfigDistribution.Keys.ToArray(),
                lotConfigDistribution.Values.ToArray()
                );

            lotConfigBarChart.SetTitle("Lot Configuration Distribution (Categorical)");
            System.Threading.Thread.Sleep(3000);
            lotConfigBarChart.Invoke(
                new Action(() =>
            {
                lotConfigBarChart.Size = new System.Drawing.Size(1000, 700);
            })
                );

            // Ordinal Categorical Variable #1: Overall material and finish of the house
            Console.WriteLine("\nOrdinal Categorical #1: Overall material and finish of the house");
            var overallQualDistribution = houseDF.GetColumn <string>(
                "OverallQual"
                ).GroupBy <int>(
                x => Convert.ToInt32(x.Value)
                ).Select(
                x => (double)x.Value.KeyCount
                ).SortByKey().Reversed;

            overallQualDistribution.Print();

            var overallQualBarChart = DataBarBox.Show(
                overallQualDistribution.Keys.Select(x => x.ToString()),
                overallQualDistribution.Values.ToArray()
                );

            overallQualBarChart.SetTitle("Overall House Quality Distribution (Ordinal)");
            System.Threading.Thread.Sleep(3000);
            overallQualBarChart.Invoke(
                new Action(() =>
            {
                overallQualBarChart.Size = new System.Drawing.Size(1000, 700);
            })
                );

            // Ordinal Categorical Variable #2: Exterior Quality
            Console.WriteLine("\nOrdinal Categorical #2: Exterior Quality");
            var exteriorQualDistribution = houseDF.GetColumn <string>(
                "ExterQual"
                ).GroupBy <string>(x => x.Value).Select(
                x => (double)x.Value.KeyCount
                )[new string[] { "Ex", "Gd", "TA", "Fa" }];

            exteriorQualDistribution.Print();

            var exteriorQualBarChart = DataBarBox.Show(
                exteriorQualDistribution.Keys.Select(x => x.ToString()),
                exteriorQualDistribution.Values.ToArray()
                );

            exteriorQualBarChart.SetTitle("Exterior Quality Distribution (Ordinal)");
            System.Threading.Thread.Sleep(3000);
            exteriorQualBarChart.Invoke(
                new Action(() =>
            {
                exteriorQualBarChart.Size = new System.Drawing.Size(1000, 700);
            })
                );

            HistogramBox.CheckForIllegalCrossThreadCalls = false;

            // Continuous Variable #1-1: First Floor Square Feet
            var firstFloorHistogram = HistogramBox
                                      .Show(
                houseDF.DropSparseRows()["1stFlrSF"].ValuesAll.ToArray(),
                title: "First Floor Square Feet (Continuous)"
                )
                                      .SetNumberOfBins(20);

            System.Threading.Thread.Sleep(3000);
            firstFloorHistogram.Invoke(
                new Action(() =>
            {
                firstFloorHistogram.Size = new System.Drawing.Size(1000, 700);
            })
                );

            // Continuous Variable #1-2: Log of First Floor Square Feet
            var logFirstFloorHistogram = HistogramBox
                                         .Show(
                houseDF.DropSparseRows()["1stFlrSF"].Log().ValuesAll.ToArray(),
                title: "First Floor Square Feet - Log Transformed (Continuous)"
                )
                                         .SetNumberOfBins(20);

            System.Threading.Thread.Sleep(3000);
            logFirstFloorHistogram.Invoke(
                new Action(() =>
            {
                logFirstFloorHistogram.Size = new System.Drawing.Size(1000, 700);
            })
                );

            // Continuous Variable #2-1: Size of garage in square feet
            var garageHistogram = HistogramBox
                                  .Show(
                houseDF.DropSparseRows()["GarageArea"].ValuesAll.ToArray(),
                title: "Size of garage in square feet (Continuous)"
                )
                                  .SetNumberOfBins(20);

            System.Threading.Thread.Sleep(3000);
            garageHistogram.Invoke(
                new Action(() =>
            {
                garageHistogram.Size = new System.Drawing.Size(1000, 700);
            })
                );

            // Continuous Variable #2-2: Log of Value of miscellaneous feature
            var logGarageHistogram = HistogramBox
                                     .Show(
                houseDF.DropSparseRows()["GarageArea"].Log().ValuesAll.ToArray(),
                title: "Size of garage in square feet - Log Transformed (Continuous)"
                )
                                     .SetNumberOfBins(20);

            System.Threading.Thread.Sleep(3000);
            logGarageHistogram.Invoke(
                new Action(() =>
            {
                logGarageHistogram.Size = new System.Drawing.Size(1000, 700);
            })
                );

            // Target Variable: Sale Price
            var salePriceHistogram = HistogramBox
                                     .Show(
                houseDF.DropSparseRows()["SalePrice"].ValuesAll.ToArray(),
                title: "Sale Price (Continuous)"
                )
                                     .SetNumberOfBins(20);

            System.Threading.Thread.Sleep(3000);
            salePriceHistogram.Invoke(
                new Action(() =>
            {
                salePriceHistogram.Size = new System.Drawing.Size(1000, 700);
            })
                );

            // Target Variable: Sale Price - Log Transformed
            var logSalePriceHistogram = HistogramBox
                                        .Show(
                houseDF.DropSparseRows()["SalePrice"].Log().ValuesAll.ToArray(),
                title: "Sale Price - Log Transformed (Continuous)"
                )
                                        .SetNumberOfBins(20);

            System.Threading.Thread.Sleep(3000);
            logSalePriceHistogram.Invoke(
                new Action(() =>
            {
                logSalePriceHistogram.Size = new System.Drawing.Size(1000, 700);
            })
                );


            Console.WriteLine("\nDONE!!!");
            Console.ReadKey();
        }
        static void Main(string[] args)
        {
            Console.SetWindowSize(100, 60);

            // Read in the Audio Features dataset
            // TODO: change the path to point to your data directory
            string dataDirPath = @"\\Mac\Home\Documents\c-sharp-machine-learning\ch.7\input-data";

            // Load the data into a data frame
            string dataPath = Path.Combine(dataDirPath, "sample.csv");

            Console.WriteLine("Loading {0}\n\n", dataPath);
            var featuresDF = Frame.ReadCsv(
                dataPath,
                hasHeaders: true,
                inferTypes: true
                );

            Console.WriteLine("* Shape: {0}, {1}\n\n", featuresDF.RowCount, featuresDF.ColumnCount);

            var genreCount = featuresDF.AggregateRowsBy <string, int>(
                new string[] { "genre_top" },
                new string[] { "track_id" },
                x => x.ValueCount
                ).SortRows("track_id");

            genreCount.Print();

            var barChart = DataBarBox.Show(
                genreCount.GetColumn <string>("genre_top").Values.ToArray().Select(x => x.Substring(0, 3)),
                genreCount["track_id"].Values.ToArray()
                ).SetTitle(
                "Genre Count"
                );

            foreach (string col in featuresDF.ColumnKeys)
            {
                if (col.StartsWith("mfcc"))
                {
                    int idx = int.Parse(col.Split('.')[2]);
                    if (idx <= 4)
                    {
                        Console.WriteLine(String.Format("\n\n-- {0} Distribution -- ", col));
                        double[] quantiles = Accord.Statistics.Measures.Quantiles(
                            featuresDF[col].ValuesAll.ToArray(),
                            new double[] { 0, 0.25, 0.5, 0.75, 1.0 }
                            );
                        Console.WriteLine(
                            "Min: \t\t\t{0:0.00}\nQ1 (25% Percentile): \t{1:0.00}\nQ2 (Median): \t\t{2:0.00}\nQ3 (75% Percentile): \t{3:0.00}\nMax: \t\t\t{4:0.00}",
                            quantiles[0], quantiles[1], quantiles[2], quantiles[3], quantiles[4]
                            );
                    }
                }
            }

            string[] attributes = new string[] { "kurtosis", "min", "max", "mean", "median", "skew", "std" };
            foreach (string attribute in attributes)
            {
                string[] featureColumns = featuresDF.ColumnKeys.Where(x => x.Contains(attribute)).ToArray();
                foreach (string genre in genreCount.GetColumn <string>("genre_top").Values)
                {
                    var genreDF = featuresDF.Rows[
                        featuresDF.GetColumn <string>("genre_top").Where(x => x.Value == genre).Keys
                                  ].Columns[featureColumns];

                    ScatterplotBox.Show(
                        BuildXYPairs(
                            genreDF.Columns[featureColumns].ToArray2D <double>(),
                            genreDF.RowCount,
                            genreDF.ColumnCount
                            )
                        ).SetTitle(String.Format("{0}-{1}", genre, attribute));
                }
            }


            Console.WriteLine("\n\n\n\n\nDONE!!!");
            Console.ReadKey();
        }
        static void Main(string[] args)
        {
            Console.SetWindowSize(100, 60);

            // Read in the Image Features dataset
            // TODO: change the path to point to your data directory
            string dataDirPath = @"\\Mac\Home\Documents\c-sharp-machine-learning\ch.8\input-data";

            // Load the data into a data frame
            string dataPath = Path.Combine(dataDirPath, "train.csv");

            Console.WriteLine("Loading {0}\n\n", dataPath);
            var featuresDF = Frame.ReadCsv(
                dataPath,
                hasHeaders: true,
                inferTypes: true
                );

            Console.WriteLine("* Shape: {0}, {1}\n\n", featuresDF.RowCount, featuresDF.ColumnCount);

            ISet <string> exportedLabels = new HashSet <string>();

            for (int i = 0; i < featuresDF.RowCount; i++)
            {
                exportedLabels.Add(featuresDF.Rows[i].GetAs <string>("label"));

                CreateImage(
                    featuresDF.Rows[i].ValuesAll.Select(x => (int)x).Where((x, idx) => idx > 0).ToArray(),
                    featuresDF.Rows[i].GetAs <string>("label")
                    );

                if (exportedLabels.Count() >= 10)
                {
                    break;
                }
            }

            var digitCount = featuresDF.AggregateRowsBy <string, int>(
                new string[] { "label" },
                new string[] { "pixel0" },
                x => x.ValueCount
                ).SortRows("pixel0");

            digitCount.Print();

            var barChart = DataBarBox.Show(
                digitCount.GetColumn <string>("label").Values.ToArray(),
                digitCount["pixel0"].Values.ToArray()
                ).SetTitle(
                "Digit Count"
                );

            List <string> featureCols = new List <string>();

            foreach (string col in featuresDF.ColumnKeys)
            {
                if (featureCols.Count >= 20)
                {
                    break;
                }

                if (col.StartsWith("pixel"))
                {
                    if (featuresDF[col].Max() > 0)
                    {
                        featureCols.Add(col);

                        Console.WriteLine(String.Format("\n\n-- {0} Distribution -- ", col));
                        double[] quantiles = Accord.Statistics.Measures.Quantiles(
                            featuresDF[col].ValuesAll.ToArray(),
                            new double[] { 0, 0.25, 0.5, 0.75, 1.0 }
                            );
                        Console.WriteLine(
                            "Min: \t\t\t{0:0.00}\nQ1 (25% Percentile): \t{1:0.00}\nQ2 (Median): \t\t{2:0.00}\nQ3 (75% Percentile): \t{3:0.00}\nMax: \t\t\t{4:0.00}",
                            quantiles[0], quantiles[1], quantiles[2], quantiles[3], quantiles[4]
                            );
                    }
                }
            }

            string[] featureColumns = featureCols.ToArray();

            foreach (string label in digitCount.GetColumn <string>("label").Values)
            {
                var subfeaturesDF = featuresDF.Rows[
                    featuresDF.GetColumn <string>("label").Where(x => x.Value == label).Keys
                                    ].Columns[featureColumns];

                ScatterplotBox.Show(
                    BuildXYPairs(
                        subfeaturesDF.Columns[featureColumns].ToArray2D <double>(),
                        subfeaturesDF.RowCount,
                        subfeaturesDF.ColumnCount
                        )
                    ).SetTitle(String.Format("Digit: {0} - 20 sample Pixels", label));
            }

            double[][] twoPixels = featuresDF.Columns[
                new string[] { featureColumns[15], featureColumns[16] }
                                   ].Rows.Select(
                x => Array.ConvertAll <object, double>(x.Value.ValuesAll.ToArray(), o => Convert.ToDouble(o))
                ).ValuesAll.ToArray();

            ScatterplotBox.Show(
                String.Format("{0} vs. {1}", featureColumns[15], featureColumns[16]),
                twoPixels,
                featuresDF.GetColumn <int>("label").Values.ToArray()
                );

            Console.WriteLine("\n\n\n\n\nDONE!!!");
            Console.ReadKey();
        }