public void DataBarBox_ShowTest1() { double[] data = new double[] { 100.0, 150.0, 42.0 }; string[] labels = { "1", "2", "3" }; DataBarBox.Show(labels, data) .Hold(); }
public void GenerateTotalsGraph(Series <string, int> values, string title = "Classification Frequency") { var barChart = DataBarBox.Show( values.Keys, values.Values.Select(Convert.ToDouble).ToArray()); barChart.SetTitle(title); }
public void GenerateTotalsGraph(Frame <int, string> frame, int hamTotal, int spamTotal) { var barChart = DataBarBox.Show( new string[] { "Ham", "Spam" }, new double[] { hamTotal, spamTotal } ); barChart.SetTitle("Ham vs. Spam in Sample Set"); }
public static void HamVsSpamBarChart(int hamEmailCount, int spamEmailCount) { var barChart = DataBarBox.Show( new string[] { "Ham", "Spam" }, new double[] { hamEmailCount, spamEmailCount } ); barChart.SetTitle("Ham vs. Spam in sample set"); }
public void GenerateTermProportionsGraph(Series <string, double> termsSeries, int totalTerms, Series <string, double> comparisonSeries, int comparisonTotal, string graphTitle) { var termsProportions = termsSeries / totalTerms; var topTerms = termsProportions.Keys.Take(_appSettings.NumberTermsToGraph).ToList(); var topTermsProportions = termsProportions.Values.Take(_appSettings.NumberTermsToGraph); var comparisonProportions = comparisonSeries / comparisonTotal; var barChart = DataBarBox.Show( topTerms.ToArray(), new double[][] { topTermsProportions.ToArray(), comparisonProportions.GetItems(topTerms).Values.ToArray(), }); barChart.SetTitle(graphTitle); }
public static void Top10HamTermsChart( IEnumerable <string> topHamTerms, IEnumerable <double> topHamTermsProportions, Series <string, double> spamTermProportions) { var hamBarChart = DataBarBox.Show( topHamTerms.ToArray(), new double[][] { topHamTermsProportions.ToArray(), spamTermProportions.GetItems(topHamTerms).Values.ToArray() } ); hamBarChart.SetTitle("Top 10 Terms in Ham Emails (blue: HAM, red: SPAM)"); System.Threading.Thread.Sleep(3000); hamBarChart.Invoke( new Action(() => { hamBarChart.Size = new System.Drawing.Size(5000, 1500); }) ); }
static void Main(string[] args) { Console.SetWindowSize(100, 60); // Read in the Image Features dataset // TODO: change the path to point to your data directory string dataDirPath = @"\\Mac\Home\Documents\c-sharp-machine-learning\ch.8\input-data"; // Load the data into a data frame string dataPath = Path.Combine(dataDirPath, "train.csv"); Console.WriteLine("Loading {0}\n\n", dataPath); var featuresDF = Frame.ReadCsv( dataPath, hasHeaders: true, inferTypes: true ); Console.WriteLine("* Shape: {0}, {1}\n\n", featuresDF.RowCount, featuresDF.ColumnCount); double trainSetProportiona = 0.7; var rnd = new Random(); var trainIdx = featuresDF.RowKeys.Where((x, i) => rnd.NextDouble() <= trainSetProportiona); var testIdx = featuresDF.RowKeys.Where((x, i) => !trainIdx.Contains(i)); var trainset = featuresDF.Rows[trainIdx]; var testset = featuresDF.Rows[testIdx]; var trainLabels = trainset.GetColumn <int>("label").Values.ToArray(); string[] nonZeroPixelCols = trainset.ColumnKeys.Where(x => trainset[x].Max() > 0 && !x.Equals("label")).ToArray(); double[][] data = trainset.Columns[nonZeroPixelCols].Rows.Select( x => Array.ConvertAll <object, double>(x.Value.ValuesAll.ToArray(), o => Convert.ToDouble(o)) ).ValuesAll.ToArray(); Console.WriteLine("* Shape: {0}, {1}\n\n", data.Length, data[0].Length); var digitCount = trainset.AggregateRowsBy <string, int>( new string[] { "label" }, new string[] { "pixel0" }, x => x.ValueCount ).SortRows("pixel0"); digitCount.Print(); var barChart = DataBarBox.Show( digitCount.GetColumn <string>("label").Values.ToArray(), digitCount["pixel0"].Values.ToArray() ).SetTitle( "Train Set - Digit Count" ); digitCount = testset.AggregateRowsBy <string, int>( new string[] { "label" }, new string[] { "pixel0" }, x => x.ValueCount ).SortRows("pixel0"); digitCount.Print(); barChart = DataBarBox.Show( digitCount.GetColumn <string>("label").Values.ToArray(), digitCount["pixel0"].Values.ToArray() ).SetTitle( "Test Set - Digit Count" ); var pca = new PrincipalComponentAnalysis( PrincipalComponentMethod.Standardize ); pca.Learn(data); double[][] transformed = pca.Transform(data); double[][] first2Components = transformed.Select(x => x.Where((y, i) => i < 2).ToArray()).ToArray(); ScatterplotBox.Show("Component #1 vs. Component #2", first2Components, trainLabels); DataSeriesBox.Show( pca.Components.Select((x, i) => (double)i), pca.Components.Select(x => x.CumulativeProportion) ).SetTitle("Explained Variance"); System.IO.File.WriteAllLines( Path.Combine(dataDirPath, "explained-variance.csv"), pca.Components.Select((x, i) => String.Format("{0},{1:0.0000}", i, x.CumulativeProportion)) ); Console.WriteLine("exporting train set..."); var trainTransformed = pca.Transform( trainset.Columns[nonZeroPixelCols].Rows.Select( x => Array.ConvertAll <object, double>(x.Value.ValuesAll.ToArray(), o => Convert.ToDouble(o)) ).ValuesAll.ToArray() ); System.IO.File.WriteAllLines( Path.Combine(dataDirPath, "pca-train.csv"), trainTransformed.Select((x, i) => String.Format("{0},{1}", String.Join(",", x), trainset["label"].GetAt(i))) ); Console.WriteLine("exporting test set..."); var testTransformed = pca.Transform( testset.Columns[nonZeroPixelCols].Rows.Select( x => Array.ConvertAll <object, double>(x.Value.ValuesAll.ToArray(), o => Convert.ToDouble(o)) ).ValuesAll.ToArray() ); System.IO.File.WriteAllLines( Path.Combine(dataDirPath, "pca-test.csv"), testTransformed.Select((x, i) => String.Format("{0},{1}", String.Join(",", x), testset["label"].GetAt(i))) ); Console.WriteLine("\n\n\n\n\nDONE!!!"); Console.ReadKey(); }
static void Main(string[] args) { Console.SetWindowSize(150, 80); // Read in the file we created in the Data Preparation step // TODO: change the path to point to your data directory string dataDirPath = @"<path-to-data-dir>"; // Read in stopwords list that we used in Chapter #2 ISet <string> stopWords = new HashSet <string>( File.ReadLines("<path-to-stopwords.txt>") ); // Load the data into a data frame string trainDataPath = Path.Combine(dataDirPath, "processed-training.csv"); Console.WriteLine("- Loading {0}", trainDataPath); var rawDF = Frame.ReadCsv( trainDataPath, hasHeaders: true, inferTypes: true ); // Look at the sentiment distributions in our sample set var sampleSetDistribution = rawDF.GetColumn <string>( "airline_sentiment" ).GroupBy <string>(x => x.Value).Select(x => x.Value.KeyCount); sampleSetDistribution.Print(); Console.WriteLine(String.Join(",", sampleSetDistribution.Values.ToArray())); var barChart = DataBarBox.Show( new string[] { "neutral", "positive", "negative" }, sampleSetDistribution.Values.Select(i => (double)i).ToArray() ); barChart.SetTitle("Sentiment Distribution in Sample Set"); // Look at words in pre-processed Tweets var tweetWordVecDF = CreateWordVec(rawDF.GetColumn <string>("tweet"), stopWords, useLemma: false); tweetWordVecDF.AddColumn( "tweet_polarity", rawDF.GetColumn <string>("airline_sentiment").Select( x => x.Value == "neutral" ? 0 : x.Value == "positive" ? 1 : 2 ) ); WriteDataFrameRowByRow(tweetWordVecDF, Path.Combine(dataDirPath, "tweet-words.csv")); Console.WriteLine("* Tweet Word Vec DF Shape ({0}, {1})", tweetWordVecDF.RowCount, tweetWordVecDF.ColumnCount); // Look at lemmas in pre-processed Tweets var tweetLemmaVecDF = CreateWordVec(rawDF.GetColumn <string>("tweet"), stopWords, useLemma: true); tweetLemmaVecDF.AddColumn( "tweet_polarity", rawDF.GetColumn <string>("airline_sentiment").Select( x => x.Value == "neutral" ? 0 : x.Value == "positive" ? 1 : 2 ) ); WriteDataFrameRowByRow(tweetLemmaVecDF, Path.Combine(dataDirPath, "tweet-lemma.csv")); Console.WriteLine("* Tweet Lemma Vec DF Shape ({0}, {1})", tweetLemmaVecDF.RowCount, tweetLemmaVecDF.ColumnCount); Console.WriteLine("Done!!!"); Console.ReadKey(); }
static void Main(string[] args) { Console.SetWindowSize(125, 50); // Read in the file we created in the Data Preparation (TwitterTokenizer) step // TODO: change the path to point to your data directory string dataDirPath = @"<path-to-data-dir>"; // Load the twitter-lemma data into a data frame var tweetLemmaDF = Frame.ReadCsv( Path.Combine(dataDirPath, "tweet-lemma.csv"), hasHeaders: true, inferTypes: true ); Console.WriteLine("* DF shape: ({0}, {1})", tweetLemmaDF.RowCount, tweetLemmaDF.ColumnCount); var sampleSetDistribution = tweetLemmaDF.GetColumn <string>( "tweet_polarity" ).GroupBy <string>(x => x.Value).Select(x => x.Value.KeyCount); int[] sampleSizes = sampleSetDistribution.Values.ToArray(); int neutralSampleSize = sampleSizes[0]; int positiveSampleSize = sampleSizes[1]; int negativeSampleSize = sampleSizes[2]; Console.WriteLine("* sentiment distribution - neutral: {0}, positive: {1}, negative: {2}", neutralSampleSize, positiveSampleSize, negativeSampleSize); var neutralTermFrequencies = ColumnWiseSum( tweetLemmaDF.Where( x => x.Value.GetAs <int>("tweet_polarity") == 0 ), "tweet_polarity" ).Sort().Reversed; var positiveTermFrequencies = ColumnWiseSum( tweetLemmaDF.Where( x => x.Value.GetAs <int>("tweet_polarity") == 1 ), "tweet_polarity" ).Sort().Reversed; var negativeTermFrequencies = ColumnWiseSum( tweetLemmaDF.Where( x => x.Value.GetAs <int>("tweet_polarity") == 2 ), "tweet_polarity" ).Sort().Reversed; // Look at Top 10 terms that appear in Neutral vs. Positive vs. Negative tweets var topN = 7; var neutralTermProportions = neutralTermFrequencies / neutralSampleSize; var positiveTermProportions = positiveTermFrequencies / positiveSampleSize; var negativeTermProportions = negativeTermFrequencies / negativeSampleSize; var topNeutralTerms = neutralTermProportions.Keys.Take(topN); var topNeutralTermsProportions = neutralTermProportions.Values.Take(topN); var topPositiveTerms = positiveTermProportions.Keys.Take(topN); var topPositiveTermsProportions = positiveTermProportions.Values.Take(topN); var topNegativeTerms = negativeTermProportions.Keys.Take(topN); var topNegativeTermsProportions = negativeTermProportions.Values.Take(topN); System.IO.File.WriteAllLines( dataDirPath + "\\neutral-frequencies.csv", neutralTermFrequencies.Keys.Zip( neutralTermFrequencies.Values, (a, b) => string.Format("{0},{1}", a, b) ) ); System.IO.File.WriteAllLines( dataDirPath + "\\positive-frequencies.csv", positiveTermFrequencies.Keys.Zip( positiveTermFrequencies.Values, (a, b) => string.Format("{0},{1}", a, b) ) ); System.IO.File.WriteAllLines( dataDirPath + "\\negative-frequencies.csv", negativeTermFrequencies.Keys.Zip( negativeTermFrequencies.Values, (a, b) => string.Format("{0},{1}", a, b) ) ); var topNeutralBarChart = DataBarBox.Show( topNeutralTerms.ToArray(), new double[][] { topNeutralTermsProportions.ToArray(), negativeTermProportions.GetItems(topNeutralTerms).Values.ToArray(), positiveTermProportions.GetItems(topNeutralTerms).Values.ToArray() } ); topNeutralBarChart.SetTitle( String.Format( "Top {0} Terms in Neutral Tweets (blue: neutral, red: negative, green: positive)", topN ) ); System.Threading.Thread.Sleep(3000); topNeutralBarChart.Invoke( new Action(() => { topNeutralBarChart.Size = new System.Drawing.Size(5000, 1500); }) ); var topPositiveBarChart = DataBarBox.Show( topPositiveTerms.ToArray(), new double[][] { neutralTermProportions.GetItems(topPositiveTerms).Values.ToArray(), negativeTermProportions.GetItems(topPositiveTerms).Values.ToArray(), topPositiveTermsProportions.ToArray() } ); topPositiveBarChart.SetTitle( String.Format( "Top {0} Terms in Positive Tweets (blue: neutral, red: negative, green: positive)", topN ) ); System.Threading.Thread.Sleep(3000); topPositiveBarChart.Invoke( new Action(() => { topPositiveBarChart.Size = new System.Drawing.Size(5000, 1500); }) ); var topNegattiveBarChart = DataBarBox.Show( topNegativeTerms.ToArray(), new double[][] { neutralTermProportions.GetItems(topNegativeTerms).Values.ToArray(), topNegativeTermsProportions.ToArray(), positiveTermProportions.GetItems(topNegativeTerms).Values.ToArray() } ); topNegattiveBarChart.SetTitle( String.Format( "Top {0} Terms in Negative Tweets (blue: neutral, red: negative, green: positive)", topN ) ); System.Threading.Thread.Sleep(3000); topNegattiveBarChart.Invoke( new Action(() => { topNegattiveBarChart.Size = new System.Drawing.Size(5000, 1500); }) ); Console.WriteLine("Done"); Console.ReadKey(); }
static void Main(string[] args) { Console.SetWindowSize(100, 60); // Read in the Online Retail dataset // TODO: change the path to point to your data directory string dataDirPath = @"\\Mac\Home\Documents\c-sharp-machine-learning\ch.6\input-data"; // Load the data into a data frame string dataPath = Path.Combine(dataDirPath, "data.csv"); Console.WriteLine("Loading {0}\n\n", dataPath); var ecommerceDF = Frame.ReadCsv( dataPath, hasHeaders: true, inferTypes: true ); Console.WriteLine("* Shape: {0}, {1}\n\n", ecommerceDF.RowCount, ecommerceDF.ColumnCount); // 1. Missing CustomerID Values ecommerceDF .Columns[new string[] { "CustomerID", "InvoiceNo", "StockCode", "Quantity", "UnitPrice", "Country" }] .GetRowsAt(new int[] { 1440, 1441, 1442, 1443, 1444, 1445, 1446 }) .Print(); Console.WriteLine("\n\n* # of values in CustomerID column: {0}", ecommerceDF["CustomerID"].ValueCount); // Drop missing values ecommerceDF = ecommerceDF .Columns[new string[] { "CustomerID", "Description", "Quantity", "UnitPrice", "Country" }] .DropSparseRows(); // Per-Transaction Purchase Amount = Quantity * UnitPrice ecommerceDF.AddColumn("Amount", ecommerceDF["Quantity"] * ecommerceDF["UnitPrice"]); Console.WriteLine("\n\n* Shape (After dropping missing values): {0}, {1}\n", ecommerceDF.RowCount, ecommerceDF.ColumnCount); Console.WriteLine("* After dropping missing values and unnecessary columns:"); ecommerceDF.GetRowsAt(new int[] { 0, 1, 2, 3, 4 }).Print(); // Export Data ecommerceDF.SaveCsv(Path.Combine(dataDirPath, "data-clean.csv")); // 2. Number of transactions by country var numTransactionsByCountry = ecommerceDF .AggregateRowsBy <string, int>( new string[] { "Country" }, new string[] { "CustomerID" }, x => x.ValueCount ).SortRows("CustomerID"); var top5 = numTransactionsByCountry .GetRowsAt(new int[] { numTransactionsByCountry.RowCount - 1, numTransactionsByCountry.RowCount - 2, numTransactionsByCountry.RowCount - 3, numTransactionsByCountry.RowCount - 4, numTransactionsByCountry.RowCount - 5 }); top5.Print(); var topTransactionByCountryBarChart = DataBarBox.Show( top5.GetColumn <string>("Country").Values.ToArray().Select(x => x.Equals("United Kingdom") ? "UK" : x), top5["CustomerID"].Values.ToArray() ); topTransactionByCountryBarChart.SetTitle( "Top 5 Countries with the most number of transactions" ); // 3. Per-Transaction Quantity Distributions Console.WriteLine("\n\n-- Per-Transaction Order Quantity Distribution-- "); double[] quantiles = Accord.Statistics.Measures.Quantiles( ecommerceDF["Quantity"].ValuesAll.ToArray(), new double[] { 0, 0.25, 0.5, 0.75, 1.0 } ); Console.WriteLine( "Min: \t\t\t{0:0.00}\nQ1 (25% Percentile): \t{1:0.00}\nQ2 (Median): \t\t{2:0.00}\nQ3 (75% Percentile): \t{3:0.00}\nMax: \t\t\t{4:0.00}", quantiles[0], quantiles[1], quantiles[2], quantiles[3], quantiles[4] ); Console.WriteLine("\n\n-- Per-Transaction Purchase-Order Quantity Distribution-- "); quantiles = Accord.Statistics.Measures.Quantiles( ecommerceDF["Quantity"].Where(x => x.Value >= 0).ValuesAll.ToArray(), new double[] { 0, 0.25, 0.5, 0.75, 1.0 } ); Console.WriteLine( "Min: \t\t\t{0:0.00}\nQ1 (25% Percentile): \t{1:0.00}\nQ2 (Median): \t\t{2:0.00}\nQ3 (75% Percentile): \t{3:0.00}\nMax: \t\t\t{4:0.00}", quantiles[0], quantiles[1], quantiles[2], quantiles[3], quantiles[4] ); Console.WriteLine("\n\n-- Per-Transaction Cancel-Order Quantity Distribution-- "); quantiles = Accord.Statistics.Measures.Quantiles( ecommerceDF["Quantity"].Where(x => x.Value < 0).ValuesAll.ToArray(), new double[] { 0, 0.25, 0.5, 0.75, 1.0 } ); Console.WriteLine( "Min: \t\t\t{0:0.00}\nQ1 (25% Percentile): \t{1:0.00}\nQ2 (Median): \t\t{2:0.00}\nQ3 (75% Percentile): \t{3:0.00}\nMax: \t\t\t{4:0.00}", quantiles[0], quantiles[1], quantiles[2], quantiles[3], quantiles[4] ); // 4. Per-Transaction Unit Price Distributions Console.WriteLine("\n\n-- Per-Transaction Unit Price Distribution-- "); quantiles = Accord.Statistics.Measures.Quantiles( ecommerceDF["UnitPrice"].ValuesAll.ToArray(), new double[] { 0, 0.25, 0.5, 0.75, 1.0 } ); Console.WriteLine( "Min: \t\t\t{0:0.00}\nQ1 (25% Percentile): \t{1:0.00}\nQ2 (Median): \t\t{2:0.00}\nQ3 (75% Percentile): \t{3:0.00}\nMax: \t\t\t{4:0.00}", quantiles[0], quantiles[1], quantiles[2], quantiles[3], quantiles[4] ); // 5. Per-Transaction Purchase Price Distributions Console.WriteLine("\n\n-- Per-Transaction Total Amount Distribution-- "); quantiles = Accord.Statistics.Measures.Quantiles( ecommerceDF["Amount"].ValuesAll.ToArray(), new double[] { 0, 0.25, 0.5, 0.75, 1.0 } ); Console.WriteLine( "Min: \t\t\t{0:0.00}\nQ1 (25% Percentile): \t{1:0.00}\nQ2 (Median): \t\t{2:0.00}\nQ3 (75% Percentile): \t{3:0.00}\nMax: \t\t\t{4:0.00}", quantiles[0], quantiles[1], quantiles[2], quantiles[3], quantiles[4] ); Console.WriteLine("\n\n-- Per-Transaction Purchase-Order Total Amount Distribution-- "); quantiles = Accord.Statistics.Measures.Quantiles( ecommerceDF["Amount"].Where(x => x.Value >= 0).ValuesAll.ToArray(), new double[] { 0, 0.25, 0.5, 0.75, 1.0 } ); Console.WriteLine( "Min: \t\t\t{0:0.00}\nQ1 (25% Percentile): \t{1:0.00}\nQ2 (Median): \t\t{2:0.00}\nQ3 (75% Percentile): \t{3:0.00}\nMax: \t\t\t{4:0.00}", quantiles[0], quantiles[1], quantiles[2], quantiles[3], quantiles[4] ); Console.WriteLine("\n\n-- Per-Transaction Cancel-Order Total Amount Distribution-- "); quantiles = Accord.Statistics.Measures.Quantiles( ecommerceDF["Amount"].Where(x => x.Value < 0).ValuesAll.ToArray(), new double[] { 0, 0.25, 0.5, 0.75, 1.0 } ); Console.WriteLine( "Min: \t\t\t{0:0.00}\nQ1 (25% Percentile): \t{1:0.00}\nQ2 (Median): \t\t{2:0.00}\nQ3 (75% Percentile): \t{3:0.00}\nMax: \t\t\t{4:0.00}", quantiles[0], quantiles[1], quantiles[2], quantiles[3], quantiles[4] ); // 6. # of Purchase vs. Cancelled Transactions var purchaseVSCancelBarChart = DataBarBox.Show( new string[] { "Purchase", "Cancel" }, new double[] { ecommerceDF["Quantity"].Where(x => x.Value >= 0).ValueCount, ecommerceDF["Quantity"].Where(x => x.Value < 0).ValueCount } ); purchaseVSCancelBarChart.SetTitle( "Purchase vs. Cancel" ); Console.WriteLine("\n\n\n\n\nDONE!!!"); Console.ReadKey(); }
static void Main(string[] args) { Console.SetWindowSize(125, 50); // Read in the file we created in the Data Preparation step // TODO: change the path to point to your data directory string dataDirPath = "\\\\Mac\\Home\\Documents\\c-sharp-machine-learning\\ch.2\\output"; // Read in stopwords list ISet <string> stopWords = new HashSet <string>( File.ReadLines("\\\\Mac\\Home\\Documents\\c-sharp-machine-learning\\ch.2\\stopwords.txt") ); // Load the data into a data frame and set the "emailNum" column as an index var rawDF = Frame.ReadCsv( Path.Combine(dataDirPath, "data-preparation-step\\transformed.csv"), hasHeaders: true, inferTypes: false, schema: "int,string,string,int" ).IndexRows <int>("emailNum").SortRowsByKey();; // Look at words used in Subject lines var subjectWordVecDF = CreateWordVec(rawDF.GetColumn <string>("subject")); subjectWordVecDF.SaveCsv(Path.Combine(dataDirPath, "data-preparation-step\\subjectWordVec-alphaonly.csv")); Console.WriteLine("* Subject Word Vec DF Shape ({0}, {1})", subjectWordVecDF.RowCount, subjectWordVecDF.ColumnCount); // Get term frequencies by each group (ham vs. spam) var hamEmailCount = rawDF.GetColumn <int>("is_ham").NumSum(); var spamEmailCount = subjectWordVecDF.RowCount - hamEmailCount; subjectWordVecDF.AddColumn("is_ham", rawDF.GetColumn <int>("is_ham")); var hamTermFrequencies = subjectWordVecDF.Where( x => x.Value.GetAs <int>("is_ham") == 1 ).Sum().Sort().Reversed.Where(x => x.Key != "is_ham"); var spamTermFrequencies = subjectWordVecDF.Where( x => x.Value.GetAs <int>("is_ham") == 0 ).Sum().Sort().Reversed; // Look at Top 10 terms that appear in Ham vs. Spam emails var topN = 10; var hamTermProportions = hamTermFrequencies / hamEmailCount; var topHamTerms = hamTermProportions.Keys.Take(topN); var topHamTermsProportions = hamTermProportions.Values.Take(topN); System.IO.File.WriteAllLines( dataDirPath + "\\ham-frequencies.csv", hamTermFrequencies.Keys.Zip( hamTermFrequencies.Values, (a, b) => string.Format("{0},{1}", a, b) ) ); var spamTermProportions = spamTermFrequencies / spamEmailCount; var topSpamTerms = spamTermProportions.Keys.Take(topN); var topSpamTermsProportions = spamTermProportions.Values.Take(topN); System.IO.File.WriteAllLines( dataDirPath + "\\spam-frequencies.csv", spamTermFrequencies.Keys.Zip( spamTermFrequencies.Values, (a, b) => string.Format("{0},{1}", a, b) ) ); var barChart = DataBarBox.Show( new string[] { "Ham", "Spam" }, new double[] { hamEmailCount, spamEmailCount } ); barChart.SetTitle("Ham vs. Spam in Sample Set"); var hamBarChart = DataBarBox.Show( topHamTerms.ToArray(), new double[][] { topHamTermsProportions.ToArray(), spamTermProportions.GetItems(topHamTerms).Values.ToArray() } ); hamBarChart.SetTitle("Top 10 Terms in Ham Emails (blue: HAM, red: SPAM)"); System.Threading.Thread.Sleep(3000); hamBarChart.Invoke( new Action(() => { hamBarChart.Size = new System.Drawing.Size(5000, 1500); }) ); var spamBarChart = DataBarBox.Show( topSpamTerms.ToArray(), new double[][] { hamTermProportions.GetItems(topSpamTerms).Values.ToArray(), topSpamTermsProportions.ToArray() } ); spamBarChart.SetTitle("Top 10 Terms in Spam Emails (blue: HAM, red: SPAM)"); System.Threading.Thread.Sleep(3000); spamBarChart.Invoke( new Action(() => { spamBarChart.Size = new System.Drawing.Size(5000, 1500); }) ); // Look at top terms appear in Ham vs. Spam emails after filtering out stopwords var hamTermFrequenciesAfterStopWords = hamTermFrequencies.Where( x => !stopWords.Contains(x.Key) ); var hamTermProportionsAfterStopWords = hamTermProportions.Where( x => !stopWords.Contains(x.Key) ); var topHamTermsAfterStopWords = hamTermProportionsAfterStopWords.Keys.Take(topN); var topHamTermsProportionsAfterStopWords = hamTermProportionsAfterStopWords.Values.Take(topN); System.IO.File.WriteAllLines( dataDirPath + "\\ham-frequencies-after-stopwords.csv", hamTermFrequenciesAfterStopWords.Keys.Zip( hamTermFrequenciesAfterStopWords.Values, (a, b) => string.Format("{0},{1}", a, b) ) ); var spamTermFrequenciesAfterStopWords = spamTermFrequencies.Where( x => !stopWords.Contains(x.Key) ); var spamTermProportionsAfterStopWords = spamTermProportions.Where( x => !stopWords.Contains(x.Key) ); var topSpamTermsAfterStopWords = spamTermProportionsAfterStopWords.Keys.Take(topN); var topSpamTermsProportionsAfterStopWords = spamTermProportionsAfterStopWords.Values.Take(topN); System.IO.File.WriteAllLines( dataDirPath + "\\spam-frequencies-after-stopwords.csv", spamTermFrequenciesAfterStopWords.Keys.Zip( spamTermFrequenciesAfterStopWords.Values, (a, b) => string.Format("{0},{1}", a, b) ) ); hamBarChart = DataBarBox.Show( topHamTermsAfterStopWords.ToArray(), new double[][] { topHamTermsProportionsAfterStopWords.ToArray(), spamTermProportionsAfterStopWords.GetItems(topHamTermsAfterStopWords).Values.ToArray() } ); hamBarChart.SetTitle("Top 10 Terms in Ham Emails - after filtering out stopwords (blue: HAM, red: SPAM)"); System.Threading.Thread.Sleep(3000); hamBarChart.Invoke( new Action(() => { hamBarChart.Size = new System.Drawing.Size(5000, 1500); }) ); spamBarChart = DataBarBox.Show( topSpamTermsAfterStopWords.ToArray(), new double[][] { hamTermProportionsAfterStopWords.GetItems(topSpamTermsAfterStopWords).Values.ToArray(), topSpamTermsProportionsAfterStopWords.ToArray() } ); spamBarChart.SetTitle("Top 10 Terms in Spam Emails - after filtering out stopwords (blue: HAM, red: SPAM)"); System.Threading.Thread.Sleep(3000); spamBarChart.Invoke( new Action(() => { spamBarChart.Size = new System.Drawing.Size(5000, 1500); }) ); Console.WriteLine("Data Analysis Step Done!"); Console.ReadKey(); }
static void Main(string[] args) { Console.SetWindowSize(100, 55); // Read in the Credit Card Fraud dataset // TODO: change the path to point to your data directory string dataDirPath = @"\\Mac\Home\Documents\c-sharp-machine-learning\ch.10\input-data"; // Load the data into a data frame string dataPath = Path.Combine(dataDirPath, "creditcard.csv"); Console.WriteLine("Loading {0}\n\n", dataPath); var df = Frame.ReadCsv( dataPath, hasHeaders: true, inferTypes: true ); Console.WriteLine("* Shape: {0}, {1}\n\n", df.RowCount, df.ColumnCount); // Target variable distribution var targetVarCount = df.AggregateRowsBy <string, int>( new string[] { "Class" }, new string[] { "V1" }, x => x.ValueCount ).SortRows("V1"); targetVarCount.RenameColumns(new string[] { "is_fraud", "count" }); targetVarCount.Print(); DataBarBox.Show( targetVarCount.GetColumn <string>("is_fraud").Values.ToArray(), targetVarCount["count"].Values.ToArray() ).SetTitle( "Counts by Target Class" ); // Feature distributions HistogramBox.CheckForIllegalCrossThreadCalls = false; foreach (string col in df.ColumnKeys) { if (col.Equals("Class") || col.Equals("Time")) { continue; } double[] values = df[col].DropMissing().ValuesAll.ToArray(); // Compute Quartiles Console.WriteLine(String.Format("\n\n-- {0} Distribution -- ", col)); double[] quartiles = Accord.Statistics.Measures.Quantiles( values, new double[] { 0, 0.25, 0.5, 0.75, 1.0 } ); Console.WriteLine( "Min: \t\t\t{0:0.00}\nQ1 (25% Percentile): \t{1:0.00}\nQ2 (Median): \t\t{2:0.00}\nQ3 (75% Percentile): \t{3:0.00}\nMax: \t\t\t{4:0.00}", quartiles[0], quartiles[1], quartiles[2], quartiles[3], quartiles[4] ); // Visualize Distributions HistogramBox.Show( values, title: col ) .SetNumberOfBins(50); } // Target Var Distributions on 2-dimensional feature space double[][] data = BuildJaggedArray( df.ToArray2D <double>(), df.RowCount, df.ColumnCount ); int[] labels = df.GetColumn <int>("Class").ValuesAll.ToArray(); double[][] first2Components = data.Select( x => x.Where((y, i) => i < 2 ).ToArray()).ToArray(); ScatterplotBox.Show("Feature #1 vs. Feature #2", first2Components, labels); double[][] next2Components = data.Select( x => x.Where((y, i) => i >= 1 && i <= 2).ToArray() ).ToArray(); ScatterplotBox.Show("Feature #2 vs. Feature #3", next2Components, labels); next2Components = data.Select( x => x.Where((y, i) => i >= 2 && i <= 3).ToArray() ).ToArray(); ScatterplotBox.Show("Feature #3 vs. Feature #4", next2Components, labels); Console.WriteLine("\n\n\n\n\nDONE!!!"); Console.ReadKey(); }
static void Main(string[] args) { Console.SetWindowSize(100, 60); // Read in the Cyber Attack dataset // TODO: change the path to point to your data directory string dataDirPath = @"\\Mac\Home\Documents\c-sharp-machine-learning\ch.9\input-data"; // Load the data into a data frame string dataPath = Path.Combine(dataDirPath, "kddcup.data_10_percent"); Console.WriteLine("Loading {0}\n\n", dataPath); var featuresDF = Frame.ReadCsv( dataPath, hasHeaders: false, inferTypes: true ); string[] colnames = { "duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", "land", "wrong_fragment", "urgent", "hot", "num_failed_logins", "logged_in", "num_compromised", "root_shell", "su_attempted", "num_root", "num_file_creations", "num_shells", "num_access_files", "num_outbound_cmds", "is_host_login", "is_guest_login", "count", "srv_count", "serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count", "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate", "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "attack_type" }; featuresDF.RenameColumns(colnames); Console.WriteLine("* Shape: {0}, {1}\n\n", featuresDF.RowCount, featuresDF.ColumnCount); // keeping "normal" for now for plotting purposes IDictionary <string, string> attackCategories = new Dictionary <string, string> { { "back", "dos" }, { "land", "dos" }, { "neptune", "dos" }, { "pod", "dos" }, { "smurf", "dos" }, { "teardrop", "dos" }, { "ipsweep", "probe" }, { "nmap", "probe" }, { "portsweep", "probe" }, { "satan", "probe" }, { "ftp_write", "r2l" }, { "guess_passwd", "r2l" }, { "imap", "r2l" }, { "multihop", "r2l" }, { "phf", "r2l" }, { "spy", "r2l" }, { "warezclient", "r2l" }, { "warezmaster", "r2l" }, { "buffer_overflow", "u2r" }, { "loadmodule", "u2r" }, { "perl", "u2r" }, { "rootkit", "u2r" }, { "normal", "normal" } }; featuresDF.AddColumn( "attack_category", featuresDF.GetColumn <string>("attack_type") .Select(x => attackCategories[x.Value.Replace(".", "")]) ); // Export with Categories Console.WriteLine("* Exporting data..."); featuresDF.SaveCsv(Path.Combine(dataDirPath, "data.csv")); // 1. Target Variable Distribution Console.WriteLine("\n\n-- Counts by Attack Category --\n"); var attackCount = featuresDF.AggregateRowsBy <string, int>( new string[] { "attack_category" }, new string[] { "duration" }, x => x.ValueCount ).SortRows("duration"); attackCount.RenameColumns(new string[] { "attack_category", "count" }); attackCount.Print(); DataBarBox.Show( attackCount.GetColumn <string>("attack_category").Values.ToArray(), attackCount["count"].Values.ToArray() ).SetTitle( "Counts by Attack Category" ); // Now, remove normal records var attackSubset = featuresDF.Rows[ featuresDF.GetColumn <string>("attack_category").Where( x => !x.Value.Equals("normal") ).Keys ]; var normalSubset = featuresDF.Rows[ featuresDF.GetColumn <string>("attack_category").Where( x => x.Value.Equals("normal") ).Keys ]; // 2. Categorical Variable Distribution string[] categoricalVars = { "protocol_type", "service", "flag", "land" }; foreach (string variable in categoricalVars) { Console.WriteLine("\n\n-- Counts by {0} --\n", variable); Console.WriteLine("* Attack:"); var attackCountDF = attackSubset.AggregateRowsBy <string, int>( new string[] { variable }, new string[] { "duration" }, x => x.ValueCount ); attackCountDF.RenameColumns(new string[] { variable, "count" }); attackCountDF.SortRows("count").Print(); Console.WriteLine("* Normal:"); var countDF = normalSubset.AggregateRowsBy <string, int>( new string[] { variable }, new string[] { "duration" }, x => x.ValueCount ); countDF.RenameColumns(new string[] { variable, "count" }); countDF.SortRows("count").Print(); DataBarBox.Show( countDF.GetColumn <string>(variable).Values.ToArray(), new double[][] { attackCountDF["count"].Values.ToArray(), countDF["count"].Values.ToArray() } ).SetTitle( String.Format("Counts by {0} (0 - Attack, 1 - Normal)", variable) ); } // 3. Continuous Variable Distribution string[] continuousVars = { "duration", "src_bytes", "dst_bytes", "wrong_fragment", "urgent", "hot", "num_failed_logins", "num_compromised", "root_shell", "su_attempted", "num_root", "num_file_creations", "num_shells", "num_access_files", "num_outbound_cmds", "count", "srv_count", "serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count", "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate", "dst_host_rerror_rate", "dst_host_srv_rerror_rate" }; foreach (string variable in continuousVars) { Console.WriteLine(String.Format("\n\n-- {0} Distribution (Attack) -- ", variable)); double[] attachQuartiles = Accord.Statistics.Measures.Quantiles( attackSubset[variable].DropMissing().ValuesAll.ToArray(), new double[] { 0, 0.25, 0.5, 0.75, 1.0 } ); Console.WriteLine( "Min: \t\t\t{0:0.00}\nQ1 (25% Percentile): \t{1:0.00}\nQ2 (Median): \t\t{2:0.00}\nQ3 (75% Percentile): \t{3:0.00}\nMax: \t\t\t{4:0.00}", attachQuartiles[0], attachQuartiles[1], attachQuartiles[2], attachQuartiles[3], attachQuartiles[4] ); Console.WriteLine(String.Format("\n\n-- {0} Distribution (Normal) -- ", variable)); double[] normalQuantiles = Accord.Statistics.Measures.Quantiles( normalSubset[variable].DropMissing().ValuesAll.ToArray(), new double[] { 0, 0.25, 0.5, 0.75, 1.0 } ); Console.WriteLine( "Min: \t\t\t{0:0.00}\nQ1 (25% Percentile): \t{1:0.00}\nQ2 (Median): \t\t{2:0.00}\nQ3 (75% Percentile): \t{3:0.00}\nMax: \t\t\t{4:0.00}", normalQuantiles[0], normalQuantiles[1], normalQuantiles[2], normalQuantiles[3], normalQuantiles[4] ); } Console.WriteLine("\n\n\n\n\nDONE!!!"); Console.ReadKey(); }
static void Main(string[] args) { Console.SetWindowSize(100, 50); // Read in the House Price dataset // TODO: change the path to point to your data directory string dataDirPath = @"\\Mac\Home\Documents\c-sharp-machine-learning\ch.5\input-data"; // Load the data into a data frame string dataPath = Path.Combine(dataDirPath, "train.csv"); Console.WriteLine("Loading {0}\n", dataPath); var houseDF = Frame.ReadCsv( dataPath, hasHeaders: true, inferTypes: true ); // Categorical Variable #1: Building Type Console.WriteLine("\nCategorical Variable #1: Building Type"); var buildingTypeDistribution = houseDF.GetColumn <string>( "BldgType" ).GroupBy <string>(x => x.Value).Select(x => (double)x.Value.KeyCount); buildingTypeDistribution.Print(); var buildingTypeBarChart = DataBarBox.Show( buildingTypeDistribution.Keys.ToArray(), buildingTypeDistribution.Values.ToArray() ); buildingTypeBarChart.SetTitle("Building Type Distribution (Categorical)"); System.Threading.Thread.Sleep(3000); buildingTypeBarChart.Invoke( new Action(() => { buildingTypeBarChart.Size = new System.Drawing.Size(1000, 700); }) ); // Categorical Variable #2: Lot Configuration Console.WriteLine("\nCategorical Variable #1: Building Type"); var lotConfigDistribution = houseDF.GetColumn <string>( "LotConfig" ).GroupBy <string>(x => x.Value).Select(x => (double)x.Value.KeyCount); lotConfigDistribution.Print(); var lotConfigBarChart = DataBarBox.Show( lotConfigDistribution.Keys.ToArray(), lotConfigDistribution.Values.ToArray() ); lotConfigBarChart.SetTitle("Lot Configuration Distribution (Categorical)"); System.Threading.Thread.Sleep(3000); lotConfigBarChart.Invoke( new Action(() => { lotConfigBarChart.Size = new System.Drawing.Size(1000, 700); }) ); // Ordinal Categorical Variable #1: Overall material and finish of the house Console.WriteLine("\nOrdinal Categorical #1: Overall material and finish of the house"); var overallQualDistribution = houseDF.GetColumn <string>( "OverallQual" ).GroupBy <int>( x => Convert.ToInt32(x.Value) ).Select( x => (double)x.Value.KeyCount ).SortByKey().Reversed; overallQualDistribution.Print(); var overallQualBarChart = DataBarBox.Show( overallQualDistribution.Keys.Select(x => x.ToString()), overallQualDistribution.Values.ToArray() ); overallQualBarChart.SetTitle("Overall House Quality Distribution (Ordinal)"); System.Threading.Thread.Sleep(3000); overallQualBarChart.Invoke( new Action(() => { overallQualBarChart.Size = new System.Drawing.Size(1000, 700); }) ); // Ordinal Categorical Variable #2: Exterior Quality Console.WriteLine("\nOrdinal Categorical #2: Exterior Quality"); var exteriorQualDistribution = houseDF.GetColumn <string>( "ExterQual" ).GroupBy <string>(x => x.Value).Select( x => (double)x.Value.KeyCount )[new string[] { "Ex", "Gd", "TA", "Fa" }]; exteriorQualDistribution.Print(); var exteriorQualBarChart = DataBarBox.Show( exteriorQualDistribution.Keys.Select(x => x.ToString()), exteriorQualDistribution.Values.ToArray() ); exteriorQualBarChart.SetTitle("Exterior Quality Distribution (Ordinal)"); System.Threading.Thread.Sleep(3000); exteriorQualBarChart.Invoke( new Action(() => { exteriorQualBarChart.Size = new System.Drawing.Size(1000, 700); }) ); HistogramBox.CheckForIllegalCrossThreadCalls = false; // Continuous Variable #1-1: First Floor Square Feet var firstFloorHistogram = HistogramBox .Show( houseDF.DropSparseRows()["1stFlrSF"].ValuesAll.ToArray(), title: "First Floor Square Feet (Continuous)" ) .SetNumberOfBins(20); System.Threading.Thread.Sleep(3000); firstFloorHistogram.Invoke( new Action(() => { firstFloorHistogram.Size = new System.Drawing.Size(1000, 700); }) ); // Continuous Variable #1-2: Log of First Floor Square Feet var logFirstFloorHistogram = HistogramBox .Show( houseDF.DropSparseRows()["1stFlrSF"].Log().ValuesAll.ToArray(), title: "First Floor Square Feet - Log Transformed (Continuous)" ) .SetNumberOfBins(20); System.Threading.Thread.Sleep(3000); logFirstFloorHistogram.Invoke( new Action(() => { logFirstFloorHistogram.Size = new System.Drawing.Size(1000, 700); }) ); // Continuous Variable #2-1: Size of garage in square feet var garageHistogram = HistogramBox .Show( houseDF.DropSparseRows()["GarageArea"].ValuesAll.ToArray(), title: "Size of garage in square feet (Continuous)" ) .SetNumberOfBins(20); System.Threading.Thread.Sleep(3000); garageHistogram.Invoke( new Action(() => { garageHistogram.Size = new System.Drawing.Size(1000, 700); }) ); // Continuous Variable #2-2: Log of Value of miscellaneous feature var logGarageHistogram = HistogramBox .Show( houseDF.DropSparseRows()["GarageArea"].Log().ValuesAll.ToArray(), title: "Size of garage in square feet - Log Transformed (Continuous)" ) .SetNumberOfBins(20); System.Threading.Thread.Sleep(3000); logGarageHistogram.Invoke( new Action(() => { logGarageHistogram.Size = new System.Drawing.Size(1000, 700); }) ); // Target Variable: Sale Price var salePriceHistogram = HistogramBox .Show( houseDF.DropSparseRows()["SalePrice"].ValuesAll.ToArray(), title: "Sale Price (Continuous)" ) .SetNumberOfBins(20); System.Threading.Thread.Sleep(3000); salePriceHistogram.Invoke( new Action(() => { salePriceHistogram.Size = new System.Drawing.Size(1000, 700); }) ); // Target Variable: Sale Price - Log Transformed var logSalePriceHistogram = HistogramBox .Show( houseDF.DropSparseRows()["SalePrice"].Log().ValuesAll.ToArray(), title: "Sale Price - Log Transformed (Continuous)" ) .SetNumberOfBins(20); System.Threading.Thread.Sleep(3000); logSalePriceHistogram.Invoke( new Action(() => { logSalePriceHistogram.Size = new System.Drawing.Size(1000, 700); }) ); Console.WriteLine("\nDONE!!!"); Console.ReadKey(); }
static void Main(string[] args) { Console.SetWindowSize(100, 60); // Read in the Audio Features dataset // TODO: change the path to point to your data directory string dataDirPath = @"\\Mac\Home\Documents\c-sharp-machine-learning\ch.7\input-data"; // Load the data into a data frame string dataPath = Path.Combine(dataDirPath, "sample.csv"); Console.WriteLine("Loading {0}\n\n", dataPath); var featuresDF = Frame.ReadCsv( dataPath, hasHeaders: true, inferTypes: true ); Console.WriteLine("* Shape: {0}, {1}\n\n", featuresDF.RowCount, featuresDF.ColumnCount); var genreCount = featuresDF.AggregateRowsBy <string, int>( new string[] { "genre_top" }, new string[] { "track_id" }, x => x.ValueCount ).SortRows("track_id"); genreCount.Print(); var barChart = DataBarBox.Show( genreCount.GetColumn <string>("genre_top").Values.ToArray().Select(x => x.Substring(0, 3)), genreCount["track_id"].Values.ToArray() ).SetTitle( "Genre Count" ); foreach (string col in featuresDF.ColumnKeys) { if (col.StartsWith("mfcc")) { int idx = int.Parse(col.Split('.')[2]); if (idx <= 4) { Console.WriteLine(String.Format("\n\n-- {0} Distribution -- ", col)); double[] quantiles = Accord.Statistics.Measures.Quantiles( featuresDF[col].ValuesAll.ToArray(), new double[] { 0, 0.25, 0.5, 0.75, 1.0 } ); Console.WriteLine( "Min: \t\t\t{0:0.00}\nQ1 (25% Percentile): \t{1:0.00}\nQ2 (Median): \t\t{2:0.00}\nQ3 (75% Percentile): \t{3:0.00}\nMax: \t\t\t{4:0.00}", quantiles[0], quantiles[1], quantiles[2], quantiles[3], quantiles[4] ); } } } string[] attributes = new string[] { "kurtosis", "min", "max", "mean", "median", "skew", "std" }; foreach (string attribute in attributes) { string[] featureColumns = featuresDF.ColumnKeys.Where(x => x.Contains(attribute)).ToArray(); foreach (string genre in genreCount.GetColumn <string>("genre_top").Values) { var genreDF = featuresDF.Rows[ featuresDF.GetColumn <string>("genre_top").Where(x => x.Value == genre).Keys ].Columns[featureColumns]; ScatterplotBox.Show( BuildXYPairs( genreDF.Columns[featureColumns].ToArray2D <double>(), genreDF.RowCount, genreDF.ColumnCount ) ).SetTitle(String.Format("{0}-{1}", genre, attribute)); } } Console.WriteLine("\n\n\n\n\nDONE!!!"); Console.ReadKey(); }
static void Main(string[] args) { Console.SetWindowSize(100, 60); // Read in the Image Features dataset // TODO: change the path to point to your data directory string dataDirPath = @"\\Mac\Home\Documents\c-sharp-machine-learning\ch.8\input-data"; // Load the data into a data frame string dataPath = Path.Combine(dataDirPath, "train.csv"); Console.WriteLine("Loading {0}\n\n", dataPath); var featuresDF = Frame.ReadCsv( dataPath, hasHeaders: true, inferTypes: true ); Console.WriteLine("* Shape: {0}, {1}\n\n", featuresDF.RowCount, featuresDF.ColumnCount); ISet <string> exportedLabels = new HashSet <string>(); for (int i = 0; i < featuresDF.RowCount; i++) { exportedLabels.Add(featuresDF.Rows[i].GetAs <string>("label")); CreateImage( featuresDF.Rows[i].ValuesAll.Select(x => (int)x).Where((x, idx) => idx > 0).ToArray(), featuresDF.Rows[i].GetAs <string>("label") ); if (exportedLabels.Count() >= 10) { break; } } var digitCount = featuresDF.AggregateRowsBy <string, int>( new string[] { "label" }, new string[] { "pixel0" }, x => x.ValueCount ).SortRows("pixel0"); digitCount.Print(); var barChart = DataBarBox.Show( digitCount.GetColumn <string>("label").Values.ToArray(), digitCount["pixel0"].Values.ToArray() ).SetTitle( "Digit Count" ); List <string> featureCols = new List <string>(); foreach (string col in featuresDF.ColumnKeys) { if (featureCols.Count >= 20) { break; } if (col.StartsWith("pixel")) { if (featuresDF[col].Max() > 0) { featureCols.Add(col); Console.WriteLine(String.Format("\n\n-- {0} Distribution -- ", col)); double[] quantiles = Accord.Statistics.Measures.Quantiles( featuresDF[col].ValuesAll.ToArray(), new double[] { 0, 0.25, 0.5, 0.75, 1.0 } ); Console.WriteLine( "Min: \t\t\t{0:0.00}\nQ1 (25% Percentile): \t{1:0.00}\nQ2 (Median): \t\t{2:0.00}\nQ3 (75% Percentile): \t{3:0.00}\nMax: \t\t\t{4:0.00}", quantiles[0], quantiles[1], quantiles[2], quantiles[3], quantiles[4] ); } } } string[] featureColumns = featureCols.ToArray(); foreach (string label in digitCount.GetColumn <string>("label").Values) { var subfeaturesDF = featuresDF.Rows[ featuresDF.GetColumn <string>("label").Where(x => x.Value == label).Keys ].Columns[featureColumns]; ScatterplotBox.Show( BuildXYPairs( subfeaturesDF.Columns[featureColumns].ToArray2D <double>(), subfeaturesDF.RowCount, subfeaturesDF.ColumnCount ) ).SetTitle(String.Format("Digit: {0} - 20 sample Pixels", label)); } double[][] twoPixels = featuresDF.Columns[ new string[] { featureColumns[15], featureColumns[16] } ].Rows.Select( x => Array.ConvertAll <object, double>(x.Value.ValuesAll.ToArray(), o => Convert.ToDouble(o)) ).ValuesAll.ToArray(); ScatterplotBox.Show( String.Format("{0} vs. {1}", featureColumns[15], featureColumns[16]), twoPixels, featuresDF.GetColumn <int>("label").Values.ToArray() ); Console.WriteLine("\n\n\n\n\nDONE!!!"); Console.ReadKey(); }