public void MultivariateLinearLogisticRegressionSimple() { // define model y = a + b0 * x0 + b1 * x1 + noise double a = 1.0; double b0 = -1.0 / 2.0; double b1 = 1.0 / 3.0; ContinuousDistribution x0distribution = new LaplaceDistribution(); ContinuousDistribution x1distribution = new NormalDistribution(); // draw a sample from the model Random rng = new Random(1); MultivariateSample old = new MultivariateSample("y", "x0", "x1"); FrameTable table = new FrameTable(); table.AddColumn <double>("x0"); table.AddColumn <double>("x1"); table.AddColumn <bool>("y"); for (int i = 0; i < 100; i++) { double x0 = x0distribution.GetRandomValue(rng); double x1 = x1distribution.GetRandomValue(rng); double t = a + b0 * x0 + b1 * x1; double p = 1.0 / (1.0 + Math.Exp(-t)); bool y = (rng.NextDouble() < p); old.Add(y ? 1.0 : 0.0, x0, x1); table.AddRow(x0, x1, y); } // do a linear regression fit on the model MultiLinearLogisticRegressionResult oldResult = old.LogisticLinearRegression(0); MultiLinearLogisticRegressionResult newResult = table["y"].As <bool>().MultiLinearLogisticRegression( table["x0"].As <double>(), table["x1"].As <double>() ); // the result should have the appropriate dimension Assert.IsTrue(newResult.Parameters.Count == 3); // The parameters should match the model Assert.IsTrue(newResult.CoefficientOf(0).ConfidenceInterval(0.99).ClosedContains(b0)); Assert.IsTrue(newResult.CoefficientOf("x1").ConfidenceInterval(0.99).ClosedContains(b1)); Assert.IsTrue(newResult.Intercept.ConfidenceInterval(0.99).ClosedContains(a)); // Our predictions should be better than chance. int correct = 0; for (int i = 0; i < table.Rows.Count; i++) { FrameRow row = table.Rows[i]; double x0 = (double)row["x0"]; double x1 = (double)row["x1"]; double p = newResult.Predict(x0, x1).Value; bool y = (bool)row["y"]; if ((y && p > 0.5) || (!y & p < 0.5)) { correct++; } } Assert.IsTrue(correct > 0.5 * table.Rows.Count); }
public void FrameTableDictionariesRoundtrip() { FrameTable frame = new FrameTable(); frame.AddColumn <string>("name"); frame.AddColumn <double>("height"); frame.AddColumn <bool?>("male"); frame.AddRow("a", 5.0, false); frame.AddRow("b", 6.0, true); frame.AddRow("c", 5.5, null); List <Dictionary <string, object> > dictionaries = frame.ToDictionaries().ToList(); Assert.IsTrue(dictionaries.Count == frame.Rows.Count); Assert.IsTrue(dictionaries[0].Count == frame.Columns.Count); FrameTable frame2 = FrameTable.FromDictionaries(dictionaries); Assert.IsTrue(frame2.Rows.Count == frame.Rows.Count); Assert.IsTrue(frame2.Columns.Count == frame.Columns.Count); Assert.IsTrue(frame2.Columns[0].Name == frame.Columns[0].Name); Assert.IsTrue(frame2.Columns[1].StorageType == frame.Columns[1].StorageType); Assert.IsTrue(frame2.Rows[2]["male"] == frame2.Rows[2]["male"]); }
public void LinearLogisticRegressionSimple() { Polynomial m = Polynomial.FromCoefficients(-1.0, 2.0); FrameTable table = new FrameTable(); table.AddColumn <double>("x"); table.AddColumn <string>("z"); Random rng = new Random(2); ContinuousDistribution xDistribution = new CauchyDistribution(4.0, 2.0); for (int i = 0; i < 24; i++) { double x = xDistribution.GetRandomValue(rng); double y = m.Evaluate(x); double p = 1.0 / (1.0 + Math.Exp(-y)); bool z = (rng.NextDouble() < p); table.AddRow(x, z.ToString()); } LinearLogisticRegressionResult fit = table["z"].As((string s) => Boolean.Parse(s)).LinearLogisticRegression(table["x"].As <double>()); Assert.IsTrue(fit.Intercept.ConfidenceInterval(0.99).ClosedContains(m.Coefficient(0))); Assert.IsTrue(fit.Slope.ConfidenceInterval(0.99).ClosedContains(m.Coefficient(1))); }
public void FrameViewColumnCoercion() { // Create nullable double and integer columns. FrameTable table = new FrameTable(); table.AddColumn <double?>("one"); table.AddColumn <int>("two"); table.AddRow(1.1, 2); table.AddRow(null, 3); // Coerce the nullable double into a non-nullable double // Should work when value is non-null, and fail with value is null IReadOnlyList <double> one = table["one"].As <double>(); Assert.IsTrue(one[0] == 1.1); try { double v = one[1]; Assert.Fail(); } catch (Exception) { } // Coerce the integer to a double. IReadOnlyList <double> two = table.Columns[1].As <double>(); Assert.IsTrue(two[0] == 2.0); }
public void TimeSeriesFitAR1() { double alpha = 0.3; double mu = 0.2; double sigma = 0.4; int n = 24; // For our fit to AR(1), we have incorporated bias correction (at least // for the most important parameter alpha), so we can do a small-n test. FrameTable data = new FrameTable(); data.AddColumn <UncertainValue>("mu"); data.AddColumn <UncertainValue>("alpha"); data.AddColumn <UncertainValue>("sigma"); data.AddColumn <SymmetricMatrix>("covariance"); data.AddColumn <double>("p"); for (int i = 0; i < 128; i++) { TimeSeries series = GenerateAR1TimeSeries(alpha, mu, sigma, n, n * i + 271828); AR1FitResult result = series.FitToAR1(); data.AddRow( result.Mu, result.Alpha, result.Sigma, result.Parameters.CovarianceMatrix, result.GoodnessOfFit.Probability ); } data.AddComputedColumn("alphaValue", r => ((UncertainValue)r["alpha"]).Value); data.AddComputedColumn("muValue", r => ((UncertainValue)r["mu"]).Value); // Check that fit parameters agree with inputs Assert.IsTrue(data["mu"].As((UncertainValue v) => v.Value).PopulationMean().ConfidenceInterval(0.99).ClosedContains(mu)); Assert.IsTrue(data["alpha"].As((UncertainValue v) => v.Value).PopulationMean().ConfidenceInterval(0.99).ClosedContains(alpha)); Assert.IsTrue(data["sigma"].As((UncertainValue v) => v.Value).PopulationMean().ConfidenceInterval(0.99).ClosedContains(sigma)); // Check that reported variances agree with actual variances Assert.IsTrue(data["mu"].As((UncertainValue v) => v.Value).PopulationStandardDeviation().ConfidenceInterval(0.99).ClosedContains(data["mu"].As((UncertainValue v) => v.Uncertainty).Median())); Assert.IsTrue(data["alpha"].As((UncertainValue v) => v.Value).PopulationStandardDeviation().ConfidenceInterval(0.99).ClosedContains(data["alpha"].As((UncertainValue v) => v.Uncertainty).Median())); Assert.IsTrue(data["sigma"].As((UncertainValue v) => v.Value).PopulationStandardDeviation().ConfidenceInterval(0.99).ClosedContains(data["sigma"].As((UncertainValue v) => v.Uncertainty).Median())); // Check that reported co-variances agree with actual co-variances Assert.IsTrue(data["mu"].As((UncertainValue v) => v.Value).PopulationCovariance(data["alpha"].As((UncertainValue v) => v.Value)).ConfidenceInterval(0.99).ClosedContains(data["covariance"].As((SymmetricMatrix c) => c[0, 1]).Median())); // For small n, the fitted alpha can vary considerably, and the formula for var(m) varies // quite strongly with alpha, so the computed var(m) have a very long tail. This pushes the // mean computed var(m) quite a bit higher than a typical value, so we use medians instead // of means for our best guess for the predicted variance. TestResult ks = data["p"].As <double>().KolmogorovSmirnovTest(new UniformDistribution()); Assert.IsTrue(ks.Probability > 0.05); // This is an onerous way to store values, but it does let us test how the data-frame machinery deals with // non-trivial storage types. }
public void BivariateNullAssociation() { Random rng = new Random(31415926); // Create a data structure to hold the results of Pearson, Spearman, and Kendall tests. FrameTable data = new FrameTable(); data.AddColumn <double>("r"); data.AddColumn <double>("ρ"); data.AddColumn <double>("τ"); // Create variables to hold the claimed distribution of each test statistic. ContinuousDistribution PRD = null; ContinuousDistribution SRD = null; ContinuousDistribution KTD = null; // Generate a large number of bivariate samples and conduct our three tests on each. ContinuousDistribution xDistribution = new LognormalDistribution(); ContinuousDistribution yDistribution = new CauchyDistribution(); for (int j = 0; j < 100; j++) { List <double> x = new List <double>(); List <double> y = new List <double>(); for (int i = 0; i < 100; i++) { x.Add(xDistribution.GetRandomValue(rng)); y.Add(yDistribution.GetRandomValue(rng)); } TestResult PR = Bivariate.PearsonRTest(x, y); TestResult SR = Bivariate.SpearmanRhoTest(x, y); TestResult KT = Bivariate.KendallTauTest(x, y); PRD = PR.Statistic.Distribution; SRD = SR.Statistic.Distribution; KTD = KT.Statistic.Distribution; data.AddRow(new Dictionary <string, object>() { { "r", PR.Statistic.Value }, { "ρ", SR.Statistic.Value }, { "τ", KT.Statistic.Value } }); } Assert.IsTrue(data["r"].As <double>().KolmogorovSmirnovTest(PRD).Probability > 0.05); Assert.IsTrue(data["ρ"].As <double>().KolmogorovSmirnovTest(SRD).Probability > 0.05); Assert.IsTrue(data["τ"].As <double>().KolmogorovSmirnovTest(KTD).Probability > 0.05); }
public void FrameTableColumnManipulations() { FrameTable frame = new FrameTable(); frame.AddColumn <int>("Integer"); frame.AddColumn <double>("Double"); Assert.IsTrue(frame.Columns.Count == 2); Assert.IsTrue(frame.Columns[0].Name == frame[frame.Columns[0].Name].Name); Assert.IsTrue(frame.Columns[0].StorageType == frame[frame.Columns[0].Name].StorageType); frame.AddColumn <DateTime>("Timestamp"); Assert.IsTrue(frame.Columns.Count == 3); frame.RemoveColumn(frame.Columns[0].Name); Assert.IsTrue(frame.Columns.Count == 2); }
public static void ImportingData() { FrameTable data; using (TextReader reader = File.OpenText("test.csv")) { data = FrameTable.FromCsv(reader); } Console.WriteLine($"Imported CSV file with {data.Rows.Count} rows."); Console.WriteLine("The names and types of the columns are:"); foreach (FrameColumn column in data.Columns) { Console.WriteLine($" {column.Name} of type {column.StorageType}"); } FrameTable titanic; Uri url = new Uri("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"); WebRequest request = WebRequest.Create(url); using (WebResponse response = request.GetResponse()) { using (StreamReader reader = new StreamReader(response.GetResponseStream())) { titanic = FrameTable.FromCsv(reader); } } Uri jsonUrl = new Uri("https://raw.githubusercontent.com/dcwuser/metanumerics/master/Examples/Data/example.json"); WebClient client = new WebClient(); string input = client.DownloadString(jsonUrl); List <Dictionary <string, object> > output = JsonConvert.DeserializeObject <List <Dictionary <string, object> > >(input); FrameTable jsonExample = FrameTable.FromDictionaries(output); // Define the schema. FrameTable table = new FrameTable(); table.AddColumn <int>("Id"); table.AddColumn <string>("Name"); table.AddColumn <string>("Sex"); table.AddColumn <DateTime>("Birthdate"); table.AddColumn <double>("Height"); table.AddColumn <double?>("Weight"); table.AddColumn <bool>("Result"); // Add rows using as arrays of objects. table.AddRow(1, "John", "M", DateTime.Parse("1970-01-02"), 190.0, 75.0, true); table.AddRow(2, "Mary", "F", DateTime.Parse("1980-02-03"), 155.0, null, true); // Add a row using a dictionary. This is more verbose, but very clear. table.AddRow(new Dictionary <string, object>() { { "Id", 3 }, { "Name", null }, { "Sex", "M" }, { "Birthdate", DateTime.Parse("1990-03-04") }, { "Height", 180.0 }, { "Weight", 60.0 }, { "Result", false } }); }
public void FrameTableRowManipulations() { // In future, test with computed columns FrameTable frame = new FrameTable(); frame.AddColumn <double>("Height"); frame.AddColumn <string>("Name"); Assert.IsTrue(frame.Columns.Count == 2); Assert.IsTrue(frame.Rows.Count == 0); // Insert a row Dictionary <string, object> row = new Dictionary <string, object>() { { "Name", "John" }, { "Height", 1.1 } }; frame.AddRow(row); Assert.IsTrue(frame.Rows.Count == 1); // Try to insert a row with missing values Dictionary <string, object> smallRow = new Dictionary <string, object>() { { "Name", "Mark" } }; try { frame.AddRow(smallRow); Assert.Fail(); } catch (Exception) { } // Try to insert a row with too many values Dictionary <string, object> bigRow = new Dictionary <string, object>() { { "Name", "Luke" }, { "Height", 1.2 }, { "Weight", 60.0 } }; try { frame.AddRow(bigRow); Assert.Fail(); } catch (Exception) { } }
private FrameTable GetTestFrame() { FrameTable frame = new FrameTable(); frame.AddColumn <string>("name"); frame.AddColumn <double>("height"); frame.AddColumn <double>("weight"); frame.AddColumn <bool?>("male"); frame.AddRow("a", 7.0, 10.0, false); frame.AddRow(null, 6.5, 11.0, true); frame.AddRow("c", 6.0, 12.0, false); frame.AddRow("d", 5.5, 11.0, true); frame.AddRow("e", 5.0, 12.0, null); frame.AddRow("f", 4.5, 13.0, true); frame.AddRow(null, 4.0, 12.0, false); frame.AddComputedColumn("bmi", r => ((double)r["weight"]) / MoreMath.Sqr((double)r["height"])); return(frame); }
public static void ConstructExampleData() { FrameTable table = new FrameTable(); table.AddColumn <int>("Id"); table.AddColumn <string>("Name"); table.AddColumn <string>("Sex"); table.AddColumn <DateTime>("Birthdate"); table.AddColumn <double>("Height"); table.AddColumns <double?>("Weight"); table.AddColumn <bool>("Result"); Random rng = new Random(1000001); string[] maleNames = new string[] { "Alex", "Chris", "David", "Eric", "Frederic", "George", "Hans", "Igor", "John", "Kevin", "Luke", "Mark", "Oscar", "Peter", "Richard", "Stephan", "Thomas", "Vincent" }; AddRows(table, maleNames, "M", 175.0, 12.0, 24.0, 3.0, 1, rng); string[] femaleNames = new string[] { "Anne", "Belle", "Dorothy", "Elizabeth", "Fiona", "Helen", "Julia", "Kate", "Louise", "Mary", "Natalie", "Olivia", "Ruth", "Sarah", "Theresa", "Viola" }; AddRows(table, femaleNames, "F", 160.0, 10.0, 24.0, 3.0, 0, rng); // add rows with nulls table.AddRow(table.Rows.Count, null, "M", DateTime.Parse("1970-07-27"), 183.0, 74.0, false); table.AddRow(table.Rows.Count, "Zoey", "F", DateTime.Parse("2007-09-17"), 138.0, null, false); string path = @"example.csv"; using (StreamWriter writer = new StreamWriter(File.OpenWrite(path))) { table.ToCsv(writer); } Console.WriteLine(File.Exists(path)); string json = JsonConvert.SerializeObject(table.ToDictionaries(), Formatting.Indented); File.WriteAllText("example.json", json); }
public void FrameTableCsvRoundtrip2() { // Let's exercise all our data adaptors FrameTable original = new FrameTable(); original.AddColumn <string>("String"); original.AddColumn <double?>("Double?"); original.AddColumn <int>("Int"); original.AddColumn <DateTime?>("DateTime?"); original.AddColumn <TimeSpan>("TimeSpan"); original.AddColumn <Boolean?>("Boolean?"); original.AddRow("z", null, 1, DateTime.Today, TimeSpan.FromMinutes(5.0), true); original.AddRow("y", 4.3, 2, null, TimeSpan.FromHours(4.0), null); original.AddRow("x", 2.0, 3, DateTime.UtcNow.Date, TimeSpan.FromDays(3.0), false); TextWriter storage = new StringWriter(); original.ToCsv(storage); FrameTable copy = FrameTable.FromCsv(new StringReader(storage.ToString())); for (int i = 0; i < original.Columns.Count; i++) { Assert.IsTrue(original.Columns[i].Name == copy.Columns[i].Name); Assert.IsTrue(original.Columns[i].StorageType == copy.Columns[i].StorageType); } for (int i = 0; i < original.Rows.Count; i++) { for (int j = 0; j < original.Columns.Count; j++) { // This awkwardness is necessary because == resolves to a static method, // so object == object does a reference check which will fail even if // both sides are equal structures. Equals, on the other hand, is a // virtual method, so it will do the appropriate comparison, but will // fail if the instance is null. if (original.Rows[i][j] == null) { Assert.IsTrue(original.Rows[i][j] == null); } else { Assert.IsTrue(original.Rows[i][j].Equals(copy.Rows[i][j])); } } } }
public void MeansClustering2() { ColumnVector[] centers = new ColumnVector[] { new ColumnVector(0.0, 0.0, 0.0), new ColumnVector(2.0, 0.0, 0.0), new ColumnVector(0.0, 2.0, 0.0), new ColumnVector(0.0, 0.0, 2.0) }; FrameTable table = new FrameTable(); string alphabet = "abcdefghijklmnopqrstuvwxyz"; for (int j = 0; j < 3; j++) { table.AddColumn <double>(alphabet[j].ToString()); } List <int> inputAssignments = new List <int>(); List <ColumnVector> inputVectors = new List <ColumnVector>(); Random rng = new Random(2); ContinuousDistribution dist = new NormalDistribution(0.0, 1.0); for (int i = 0; i < 100; i++) { int inputAssignment = rng.Next(0, centers.Length); inputAssignments.Add(inputAssignment); ColumnVector inputVector = centers[inputAssignment].Copy(); for (int k = 0; k < inputVector.Dimension; k++) { inputVector[k] += dist.GetRandomValue(rng); } inputVectors.Add(inputVector); table.AddRow <double>(inputVector); } MeansClusteringResult result = table.AsColumns <double>().MeansClustering(centers.Length); //MultivariateSample s = new MultivariateSample(3); //foreach (ColumnVector v in inputVectors) { s.Add(v); } //MeansClusteringResult result = s.MeansClustering(centers.Length); List <int> outputAssignments = new List <int>(); for (int i = 0; i < inputVectors.Count; i++) { int assignment = result.Classify(inputVectors[i]); outputAssignments.Add(assignment); } // Map the output centroids to the original centroids Dictionary <int, int> map = new Dictionary <int, int>(); for (int outputIndex = 0; outputIndex < result.Count; outputIndex++) { ColumnVector centroid = result.Centroid(outputIndex); int mappedInputIndex = -1; double mappedInputDistance = Double.MaxValue; for (int inputIndex = 0; inputIndex < centers.Length; inputIndex++) { double distance = (centroid - centers[inputIndex]).Norm(); if (distance < mappedInputDistance) { mappedInputIndex = inputIndex; mappedInputDistance = distance; } } Assert.IsTrue(mappedInputIndex >= 0); Assert.IsTrue(mappedInputDistance < 1.0); map.Add(outputIndex, mappedInputIndex); } int correctCount = 0; for (int i = 0; i < outputAssignments.Count; i++) { if (map[outputAssignments[i]] == inputAssignments[i]) { correctCount++; } } Assert.IsTrue(correctCount >= 0.50 * outputAssignments.Count); }
public void MultivariateLinearLogisticRegressionVariances() { // define model y = a + b0 * x0 + b1 * x1 + noise double a = -3.0; double b0 = 2.0; double b1 = 1.0; ContinuousDistribution x0distribution = new ExponentialDistribution(); ContinuousDistribution x1distribution = new LognormalDistribution(); FrameTable data = new FrameTable(); data.AddColumns <double>("a", "da", "b0", "db0", "b1", "db1", "p", "dp"); // draw a sample from the model Random rng = new Random(2); for (int j = 0; j < 32; j++) { List <double> x0s = new List <double>(); List <double> x1s = new List <double>(); List <bool> ys = new List <bool>(); FrameTable table = new FrameTable(); table.AddColumn <double>("x0"); table.AddColumn <double>("x1"); table.AddColumn <bool>("y"); for (int i = 0; i < 32; i++) { double x0 = x0distribution.GetRandomValue(rng); double x1 = x1distribution.GetRandomValue(rng); double t = a + b0 * x0 + b1 * x1; double p = 1.0 / (1.0 + Math.Exp(-t)); bool y = (rng.NextDouble() < p); x0s.Add(x0); x1s.Add(x1); ys.Add(y); } // do a linear regression fit on the model MultiLinearLogisticRegressionResult result = ys.MultiLinearLogisticRegression( new Dictionary <string, IReadOnlyList <double> > { { "x0", x0s }, { "x1", x1s } } ); UncertainValue pp = result.Predict(0.0, 1.0); data.AddRow( result.Intercept.Value, result.Intercept.Uncertainty, result.CoefficientOf("x0").Value, result.CoefficientOf("x0").Uncertainty, result.CoefficientOf("x1").Value, result.CoefficientOf("x1").Uncertainty, pp.Value, pp.Uncertainty ); } // The estimated parameters should agree with the model that generated the data. // The variances of the estimates should agree with the claimed variances Assert.IsTrue(data["a"].As <double>().PopulationStandardDeviation().ConfidenceInterval(0.99).ClosedContains(data["da"].As <double>().Mean())); Assert.IsTrue(data["b0"].As <double>().PopulationStandardDeviation().ConfidenceInterval(0.99).ClosedContains(data["db0"].As <double>().Mean())); Assert.IsTrue(data["b1"].As <double>().PopulationStandardDeviation().ConfidenceInterval(0.99).ClosedContains(data["db1"].As <double>().Mean())); Assert.IsTrue(data["p"].As <double>().PopulationStandardDeviation().ConfidenceInterval(0.99).ClosedContains(data["dp"].As <double>().Mean())); }
public void FrameTableManipulation() { FrameTable table = new FrameTable(); table.AddColumn <int>("Id"); table.AddColumn <DateTime?>("Birthdate"); table.AddColumns <string>("FirstName", "LastName"); Assert.IsTrue(table.Columns.Count == 4); // Index lookup should work Assert.IsTrue(table.GetColumnIndex("Birthdate") >= 0); Assert.IsTrue(table.GetColumnIndex("None") < 0); // Add rows Assert.IsTrue(table.Rows.Count == 0); table.AddRow(1, DateTime.Parse("1990-01-01"), "a", "p"); table.AddRow(2, DateTime.Parse("2000-02-02"), null, null); table.AddRow(new Dictionary <string, object>() { { "Id", 3 }, { "Birthdate", null }, { "FirstName", "c" }, { "LastName", "r" } }); Assert.IsTrue(table.Rows.Count == 3); // Adding rows with the wrong types and/or entries should fail // Careful, some of these will leave the table in a bad state //try { // table.AddRow(4, DateTime.Parse("2010-04-04"), 1.0, "s"); // Assert.Fail(); //} catch (Exception) { } try { table.AddRow(4, DateTime.Parse("2010-04-04")); Assert.Fail(); } catch (Exception) { } //try { // table.AddRow(new Dictionary<string, object>() { // {"Id", 4}, { "FirstName", "d" }, { "LastName", "r" } // }); // Assert.Fail(); //} catch (Exception) { } //try { // table.AddRow(new Dictionary<string, object>() { // {"Id", 4}, { "Birthdate", null }, { "FirstName", "d" }, { "LastName", "r" }, { "MiddleName", "u" } // }); // Assert.Fail(); //} catch (Exception) { } // Adding a new column with the wrong length should fail try { table.AddColumn <double>("Score"); Assert.Fail(); } catch (Exception) { } Assert.IsTrue(table.GetColumnIndex("Score") < 0); // Adding a new column with the right length should work List <double> scores = new List <double>() { 1.1, 1.2, 1.3 }; table.AddColumn("Score", scores); Assert.IsTrue(table.GetColumnIndex("Score") >= 0); // Adding a new computed column should work table.AddComputedColumn <TimeSpan?>("Age", r => { DateTime?b = (DateTime?)r["Birthdate"]; if (b.HasValue) { return(DateTime.Now - b.Value); } else { return(null); } }); Assert.IsTrue(table.GetColumnIndex("Age") >= 0); // Changing a value should change the result of the computed column that depends on it int birthdateIndex = table.GetColumnIndex("Birthdate"); int ageIndex = table.GetColumnIndex("Age"); TimeSpan age1 = (TimeSpan)table[0, ageIndex]; table[0, birthdateIndex] = DateTime.Parse("2010-01-01"); TimeSpan age2 = (TimeSpan)table[0, ageIndex]; Assert.IsTrue(age2 != age1); // Clearing a table should work table.Clear(); Assert.IsTrue(table.Columns.Count > 0); Assert.IsTrue(table.Rows.Count == 0); }
public void ContingencyTableProbabilities() { // Construct data where (i) there are both reference-nulls and nullable-struct-nulls, // (ii) all values of one column are equally, (iii) values of other column depend on value of first column List <string> groups = new List <string>() { "A", "B", "C", null }; FrameTable data = new FrameTable(); data.AddColumn <string>("Group"); data.AddColumn <bool?>("Outcome"); int n = 512; double pOutcomeNull = 0.05; Func <int, double> pOutcome = groupIndex => 0.8 - 0.2 * groupIndex; Random rng = new Random(10101010); for (int i = 0; i < n; i++) { int groupIndex = rng.Next(0, groups.Count); string group = groups[groupIndex]; bool? outcome = (rng.NextDouble() < pOutcome(groupIndex)); if (rng.NextDouble() < pOutcomeNull) { outcome = null; } data.AddRow(group, outcome); } // Form a contingency table. ContingencyTable <string, bool?> table = Bivariate.Crosstabs(data["Group"].As <string>(), data["Outcome"].As <bool?>()); // Total counts should match Assert.IsTrue(table.Total == n); // All values should be represented foreach (string row in table.Rows) { Assert.IsTrue(groups.Contains(row)); } // Counts in each cell and marginal totals should match foreach (string group in table.Rows) { int rowTotal = 0; foreach (bool?outcome in table.Columns) { FrameView view = data.Where(r => ((string)r["Group"] == group) && ((bool?)r["Outcome"] == outcome)); Assert.IsTrue(table[group, outcome] == view.Rows.Count); rowTotal += view.Rows.Count; } Assert.IsTrue(rowTotal == table.RowTotal(group)); } // Inferred probabilities should agree with model Assert.IsTrue(table.ProbabilityOfColumn(null).ConfidenceInterval(0.99).ClosedContains(pOutcomeNull)); for (int groupIndex = 0; groupIndex < groups.Count; groupIndex++) { string group = groups[groupIndex]; Assert.IsTrue(table.ProbabilityOfRow(group).ConfidenceInterval(0.99).ClosedContains(0.25)); Assert.IsTrue(table.ProbabilityOfColumnConditionalOnRow(true, group).ConfidenceInterval(0.99).ClosedContains(pOutcome(groupIndex) * (1.0 - pOutcomeNull))); } Assert.IsTrue(table.ProbabilityOfColumn(null).ConfidenceInterval(0.99).ClosedContains(pOutcomeNull)); // Pearson test should catch that rows and columns are corrleated Assert.IsTrue(table.PearsonChiSquaredTest().Probability < 0.05); }