private static void AddRows(FrameTable table, IReadOnlyList <string> names, string sex, double meanHeight, double stddevHeight, double meanBmi, double stddevBmi, int flag, Random rng) { NormalDistribution gauss = new NormalDistribution(); UniformDistribution ages = new UniformDistribution(Interval.FromEndpoints(15.0, 75.0)); foreach (string name in names) { double zHeight = gauss.GetRandomValue(rng); double height = meanHeight + stddevHeight * zHeight; double zBmi = gauss.GetRandomValue(rng); double bmi = meanBmi + stddevBmi * zBmi; double weight = MoreMath.Sqr(height / 100.0) * bmi; double t = -0.4 + 0.6 * zBmi + 0.8 * flag; double p = 1.0 / (1.0 + Math.Exp(-t)); bool r = rng.NextDouble() < p; int id = table.Rows.Count; TimeSpan age = TimeSpan.FromDays(365.24 * ages.GetRandomValue(rng)); DateTime birthdate = (DateTime.Now - age).Date; table.AddRow(id, name, sex, birthdate, height, weight, r); } }
public void MultivariateLinearLogisticRegressionSimple() { // define model y = a + b0 * x0 + b1 * x1 + noise double a = 1.0; double b0 = -1.0 / 2.0; double b1 = 1.0 / 3.0; ContinuousDistribution x0distribution = new LaplaceDistribution(); ContinuousDistribution x1distribution = new NormalDistribution(); // draw a sample from the model Random rng = new Random(1); MultivariateSample old = new MultivariateSample("y", "x0", "x1"); FrameTable table = new FrameTable(); table.AddColumn <double>("x0"); table.AddColumn <double>("x1"); table.AddColumn <bool>("y"); for (int i = 0; i < 100; i++) { double x0 = x0distribution.GetRandomValue(rng); double x1 = x1distribution.GetRandomValue(rng); double t = a + b0 * x0 + b1 * x1; double p = 1.0 / (1.0 + Math.Exp(-t)); bool y = (rng.NextDouble() < p); old.Add(y ? 1.0 : 0.0, x0, x1); table.AddRow(x0, x1, y); } // do a linear regression fit on the model MultiLinearLogisticRegressionResult oldResult = old.LogisticLinearRegression(0); MultiLinearLogisticRegressionResult newResult = table["y"].As <bool>().MultiLinearLogisticRegression( table["x0"].As <double>(), table["x1"].As <double>() ); // the result should have the appropriate dimension Assert.IsTrue(newResult.Parameters.Count == 3); // The parameters should match the model Assert.IsTrue(newResult.CoefficientOf(0).ConfidenceInterval(0.99).ClosedContains(b0)); Assert.IsTrue(newResult.CoefficientOf("x1").ConfidenceInterval(0.99).ClosedContains(b1)); Assert.IsTrue(newResult.Intercept.ConfidenceInterval(0.99).ClosedContains(a)); // Our predictions should be better than chance. int correct = 0; for (int i = 0; i < table.Rows.Count; i++) { FrameRow row = table.Rows[i]; double x0 = (double)row["x0"]; double x1 = (double)row["x1"]; double p = newResult.Predict(x0, x1).Value; bool y = (bool)row["y"]; if ((y && p > 0.5) || (!y & p < 0.5)) { correct++; } } Assert.IsTrue(correct > 0.5 * table.Rows.Count); }
public void FrameTableCsvRoundtrip() { FrameTable frame; using (TextReader reader = File.OpenText(csvFileName)) { frame = FrameTable.FromCsv(reader); } Assert.IsTrue(frame != null); Assert.IsTrue(frame.Columns.Count > 0); Assert.IsTrue(frame.Rows.Count > 0); string outputPath = Path.GetTempFileName(); try { using (FileStream stream = File.OpenWrite(outputPath)) { using (TextWriter writer = new StreamWriter(stream)) { frame.ToCsv(writer); } } Guid inputHash = ComputeMD5Hash(csvFileName); Guid outputHash = ComputeMD5Hash(outputPath); Assert.IsTrue(inputHash == outputHash); } finally { File.Delete(outputPath); } }
public void MultivariateLinearRegressionVariances() { // define model y = a + b0 * x0 + b1 * x1 + noise double a = -3.0; double b0 = 2.0; double b1 = -1.0; ContinuousDistribution x0distribution = new LaplaceDistribution(); ContinuousDistribution x1distribution = new CauchyDistribution(); ContinuousDistribution eDistribution = new NormalDistribution(0.0, 4.0); FrameTable data = new FrameTable(); data.AddColumns <double>("a", "da", "b0", "db0", "b1", "db1", "ab1Cov", "p", "dp"); // draw a sample from the model Random rng = new Random(4); for (int j = 0; j < 64; j++) { List <double> x0s = new List <double>(); List <double> x1s = new List <double>(); List <double> ys = new List <double>(); for (int i = 0; i < 16; i++) { double x0 = x0distribution.GetRandomValue(rng); double x1 = x1distribution.GetRandomValue(rng); double e = eDistribution.GetRandomValue(rng); double y = a + b0 * x0 + b1 * x1 + e; x0s.Add(x0); x1s.Add(x1); ys.Add(y); } // do a linear regression fit on the model MultiLinearRegressionResult result = ys.MultiLinearRegression( new Dictionary <string, IReadOnlyList <double> > { { "x0", x0s }, { "x1", x1s } } ); UncertainValue pp = result.Predict(-5.0, 6.0); data.AddRow( result.Intercept.Value, result.Intercept.Uncertainty, result.CoefficientOf("x0").Value, result.CoefficientOf("x0").Uncertainty, result.CoefficientOf("x1").Value, result.CoefficientOf("x1").Uncertainty, result.Parameters.CovarianceOf("Intercept", "x1"), pp.Value, pp.Uncertainty ); } // The estimated parameters should agree with the model that generated the data. // The variances of the estimates should agree with the claimed variances Assert.IsTrue(data["a"].As <double>().PopulationStandardDeviation().ConfidenceInterval(0.99).ClosedContains(data["da"].As <double>().Mean())); Assert.IsTrue(data["b0"].As <double>().PopulationStandardDeviation().ConfidenceInterval(0.99).ClosedContains(data["db0"].As <double>().Mean())); Assert.IsTrue(data["b1"].As <double>().PopulationStandardDeviation().ConfidenceInterval(0.99).ClosedContains(data["db1"].As <double>().Mean())); Assert.IsTrue(data["a"].As <double>().PopulationCovariance(data["b1"].As <double>()).ConfidenceInterval(0.99).ClosedContains(data["ab1Cov"].As <double>().Mean())); Assert.IsTrue(data["p"].As <double>().PopulationStandardDeviation().ConfidenceInterval(0.99).ClosedContains(data["dp"].As <double>().Median())); }
public static void ConstructExampleData() { FrameTable table = new FrameTable(); table.AddColumn <int>("Id"); table.AddColumn <string>("Name"); table.AddColumn <string>("Sex"); table.AddColumn <DateTime>("Birthdate"); table.AddColumn <double>("Height"); table.AddColumns <double?>("Weight"); table.AddColumn <bool>("Result"); Random rng = new Random(1000001); string[] maleNames = new string[] { "Alex", "Chris", "David", "Eric", "Frederic", "George", "Hans", "Igor", "John", "Kevin", "Luke", "Mark", "Oscar", "Peter", "Richard", "Stephan", "Thomas", "Vincent" }; AddRows(table, maleNames, "M", 175.0, 12.0, 24.0, 3.0, 1, rng); string[] femaleNames = new string[] { "Anne", "Belle", "Dorothy", "Elizabeth", "Fiona", "Helen", "Julia", "Kate", "Louise", "Mary", "Natalie", "Olivia", "Ruth", "Sarah", "Theresa", "Viola" }; AddRows(table, femaleNames, "F", 160.0, 10.0, 24.0, 3.0, 0, rng); // add rows with nulls table.AddRow(table.Rows.Count, null, "M", DateTime.Parse("1970-07-27"), 183.0, 74.0, false); table.AddRow(table.Rows.Count, "Zoey", "F", DateTime.Parse("2007-09-17"), 138.0, null, false); string path = @"example.csv"; using (StreamWriter writer = new StreamWriter(File.OpenWrite(path))) { table.ToCsv(writer); } Console.WriteLine(File.Exists(path)); string json = JsonConvert.SerializeObject(table.ToDictionaries(), Formatting.Indented); File.WriteAllText("example.json", json); }
public void FrameTableDictionariesRoundtrip() { FrameTable frame = new FrameTable(); frame.AddColumn <string>("name"); frame.AddColumn <double>("height"); frame.AddColumn <bool?>("male"); frame.AddRow("a", 5.0, false); frame.AddRow("b", 6.0, true); frame.AddRow("c", 5.5, null); List <Dictionary <string, object> > dictionaries = frame.ToDictionaries().ToList(); Assert.IsTrue(dictionaries.Count == frame.Rows.Count); Assert.IsTrue(dictionaries[0].Count == frame.Columns.Count); FrameTable frame2 = FrameTable.FromDictionaries(dictionaries); Assert.IsTrue(frame2.Rows.Count == frame.Rows.Count); Assert.IsTrue(frame2.Columns.Count == frame.Columns.Count); Assert.IsTrue(frame2.Columns[0].Name == frame.Columns[0].Name); Assert.IsTrue(frame2.Columns[1].StorageType == frame.Columns[1].StorageType); Assert.IsTrue(frame2.Rows[2]["male"] == frame2.Rows[2]["male"]); }
public void LinearLogisticRegressionSimple() { Polynomial m = Polynomial.FromCoefficients(-1.0, 2.0); FrameTable table = new FrameTable(); table.AddColumn <double>("x"); table.AddColumn <string>("z"); Random rng = new Random(2); ContinuousDistribution xDistribution = new CauchyDistribution(4.0, 2.0); for (int i = 0; i < 24; i++) { double x = xDistribution.GetRandomValue(rng); double y = m.Evaluate(x); double p = 1.0 / (1.0 + Math.Exp(-y)); bool z = (rng.NextDouble() < p); table.AddRow(x, z.ToString()); } LinearLogisticRegressionResult fit = table["z"].As((string s) => Boolean.Parse(s)).LinearLogisticRegression(table["x"].As <double>()); Assert.IsTrue(fit.Intercept.ConfidenceInterval(0.99).ClosedContains(m.Coefficient(0))); Assert.IsTrue(fit.Slope.ConfidenceInterval(0.99).ClosedContains(m.Coefficient(1))); }
public void FrameViewGroupBy() { FrameView original = GetTestFrame(); HashSet <bool?> values = new HashSet <bool?>(original["male"].As <bool?>().Distinct()); FrameTable grouped = original.GroupBy("male", v => { SummaryStatistics summary = new SummaryStatistics(v["height"].As <double>()); return(new Dictionary <string, object>() { { "count", summary.Count }, { "heightMean", summary.Mean }, { "heightStandardDeviation", summary.StandardDeviation } }); }); Assert.IsTrue(grouped.Rows.Count == values.Count); Assert.IsTrue(grouped.Columns.Count == 4); for (int i = 0; i < grouped.Rows.Count; i++) { bool?value = grouped["male"].As <bool?>()[i]; Assert.IsTrue(values.Contains(value)); FrameView selected = original.Where(r => (bool?)r["male"] == value); Assert.IsTrue(selected.Rows.Count > 0); double mean = selected["height"].As <double>().Mean(); Assert.IsTrue(TestUtilities.IsNearlyEqual(grouped["heightMean"].As <double>()[i], mean)); double standardDeviation = selected["height"].As <double>().StandardDeviation(); Assert.IsTrue(TestUtilities.IsNearlyEqual(grouped["heightStandardDeviation"].As <double>()[i], standardDeviation)); } }
public void FrameViewColumnCoercion() { // Create nullable double and integer columns. FrameTable table = new FrameTable(); table.AddColumn <double?>("one"); table.AddColumn <int>("two"); table.AddRow(1.1, 2); table.AddRow(null, 3); // Coerce the nullable double into a non-nullable double // Should work when value is non-null, and fail with value is null IReadOnlyList <double> one = table["one"].As <double>(); Assert.IsTrue(one[0] == 1.1); try { double v = one[1]; Assert.Fail(); } catch (Exception) { } // Coerce the integer to a double. IReadOnlyList <double> two = table.Columns[1].As <double>(); Assert.IsTrue(two[0] == 2.0); }
public void CsvWhitespaceParsing() { StringBuilder text = new StringBuilder(); text.AppendLine(" c0 ,\tc1 "); text.AppendLine(","); text.AppendLine(" , "); text.AppendLine(" ,\t"); text.AppendLine("\t\" \" , \"\t\" "); text.AppendLine(" \" a\t\"\t,\t\"\t a \""); text.AppendLine("\t \" \"\"\", \" , \" "); FrameTable table = FrameTable.FromCsv(new StringReader(text.ToString())); Assert.IsTrue(table.Columns[0].Name == "c0"); Assert.IsTrue(table.Columns[1].Name == "c1"); Assert.IsTrue((string)table.Rows[0][0] == null); Assert.IsTrue((string)table.Rows[0][1] == null); Assert.IsTrue((string)table.Rows[1][0] == null); Assert.IsTrue((string)table.Rows[1][1] == null); Assert.IsTrue((string)table.Rows[2][0] == null); Assert.IsTrue((string)table.Rows[2][1] == null); Assert.IsTrue((string)table.Rows[3][0] == " "); Assert.IsTrue((string)table.Rows[3][1] == "\t"); Assert.IsTrue((string)table.Rows[4][0] == " a\t"); Assert.IsTrue((string)table.Rows[4][1] == "\t a "); Assert.IsTrue((string)table.Rows[5][0] == " \""); Assert.IsTrue((string)table.Rows[5][1] == " , "); }
public void InternetSampleDownload() { FrameTable table = DownloadFrameTable(new Uri("https://raw.githubusercontent.com/Dataweekends/zero_to_deep_learning_udemy/master/data/weight-height.csv")); FrameView view = table.WhereNotNull(); view.AddComputedColumn("Bmi", (FrameRow r) => { double h = (double)r["Height"]; double w = (double)r["Weight"]; return(w / (h * h)); }); FrameView males = view.Where("Gender", (string s) => (s == "Male")); FrameView females = view.Where("Gender", (string s) => (s == "Female")); SummaryStatistics maleSummary = new SummaryStatistics(males["Height"].As <double>()); SummaryStatistics femaleSummary = new SummaryStatistics(females["Height"].As <double>()); TestResult allNormal = view["Height"].As <double>().ShapiroFranciaTest(); TestResult maleNormal = males["Height"].As <double>().ShapiroFranciaTest(); TestResult femaleNormal = females["Height"].As <double>().ShapiroFranciaTest(); TestResult tTest = Univariate.StudentTTest(males["Height"].As <double>(), females["Height"].As <double>()); TestResult mwTest = Univariate.MannWhitneyTest(males["Height"].As <double>(), females["Height"].As <double>()); LinearRegressionResult result0 = males["Weight"].As <double>().LinearRegression(males["Height"].As <double>()); PolynomialRegressionResult result1 = males["Height"].As <double>().PolynomialRegression(males["Weight"].As <double>(), 1); PolynomialRegressionResult result2 = males["Height"].As <double>().PolynomialRegression(males["Weight"].As <double>(), 2); PolynomialRegressionResult result3 = males["Height"].As <double>().PolynomialRegression(males["Weight"].As <double>(), 3); //MultiLinearRegressionResult multi = view["Weight"].As<double>().MultiLinearRegression(view["Height"].As<double>(), view["Gender"].As<string>().Select(s => (s == "Male") ? 1.0 : 0.0).ToList()); }
public void WaldFit() { WaldDistribution wald = new WaldDistribution(3.5, 2.5); FrameTable results = new FrameTable(); results.AddColumns <double>("Mean", "Shape", "MeanVariance", "ShapeVariance", "MeanShapeCovariance"); for (int i = 0; i < 128; i++) { Sample sample = SampleTest.CreateSample(wald, 16, i); WaldFitResult result = WaldDistribution.FitToSample(sample); Assert.IsTrue(result.Mean.Value == result.Parameters.ValuesVector[result.Parameters.IndexOf("Mean")]); Assert.IsTrue(result.Shape.Value == result.Parameters.ValuesVector[result.Parameters.IndexOf("Shape")]); Assert.IsTrue(TestUtilities.IsNearlyEqual(result.Parameters.VarianceOf("Mean"), MoreMath.Sqr(result.Mean.Uncertainty))); Assert.IsTrue(TestUtilities.IsNearlyEqual(result.Parameters.VarianceOf("Shape"), MoreMath.Sqr(result.Shape.Uncertainty))); results.AddRow( result.Mean.Value, result.Shape.Value, result.Parameters.VarianceOf("Mean"), result.Parameters.VarianceOf("Shape"), result.Parameters.CovarianceOf("Mean", "Shape") ); } Assert.IsTrue(results["Mean"].As <double>().PopulationMean().ConfidenceInterval(0.99).ClosedContains(wald.Mean)); Assert.IsTrue(results["Shape"].As <double>().PopulationMean().ConfidenceInterval(0.99).ClosedContains(wald.Shape)); Assert.IsTrue(results["Mean"].As <double>().PopulationVariance().ConfidenceInterval(0.99).ClosedContains(results["MeanVariance"].As <double>().Median())); Assert.IsTrue(results["Shape"].As <double>().PopulationVariance().ConfidenceInterval(0.99).ClosedContains(results["ShapeVariance"].As <double>().Median())); Assert.IsTrue(results["Mean"].As <double>().PopulationCovariance(results["Shape"].As <double>()).ConfidenceInterval(0.99).ClosedContains(results["MeanShapeCovariance"].As <double>().Median())); }
public void InternetTimeSeriesDownload() { FrameTable table = DownloadFrameTable(new Uri("https://timeseries.weebly.com/uploads/2/1/0/8/21086414/sea_ice.csv")); double[] powerSpectrum = table["Arctic"].As <double>().PowerSpectrum(); double v12 = table["Arctic"].As <double>().Autocovariance(12); TestResult lbTest = table["Arctic"].As <double>().LjungBoxTest(); }
public void TimeSeriesFitAR1() { double alpha = 0.3; double mu = 0.2; double sigma = 0.4; int n = 24; // For our fit to AR(1), we have incorporated bias correction (at least // for the most important parameter alpha), so we can do a small-n test. FrameTable data = new FrameTable(); data.AddColumn <UncertainValue>("mu"); data.AddColumn <UncertainValue>("alpha"); data.AddColumn <UncertainValue>("sigma"); data.AddColumn <SymmetricMatrix>("covariance"); data.AddColumn <double>("p"); for (int i = 0; i < 128; i++) { TimeSeries series = GenerateAR1TimeSeries(alpha, mu, sigma, n, n * i + 271828); AR1FitResult result = series.FitToAR1(); data.AddRow( result.Mu, result.Alpha, result.Sigma, result.Parameters.CovarianceMatrix, result.GoodnessOfFit.Probability ); } data.AddComputedColumn("alphaValue", r => ((UncertainValue)r["alpha"]).Value); data.AddComputedColumn("muValue", r => ((UncertainValue)r["mu"]).Value); // Check that fit parameters agree with inputs Assert.IsTrue(data["mu"].As((UncertainValue v) => v.Value).PopulationMean().ConfidenceInterval(0.99).ClosedContains(mu)); Assert.IsTrue(data["alpha"].As((UncertainValue v) => v.Value).PopulationMean().ConfidenceInterval(0.99).ClosedContains(alpha)); Assert.IsTrue(data["sigma"].As((UncertainValue v) => v.Value).PopulationMean().ConfidenceInterval(0.99).ClosedContains(sigma)); // Check that reported variances agree with actual variances Assert.IsTrue(data["mu"].As((UncertainValue v) => v.Value).PopulationStandardDeviation().ConfidenceInterval(0.99).ClosedContains(data["mu"].As((UncertainValue v) => v.Uncertainty).Median())); Assert.IsTrue(data["alpha"].As((UncertainValue v) => v.Value).PopulationStandardDeviation().ConfidenceInterval(0.99).ClosedContains(data["alpha"].As((UncertainValue v) => v.Uncertainty).Median())); Assert.IsTrue(data["sigma"].As((UncertainValue v) => v.Value).PopulationStandardDeviation().ConfidenceInterval(0.99).ClosedContains(data["sigma"].As((UncertainValue v) => v.Uncertainty).Median())); // Check that reported co-variances agree with actual co-variances Assert.IsTrue(data["mu"].As((UncertainValue v) => v.Value).PopulationCovariance(data["alpha"].As((UncertainValue v) => v.Value)).ConfidenceInterval(0.99).ClosedContains(data["covariance"].As((SymmetricMatrix c) => c[0, 1]).Median())); // For small n, the fitted alpha can vary considerably, and the formula for var(m) varies // quite strongly with alpha, so the computed var(m) have a very long tail. This pushes the // mean computed var(m) quite a bit higher than a typical value, so we use medians instead // of means for our best guess for the predicted variance. TestResult ks = data["p"].As <double>().KolmogorovSmirnovTest(new UniformDistribution()); Assert.IsTrue(ks.Probability > 0.05); // This is an onerous way to store values, but it does let us test how the data-frame machinery deals with // non-trivial storage types. }
public static void ImportingData() { FrameTable data; using (TextReader reader = File.OpenText("test.csv")) { data = FrameTable.FromCsv(reader); } Console.WriteLine($"Imported CSV file with {data.Rows.Count} rows."); Console.WriteLine("The names and types of the columns are:"); foreach (FrameColumn column in data.Columns) { Console.WriteLine($" {column.Name} of type {column.StorageType}"); } FrameTable titanic; Uri url = new Uri("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"); WebRequest request = WebRequest.Create(url); using (WebResponse response = request.GetResponse()) { using (StreamReader reader = new StreamReader(response.GetResponseStream())) { titanic = FrameTable.FromCsv(reader); } } Uri jsonUrl = new Uri("https://raw.githubusercontent.com/dcwuser/metanumerics/master/Examples/Data/example.json"); WebClient client = new WebClient(); string input = client.DownloadString(jsonUrl); List <Dictionary <string, object> > output = JsonConvert.DeserializeObject <List <Dictionary <string, object> > >(input); FrameTable jsonExample = FrameTable.FromDictionaries(output); // Define the schema. FrameTable table = new FrameTable(); table.AddColumn <int>("Id"); table.AddColumn <string>("Name"); table.AddColumn <string>("Sex"); table.AddColumn <DateTime>("Birthdate"); table.AddColumn <double>("Height"); table.AddColumn <double?>("Weight"); table.AddColumn <bool>("Result"); // Add rows using as arrays of objects. table.AddRow(1, "John", "M", DateTime.Parse("1970-01-02"), 190.0, 75.0, true); table.AddRow(2, "Mary", "F", DateTime.Parse("1980-02-03"), 155.0, null, true); // Add a row using a dictionary. This is more verbose, but very clear. table.AddRow(new Dictionary <string, object>() { { "Id", 3 }, { "Name", null }, { "Sex", "M" }, { "Birthdate", DateTime.Parse("1990-03-04") }, { "Height", 180.0 }, { "Weight", 60.0 }, { "Result", false } }); }
public static void ManipulatingData() { FrameTable table; Uri url = new Uri("https://raw.githubusercontent.com/dcwuser/metanumerics/master/Examples/Data/example.csv"); WebRequest request = WebRequest.Create(url); using (WebResponse response = request.GetResponse()) { using (StreamReader reader = new StreamReader(response.GetResponseStream())) { table = FrameTable.FromCsv(reader); } } FrameView selected = table.Select("Height", "Weight", "Sex"); FrameView discarded = table.Discard("Name"); table.AddComputedColumn("Bmi", r => ((double)r["Weight"]) / MoreMath.Sqr((double)r["Height"] / 100.0)); Console.WriteLine($"Bmi of first subject is {table["Bmi"][0]}."); FrameView noNulls = table.WhereNotNull(); FrameView noNullWeights = table.WhereNotNull("Weight"); FrameView noNullWeightsOrHeights = table.WhereNotNull("Weight", "Height"); double meanWeight = table.WhereNotNull("Weight").Columns["Weight"].As <double>().Mean(); FrameView men = table.Where <string>("Sex", s => s == "M"); FrameView shortMen = table.Where( r => ((string)r["Sex"]) == "M" && ((double)r["Height"] < 175.0) ); FrameView ordered = table.OrderBy("Height"); FrameView reversed = table.OrderBy("Height", SortOrder.Descending); FrameView alsoOrdered = table.OrderBy <double>("Height", (h1, h2) => h1.CompareTo(h2)); FrameView sorted = table.OrderBy((r1, r2) => { int first = ((string)r1["Sex"]).CompareTo((string )r2["Sex"]); int second = ((double)r1["Height"]).CompareTo((double)r2["Height"]); return(first != 0 ? first : second); }); List <string> sexes = table["Sex"].As <string>().Distinct().ToList(); FrameTable counts = table.GroupBy("Sex", v => v.Rows.Count, "Count"); FrameTable summarize = table.GroupBy("Sex", v => { SummaryStatistics summary = new SummaryStatistics(v["Height"].As <double>()); return(new Dictionary <string, object>() { { "Count", summary.Count }, { "Mean", summary.Mean }, { "StdDev", summary.StandardDeviation } }); }); }
public void MultivariateLinearRegressionSimple() { // define model y = a + b0 * x0 + b1 * x1 + noise double a = 1.0; double b0 = -2.0; double b1 = 3.0; ContinuousDistribution x0distribution = new CauchyDistribution(10.0, 5.0); ContinuousDistribution x1distribution = new UniformDistribution(Interval.FromEndpoints(-10.0, 20.0)); ContinuousDistribution noise = new NormalDistribution(0.0, 10.0); // draw a sample from the model Random rng = new Random(1); MultivariateSample sample = new MultivariateSample("x0", "x1", "y"); FrameTable table = new FrameTable(); table.AddColumns <double>("x0", "x1", "y"); for (int i = 0; i < 100; i++) { double x0 = x0distribution.GetRandomValue(rng); double x1 = x1distribution.GetRandomValue(rng); double eps = noise.GetRandomValue(rng); double y = a + b0 * x0 + b1 * x1 + eps; sample.Add(x0, x1, y); table.AddRow(x0, x1, y); } // do a linear regression fit on the model ParameterCollection oldResult = sample.LinearRegression(2).Parameters; MultiLinearRegressionResult newResult = table["y"].As <double>().MultiLinearRegression( table["x0"].As <double>(), table["x1"].As <double>() ); // the result should have the appropriate dimension Assert.IsTrue(oldResult.Count == 3); Assert.IsTrue(newResult.Parameters.Count == 3); // The parameters should match the model Assert.IsTrue(oldResult[0].Estimate.ConfidenceInterval(0.90).ClosedContains(b0)); Assert.IsTrue(oldResult[1].Estimate.ConfidenceInterval(0.90).ClosedContains(b1)); Assert.IsTrue(oldResult[2].Estimate.ConfidenceInterval(0.90).ClosedContains(a)); Assert.IsTrue(newResult.CoefficientOf(0).ConfidenceInterval(0.99).ClosedContains(b0)); Assert.IsTrue(newResult.CoefficientOf("x1").ConfidenceInterval(0.99).ClosedContains(b1)); Assert.IsTrue(newResult.Intercept.ConfidenceInterval(0.99).ClosedContains(a)); // The residuals should be compatible with the model predictions for (int i = 0; i < table.Rows.Count; i++) { FrameRow row = table.Rows[i]; double x0 = (double)row["x0"]; double x1 = (double)row["x1"]; double yp = newResult.Predict(x0, x1).Value; double y = (double)row["y"]; Assert.IsTrue(TestUtilities.IsNearlyEqual(newResult.Residuals[i], y - yp)); } }
public void LinearRegressionVariances() { // do a set of logistic regression fits // make sure not only that the fit parameters are what they should be, but that their variances/covariances are as returned Random rng = new Random(314159); // define line parameters double a0 = 2.0; double b0 = -1.0; // do a lot of fits, recording results of each FrameTable data = new FrameTable(); data.AddColumns <double>("a", "va", "b", "vb", "abCov", "p", "dp"); for (int k = 0; k < 128; k++) { // we should be able to draw x's from any distribution; noise should be drawn from a normal distribution ContinuousDistribution xd = new LogisticDistribution(); ContinuousDistribution nd = new NormalDistribution(0.0, 2.0); // generate a synthetic data set BivariateSample sample = new BivariateSample(); for (int i = 0; i < 12; i++) { double x = xd.GetRandomValue(rng); double y = a0 + b0 * x + nd.GetRandomValue(rng); sample.Add(x, y); } // do the regression LinearRegressionResult result = sample.LinearRegression(); // record result UncertainValue p = result.Predict(12.0); data.AddRow(new Dictionary <string, object>() { { "a", result.Intercept.Value }, { "va", result.Parameters.VarianceOf("Intercept") }, { "b", result.Slope.Value }, { "vb", result.Parameters.VarianceOf("Slope") }, { "abCov", result.Parameters.CovarianceOf("Slope", "Intercept") }, { "p", p.Value }, { "dp", p.Uncertainty } }); } // variances of parameters should agree with predictions Assert.IsTrue(data["a"].As <double>().PopulationVariance().ConfidenceInterval(0.99).ClosedContains(data["va"].As <double>().Median())); Assert.IsTrue(data["b"].As <double>().PopulationVariance().ConfidenceInterval(0.99).ClosedContains(data["vb"].As <double>().Median())); Assert.IsTrue(data["a"].As <double>().PopulationCovariance(data["b"].As <double>()).ConfidenceInterval(0.99).ClosedContains(data["abCov"].As <double>().Median())); // variance of prediction should agree with claim Assert.IsTrue(data["p"].As <double>().PopulationStandardDeviation().ConfidenceInterval(0.99).ClosedContains(data["dp"].As <double>().Median())); }
public void SmokeTest2() { FrameTable frame; string path = @"C:\Users\dcw-b\Desktop\DataSets\551184489_52017_210_airline_delay_causes\551184489_52017_210_airline_delay_causes.csv"; using (StreamReader stream = File.OpenText(path)) { frame = FrameTable.FromCsv(stream); } FrameView view = frame.GroupBy("carrier", (FrameView q) => q.Rows.Count, "count"); }
public FrameTable DownloadFrameTable(Uri url) { FrameTable frame; WebRequest request = WebRequest.Create(url); using (WebResponse response = request.GetResponse()) { using (Stream responseStream = response.GetResponseStream()) { using (TextReader reader = new StreamReader(responseStream)) { frame = FrameTable.FromCsv(reader); } } } return(frame); }
public void LinearLogisticRegressionVariances() { // define model y = a + b0 * x0 + b1 * x1 + noise double a = -2.0; double b = 1.0; ContinuousDistribution xDistribution = new StudentDistribution(2.0); FrameTable data = new FrameTable(); data.AddColumns <double>("a", "da", "b", "db", "abcov", "p", "dp"); // draw a sample from the model Random rng = new Random(3); for (int j = 0; j < 32; j++) { List <double> xs = new List <double>(); List <bool> ys = new List <bool>(); for (int i = 0; i < 32; i++) { double x = xDistribution.GetRandomValue(rng); double t = a + b * x; double p = 1.0 / (1.0 + Math.Exp(-t)); bool y = (rng.NextDouble() < p); xs.Add(x); ys.Add(y); } // do a linear regression fit on the model LinearLogisticRegressionResult result = ys.LinearLogisticRegression(xs); UncertainValue pp = result.Predict(1.0); data.AddRow( result.Intercept.Value, result.Intercept.Uncertainty, result.Slope.Value, result.Slope.Uncertainty, result.Parameters.CovarianceMatrix[0, 1], pp.Value, pp.Uncertainty ); } // The estimated parameters should agree with the model that generated the data. // The variances of the estimates should agree with the claimed variances Assert.IsTrue(data["a"].As <double>().PopulationStandardDeviation().ConfidenceInterval(0.99).ClosedContains(data["da"].As <double>().Mean())); Assert.IsTrue(data["b"].As <double>().PopulationStandardDeviation().ConfidenceInterval(0.99).ClosedContains(data["db"].As <double>().Mean())); Assert.IsTrue(data["a"].As <double>().PopulationCovariance(data["b"].As <double>()).ConfidenceInterval(0.99).ClosedContains(data["abcov"].As <double>().Mean())); Assert.IsTrue(data["p"].As <double>().PopulationStandardDeviation().ConfidenceInterval(0.99).ClosedContains(data["dp"].As <double>().Mean())); }
public void FrameTableCsvRoundtrip2() { // Let's exercise all our data adaptors FrameTable original = new FrameTable(); original.AddColumn <string>("String"); original.AddColumn <double?>("Double?"); original.AddColumn <int>("Int"); original.AddColumn <DateTime?>("DateTime?"); original.AddColumn <TimeSpan>("TimeSpan"); original.AddColumn <Boolean?>("Boolean?"); original.AddRow("z", null, 1, DateTime.Today, TimeSpan.FromMinutes(5.0), true); original.AddRow("y", 4.3, 2, null, TimeSpan.FromHours(4.0), null); original.AddRow("x", 2.0, 3, DateTime.UtcNow.Date, TimeSpan.FromDays(3.0), false); TextWriter storage = new StringWriter(); original.ToCsv(storage); FrameTable copy = FrameTable.FromCsv(new StringReader(storage.ToString())); for (int i = 0; i < original.Columns.Count; i++) { Assert.IsTrue(original.Columns[i].Name == copy.Columns[i].Name); Assert.IsTrue(original.Columns[i].StorageType == copy.Columns[i].StorageType); } for (int i = 0; i < original.Rows.Count; i++) { for (int j = 0; j < original.Columns.Count; j++) { // This awkwardness is necessary because == resolves to a static method, // so object == object does a reference check which will fail even if // both sides are equal structures. Equals, on the other hand, is a // virtual method, so it will do the appropriate comparison, but will // fail if the instance is null. if (original.Rows[i][j] == null) { Assert.IsTrue(original.Rows[i][j] == null); } else { Assert.IsTrue(original.Rows[i][j].Equals(copy.Rows[i][j])); } } } }
public void BivariateNullAssociation() { Random rng = new Random(31415926); // Create a data structure to hold the results of Pearson, Spearman, and Kendall tests. FrameTable data = new FrameTable(); data.AddColumn <double>("r"); data.AddColumn <double>("ρ"); data.AddColumn <double>("τ"); // Create variables to hold the claimed distribution of each test statistic. ContinuousDistribution PRD = null; ContinuousDistribution SRD = null; ContinuousDistribution KTD = null; // Generate a large number of bivariate samples and conduct our three tests on each. ContinuousDistribution xDistribution = new LognormalDistribution(); ContinuousDistribution yDistribution = new CauchyDistribution(); for (int j = 0; j < 100; j++) { List <double> x = new List <double>(); List <double> y = new List <double>(); for (int i = 0; i < 100; i++) { x.Add(xDistribution.GetRandomValue(rng)); y.Add(yDistribution.GetRandomValue(rng)); } TestResult PR = Bivariate.PearsonRTest(x, y); TestResult SR = Bivariate.SpearmanRhoTest(x, y); TestResult KT = Bivariate.KendallTauTest(x, y); PRD = PR.Statistic.Distribution; SRD = SR.Statistic.Distribution; KTD = KT.Statistic.Distribution; data.AddRow(new Dictionary <string, object>() { { "r", PR.Statistic.Value }, { "ρ", SR.Statistic.Value }, { "τ", KT.Statistic.Value } }); } Assert.IsTrue(data["r"].As <double>().KolmogorovSmirnovTest(PRD).Probability > 0.05); Assert.IsTrue(data["ρ"].As <double>().KolmogorovSmirnovTest(SRD).Probability > 0.05); Assert.IsTrue(data["τ"].As <double>().KolmogorovSmirnovTest(KTD).Probability > 0.05); }
public void FrameTableColumnManipulations() { FrameTable frame = new FrameTable(); frame.AddColumn <int>("Integer"); frame.AddColumn <double>("Double"); Assert.IsTrue(frame.Columns.Count == 2); Assert.IsTrue(frame.Columns[0].Name == frame[frame.Columns[0].Name].Name); Assert.IsTrue(frame.Columns[0].StorageType == frame[frame.Columns[0].Name].StorageType); frame.AddColumn <DateTime>("Timestamp"); Assert.IsTrue(frame.Columns.Count == 3); frame.RemoveColumn(frame.Columns[0].Name); Assert.IsTrue(frame.Columns.Count == 2); }
public void BivariateNonlinearFitVariances() { // Verify that we can fit a non-linear function, // that the estimated parameters do cluster around the true values, // and that the estimated parameter covariances do reflect the actually observed covariances double a = 2.7; double b = 3.1; ContinuousDistribution xDistribution = new ExponentialDistribution(2.0); ContinuousDistribution eDistribution = new NormalDistribution(0.0, 4.0); FrameTable parameters = new FrameTable(); parameters.AddColumns <double>("a", "b"); MultivariateSample covariances = new MultivariateSample(3); for (int i = 0; i < 64; i++) { BivariateSample sample = new BivariateSample(); Random rng = new Random(i); for (int j = 0; j < 8; j++) { double x = xDistribution.GetRandomValue(rng); double y = a * Math.Pow(x, b) + eDistribution.GetRandomValue(rng); sample.Add(x, y); } NonlinearRegressionResult fit = sample.NonlinearRegression( (IReadOnlyList <double> p, double x) => p[0] * Math.Pow(x, p[1]), new double[] { 1.0, 1.0 } ); parameters.AddRow(fit.Parameters.ValuesVector); covariances.Add(fit.Parameters.CovarianceMatrix[0, 0], fit.Parameters.CovarianceMatrix[1, 1], fit.Parameters.CovarianceMatrix[0, 1]); } Assert.IsTrue(parameters["a"].As <double>().PopulationMean().ConfidenceInterval(0.99).ClosedContains(a)); Assert.IsTrue(parameters["b"].As <double>().PopulationMean().ConfidenceInterval(0.99).ClosedContains(b)); Assert.IsTrue(parameters["a"].As <double>().PopulationVariance().ConfidenceInterval(0.99).ClosedContains(covariances.Column(0).Mean)); Assert.IsTrue(parameters["b"].As <double>().PopulationVariance().ConfidenceInterval(0.99).ClosedContains(covariances.Column(1).Mean)); Assert.IsTrue(parameters["a"].As <double>().PopulationCovariance(parameters["b"].As <double>()).ConfidenceInterval(0.99).ClosedContains(covariances.Column(2).Mean)); Assert.IsTrue(Bivariate.PopulationCovariance(parameters["a"].As <double>(), parameters["b"].As <double>()).ConfidenceInterval(0.99).ClosedContains(covariances.Column(2).Mean)); }
public void Smoketest() { FrameTable frame; string url = "https://raw.githubusercontent.com/pandas-dev/pandas/master/doc/data/tips.csv"; WebRequest request = WebRequest.Create(url); using (WebResponse response = request.GetResponse()) { using (Stream responseStream = response.GetResponseStream()) { using (TextReader reader = new StreamReader(responseStream)) { frame = FrameTable.FromCsv(reader); } } } frame.AddComputedColumn("tip_fraction", r => ((double)r["tip"]) / ((double)r["total_bill"])); FrameView counts = frame.GroupBy("day", v => v.Rows.Count, "total").OrderBy("day"); FrameView means = frame.GroupBy("sex", v => v["tip_fraction"].As <double>().Mean(), "mean_tip_fraction"); }
public void FrameTableRowManipulations() { // In future, test with computed columns FrameTable frame = new FrameTable(); frame.AddColumn <double>("Height"); frame.AddColumn <string>("Name"); Assert.IsTrue(frame.Columns.Count == 2); Assert.IsTrue(frame.Rows.Count == 0); // Insert a row Dictionary <string, object> row = new Dictionary <string, object>() { { "Name", "John" }, { "Height", 1.1 } }; frame.AddRow(row); Assert.IsTrue(frame.Rows.Count == 1); // Try to insert a row with missing values Dictionary <string, object> smallRow = new Dictionary <string, object>() { { "Name", "Mark" } }; try { frame.AddRow(smallRow); Assert.Fail(); } catch (Exception) { } // Try to insert a row with too many values Dictionary <string, object> bigRow = new Dictionary <string, object>() { { "Name", "Luke" }, { "Height", 1.2 }, { "Weight", 60.0 } }; try { frame.AddRow(bigRow); Assert.Fail(); } catch (Exception) { } }
private FrameTable GetTestFrame() { FrameTable frame = new FrameTable(); frame.AddColumn <string>("name"); frame.AddColumn <double>("height"); frame.AddColumn <double>("weight"); frame.AddColumn <bool?>("male"); frame.AddRow("a", 7.0, 10.0, false); frame.AddRow(null, 6.5, 11.0, true); frame.AddRow("c", 6.0, 12.0, false); frame.AddRow("d", 5.5, 11.0, true); frame.AddRow("e", 5.0, 12.0, null); frame.AddRow("f", 4.5, 13.0, true); frame.AddRow(null, 4.0, 12.0, false); frame.AddComputedColumn("bmi", r => ((double)r["weight"]) / MoreMath.Sqr((double)r["height"])); return(frame); }
public void FrameViewGroupByClause() { FrameView original = GetTestFrame(); HashSet <bool?> values = new HashSet <bool?>(original["male"].As <bool?>().Distinct()); FrameTable grouped = original.GroupBy("male", v => v["height"].As <double>().Mean(), "meanHeight"); Assert.IsTrue(grouped.Rows.Count == values.Count); foreach (FrameRow row in grouped.Rows) { bool?value = (bool?)row["male"]; Assert.IsTrue(values.Contains(value)); // r["male"] == value doesn't work, because this is an object comparison, // and two equal values boxed to objects are not equal. This is a problem. //DataView selected = original.Where<bool?>("male", m => m.Equals(value)); FrameView selected = original.Where(r => (bool?)r["male"] == value); double height = selected["height"].As <double>().Mean(); Assert.IsTrue((double)row["meanHeight"] == height); } }
public void MeansClustering2() { ColumnVector[] centers = new ColumnVector[] { new ColumnVector(0.0, 0.0, 0.0), new ColumnVector(2.0, 0.0, 0.0), new ColumnVector(0.0, 2.0, 0.0), new ColumnVector(0.0, 0.0, 2.0) }; FrameTable table = new FrameTable(); string alphabet = "abcdefghijklmnopqrstuvwxyz"; for (int j = 0; j < 3; j++) { table.AddColumn <double>(alphabet[j].ToString()); } List <int> inputAssignments = new List <int>(); List <ColumnVector> inputVectors = new List <ColumnVector>(); Random rng = new Random(2); ContinuousDistribution dist = new NormalDistribution(0.0, 1.0); for (int i = 0; i < 100; i++) { int inputAssignment = rng.Next(0, centers.Length); inputAssignments.Add(inputAssignment); ColumnVector inputVector = centers[inputAssignment].Copy(); for (int k = 0; k < inputVector.Dimension; k++) { inputVector[k] += dist.GetRandomValue(rng); } inputVectors.Add(inputVector); table.AddRow <double>(inputVector); } MeansClusteringResult result = table.AsColumns <double>().MeansClustering(centers.Length); //MultivariateSample s = new MultivariateSample(3); //foreach (ColumnVector v in inputVectors) { s.Add(v); } //MeansClusteringResult result = s.MeansClustering(centers.Length); List <int> outputAssignments = new List <int>(); for (int i = 0; i < inputVectors.Count; i++) { int assignment = result.Classify(inputVectors[i]); outputAssignments.Add(assignment); } // Map the output centroids to the original centroids Dictionary <int, int> map = new Dictionary <int, int>(); for (int outputIndex = 0; outputIndex < result.Count; outputIndex++) { ColumnVector centroid = result.Centroid(outputIndex); int mappedInputIndex = -1; double mappedInputDistance = Double.MaxValue; for (int inputIndex = 0; inputIndex < centers.Length; inputIndex++) { double distance = (centroid - centers[inputIndex]).Norm(); if (distance < mappedInputDistance) { mappedInputIndex = inputIndex; mappedInputDistance = distance; } } Assert.IsTrue(mappedInputIndex >= 0); Assert.IsTrue(mappedInputDistance < 1.0); map.Add(outputIndex, mappedInputIndex); } int correctCount = 0; for (int i = 0; i < outputAssignments.Count; i++) { if (map[outputAssignments[i]] == inputAssignments[i]) { correctCount++; } } Assert.IsTrue(correctCount >= 0.50 * outputAssignments.Count); }