示例#1
0
        private static void AddRows(FrameTable table, IReadOnlyList <string> names, string sex, double meanHeight, double stddevHeight, double meanBmi, double stddevBmi, int flag, Random rng)
        {
            NormalDistribution  gauss = new NormalDistribution();
            UniformDistribution ages  = new UniformDistribution(Interval.FromEndpoints(15.0, 75.0));

            foreach (string name in names)
            {
                double zHeight = gauss.GetRandomValue(rng);
                double height  = meanHeight + stddevHeight * zHeight;

                double zBmi = gauss.GetRandomValue(rng);
                double bmi  = meanBmi + stddevBmi * zBmi;

                double weight = MoreMath.Sqr(height / 100.0) * bmi;

                double t = -0.4 + 0.6 * zBmi + 0.8 * flag;
                double p = 1.0 / (1.0 + Math.Exp(-t));
                bool   r = rng.NextDouble() < p;

                int id = table.Rows.Count;

                TimeSpan age       = TimeSpan.FromDays(365.24 * ages.GetRandomValue(rng));
                DateTime birthdate = (DateTime.Now - age).Date;

                table.AddRow(id, name, sex, birthdate, height, weight, r);
            }
        }
        public void MultivariateLinearLogisticRegressionSimple()
        {
            // define model y = a + b0 * x0 + b1 * x1 + noise
            double a  = 1.0;
            double b0 = -1.0 / 2.0;
            double b1 = 1.0 / 3.0;
            ContinuousDistribution x0distribution = new LaplaceDistribution();
            ContinuousDistribution x1distribution = new NormalDistribution();

            // draw a sample from the model
            Random             rng   = new Random(1);
            MultivariateSample old   = new MultivariateSample("y", "x0", "x1");
            FrameTable         table = new FrameTable();

            table.AddColumn <double>("x0");
            table.AddColumn <double>("x1");
            table.AddColumn <bool>("y");

            for (int i = 0; i < 100; i++)
            {
                double x0 = x0distribution.GetRandomValue(rng);
                double x1 = x1distribution.GetRandomValue(rng);
                double t  = a + b0 * x0 + b1 * x1;
                double p  = 1.0 / (1.0 + Math.Exp(-t));
                bool   y  = (rng.NextDouble() < p);
                old.Add(y ? 1.0 : 0.0, x0, x1);
                table.AddRow(x0, x1, y);
            }

            // do a linear regression fit on the model
            MultiLinearLogisticRegressionResult oldResult = old.LogisticLinearRegression(0);
            MultiLinearLogisticRegressionResult newResult = table["y"].As <bool>().MultiLinearLogisticRegression(
                table["x0"].As <double>(), table["x1"].As <double>()
                );

            // the result should have the appropriate dimension
            Assert.IsTrue(newResult.Parameters.Count == 3);

            // The parameters should match the model
            Assert.IsTrue(newResult.CoefficientOf(0).ConfidenceInterval(0.99).ClosedContains(b0));
            Assert.IsTrue(newResult.CoefficientOf("x1").ConfidenceInterval(0.99).ClosedContains(b1));
            Assert.IsTrue(newResult.Intercept.ConfidenceInterval(0.99).ClosedContains(a));

            // Our predictions should be better than chance.
            int correct = 0;

            for (int i = 0; i < table.Rows.Count; i++)
            {
                FrameRow row = table.Rows[i];
                double   x0  = (double)row["x0"];
                double   x1  = (double)row["x1"];
                double   p   = newResult.Predict(x0, x1).Value;
                bool     y   = (bool)row["y"];
                if ((y && p > 0.5) || (!y & p < 0.5))
                {
                    correct++;
                }
            }
            Assert.IsTrue(correct > 0.5 * table.Rows.Count);
        }
示例#3
0
        public void FrameTableCsvRoundtrip()
        {
            FrameTable frame;

            using (TextReader reader = File.OpenText(csvFileName))
            {
                frame = FrameTable.FromCsv(reader);
            }

            Assert.IsTrue(frame != null);
            Assert.IsTrue(frame.Columns.Count > 0);
            Assert.IsTrue(frame.Rows.Count > 0);

            string outputPath = Path.GetTempFileName();

            try
            {
                using (FileStream stream = File.OpenWrite(outputPath))
                {
                    using (TextWriter writer = new StreamWriter(stream))
                    {
                        frame.ToCsv(writer);
                    }
                }

                Guid inputHash  = ComputeMD5Hash(csvFileName);
                Guid outputHash = ComputeMD5Hash(outputPath);
                Assert.IsTrue(inputHash == outputHash);
            }
            finally
            {
                File.Delete(outputPath);
            }
        }
        public void MultivariateLinearRegressionVariances()
        {
            // define model y = a + b0 * x0 + b1 * x1 + noise
            double a  = -3.0;
            double b0 = 2.0;
            double b1 = -1.0;
            ContinuousDistribution x0distribution = new LaplaceDistribution();
            ContinuousDistribution x1distribution = new CauchyDistribution();
            ContinuousDistribution eDistribution  = new NormalDistribution(0.0, 4.0);

            FrameTable data = new FrameTable();

            data.AddColumns <double>("a", "da", "b0", "db0", "b1", "db1", "ab1Cov", "p", "dp");

            // draw a sample from the model
            Random rng = new Random(4);

            for (int j = 0; j < 64; j++)
            {
                List <double> x0s = new List <double>();
                List <double> x1s = new List <double>();
                List <double> ys  = new List <double>();

                for (int i = 0; i < 16; i++)
                {
                    double x0 = x0distribution.GetRandomValue(rng);
                    double x1 = x1distribution.GetRandomValue(rng);
                    double e  = eDistribution.GetRandomValue(rng);
                    double y  = a + b0 * x0 + b1 * x1 + e;
                    x0s.Add(x0);
                    x1s.Add(x1);
                    ys.Add(y);
                }

                // do a linear regression fit on the model
                MultiLinearRegressionResult result = ys.MultiLinearRegression(
                    new Dictionary <string, IReadOnlyList <double> > {
                    { "x0", x0s }, { "x1", x1s }
                }
                    );
                UncertainValue pp = result.Predict(-5.0, 6.0);

                data.AddRow(
                    result.Intercept.Value, result.Intercept.Uncertainty,
                    result.CoefficientOf("x0").Value, result.CoefficientOf("x0").Uncertainty,
                    result.CoefficientOf("x1").Value, result.CoefficientOf("x1").Uncertainty,
                    result.Parameters.CovarianceOf("Intercept", "x1"),
                    pp.Value, pp.Uncertainty
                    );
            }

            // The estimated parameters should agree with the model that generated the data.

            // The variances of the estimates should agree with the claimed variances
            Assert.IsTrue(data["a"].As <double>().PopulationStandardDeviation().ConfidenceInterval(0.99).ClosedContains(data["da"].As <double>().Mean()));
            Assert.IsTrue(data["b0"].As <double>().PopulationStandardDeviation().ConfidenceInterval(0.99).ClosedContains(data["db0"].As <double>().Mean()));
            Assert.IsTrue(data["b1"].As <double>().PopulationStandardDeviation().ConfidenceInterval(0.99).ClosedContains(data["db1"].As <double>().Mean()));
            Assert.IsTrue(data["a"].As <double>().PopulationCovariance(data["b1"].As <double>()).ConfidenceInterval(0.99).ClosedContains(data["ab1Cov"].As <double>().Mean()));
            Assert.IsTrue(data["p"].As <double>().PopulationStandardDeviation().ConfidenceInterval(0.99).ClosedContains(data["dp"].As <double>().Median()));
        }
示例#5
0
        public static void ConstructExampleData()
        {
            FrameTable table = new FrameTable();

            table.AddColumn <int>("Id");
            table.AddColumn <string>("Name");
            table.AddColumn <string>("Sex");
            table.AddColumn <DateTime>("Birthdate");
            table.AddColumn <double>("Height");
            table.AddColumns <double?>("Weight");
            table.AddColumn <bool>("Result");

            Random rng = new Random(1000001);

            string[] maleNames = new string[] { "Alex", "Chris", "David", "Eric", "Frederic", "George", "Hans", "Igor", "John", "Kevin", "Luke", "Mark", "Oscar", "Peter", "Richard", "Stephan", "Thomas", "Vincent" };
            AddRows(table, maleNames, "M", 175.0, 12.0, 24.0, 3.0, 1, rng);

            string[] femaleNames = new string[] { "Anne", "Belle", "Dorothy", "Elizabeth", "Fiona", "Helen", "Julia", "Kate", "Louise", "Mary", "Natalie", "Olivia", "Ruth", "Sarah", "Theresa", "Viola" };
            AddRows(table, femaleNames, "F", 160.0, 10.0, 24.0, 3.0, 0, rng);

            // add rows with nulls
            table.AddRow(table.Rows.Count, null, "M", DateTime.Parse("1970-07-27"), 183.0, 74.0, false);
            table.AddRow(table.Rows.Count, "Zoey", "F", DateTime.Parse("2007-09-17"), 138.0, null, false);

            string path = @"example.csv";

            using (StreamWriter writer = new StreamWriter(File.OpenWrite(path))) {
                table.ToCsv(writer);
            }
            Console.WriteLine(File.Exists(path));

            string json = JsonConvert.SerializeObject(table.ToDictionaries(), Formatting.Indented);

            File.WriteAllText("example.json", json);
        }
示例#6
0
        public void FrameTableDictionariesRoundtrip()
        {
            FrameTable frame = new FrameTable();

            frame.AddColumn <string>("name");
            frame.AddColumn <double>("height");
            frame.AddColumn <bool?>("male");

            frame.AddRow("a", 5.0, false);
            frame.AddRow("b", 6.0, true);
            frame.AddRow("c", 5.5, null);

            List <Dictionary <string, object> > dictionaries = frame.ToDictionaries().ToList();

            Assert.IsTrue(dictionaries.Count == frame.Rows.Count);
            Assert.IsTrue(dictionaries[0].Count == frame.Columns.Count);

            FrameTable frame2 = FrameTable.FromDictionaries(dictionaries);

            Assert.IsTrue(frame2.Rows.Count == frame.Rows.Count);
            Assert.IsTrue(frame2.Columns.Count == frame.Columns.Count);
            Assert.IsTrue(frame2.Columns[0].Name == frame.Columns[0].Name);
            Assert.IsTrue(frame2.Columns[1].StorageType == frame.Columns[1].StorageType);
            Assert.IsTrue(frame2.Rows[2]["male"] == frame2.Rows[2]["male"]);
        }
示例#7
0
        public void LinearLogisticRegressionSimple()
        {
            Polynomial m = Polynomial.FromCoefficients(-1.0, 2.0);

            FrameTable table = new FrameTable();

            table.AddColumn <double>("x");
            table.AddColumn <string>("z");

            Random rng = new Random(2);
            ContinuousDistribution xDistribution = new CauchyDistribution(4.0, 2.0);

            for (int i = 0; i < 24; i++)
            {
                double x = xDistribution.GetRandomValue(rng);
                double y = m.Evaluate(x);
                double p = 1.0 / (1.0 + Math.Exp(-y));
                bool   z = (rng.NextDouble() < p);
                table.AddRow(x, z.ToString());
            }

            LinearLogisticRegressionResult fit = table["z"].As((string s) => Boolean.Parse(s)).LinearLogisticRegression(table["x"].As <double>());

            Assert.IsTrue(fit.Intercept.ConfidenceInterval(0.99).ClosedContains(m.Coefficient(0)));
            Assert.IsTrue(fit.Slope.ConfidenceInterval(0.99).ClosedContains(m.Coefficient(1)));
        }
示例#8
0
        public void FrameViewGroupBy()
        {
            FrameView original = GetTestFrame();

            HashSet <bool?> values = new HashSet <bool?>(original["male"].As <bool?>().Distinct());

            FrameTable grouped = original.GroupBy("male", v => {
                SummaryStatistics summary = new SummaryStatistics(v["height"].As <double>());
                return(new Dictionary <string, object>()
                {
                    { "count", summary.Count },
                    { "heightMean", summary.Mean },
                    { "heightStandardDeviation", summary.StandardDeviation }
                });
            });

            Assert.IsTrue(grouped.Rows.Count == values.Count);
            Assert.IsTrue(grouped.Columns.Count == 4);

            for (int i = 0; i < grouped.Rows.Count; i++)
            {
                bool?value = grouped["male"].As <bool?>()[i];
                Assert.IsTrue(values.Contains(value));

                FrameView selected = original.Where(r => (bool?)r["male"] == value);
                Assert.IsTrue(selected.Rows.Count > 0);

                double mean = selected["height"].As <double>().Mean();
                Assert.IsTrue(TestUtilities.IsNearlyEqual(grouped["heightMean"].As <double>()[i], mean));

                double standardDeviation = selected["height"].As <double>().StandardDeviation();
                Assert.IsTrue(TestUtilities.IsNearlyEqual(grouped["heightStandardDeviation"].As <double>()[i], standardDeviation));
            }
        }
示例#9
0
        public void FrameViewColumnCoercion()
        {
            // Create nullable double and integer columns.
            FrameTable table = new FrameTable();

            table.AddColumn <double?>("one");
            table.AddColumn <int>("two");

            table.AddRow(1.1, 2);
            table.AddRow(null, 3);

            // Coerce the nullable double into a non-nullable double
            // Should work when value is non-null, and fail with value is null
            IReadOnlyList <double> one = table["one"].As <double>();

            Assert.IsTrue(one[0] == 1.1);
            try {
                double v = one[1];
                Assert.Fail();
            } catch (Exception) { }

            // Coerce the integer to a double.
            IReadOnlyList <double> two = table.Columns[1].As <double>();

            Assert.IsTrue(two[0] == 2.0);
        }
示例#10
0
        public void CsvWhitespaceParsing()
        {
            StringBuilder text = new StringBuilder();

            text.AppendLine(" c0 ,\tc1  ");
            text.AppendLine(",");
            text.AppendLine(" , ");
            text.AppendLine("  ,\t");
            text.AppendLine("\t\" \" ,  \"\t\" ");
            text.AppendLine(" \" a\t\"\t,\t\"\t a \"");
            text.AppendLine("\t \" \"\"\",  \" , \" ");

            FrameTable table = FrameTable.FromCsv(new StringReader(text.ToString()));

            Assert.IsTrue(table.Columns[0].Name == "c0");
            Assert.IsTrue(table.Columns[1].Name == "c1");

            Assert.IsTrue((string)table.Rows[0][0] == null);
            Assert.IsTrue((string)table.Rows[0][1] == null);
            Assert.IsTrue((string)table.Rows[1][0] == null);
            Assert.IsTrue((string)table.Rows[1][1] == null);
            Assert.IsTrue((string)table.Rows[2][0] == null);
            Assert.IsTrue((string)table.Rows[2][1] == null);
            Assert.IsTrue((string)table.Rows[3][0] == " ");
            Assert.IsTrue((string)table.Rows[3][1] == "\t");
            Assert.IsTrue((string)table.Rows[4][0] == " a\t");
            Assert.IsTrue((string)table.Rows[4][1] == "\t a ");
            Assert.IsTrue((string)table.Rows[5][0] == " \"");
            Assert.IsTrue((string)table.Rows[5][1] == " , ");
        }
示例#11
0
        public void InternetSampleDownload()
        {
            FrameTable table = DownloadFrameTable(new Uri("https://raw.githubusercontent.com/Dataweekends/zero_to_deep_learning_udemy/master/data/weight-height.csv"));
            FrameView  view  = table.WhereNotNull();

            view.AddComputedColumn("Bmi", (FrameRow r) => {
                double h = (double)r["Height"];
                double w = (double)r["Weight"];
                return(w / (h * h));
            });

            FrameView males   = view.Where("Gender", (string s) => (s == "Male"));
            FrameView females = view.Where("Gender", (string s) => (s == "Female"));

            SummaryStatistics maleSummary   = new SummaryStatistics(males["Height"].As <double>());
            SummaryStatistics femaleSummary = new SummaryStatistics(females["Height"].As <double>());

            TestResult allNormal    = view["Height"].As <double>().ShapiroFranciaTest();
            TestResult maleNormal   = males["Height"].As <double>().ShapiroFranciaTest();
            TestResult femaleNormal = females["Height"].As <double>().ShapiroFranciaTest();

            TestResult tTest  = Univariate.StudentTTest(males["Height"].As <double>(), females["Height"].As <double>());
            TestResult mwTest = Univariate.MannWhitneyTest(males["Height"].As <double>(), females["Height"].As <double>());

            LinearRegressionResult     result0 = males["Weight"].As <double>().LinearRegression(males["Height"].As <double>());
            PolynomialRegressionResult result1 = males["Height"].As <double>().PolynomialRegression(males["Weight"].As <double>(), 1);
            PolynomialRegressionResult result2 = males["Height"].As <double>().PolynomialRegression(males["Weight"].As <double>(), 2);
            PolynomialRegressionResult result3 = males["Height"].As <double>().PolynomialRegression(males["Weight"].As <double>(), 3);

            //MultiLinearRegressionResult multi = view["Weight"].As<double>().MultiLinearRegression(view["Height"].As<double>(), view["Gender"].As<string>().Select(s => (s == "Male") ? 1.0 : 0.0).ToList());
        }
示例#12
0
        public void WaldFit()
        {
            WaldDistribution wald = new WaldDistribution(3.5, 2.5);

            FrameTable results = new FrameTable();

            results.AddColumns <double>("Mean", "Shape", "MeanVariance", "ShapeVariance", "MeanShapeCovariance");

            for (int i = 0; i < 128; i++)
            {
                Sample sample = SampleTest.CreateSample(wald, 16, i);

                WaldFitResult result = WaldDistribution.FitToSample(sample);
                Assert.IsTrue(result.Mean.Value == result.Parameters.ValuesVector[result.Parameters.IndexOf("Mean")]);
                Assert.IsTrue(result.Shape.Value == result.Parameters.ValuesVector[result.Parameters.IndexOf("Shape")]);
                Assert.IsTrue(TestUtilities.IsNearlyEqual(result.Parameters.VarianceOf("Mean"), MoreMath.Sqr(result.Mean.Uncertainty)));
                Assert.IsTrue(TestUtilities.IsNearlyEqual(result.Parameters.VarianceOf("Shape"), MoreMath.Sqr(result.Shape.Uncertainty)));
                results.AddRow(
                    result.Mean.Value, result.Shape.Value,
                    result.Parameters.VarianceOf("Mean"), result.Parameters.VarianceOf("Shape"), result.Parameters.CovarianceOf("Mean", "Shape")
                    );
            }

            Assert.IsTrue(results["Mean"].As <double>().PopulationMean().ConfidenceInterval(0.99).ClosedContains(wald.Mean));
            Assert.IsTrue(results["Shape"].As <double>().PopulationMean().ConfidenceInterval(0.99).ClosedContains(wald.Shape));

            Assert.IsTrue(results["Mean"].As <double>().PopulationVariance().ConfidenceInterval(0.99).ClosedContains(results["MeanVariance"].As <double>().Median()));
            Assert.IsTrue(results["Shape"].As <double>().PopulationVariance().ConfidenceInterval(0.99).ClosedContains(results["ShapeVariance"].As <double>().Median()));
            Assert.IsTrue(results["Mean"].As <double>().PopulationCovariance(results["Shape"].As <double>()).ConfidenceInterval(0.99).ClosedContains(results["MeanShapeCovariance"].As <double>().Median()));
        }
示例#13
0
        public void InternetTimeSeriesDownload()
        {
            FrameTable table = DownloadFrameTable(new Uri("https://timeseries.weebly.com/uploads/2/1/0/8/21086414/sea_ice.csv"));

            double[]   powerSpectrum = table["Arctic"].As <double>().PowerSpectrum();
            double     v12           = table["Arctic"].As <double>().Autocovariance(12);
            TestResult lbTest        = table["Arctic"].As <double>().LjungBoxTest();
        }
示例#14
0
        public void TimeSeriesFitAR1()
        {
            double alpha = 0.3;
            double mu    = 0.2;
            double sigma = 0.4;
            int    n     = 24;

            // For our fit to AR(1), we have incorporated bias correction (at least
            // for the most important parameter alpha), so we can do a small-n test.

            FrameTable data = new FrameTable();

            data.AddColumn <UncertainValue>("mu");
            data.AddColumn <UncertainValue>("alpha");
            data.AddColumn <UncertainValue>("sigma");
            data.AddColumn <SymmetricMatrix>("covariance");
            data.AddColumn <double>("p");

            for (int i = 0; i < 128; i++)
            {
                TimeSeries series = GenerateAR1TimeSeries(alpha, mu, sigma, n, n * i + 271828);

                AR1FitResult result = series.FitToAR1();

                data.AddRow(
                    result.Mu, result.Alpha, result.Sigma,
                    result.Parameters.CovarianceMatrix, result.GoodnessOfFit.Probability
                    );
            }

            data.AddComputedColumn("alphaValue", r => ((UncertainValue)r["alpha"]).Value);
            data.AddComputedColumn("muValue", r => ((UncertainValue)r["mu"]).Value);

            // Check that fit parameters agree with inputs
            Assert.IsTrue(data["mu"].As((UncertainValue v) => v.Value).PopulationMean().ConfidenceInterval(0.99).ClosedContains(mu));
            Assert.IsTrue(data["alpha"].As((UncertainValue v) => v.Value).PopulationMean().ConfidenceInterval(0.99).ClosedContains(alpha));
            Assert.IsTrue(data["sigma"].As((UncertainValue v) => v.Value).PopulationMean().ConfidenceInterval(0.99).ClosedContains(sigma));

            // Check that reported variances agree with actual variances
            Assert.IsTrue(data["mu"].As((UncertainValue v) => v.Value).PopulationStandardDeviation().ConfidenceInterval(0.99).ClosedContains(data["mu"].As((UncertainValue v) => v.Uncertainty).Median()));
            Assert.IsTrue(data["alpha"].As((UncertainValue v) => v.Value).PopulationStandardDeviation().ConfidenceInterval(0.99).ClosedContains(data["alpha"].As((UncertainValue v) => v.Uncertainty).Median()));
            Assert.IsTrue(data["sigma"].As((UncertainValue v) => v.Value).PopulationStandardDeviation().ConfidenceInterval(0.99).ClosedContains(data["sigma"].As((UncertainValue v) => v.Uncertainty).Median()));

            // Check that reported co-variances agree with actual co-variances
            Assert.IsTrue(data["mu"].As((UncertainValue v) => v.Value).PopulationCovariance(data["alpha"].As((UncertainValue v) => v.Value)).ConfidenceInterval(0.99).ClosedContains(data["covariance"].As((SymmetricMatrix c) => c[0, 1]).Median()));

            // For small n, the fitted alpha can vary considerably, and the formula for var(m) varies
            // quite strongly with alpha, so the computed var(m) have a very long tail. This pushes the
            // mean computed var(m) quite a bit higher than a typical value, so we use medians instead
            // of means for our best guess for the predicted variance.

            TestResult ks = data["p"].As <double>().KolmogorovSmirnovTest(new UniformDistribution());

            Assert.IsTrue(ks.Probability > 0.05);

            // This is an onerous way to store values, but it does let us test how the data-frame machinery deals with
            // non-trivial storage types.
        }
示例#15
0
        public static void ImportingData()
        {
            FrameTable data;

            using (TextReader reader = File.OpenText("test.csv")) {
                data = FrameTable.FromCsv(reader);
            }

            Console.WriteLine($"Imported CSV file with {data.Rows.Count} rows.");
            Console.WriteLine("The names and types of the columns are:");
            foreach (FrameColumn column in data.Columns)
            {
                Console.WriteLine($"  {column.Name} of type {column.StorageType}");
            }

            FrameTable titanic;
            Uri        url     = new Uri("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv");
            WebRequest request = WebRequest.Create(url);

            using (WebResponse response = request.GetResponse()) {
                using (StreamReader reader = new StreamReader(response.GetResponseStream())) {
                    titanic = FrameTable.FromCsv(reader);
                }
            }

            Uri       jsonUrl = new Uri("https://raw.githubusercontent.com/dcwuser/metanumerics/master/Examples/Data/example.json");
            WebClient client  = new WebClient();
            string    input   = client.DownloadString(jsonUrl);
            List <Dictionary <string, object> > output = JsonConvert.DeserializeObject <List <Dictionary <string, object> > >(input);
            FrameTable jsonExample = FrameTable.FromDictionaries(output);

            // Define the schema.
            FrameTable table = new FrameTable();

            table.AddColumn <int>("Id");
            table.AddColumn <string>("Name");
            table.AddColumn <string>("Sex");
            table.AddColumn <DateTime>("Birthdate");
            table.AddColumn <double>("Height");
            table.AddColumn <double?>("Weight");
            table.AddColumn <bool>("Result");

            // Add rows using as arrays of objects.
            table.AddRow(1, "John", "M", DateTime.Parse("1970-01-02"), 190.0, 75.0, true);
            table.AddRow(2, "Mary", "F", DateTime.Parse("1980-02-03"), 155.0, null, true);

            // Add a row using a dictionary. This is more verbose, but very clear.
            table.AddRow(new Dictionary <string, object>()
            {
                { "Id", 3 },
                { "Name", null },
                { "Sex", "M" },
                { "Birthdate", DateTime.Parse("1990-03-04") },
                { "Height", 180.0 },
                { "Weight", 60.0 },
                { "Result", false }
            });
        }
示例#16
0
        public static void ManipulatingData()
        {
            FrameTable table;
            Uri        url     = new Uri("https://raw.githubusercontent.com/dcwuser/metanumerics/master/Examples/Data/example.csv");
            WebRequest request = WebRequest.Create(url);

            using (WebResponse response = request.GetResponse()) {
                using (StreamReader reader = new StreamReader(response.GetResponseStream())) {
                    table = FrameTable.FromCsv(reader);
                }
            }

            FrameView selected = table.Select("Height", "Weight", "Sex");

            FrameView discarded = table.Discard("Name");

            table.AddComputedColumn("Bmi", r => ((double)r["Weight"]) / MoreMath.Sqr((double)r["Height"] / 100.0));
            Console.WriteLine($"Bmi of first subject is {table["Bmi"][0]}.");

            FrameView noNulls                = table.WhereNotNull();
            FrameView noNullWeights          = table.WhereNotNull("Weight");
            FrameView noNullWeightsOrHeights = table.WhereNotNull("Weight", "Height");

            double meanWeight = table.WhereNotNull("Weight").Columns["Weight"].As <double>().Mean();

            FrameView men = table.Where <string>("Sex", s => s == "M");

            FrameView shortMen = table.Where(
                r => ((string)r["Sex"]) == "M" && ((double)r["Height"] < 175.0)
                );

            FrameView ordered = table.OrderBy("Height");

            FrameView reversed = table.OrderBy("Height", SortOrder.Descending);

            FrameView alsoOrdered = table.OrderBy <double>("Height", (h1, h2) => h1.CompareTo(h2));

            FrameView sorted = table.OrderBy((r1, r2) => {
                int first  = ((string)r1["Sex"]).CompareTo((string )r2["Sex"]);
                int second = ((double)r1["Height"]).CompareTo((double)r2["Height"]);
                return(first != 0 ? first : second);
            });

            List <string> sexes = table["Sex"].As <string>().Distinct().ToList();

            FrameTable counts = table.GroupBy("Sex", v => v.Rows.Count, "Count");

            FrameTable summarize = table.GroupBy("Sex", v => {
                SummaryStatistics summary = new SummaryStatistics(v["Height"].As <double>());
                return(new Dictionary <string, object>()
                {
                    { "Count", summary.Count },
                    { "Mean", summary.Mean },
                    { "StdDev", summary.StandardDeviation }
                });
            });
        }
        public void MultivariateLinearRegressionSimple()
        {
            // define model y = a + b0 * x0 + b1 * x1 + noise
            double a  = 1.0;
            double b0 = -2.0;
            double b1 = 3.0;
            ContinuousDistribution x0distribution = new CauchyDistribution(10.0, 5.0);
            ContinuousDistribution x1distribution = new UniformDistribution(Interval.FromEndpoints(-10.0, 20.0));
            ContinuousDistribution noise          = new NormalDistribution(0.0, 10.0);

            // draw a sample from the model
            Random             rng    = new Random(1);
            MultivariateSample sample = new MultivariateSample("x0", "x1", "y");
            FrameTable         table  = new FrameTable();

            table.AddColumns <double>("x0", "x1", "y");

            for (int i = 0; i < 100; i++)
            {
                double x0  = x0distribution.GetRandomValue(rng);
                double x1  = x1distribution.GetRandomValue(rng);
                double eps = noise.GetRandomValue(rng);
                double y   = a + b0 * x0 + b1 * x1 + eps;
                sample.Add(x0, x1, y);
                table.AddRow(x0, x1, y);
            }

            // do a linear regression fit on the model
            ParameterCollection         oldResult = sample.LinearRegression(2).Parameters;
            MultiLinearRegressionResult newResult = table["y"].As <double>().MultiLinearRegression(
                table["x0"].As <double>(), table["x1"].As <double>()
                );

            // the result should have the appropriate dimension
            Assert.IsTrue(oldResult.Count == 3);
            Assert.IsTrue(newResult.Parameters.Count == 3);

            // The parameters should match the model
            Assert.IsTrue(oldResult[0].Estimate.ConfidenceInterval(0.90).ClosedContains(b0));
            Assert.IsTrue(oldResult[1].Estimate.ConfidenceInterval(0.90).ClosedContains(b1));
            Assert.IsTrue(oldResult[2].Estimate.ConfidenceInterval(0.90).ClosedContains(a));

            Assert.IsTrue(newResult.CoefficientOf(0).ConfidenceInterval(0.99).ClosedContains(b0));
            Assert.IsTrue(newResult.CoefficientOf("x1").ConfidenceInterval(0.99).ClosedContains(b1));
            Assert.IsTrue(newResult.Intercept.ConfidenceInterval(0.99).ClosedContains(a));

            // The residuals should be compatible with the model predictions
            for (int i = 0; i < table.Rows.Count; i++)
            {
                FrameRow row = table.Rows[i];
                double   x0  = (double)row["x0"];
                double   x1  = (double)row["x1"];
                double   yp  = newResult.Predict(x0, x1).Value;
                double   y   = (double)row["y"];
                Assert.IsTrue(TestUtilities.IsNearlyEqual(newResult.Residuals[i], y - yp));
            }
        }
示例#18
0
        public void LinearRegressionVariances()
        {
            // do a set of logistic regression fits
            // make sure not only that the fit parameters are what they should be, but that their variances/covariances are as returned

            Random rng = new Random(314159);

            // define line parameters
            double a0 = 2.0; double b0 = -1.0;

            // do a lot of fits, recording results of each
            FrameTable data = new FrameTable();

            data.AddColumns <double>("a", "va", "b", "vb", "abCov", "p", "dp");

            for (int k = 0; k < 128; k++)
            {
                // we should be able to draw x's from any distribution; noise should be drawn from a normal distribution
                ContinuousDistribution xd = new LogisticDistribution();
                ContinuousDistribution nd = new NormalDistribution(0.0, 2.0);

                // generate a synthetic data set
                BivariateSample sample = new BivariateSample();
                for (int i = 0; i < 12; i++)
                {
                    double x = xd.GetRandomValue(rng);
                    double y = a0 + b0 * x + nd.GetRandomValue(rng);
                    sample.Add(x, y);
                }

                // do the regression
                LinearRegressionResult result = sample.LinearRegression();

                // record result
                UncertainValue p = result.Predict(12.0);
                data.AddRow(new Dictionary <string, object>()
                {
                    { "a", result.Intercept.Value },
                    { "va", result.Parameters.VarianceOf("Intercept") },
                    { "b", result.Slope.Value },
                    { "vb", result.Parameters.VarianceOf("Slope") },
                    { "abCov", result.Parameters.CovarianceOf("Slope", "Intercept") },
                    { "p", p.Value },
                    { "dp", p.Uncertainty }
                });
            }

            // variances of parameters should agree with predictions
            Assert.IsTrue(data["a"].As <double>().PopulationVariance().ConfidenceInterval(0.99).ClosedContains(data["va"].As <double>().Median()));
            Assert.IsTrue(data["b"].As <double>().PopulationVariance().ConfidenceInterval(0.99).ClosedContains(data["vb"].As <double>().Median()));
            Assert.IsTrue(data["a"].As <double>().PopulationCovariance(data["b"].As <double>()).ConfidenceInterval(0.99).ClosedContains(data["abCov"].As <double>().Median()));

            // variance of prediction should agree with claim
            Assert.IsTrue(data["p"].As <double>().PopulationStandardDeviation().ConfidenceInterval(0.99).ClosedContains(data["dp"].As <double>().Median()));
        }
示例#19
0
        public void SmokeTest2()
        {
            FrameTable frame;
            string     path = @"C:\Users\dcw-b\Desktop\DataSets\551184489_52017_210_airline_delay_causes\551184489_52017_210_airline_delay_causes.csv";

            using (StreamReader stream = File.OpenText(path)) {
                frame = FrameTable.FromCsv(stream);
            }

            FrameView view = frame.GroupBy("carrier", (FrameView q) => q.Rows.Count, "count");
        }
示例#20
0
        public FrameTable DownloadFrameTable(Uri url)
        {
            FrameTable frame;
            WebRequest request = WebRequest.Create(url);

            using (WebResponse response = request.GetResponse()) {
                using (Stream responseStream = response.GetResponseStream()) {
                    using (TextReader reader = new StreamReader(responseStream)) {
                        frame = FrameTable.FromCsv(reader);
                    }
                }
            }
            return(frame);
        }
示例#21
0
        public void LinearLogisticRegressionVariances()
        {
            // define model y = a + b0 * x0 + b1 * x1 + noise
            double a = -2.0;
            double b = 1.0;
            ContinuousDistribution xDistribution = new StudentDistribution(2.0);

            FrameTable data = new FrameTable();

            data.AddColumns <double>("a", "da", "b", "db", "abcov", "p", "dp");

            // draw a sample from the model
            Random rng = new Random(3);

            for (int j = 0; j < 32; j++)
            {
                List <double> xs = new List <double>();
                List <bool>   ys = new List <bool>();

                for (int i = 0; i < 32; i++)
                {
                    double x = xDistribution.GetRandomValue(rng);
                    double t = a + b * x;
                    double p = 1.0 / (1.0 + Math.Exp(-t));
                    bool   y = (rng.NextDouble() < p);
                    xs.Add(x);
                    ys.Add(y);
                }

                // do a linear regression fit on the model
                LinearLogisticRegressionResult result = ys.LinearLogisticRegression(xs);
                UncertainValue pp = result.Predict(1.0);

                data.AddRow(
                    result.Intercept.Value, result.Intercept.Uncertainty,
                    result.Slope.Value, result.Slope.Uncertainty,
                    result.Parameters.CovarianceMatrix[0, 1],
                    pp.Value, pp.Uncertainty
                    );
            }

            // The estimated parameters should agree with the model that generated the data.

            // The variances of the estimates should agree with the claimed variances
            Assert.IsTrue(data["a"].As <double>().PopulationStandardDeviation().ConfidenceInterval(0.99).ClosedContains(data["da"].As <double>().Mean()));
            Assert.IsTrue(data["b"].As <double>().PopulationStandardDeviation().ConfidenceInterval(0.99).ClosedContains(data["db"].As <double>().Mean()));
            Assert.IsTrue(data["a"].As <double>().PopulationCovariance(data["b"].As <double>()).ConfidenceInterval(0.99).ClosedContains(data["abcov"].As <double>().Mean()));
            Assert.IsTrue(data["p"].As <double>().PopulationStandardDeviation().ConfidenceInterval(0.99).ClosedContains(data["dp"].As <double>().Mean()));
        }
示例#22
0
        public void FrameTableCsvRoundtrip2()
        {
            // Let's exercise all our data adaptors
            FrameTable original = new FrameTable();

            original.AddColumn <string>("String");
            original.AddColumn <double?>("Double?");
            original.AddColumn <int>("Int");
            original.AddColumn <DateTime?>("DateTime?");
            original.AddColumn <TimeSpan>("TimeSpan");
            original.AddColumn <Boolean?>("Boolean?");

            original.AddRow("z", null, 1, DateTime.Today, TimeSpan.FromMinutes(5.0), true);
            original.AddRow("y", 4.3, 2, null, TimeSpan.FromHours(4.0), null);
            original.AddRow("x", 2.0, 3, DateTime.UtcNow.Date, TimeSpan.FromDays(3.0), false);

            TextWriter storage = new StringWriter();

            original.ToCsv(storage);

            FrameTable copy = FrameTable.FromCsv(new StringReader(storage.ToString()));

            for (int i = 0; i < original.Columns.Count; i++)
            {
                Assert.IsTrue(original.Columns[i].Name == copy.Columns[i].Name);
                Assert.IsTrue(original.Columns[i].StorageType == copy.Columns[i].StorageType);
            }

            for (int i = 0; i < original.Rows.Count; i++)
            {
                for (int j = 0; j < original.Columns.Count; j++)
                {
                    // This awkwardness is necessary because == resolves to a static method,
                    // so object == object does a reference check which will fail even if
                    // both sides are equal structures. Equals, on the other hand, is a
                    // virtual method, so it will do the appropriate comparison, but will
                    // fail if the instance is null.
                    if (original.Rows[i][j] == null)
                    {
                        Assert.IsTrue(original.Rows[i][j] == null);
                    }
                    else
                    {
                        Assert.IsTrue(original.Rows[i][j].Equals(copy.Rows[i][j]));
                    }
                }
            }
        }
        public void BivariateNullAssociation()
        {
            Random rng = new Random(31415926);

            // Create a data structure to hold the results of Pearson, Spearman, and Kendall tests.
            FrameTable data = new FrameTable();

            data.AddColumn <double>("r");
            data.AddColumn <double>("ρ");
            data.AddColumn <double>("τ");

            // Create variables to hold the claimed distribution of each test statistic.
            ContinuousDistribution PRD = null;
            ContinuousDistribution SRD = null;
            ContinuousDistribution KTD = null;

            // Generate a large number of bivariate samples and conduct our three tests on each.
            ContinuousDistribution xDistribution = new LognormalDistribution();
            ContinuousDistribution yDistribution = new CauchyDistribution();

            for (int j = 0; j < 100; j++)
            {
                List <double> x = new List <double>();
                List <double> y = new List <double>();
                for (int i = 0; i < 100; i++)
                {
                    x.Add(xDistribution.GetRandomValue(rng));
                    y.Add(yDistribution.GetRandomValue(rng));
                }

                TestResult PR = Bivariate.PearsonRTest(x, y);
                TestResult SR = Bivariate.SpearmanRhoTest(x, y);
                TestResult KT = Bivariate.KendallTauTest(x, y);

                PRD = PR.Statistic.Distribution;
                SRD = SR.Statistic.Distribution;
                KTD = KT.Statistic.Distribution;

                data.AddRow(new Dictionary <string, object>()
                {
                    { "r", PR.Statistic.Value }, { "ρ", SR.Statistic.Value }, { "τ", KT.Statistic.Value }
                });
            }

            Assert.IsTrue(data["r"].As <double>().KolmogorovSmirnovTest(PRD).Probability > 0.05);
            Assert.IsTrue(data["ρ"].As <double>().KolmogorovSmirnovTest(SRD).Probability > 0.05);
            Assert.IsTrue(data["τ"].As <double>().KolmogorovSmirnovTest(KTD).Probability > 0.05);
        }
示例#24
0
        public void FrameTableColumnManipulations()
        {
            FrameTable frame = new FrameTable();

            frame.AddColumn <int>("Integer");
            frame.AddColumn <double>("Double");
            Assert.IsTrue(frame.Columns.Count == 2);

            Assert.IsTrue(frame.Columns[0].Name == frame[frame.Columns[0].Name].Name);
            Assert.IsTrue(frame.Columns[0].StorageType == frame[frame.Columns[0].Name].StorageType);

            frame.AddColumn <DateTime>("Timestamp");
            Assert.IsTrue(frame.Columns.Count == 3);

            frame.RemoveColumn(frame.Columns[0].Name);
            Assert.IsTrue(frame.Columns.Count == 2);
        }
示例#25
0
        public void BivariateNonlinearFitVariances()
        {
            // Verify that we can fit a non-linear function,
            // that the estimated parameters do cluster around the true values,
            // and that the estimated parameter covariances do reflect the actually observed covariances

            double a = 2.7;
            double b = 3.1;

            ContinuousDistribution xDistribution = new ExponentialDistribution(2.0);
            ContinuousDistribution eDistribution = new NormalDistribution(0.0, 4.0);

            FrameTable parameters = new FrameTable();

            parameters.AddColumns <double>("a", "b");
            MultivariateSample covariances = new MultivariateSample(3);

            for (int i = 0; i < 64; i++)
            {
                BivariateSample sample = new BivariateSample();
                Random          rng    = new Random(i);
                for (int j = 0; j < 8; j++)
                {
                    double x = xDistribution.GetRandomValue(rng);
                    double y = a * Math.Pow(x, b) + eDistribution.GetRandomValue(rng);
                    sample.Add(x, y);
                }

                NonlinearRegressionResult fit = sample.NonlinearRegression(
                    (IReadOnlyList <double> p, double x) => p[0] * Math.Pow(x, p[1]),
                    new double[] { 1.0, 1.0 }
                    );

                parameters.AddRow(fit.Parameters.ValuesVector);
                covariances.Add(fit.Parameters.CovarianceMatrix[0, 0], fit.Parameters.CovarianceMatrix[1, 1], fit.Parameters.CovarianceMatrix[0, 1]);
            }

            Assert.IsTrue(parameters["a"].As <double>().PopulationMean().ConfidenceInterval(0.99).ClosedContains(a));
            Assert.IsTrue(parameters["b"].As <double>().PopulationMean().ConfidenceInterval(0.99).ClosedContains(b));

            Assert.IsTrue(parameters["a"].As <double>().PopulationVariance().ConfidenceInterval(0.99).ClosedContains(covariances.Column(0).Mean));
            Assert.IsTrue(parameters["b"].As <double>().PopulationVariance().ConfidenceInterval(0.99).ClosedContains(covariances.Column(1).Mean));
            Assert.IsTrue(parameters["a"].As <double>().PopulationCovariance(parameters["b"].As <double>()).ConfidenceInterval(0.99).ClosedContains(covariances.Column(2).Mean));
            Assert.IsTrue(Bivariate.PopulationCovariance(parameters["a"].As <double>(), parameters["b"].As <double>()).ConfidenceInterval(0.99).ClosedContains(covariances.Column(2).Mean));
        }
示例#26
0
        public void Smoketest()
        {
            FrameTable frame;
            string     url     = "https://raw.githubusercontent.com/pandas-dev/pandas/master/doc/data/tips.csv";
            WebRequest request = WebRequest.Create(url);

            using (WebResponse response = request.GetResponse()) {
                using (Stream responseStream = response.GetResponseStream()) {
                    using (TextReader reader = new StreamReader(responseStream)) {
                        frame = FrameTable.FromCsv(reader);
                    }
                }
            }
            frame.AddComputedColumn("tip_fraction", r => ((double)r["tip"]) / ((double)r["total_bill"]));

            FrameView counts = frame.GroupBy("day", v => v.Rows.Count, "total").OrderBy("day");
            FrameView means  = frame.GroupBy("sex", v => v["tip_fraction"].As <double>().Mean(), "mean_tip_fraction");
        }
示例#27
0
        public void FrameTableRowManipulations()
        {
            // In future, test with computed columns
            FrameTable frame = new FrameTable();

            frame.AddColumn <double>("Height");
            frame.AddColumn <string>("Name");

            Assert.IsTrue(frame.Columns.Count == 2);
            Assert.IsTrue(frame.Rows.Count == 0);

            // Insert a row
            Dictionary <string, object> row = new Dictionary <string, object>()
            {
                { "Name", "John" }, { "Height", 1.1 }
            };

            frame.AddRow(row);
            Assert.IsTrue(frame.Rows.Count == 1);

            // Try to insert a row with missing values
            Dictionary <string, object> smallRow = new Dictionary <string, object>()
            {
                { "Name", "Mark" }
            };

            try {
                frame.AddRow(smallRow);
                Assert.Fail();
            } catch (Exception) { }

            // Try to insert a row with too many values
            Dictionary <string, object> bigRow = new Dictionary <string, object>()
            {
                { "Name", "Luke" }, { "Height", 1.2 }, { "Weight", 60.0 }
            };

            try {
                frame.AddRow(bigRow);
                Assert.Fail();
            } catch (Exception) { }
        }
示例#28
0
        private FrameTable GetTestFrame()
        {
            FrameTable frame = new FrameTable();

            frame.AddColumn <string>("name");
            frame.AddColumn <double>("height");
            frame.AddColumn <double>("weight");
            frame.AddColumn <bool?>("male");

            frame.AddRow("a", 7.0, 10.0, false);
            frame.AddRow(null, 6.5, 11.0, true);
            frame.AddRow("c", 6.0, 12.0, false);
            frame.AddRow("d", 5.5, 11.0, true);
            frame.AddRow("e", 5.0, 12.0, null);
            frame.AddRow("f", 4.5, 13.0, true);
            frame.AddRow(null, 4.0, 12.0, false);

            frame.AddComputedColumn("bmi", r => ((double)r["weight"]) / MoreMath.Sqr((double)r["height"]));

            return(frame);
        }
示例#29
0
        public void FrameViewGroupByClause()
        {
            FrameView original = GetTestFrame();

            HashSet <bool?> values = new HashSet <bool?>(original["male"].As <bool?>().Distinct());

            FrameTable grouped = original.GroupBy("male", v => v["height"].As <double>().Mean(), "meanHeight");

            Assert.IsTrue(grouped.Rows.Count == values.Count);

            foreach (FrameRow row in grouped.Rows)
            {
                bool?value = (bool?)row["male"];
                Assert.IsTrue(values.Contains(value));
                // r["male"] == value doesn't work, because this is an object comparison,
                // and two equal values boxed to objects are not equal. This is a problem.
                //DataView selected = original.Where<bool?>("male", m => m.Equals(value));
                FrameView selected = original.Where(r => (bool?)r["male"] == value);
                double    height   = selected["height"].As <double>().Mean();
                Assert.IsTrue((double)row["meanHeight"] == height);
            }
        }
        public void MeansClustering2()
        {
            ColumnVector[] centers = new ColumnVector[] {
                new ColumnVector(0.0, 0.0, 0.0),
                new ColumnVector(2.0, 0.0, 0.0),
                new ColumnVector(0.0, 2.0, 0.0),
                new ColumnVector(0.0, 0.0, 2.0)
            };

            FrameTable table    = new FrameTable();
            string     alphabet = "abcdefghijklmnopqrstuvwxyz";

            for (int j = 0; j < 3; j++)
            {
                table.AddColumn <double>(alphabet[j].ToString());
            }


            List <int>          inputAssignments = new List <int>();
            List <ColumnVector> inputVectors     = new List <ColumnVector>();
            Random rng = new Random(2);
            ContinuousDistribution dist = new NormalDistribution(0.0, 1.0);

            for (int i = 0; i < 100; i++)
            {
                int inputAssignment = rng.Next(0, centers.Length);
                inputAssignments.Add(inputAssignment);
                ColumnVector inputVector = centers[inputAssignment].Copy();
                for (int k = 0; k < inputVector.Dimension; k++)
                {
                    inputVector[k] += dist.GetRandomValue(rng);
                }
                inputVectors.Add(inputVector);
                table.AddRow <double>(inputVector);
            }

            MeansClusteringResult result = table.AsColumns <double>().MeansClustering(centers.Length);

            //MultivariateSample s = new MultivariateSample(3);
            //foreach (ColumnVector v in inputVectors) { s.Add(v); }
            //MeansClusteringResult result = s.MeansClustering(centers.Length);

            List <int> outputAssignments = new List <int>();

            for (int i = 0; i < inputVectors.Count; i++)
            {
                int assignment = result.Classify(inputVectors[i]);
                outputAssignments.Add(assignment);
            }

            // Map the output centroids to the original centroids
            Dictionary <int, int> map = new Dictionary <int, int>();

            for (int outputIndex = 0; outputIndex < result.Count; outputIndex++)
            {
                ColumnVector centroid            = result.Centroid(outputIndex);
                int          mappedInputIndex    = -1;
                double       mappedInputDistance = Double.MaxValue;
                for (int inputIndex = 0; inputIndex < centers.Length; inputIndex++)
                {
                    double distance = (centroid - centers[inputIndex]).Norm();
                    if (distance < mappedInputDistance)
                    {
                        mappedInputIndex    = inputIndex;
                        mappedInputDistance = distance;
                    }
                }
                Assert.IsTrue(mappedInputIndex >= 0);
                Assert.IsTrue(mappedInputDistance < 1.0);
                map.Add(outputIndex, mappedInputIndex);
            }

            int correctCount = 0;

            for (int i = 0; i < outputAssignments.Count; i++)
            {
                if (map[outputAssignments[i]] == inputAssignments[i])
                {
                    correctCount++;
                }
            }
            Assert.IsTrue(correctCount >= 0.50 * outputAssignments.Count);
        }