Exemple #1
0
        public void TestSignaturesV2_3_X()
        {
            Column col = _df["name"];

            col = _df["age"];

            DataFrame df = _df.ToDF();

            df = df.ToDF("name2", "age2");

            StructType schema = _df.Schema();

            Assert.NotNull(schema);

            _df.PrintSchema();

            _df.Explain();
            _df.Explain(true);
            _df.Explain(false);

            Assert.Equal(2, _df.Columns().ToArray().Length);

            _df.IsLocal();

            _df.IsStreaming();

            using (var tempDir = new TemporaryDirectory())
            {
                // The following is required for *CheckPoint().
                _spark.SparkContext.SetCheckpointDir(tempDir.Path);

                _df.Checkpoint();
                _df.Checkpoint(false);

                _df.LocalCheckpoint();
                _df.LocalCheckpoint(false);
            }

            _df.WithWatermark("time", "10 minutes");

            _df.Show();
            _df.Show(10);
            _df.Show(10, 10);
            _df.Show(10, 10, true);

            _df.Join(_df);
            _df.Join(_df, "name");
            _df.Join(_df, new[] { "name" });
            _df.Join(_df, new[] { "name" }, "outer");
            _df.Join(_df, _df["age"] == _df["age"]);
            _df.Join(_df, _df["age"] == _df["age"], "outer");

            _df.CrossJoin(_df);

            _df.SortWithinPartitions("age");
            _df.SortWithinPartitions("age", "name");
            _df.SortWithinPartitions();
            _df.SortWithinPartitions(_df["age"]);
            _df.SortWithinPartitions(_df["age"], _df["name"]);

            _df.Sort("age");
            _df.Sort("age", "name");
            _df.Sort();
            _df.Sort(_df["age"]);
            _df.Sort(_df["age"], _df["name"]);

            _df.OrderBy("age");
            _df.OrderBy("age", "name");
            _df.OrderBy();
            _df.OrderBy(_df["age"]);
            _df.OrderBy(_df["age"], _df["name"]);

            _df.Hint("broadcast");
            _df.Hint("broadcast", new[] { "hello", "world" });

            _df.Col("age");

            _df.ColRegex("age");

            _df.As("alias");

            _df.Alias("alias");

            _df.Select("age");
            _df.Select("age", "name");
            _df.Select();
            _df.Select(_df["age"]);
            _df.Select(_df["age"], _df["name"]);

            _df.SelectExpr();
            _df.SelectExpr("age * 2");
            _df.SelectExpr("age * 2", "abs(age)");

            _df.Filter(_df["age"] > 21);
            _df.Filter("age > 21");

            _df.Where(_df["age"] > 21);
            _df.Where("age > 21");

            _df.GroupBy("age");
            _df.GroupBy("age", "name");
            _df.GroupBy();
            _df.GroupBy(_df["age"]);
            _df.GroupBy(_df["age"], _df["name"]);

            _df.Rollup("age");
            _df.Rollup("age", "name");
            _df.Rollup();
            _df.Rollup(_df["age"]);
            _df.Rollup(_df["age"], _df["name"]);

            _df.Cube("age");
            _df.Cube("age", "name");
            _df.Cube();
            _df.Cube(_df["age"]);
            _df.Cube(_df["age"], _df["name"]);

            _df.Agg(Avg(_df["age"]));
            _df.Agg(Avg(_df["age"]), Avg(_df["name"]));

            _df.Limit(10);

            _df.Union(_df);

            _df.UnionByName(_df);

            _df.Intersect(_df);

            _df.Except(_df);

            _df.Sample(0.5);
            _df.Sample(0.5, true);
            _df.Sample(0.5, false, 12345);

            _df.RandomSplit(new[] { 0.2, 0.8 });
            _df.RandomSplit(new[] { 0.2, 0.8 }, 12345);

            _df.WithColumn("age2", _df["age"]);

            _df.WithColumnRenamed("age", "age2");

            _df.Drop();
            _df.Drop("age");
            _df.Drop("age", "name");

            _df.Drop(_df["age"]);

            _df.DropDuplicates();
            _df.DropDuplicates("age");
            _df.DropDuplicates("age", "name");

            _df.Describe();
            _df.Describe("age");
            _df.Describe("age", "name");

            _df.Summary();
            _df.Summary("count");
            _df.Summary("count", "mean");

            _df.Head(2);
            _df.Head();

            _df.First();

            _df.Take(3).ToArray();

            _df.Collect().ToArray();

            _df.ToLocalIterator().ToArray();

            _df.Count();

            _df.Repartition(2);
            _df.Repartition(2, _df["age"]);
            _df.Repartition(_df["age"]);
            _df.Repartition();

            _df.RepartitionByRange(2, _df["age"]);
            _df.RepartitionByRange(_df["age"]);

            _df.Coalesce(1);

            _df.Distinct();

            _df.Persist();

            _df.Cache();

            _df.Unpersist();

            _df.CreateTempView("view");
            _df.CreateOrReplaceTempView("view");

            _df.CreateGlobalTempView("global_view");
            _df.CreateOrReplaceGlobalTempView("global_view");
        }
        public void Describe_Test01()
        {
            var mlContext = new MLContext();
            var dict      = new Dictionary <string, List <object> >
            {
                { "product_id", new List <object>()
                  {
                      1, 1, 2, 2, 2, 2, 2
                  } },
                { "retail_price", new List <object>()
                  {
                      2, 2, 5, 5, 5, 5, 5
                  } },
                { "quantity", new List <object>()
                  {
                      1, 2, 4, 8, 16, 32, 64
                  } },
                { "city", new List <object>()
                  {
                      "SF", "SJ", "SF", "SJ", "Miami", "Orlando", "SJ"
                  } },
                { "state", new List <object>()
                  {
                      "CA", "CA", "CA", "CA", "FL", "FL", "PR"
                  } },
            };


            var df     = new DataFrame(dict);
            var descDf = df.Describe(false, "product_id", "quantity", "state");

            Assert.True(descDf.RowCount() == 11);
            Assert.True(descDf.ColCount() == 3);

            var str = descDf.ToStringBuilder();

            Assert.Equal("Count", descDf.Index[0]);
            Assert.Equal(7, descDf["product_id", 0]);
            Assert.Equal(7, descDf["quantity", 0]);
            Assert.Equal(7, descDf["state", 0]);

            Assert.Equal("Unique", descDf.Index[1]);
            Assert.Equal(2, descDf["product_id", 1]);
            Assert.Equal(7, descDf["quantity", 1]);
            Assert.Equal(3, descDf["state", 1]);

            Assert.Equal("Top", descDf.Index[2]);
            Assert.Equal(2, descDf["product_id", 2]);
            Assert.Equal(1, descDf["quantity", 2]);
            Assert.Equal("CA", descDf["state", 2]);

            Assert.Equal("Freq", descDf.Index[3]);
            Assert.Equal(5, descDf["product_id", 3]);
            Assert.Equal(1, descDf["quantity", 3]);
            Assert.Equal(4, descDf["state", 3]);

            Assert.Equal("Mean", descDf.Index[4]);
            Assert.Equal(1.714286, descDf["product_id", 4]);
            Assert.Equal(18.142857, descDf["quantity", 4]);
            Assert.Equal(DataFrame.NAN, descDf["state", 4]);

            Assert.Equal("Std", descDf.Index[5]);
            Assert.Equal(0.487950, descDf["product_id", 5]);
            Assert.Equal(22.937804, descDf["quantity", 5]);
            Assert.Equal(DataFrame.NAN, descDf["state", 5]);

            Assert.Equal("Min", descDf.Index[6]);
            Assert.Equal(1, descDf["product_id", 6]);
            Assert.Equal(1, descDf["quantity", 6]);
            Assert.Equal(DataFrame.NAN, descDf["state", 6]);

            Assert.Equal("25%", descDf.Index[7]);
            Assert.Equal(1.5, descDf["product_id", 7]);
            Assert.Equal(3.0, descDf["quantity", 7]);
            Assert.Equal(DataFrame.NAN, descDf["state", 7]);

            Assert.Equal("Median", descDf.Index[8]);
            Assert.Equal(2.0, descDf["product_id", 8]);
            Assert.Equal(8.0, descDf["quantity", 8]);
            Assert.Equal(DataFrame.NAN, descDf["state", 8]);

            Assert.Equal("75%", descDf.Index[9]);
            Assert.Equal(2d, descDf["product_id", 9]);
            Assert.Equal(24d, descDf["quantity", 9]);
            Assert.Equal(DataFrame.NAN, descDf["state", 9]);

            Assert.Equal("Max", descDf.Index[10]);
            Assert.Equal(2, descDf["product_id", 10]);
            Assert.Equal(64, descDf["quantity", 10]);
            Assert.Equal(DataFrame.NAN, descDf["state", 10]);
        }