Esempio n. 1
0
        public void Run(string[] args)
        {
            if (args.Length != 1)
            {
                Console.Error.WriteLine(
                    "Usage: Sql.VectorDataFrameUdfs <path to SPARK_HOME/examples/src/main/resources/people.json>");
                Environment.Exit(1);
            }

            SparkSession spark = SparkSession
                                 .Builder()
                                 // Lower the shuffle partitions to speed up groupBy() operations.
                                 .Config("spark.sql.shuffle.partitions", "3")
                                 .AppName("SQL VectorUdfs example using .NET for Apache Spark")
                                 .GetOrCreate();

            DataFrame df = spark.Read().Schema("age INT, name STRING").Json(args[0]);

            StructType schema = df.Schema();

            Console.WriteLine(schema.SimpleString);

            df.Show();

            df.PrintSchema();

            // Grouped Map Vector UDF
            // able to return different shapes and record lengths
            df.GroupBy("age")
            .Apply(
                new StructType(new[]
            {
                new StructField("age", new IntegerType()),
                new StructField("nameCharCount", new IntegerType())
            }),
                r => CountCharacters(r))
            .Show();

            spark.Stop();
        }
Esempio n. 2
0
        public void TestSignaturesV2_3_X()
        {
            Assert.IsType <Column>(_df["name"]);
            Assert.IsType <Column>(_df["age"]);

            Assert.IsType <DataFrame>(_df.ToDF());
            Assert.IsType <DataFrame>(_df.ToDF("name2", "age2"));

            StructType schema = _df.Schema();

            Assert.NotNull(schema);

            _df.PrintSchema();

            _df.Explain();
            _df.Explain(true);
            _df.Explain(false);

            Assert.Equal(2, _df.Columns().ToArray().Length);

            var expected = new List <Tuple <string, string> >
            {
                new Tuple <string, string>("age", "integer"),
                new Tuple <string, string>("name", "string")
            };

            Assert.Equal(expected, _df.DTypes());

            Assert.IsType <bool>(_df.IsLocal());

            Assert.IsType <bool>(_df.IsStreaming());

            using (var tempDir = new TemporaryDirectory())
            {
                // The following is required for *CheckPoint().
                _spark.SparkContext.SetCheckpointDir(tempDir.Path);

                Assert.IsType <DataFrame>(_df.Checkpoint());
                Assert.IsType <DataFrame>(_df.Checkpoint(false));

                Assert.IsType <DataFrame>(_df.LocalCheckpoint());
                Assert.IsType <DataFrame>(_df.LocalCheckpoint(false));
            }

            Assert.IsType <DataFrame>(_df.WithWatermark("time", "10 minutes"));

            _df.Show();
            _df.Show(10);
            _df.Show(10, 10);
            _df.Show(10, 10, true);

            Assert.IsType <DataFrame>(_df.Join(_df));
            Assert.IsType <DataFrame>(_df.Join(_df, "name"));
            Assert.IsType <DataFrame>(_df.Join(_df, new[] { "name" }));
            Assert.IsType <DataFrame>(_df.Join(_df, new[] { "name" }, "outer"));
            Assert.IsType <DataFrame>(_df.Join(_df, _df["age"] == _df["age"]));
            Assert.IsType <DataFrame>(_df.Join(_df, _df["age"] == _df["age"], "outer"));

            Assert.IsType <DataFrame>(_df.CrossJoin(_df));

            Assert.IsType <DataFrame>(_df.SortWithinPartitions("age"));
            Assert.IsType <DataFrame>(_df.SortWithinPartitions("age", "name"));
            Assert.IsType <DataFrame>(_df.SortWithinPartitions());
            Assert.IsType <DataFrame>(_df.SortWithinPartitions(_df["age"]));
            Assert.IsType <DataFrame>(_df.SortWithinPartitions(_df["age"], _df["name"]));

            Assert.IsType <DataFrame>(_df.Sort("age"));
            Assert.IsType <DataFrame>(_df.Sort("age", "name"));
            Assert.IsType <DataFrame>(_df.Sort());
            Assert.IsType <DataFrame>(_df.Sort(_df["age"]));
            Assert.IsType <DataFrame>(_df.Sort(_df["age"], _df["name"]));

            Assert.IsType <DataFrame>(_df.OrderBy("age"));
            Assert.IsType <DataFrame>(_df.OrderBy("age", "name"));
            Assert.IsType <DataFrame>(_df.OrderBy());
            Assert.IsType <DataFrame>(_df.OrderBy(_df["age"]));
            Assert.IsType <DataFrame>(_df.OrderBy(_df["age"], _df["name"]));

            Assert.IsType <DataFrame>(_df.Hint("broadcast"));
            Assert.IsType <DataFrame>(_df.Hint("broadcast", new[] { "hello", "world" }));

            Assert.IsType <Column>(_df.Col("age"));

            Assert.IsType <Column>(_df.ColRegex("age"));

            Assert.IsType <DataFrame>(_df.As("alias"));

            Assert.IsType <DataFrame>(_df.Alias("alias"));

            Assert.IsType <DataFrame>(_df.Select("age"));
            Assert.IsType <DataFrame>(_df.Select("age", "name"));
            Assert.IsType <DataFrame>(_df.Select());
            Assert.IsType <DataFrame>(_df.Select(_df["age"]));
            Assert.IsType <DataFrame>(_df.Select(_df["age"], _df["name"]));

            Assert.IsType <DataFrame>(_df.SelectExpr());
            Assert.IsType <DataFrame>(_df.SelectExpr("age * 2"));
            Assert.IsType <DataFrame>(_df.SelectExpr("age * 2", "abs(age)"));

            Assert.IsType <DataFrame>(_df.Filter(_df["age"] > 21));
            Assert.IsType <DataFrame>(_df.Filter("age > 21"));

            Assert.IsType <DataFrame>(_df.Where(_df["age"] > 21));
            Assert.IsType <DataFrame>(_df.Where("age > 21"));

            Assert.IsType <RelationalGroupedDataset>(_df.GroupBy("age"));
            Assert.IsType <RelationalGroupedDataset>(_df.GroupBy("age", "name"));
            Assert.IsType <RelationalGroupedDataset>(_df.GroupBy());
            Assert.IsType <RelationalGroupedDataset>(_df.GroupBy(_df["age"]));
            Assert.IsType <RelationalGroupedDataset>(_df.GroupBy(_df["age"], _df["name"]));

            {
                RelationalGroupedDataset df =
                    _df.WithColumn("tempAge", _df["age"]).GroupBy("name");

                Assert.IsType <DataFrame>(df.Mean("age"));
                Assert.IsType <DataFrame>(df.Mean("age", "tempAge"));

                Assert.IsType <DataFrame>(df.Max("age"));
                Assert.IsType <DataFrame>(df.Max("age", "tempAge"));

                Assert.IsType <DataFrame>(df.Avg("age"));
                Assert.IsType <DataFrame>(df.Avg("age", "tempAge"));

                Assert.IsType <DataFrame>(df.Min("age"));
                Assert.IsType <DataFrame>(df.Min("age", "tempAge"));

                Assert.IsType <DataFrame>(df.Sum("age"));
                Assert.IsType <DataFrame>(df.Sum("age", "tempAge"));
            }

            Assert.IsType <RelationalGroupedDataset>(_df.Rollup("age"));
            Assert.IsType <RelationalGroupedDataset>(_df.Rollup("age", "name"));
            Assert.IsType <RelationalGroupedDataset>(_df.Rollup());
            Assert.IsType <RelationalGroupedDataset>(_df.Rollup(_df["age"]));
            Assert.IsType <RelationalGroupedDataset>(_df.Rollup(_df["age"], _df["name"]));

            Assert.IsType <RelationalGroupedDataset>(_df.Cube("age"));
            Assert.IsType <RelationalGroupedDataset>(_df.Cube("age", "name"));
            Assert.IsType <RelationalGroupedDataset>(_df.Cube());
            Assert.IsType <RelationalGroupedDataset>(_df.Cube(_df["age"]));
            Assert.IsType <RelationalGroupedDataset>(_df.Cube(_df["age"], _df["name"]));

            Assert.IsType <DataFrame>(_df.Agg(Avg(_df["age"])));
            Assert.IsType <DataFrame>(_df.Agg(Avg(_df["age"]), Avg(_df["name"])));

            Assert.IsType <DataFrame>(_df.Limit(10));

            Assert.IsType <DataFrame>(_df.Union(_df));

            Assert.IsType <DataFrame>(_df.UnionByName(_df));

            Assert.IsType <DataFrame>(_df.Intersect(_df));

            Assert.IsType <DataFrame>(_df.Except(_df));

            Assert.IsType <DataFrame>(_df.Sample(0.5));
            Assert.IsType <DataFrame>(_df.Sample(0.5, true));
            Assert.IsType <DataFrame>(_df.Sample(0.5, false, 12345));

            Assert.IsType <DataFrame[]>(_df.RandomSplit(new[] { 0.2, 0.8 }));
            Assert.IsType <DataFrame[]>(_df.RandomSplit(new[] { 0.2, 0.8 }, 12345));

            Assert.IsType <DataFrame>(_df.WithColumn("age2", _df["age"]));

            Assert.IsType <DataFrame>(_df.WithColumnRenamed("age", "age2"));

            Assert.IsType <DataFrame>(_df.Drop());
            Assert.IsType <DataFrame>(_df.Drop("age"));
            Assert.IsType <DataFrame>(_df.Drop("age", "name"));

            Assert.IsType <DataFrame>(_df.Drop(_df["age"]));

            Assert.IsType <DataFrame>(_df.DropDuplicates());
            Assert.IsType <DataFrame>(_df.DropDuplicates("age"));
            Assert.IsType <DataFrame>(_df.DropDuplicates("age", "name"));

            Assert.IsType <DataFrame>(_df.Describe());
            Assert.IsType <DataFrame>(_df.Describe("age"));
            Assert.IsType <DataFrame>(_df.Describe("age", "name"));

            Assert.IsType <DataFrame>(_df.Summary());
            Assert.IsType <DataFrame>(_df.Summary("count"));
            Assert.IsType <DataFrame>(_df.Summary("count", "mean"));

            Assert.IsType <Row[]>(_df.Head(2).ToArray());
            Assert.IsType <Row>(_df.Head());

            Assert.IsType <Row>(_df.First());

            Assert.IsType <DataFrame>(_df.Transform(df => df.Drop("age")));

            Assert.IsType <Row[]>(_df.Take(3).ToArray());

            Assert.IsType <Row[]>(_df.Collect().ToArray());

            Assert.IsType <Row[]>(_df.ToLocalIterator().ToArray());

            Assert.IsType <long>(_df.Count());

            Assert.IsType <DataFrame>(_df.Repartition(2));
            Assert.IsType <DataFrame>(_df.Repartition(2, _df["age"]));
            Assert.IsType <DataFrame>(_df.Repartition(_df["age"]));
            Assert.IsType <DataFrame>(_df.Repartition());

            Assert.IsType <DataFrame>(_df.RepartitionByRange(2, _df["age"]));
            Assert.IsType <DataFrame>(_df.RepartitionByRange(_df["age"]));

            Assert.IsType <DataFrame>(_df.Coalesce(1));

            Assert.IsType <DataFrame>(_df.Distinct());

            Assert.IsType <DataFrame>(_df.Persist());

            Assert.IsType <DataFrame>(_df.Persist(StorageLevel.DISK_ONLY));

            Assert.IsType <DataFrame>(_df.Cache());

            Assert.IsType <StorageLevel>(_df.StorageLevel());

            Assert.IsType <DataFrame>(_df.Unpersist());

            _df.CreateTempView("view");
            _df.CreateOrReplaceTempView("view");

            _df.CreateGlobalTempView("global_view");
            _df.CreateOrReplaceGlobalTempView("global_view");
        }