Пример #1
0
        internal void Q21()
        {
            DataFrame fsupplier = _supplier.Select(Col("s_suppkey"), Col("s_nationkey"), Col("s_name"));

            DataFrame plineitem = _lineitem
                                  .Select(Col("l_suppkey"), Col("l_orderkey"), Col("l_receiptdate"), Col("l_commitdate"));

            DataFrame flineitem = plineitem.Filter(Col("l_receiptdate") > Col("l_commitdate"));

            DataFrame line1 = plineitem.GroupBy(Col("l_orderkey"))
                              .Agg(CountDistinct(Col("l_suppkey")).As("suppkey_count"), Max(Col("l_suppkey")).As("suppkey_max"))
                              .Select(Col("l_orderkey").As("key"), Col("suppkey_count"), Col("suppkey_max"));

            DataFrame line2 = flineitem.GroupBy(Col("l_orderkey"))
                              .Agg(CountDistinct(Col("l_suppkey")).As("suppkey_count"), Max(Col("l_suppkey")).As("suppkey_max"))
                              .Select(Col("l_orderkey").As("key"), Col("suppkey_count"), Col("suppkey_max"));

            DataFrame forder = _orders.Select(Col("o_orderkey"), Col("o_orderstatus"))
                               .Filter(Col("o_orderstatus") == "F");

            _nation.Filter(Col("n_name") == "SAUDI ARABIA")
            .Join(fsupplier, Col("n_nationkey") == fsupplier["s_nationkey"])
            .Join(flineitem, Col("s_suppkey") == flineitem["l_suppkey"])
            .Join(forder, Col("l_orderkey") == forder["o_orderkey"])
            .Join(line1, Col("l_orderkey") == line1["key"])
            .Filter(Col("suppkey_count") > 1)
            .Select(Col("s_name"), Col("l_orderkey"), Col("l_suppkey"))
            .Join(line2, Col("l_orderkey") == line2["key"], "left_outer")
            .Select(Col("s_name"), Col("l_orderkey"), Col("l_suppkey"), Col("suppkey_count"), Col("suppkey_max"))
            .Filter(Col("suppkey_count") == 1 & Col("l_suppkey") == Col("suppkey_max"))
            .GroupBy(Col("s_name"))
            .Agg(Count(Col("l_suppkey")).As("numwait"))
            .Sort(Col("numwait").Desc(), Col("s_name"))
            .Show();
        }
Пример #2
0
        public void TestDataFrameGroupedMapUdf()
        {
            DataFrame df = _spark
                           .Read()
                           .Schema("age INT, name STRING")
                           .Json($"{TestEnvironment.ResourceDirectory}more_people.json");

            // Data:
            // { "name":"Michael"}
            // { "name":"Andy", "age":30}
            // { "name":"Seth", "age":30}
            // { "name":"Justin", "age":19}
            // { "name":"Kathy", "age":19}

            Row[] rows = df.GroupBy("age")
                         .Apply(
                new StructType(new[]
            {
                new StructField("age", new IntegerType()),
                new StructField("nameCharCount", new IntegerType())
            }),
                batch => CountCharacters(batch))
                         .Collect()
                         .ToArray();

            Assert.Equal(3, rows.Length);
            foreach (Row row in rows)
            {
                int?age       = row.GetAs <int?>("age");
                int charCount = row.GetAs <int>("nameCharCount");
                switch (age)
                {
                case null:
                    Assert.Equal(7, charCount);
                    break;

                case 19:
                    Assert.Equal(11, charCount);
                    break;

                case 30:
                    Assert.Equal(8, charCount);
                    break;

                default:
                    throw new Exception($"Unexpected age: {age}.");
                }
            }
        }
Пример #3
0
        internal void Q13()
        {
            Func <Column, Column> special = Udf <string, bool>((x) => s_q13SpecialRegex.IsMatch(x));

            DataFrame c_orders = _customer.Join(_orders, Col("c_custkey") == _orders["o_custkey"]
                                                & !special(_orders["o_comment"]), "left_outer")
                                 .GroupBy(Col("c_custkey"))
                                 .Agg(Count(Col("o_orderkey")).As("c_count"));

            c_orders
            .GroupBy(Col("c_count"))
            .Agg(Count(Col("*")).As("custdist"))
            .Sort(Col("custdist").Desc(), Col("c_count").Desc())
            .Show();
        }
Пример #4
0
        internal void Q17()
        {
            Func <Column, Column> mul02 = Udf <double, double>((x) => x * 0.2);

            DataFrame flineitem = _lineitem.Select(Col("l_partkey"), Col("l_quantity"), Col("l_extendedprice"));

            DataFrame fpart = _part.Filter(Col("p_brand") == "Brand#23" & Col("p_container") == "MED BOX")
                              .Select(Col("p_partkey"))
                              .Join(_lineitem, Col("p_partkey") == _lineitem["l_partkey"], "left_outer");

            fpart.GroupBy("p_partkey")
            .Agg(mul02(Avg(Col("l_quantity"))).As("avg_quantity"))
            .Select(Col("p_partkey").As("key"), Col("avg_quantity"))
            .Join(fpart, Col("key") == fpart["p_partkey"])
            .Filter(Col("l_quantity") < Col("avg_quantity"))
            .Agg((Sum(Col("l_extendedprice")) / 7.0).As("avg_yearly"))
            .Show();
        }
Пример #5
0
        internal void Q11()
        {
            Func <Column, Column, Column> mul   = Udf <double, int, double>((x, y) => x * y);
            Func <Column, Column>         mul01 = Udf <double, double>(x => x * 0.0001);

            DataFrame tmp = _nation.Filter(Col("n_name") == "GERMANY")
                            .Join(_supplier, Col("n_nationkey") == _supplier["s_nationkey"])
                            .Select(Col("s_suppkey"))
                            .Join(_partsupp, Col("s_suppkey") == _partsupp["ps_suppkey"])
                            .Select(Col("ps_partkey"), mul(Col("ps_supplycost"), Col("ps_availqty")).As("value"));

            DataFrame sumRes = tmp.Agg(Sum("value").As("total_value"));

            tmp.GroupBy(Col("ps_partkey")).Agg(Sum("value").As("part_value"))
            .Join(sumRes, Col("part_value") > mul01(Col("total_value")))
            .Sort(Col("part_value").Desc())
            .Show();
        }
Пример #6
0
        internal void Q2()
        {
            DataFrame europe = _region.Filter(Col("r_name") == "EUROPE")
                               .Join(_nation, Col("r_regionkey") == _nation["n_regionkey"])
                               .Join(_supplier, Col("n_nationkey") == _supplier["s_nationkey"])
                               .Join(_partsupp, _supplier["s_suppkey"] == _partsupp["ps_suppkey"]);

            DataFrame brass = _part
                              .Filter(_part["p_size"] == 15 & _part["p_type"].EndsWith("BRASS"))
                              .Join(europe, europe["ps_partkey"] == Col("p_partkey"));

            DataFrame minCost = brass.GroupBy(brass["ps_partkey"])
                                .Agg(Min("ps_supplycost").As("min"));

            brass.Join(minCost, brass["ps_partkey"] == minCost["ps_partkey"])
            .Filter(brass["ps_supplycost"] == minCost["min"])
            .Select("s_acctbal", "s_name", "n_name", "p_partkey", "p_mfgr", "s_address", "s_phone", "s_comment")
            .Sort(Col("s_acctbal").Desc(), Col("n_name"), Col("s_name"), Col("p_partkey"))
            .Show();
        }
Пример #7
0
        public void Run(string[] args)
        {
            if (args.Length != 1)
            {
                Console.Error.WriteLine(
                    "Usage: Sql.VectorDataFrameUdfs <path to SPARK_HOME/examples/src/main/resources/people.json>");
                Environment.Exit(1);
            }

            SparkSession spark = SparkSession
                                 .Builder()
                                 // Lower the shuffle partitions to speed up groupBy() operations.
                                 .Config("spark.sql.shuffle.partitions", "3")
                                 .AppName("SQL VectorUdfs example using .NET for Apache Spark")
                                 .GetOrCreate();

            DataFrame df = spark.Read().Schema("age INT, name STRING").Json(args[0]);

            StructType schema = df.Schema();

            Console.WriteLine(schema.SimpleString);

            df.Show();

            df.PrintSchema();

            // Grouped Map Vector UDF
            // able to return different shapes and record lengths
            df.GroupBy("age")
            .Apply(
                new StructType(new[]
            {
                new StructField("age", new IntegerType()),
                new StructField("nameCharCount", new IntegerType())
            }),
                r => CountCharacters(r))
            .Show();

            spark.Stop();
        }
Пример #8
0
        public void TestSignaturesV2_3_X()
        {
            Assert.IsType <Column>(_df["name"]);
            Assert.IsType <Column>(_df["age"]);

            Assert.IsType <DataFrame>(_df.ToDF());
            Assert.IsType <DataFrame>(_df.ToDF("name2", "age2"));

            StructType schema = _df.Schema();

            Assert.NotNull(schema);

            _df.PrintSchema();

            _df.Explain();
            _df.Explain(true);
            _df.Explain(false);

            Assert.Equal(2, _df.Columns().ToArray().Length);

            var expected = new List <Tuple <string, string> >
            {
                new Tuple <string, string>("age", "integer"),
                new Tuple <string, string>("name", "string")
            };

            Assert.Equal(expected, _df.DTypes());

            Assert.IsType <bool>(_df.IsLocal());

            Assert.IsType <bool>(_df.IsStreaming());

            using (var tempDir = new TemporaryDirectory())
            {
                // The following is required for *CheckPoint().
                _spark.SparkContext.SetCheckpointDir(tempDir.Path);

                Assert.IsType <DataFrame>(_df.Checkpoint());
                Assert.IsType <DataFrame>(_df.Checkpoint(false));

                Assert.IsType <DataFrame>(_df.LocalCheckpoint());
                Assert.IsType <DataFrame>(_df.LocalCheckpoint(false));
            }

            Assert.IsType <DataFrame>(_df.WithWatermark("time", "10 minutes"));

            _df.Show();
            _df.Show(10);
            _df.Show(10, 10);
            _df.Show(10, 10, true);

            Assert.IsType <DataFrame>(_df.Join(_df));
            Assert.IsType <DataFrame>(_df.Join(_df, "name"));
            Assert.IsType <DataFrame>(_df.Join(_df, new[] { "name" }));
            Assert.IsType <DataFrame>(_df.Join(_df, new[] { "name" }, "outer"));
            Assert.IsType <DataFrame>(_df.Join(_df, _df["age"] == _df["age"]));
            Assert.IsType <DataFrame>(_df.Join(_df, _df["age"] == _df["age"], "outer"));

            Assert.IsType <DataFrame>(_df.CrossJoin(_df));

            Assert.IsType <DataFrame>(_df.SortWithinPartitions("age"));
            Assert.IsType <DataFrame>(_df.SortWithinPartitions("age", "name"));
            Assert.IsType <DataFrame>(_df.SortWithinPartitions());
            Assert.IsType <DataFrame>(_df.SortWithinPartitions(_df["age"]));
            Assert.IsType <DataFrame>(_df.SortWithinPartitions(_df["age"], _df["name"]));

            Assert.IsType <DataFrame>(_df.Sort("age"));
            Assert.IsType <DataFrame>(_df.Sort("age", "name"));
            Assert.IsType <DataFrame>(_df.Sort());
            Assert.IsType <DataFrame>(_df.Sort(_df["age"]));
            Assert.IsType <DataFrame>(_df.Sort(_df["age"], _df["name"]));

            Assert.IsType <DataFrame>(_df.OrderBy("age"));
            Assert.IsType <DataFrame>(_df.OrderBy("age", "name"));
            Assert.IsType <DataFrame>(_df.OrderBy());
            Assert.IsType <DataFrame>(_df.OrderBy(_df["age"]));
            Assert.IsType <DataFrame>(_df.OrderBy(_df["age"], _df["name"]));

            Assert.IsType <DataFrame>(_df.Hint("broadcast"));
            Assert.IsType <DataFrame>(_df.Hint("broadcast", new[] { "hello", "world" }));

            Assert.IsType <Column>(_df.Col("age"));

            Assert.IsType <Column>(_df.ColRegex("age"));

            Assert.IsType <DataFrame>(_df.As("alias"));

            Assert.IsType <DataFrame>(_df.Alias("alias"));

            Assert.IsType <DataFrame>(_df.Select("age"));
            Assert.IsType <DataFrame>(_df.Select("age", "name"));
            Assert.IsType <DataFrame>(_df.Select());
            Assert.IsType <DataFrame>(_df.Select(_df["age"]));
            Assert.IsType <DataFrame>(_df.Select(_df["age"], _df["name"]));

            Assert.IsType <DataFrame>(_df.SelectExpr());
            Assert.IsType <DataFrame>(_df.SelectExpr("age * 2"));
            Assert.IsType <DataFrame>(_df.SelectExpr("age * 2", "abs(age)"));

            Assert.IsType <DataFrame>(_df.Filter(_df["age"] > 21));
            Assert.IsType <DataFrame>(_df.Filter("age > 21"));

            Assert.IsType <DataFrame>(_df.Where(_df["age"] > 21));
            Assert.IsType <DataFrame>(_df.Where("age > 21"));

            Assert.IsType <RelationalGroupedDataset>(_df.GroupBy("age"));
            Assert.IsType <RelationalGroupedDataset>(_df.GroupBy("age", "name"));
            Assert.IsType <RelationalGroupedDataset>(_df.GroupBy());
            Assert.IsType <RelationalGroupedDataset>(_df.GroupBy(_df["age"]));
            Assert.IsType <RelationalGroupedDataset>(_df.GroupBy(_df["age"], _df["name"]));

            {
                RelationalGroupedDataset df =
                    _df.WithColumn("tempAge", _df["age"]).GroupBy("name");

                Assert.IsType <DataFrame>(df.Mean("age"));
                Assert.IsType <DataFrame>(df.Mean("age", "tempAge"));

                Assert.IsType <DataFrame>(df.Max("age"));
                Assert.IsType <DataFrame>(df.Max("age", "tempAge"));

                Assert.IsType <DataFrame>(df.Avg("age"));
                Assert.IsType <DataFrame>(df.Avg("age", "tempAge"));

                Assert.IsType <DataFrame>(df.Min("age"));
                Assert.IsType <DataFrame>(df.Min("age", "tempAge"));

                Assert.IsType <DataFrame>(df.Sum("age"));
                Assert.IsType <DataFrame>(df.Sum("age", "tempAge"));
            }

            Assert.IsType <RelationalGroupedDataset>(_df.Rollup("age"));
            Assert.IsType <RelationalGroupedDataset>(_df.Rollup("age", "name"));
            Assert.IsType <RelationalGroupedDataset>(_df.Rollup());
            Assert.IsType <RelationalGroupedDataset>(_df.Rollup(_df["age"]));
            Assert.IsType <RelationalGroupedDataset>(_df.Rollup(_df["age"], _df["name"]));

            Assert.IsType <RelationalGroupedDataset>(_df.Cube("age"));
            Assert.IsType <RelationalGroupedDataset>(_df.Cube("age", "name"));
            Assert.IsType <RelationalGroupedDataset>(_df.Cube());
            Assert.IsType <RelationalGroupedDataset>(_df.Cube(_df["age"]));
            Assert.IsType <RelationalGroupedDataset>(_df.Cube(_df["age"], _df["name"]));

            Assert.IsType <DataFrame>(_df.Agg(Avg(_df["age"])));
            Assert.IsType <DataFrame>(_df.Agg(Avg(_df["age"]), Avg(_df["name"])));

            Assert.IsType <DataFrame>(_df.Limit(10));

            Assert.IsType <DataFrame>(_df.Union(_df));

            Assert.IsType <DataFrame>(_df.UnionByName(_df));

            Assert.IsType <DataFrame>(_df.Intersect(_df));

            Assert.IsType <DataFrame>(_df.Except(_df));

            Assert.IsType <DataFrame>(_df.Sample(0.5));
            Assert.IsType <DataFrame>(_df.Sample(0.5, true));
            Assert.IsType <DataFrame>(_df.Sample(0.5, false, 12345));

            Assert.IsType <DataFrame[]>(_df.RandomSplit(new[] { 0.2, 0.8 }));
            Assert.IsType <DataFrame[]>(_df.RandomSplit(new[] { 0.2, 0.8 }, 12345));

            Assert.IsType <DataFrame>(_df.WithColumn("age2", _df["age"]));

            Assert.IsType <DataFrame>(_df.WithColumnRenamed("age", "age2"));

            Assert.IsType <DataFrame>(_df.Drop());
            Assert.IsType <DataFrame>(_df.Drop("age"));
            Assert.IsType <DataFrame>(_df.Drop("age", "name"));

            Assert.IsType <DataFrame>(_df.Drop(_df["age"]));

            Assert.IsType <DataFrame>(_df.DropDuplicates());
            Assert.IsType <DataFrame>(_df.DropDuplicates("age"));
            Assert.IsType <DataFrame>(_df.DropDuplicates("age", "name"));

            Assert.IsType <DataFrame>(_df.Describe());
            Assert.IsType <DataFrame>(_df.Describe("age"));
            Assert.IsType <DataFrame>(_df.Describe("age", "name"));

            Assert.IsType <DataFrame>(_df.Summary());
            Assert.IsType <DataFrame>(_df.Summary("count"));
            Assert.IsType <DataFrame>(_df.Summary("count", "mean"));

            Assert.IsType <Row[]>(_df.Head(2).ToArray());
            Assert.IsType <Row>(_df.Head());

            Assert.IsType <Row>(_df.First());

            Assert.IsType <DataFrame>(_df.Transform(df => df.Drop("age")));

            Assert.IsType <Row[]>(_df.Take(3).ToArray());

            Assert.IsType <Row[]>(_df.Collect().ToArray());

            Assert.IsType <Row[]>(_df.ToLocalIterator().ToArray());

            Assert.IsType <long>(_df.Count());

            Assert.IsType <DataFrame>(_df.Repartition(2));
            Assert.IsType <DataFrame>(_df.Repartition(2, _df["age"]));
            Assert.IsType <DataFrame>(_df.Repartition(_df["age"]));
            Assert.IsType <DataFrame>(_df.Repartition());

            Assert.IsType <DataFrame>(_df.RepartitionByRange(2, _df["age"]));
            Assert.IsType <DataFrame>(_df.RepartitionByRange(_df["age"]));

            Assert.IsType <DataFrame>(_df.Coalesce(1));

            Assert.IsType <DataFrame>(_df.Distinct());

            Assert.IsType <DataFrame>(_df.Persist());

            Assert.IsType <DataFrame>(_df.Persist(StorageLevel.DISK_ONLY));

            Assert.IsType <DataFrame>(_df.Cache());

            Assert.IsType <StorageLevel>(_df.StorageLevel());

            Assert.IsType <DataFrame>(_df.Unpersist());

            _df.CreateTempView("view");
            _df.CreateOrReplaceTempView("view");

            _df.CreateGlobalTempView("global_view");
            _df.CreateOrReplaceGlobalTempView("global_view");
        }