Пример #1
0
        internal void Q21()
        {
            DataFrame fsupplier = _supplier.Select(Col("s_suppkey"), Col("s_nationkey"), Col("s_name"));

            DataFrame plineitem = _lineitem
                                  .Select(Col("l_suppkey"), Col("l_orderkey"), Col("l_receiptdate"), Col("l_commitdate"));

            DataFrame flineitem = plineitem.Filter(Col("l_receiptdate") > Col("l_commitdate"));

            DataFrame line1 = plineitem.GroupBy(Col("l_orderkey"))
                              .Agg(CountDistinct(Col("l_suppkey")).As("suppkey_count"), Max(Col("l_suppkey")).As("suppkey_max"))
                              .Select(Col("l_orderkey").As("key"), Col("suppkey_count"), Col("suppkey_max"));

            DataFrame line2 = flineitem.GroupBy(Col("l_orderkey"))
                              .Agg(CountDistinct(Col("l_suppkey")).As("suppkey_count"), Max(Col("l_suppkey")).As("suppkey_max"))
                              .Select(Col("l_orderkey").As("key"), Col("suppkey_count"), Col("suppkey_max"));

            DataFrame forder = _orders.Select(Col("o_orderkey"), Col("o_orderstatus"))
                               .Filter(Col("o_orderstatus") == "F");

            _nation.Filter(Col("n_name") == "SAUDI ARABIA")
            .Join(fsupplier, Col("n_nationkey") == fsupplier["s_nationkey"])
            .Join(flineitem, Col("s_suppkey") == flineitem["l_suppkey"])
            .Join(forder, Col("l_orderkey") == forder["o_orderkey"])
            .Join(line1, Col("l_orderkey") == line1["key"])
            .Filter(Col("suppkey_count") > 1)
            .Select(Col("s_name"), Col("l_orderkey"), Col("l_suppkey"))
            .Join(line2, Col("l_orderkey") == line2["key"], "left_outer")
            .Select(Col("s_name"), Col("l_orderkey"), Col("l_suppkey"), Col("suppkey_count"), Col("suppkey_max"))
            .Filter(Col("suppkey_count") == 1 & Col("l_suppkey") == Col("suppkey_max"))
            .GroupBy(Col("s_name"))
            .Agg(Count(Col("l_suppkey")).As("numwait"))
            .Sort(Col("numwait").Desc(), Col("s_name"))
            .Show();
        }
Пример #2
0
        internal void Q22()
        {
            Func <Column, Column> sub2 = Udf <string, string>(x => x.Substring(0, 2));

            Func <Column, Column> phone = Udf <string, bool>(x => s_q22PhoneRegex.IsMatch(x));

            DataFrame fcustomer = _customer.Select(Col("c_acctbal"), Col("c_custkey"), sub2(Col("c_phone")).As("cntrycode"))
                                  .Filter(phone(Col("cntrycode")));

            DataFrame avg_customer = fcustomer.Filter(Col("c_acctbal") > 0.0)
                                     .Agg(Avg(Col("c_acctbal")).As("avg_acctbal"));

            _orders.GroupBy(Col("o_custkey"))
            .Agg(Col("o_custkey")).Select(Col("o_custkey"))
            .Join(fcustomer, Col("o_custkey") == fcustomer["c_custkey"], "right_outer")
            .Filter(Col("o_custkey").IsNull())
            .Join(avg_customer)
            .Filter(Col("c_acctbal") > Col("avg_acctbal"))
            .GroupBy(Col("cntrycode"))
            .Agg(Count(Col("c_acctbal")).As("numcust"), Sum(Col("c_acctbal")).As("totacctbal"))
            .Sort(Col("cntrycode"))
            .Show();
        }
Пример #3
0
        public void TestSignaturesV2_3_X()
        {
            Assert.IsType <Column>(_df["name"]);
            Assert.IsType <Column>(_df["age"]);

            Assert.IsType <DataFrame>(_df.ToDF());
            Assert.IsType <DataFrame>(_df.ToDF("name2", "age2"));

            StructType schema = _df.Schema();

            Assert.NotNull(schema);

            _df.PrintSchema();

            _df.Explain();
            _df.Explain(true);
            _df.Explain(false);

            Assert.Equal(2, _df.Columns().ToArray().Length);

            var expected = new List <Tuple <string, string> >
            {
                new Tuple <string, string>("age", "integer"),
                new Tuple <string, string>("name", "string")
            };

            Assert.Equal(expected, _df.DTypes());

            Assert.IsType <bool>(_df.IsLocal());

            Assert.IsType <bool>(_df.IsStreaming());

            using (var tempDir = new TemporaryDirectory())
            {
                // The following is required for *CheckPoint().
                _spark.SparkContext.SetCheckpointDir(tempDir.Path);

                Assert.IsType <DataFrame>(_df.Checkpoint());
                Assert.IsType <DataFrame>(_df.Checkpoint(false));

                Assert.IsType <DataFrame>(_df.LocalCheckpoint());
                Assert.IsType <DataFrame>(_df.LocalCheckpoint(false));
            }

            Assert.IsType <DataFrame>(_df.WithWatermark("time", "10 minutes"));

            _df.Show();
            _df.Show(10);
            _df.Show(10, 10);
            _df.Show(10, 10, true);

            Assert.IsType <DataFrame>(_df.Join(_df));
            Assert.IsType <DataFrame>(_df.Join(_df, "name"));
            Assert.IsType <DataFrame>(_df.Join(_df, new[] { "name" }));
            Assert.IsType <DataFrame>(_df.Join(_df, new[] { "name" }, "outer"));
            Assert.IsType <DataFrame>(_df.Join(_df, _df["age"] == _df["age"]));
            Assert.IsType <DataFrame>(_df.Join(_df, _df["age"] == _df["age"], "outer"));

            Assert.IsType <DataFrame>(_df.CrossJoin(_df));

            Assert.IsType <DataFrame>(_df.SortWithinPartitions("age"));
            Assert.IsType <DataFrame>(_df.SortWithinPartitions("age", "name"));
            Assert.IsType <DataFrame>(_df.SortWithinPartitions());
            Assert.IsType <DataFrame>(_df.SortWithinPartitions(_df["age"]));
            Assert.IsType <DataFrame>(_df.SortWithinPartitions(_df["age"], _df["name"]));

            Assert.IsType <DataFrame>(_df.Sort("age"));
            Assert.IsType <DataFrame>(_df.Sort("age", "name"));
            Assert.IsType <DataFrame>(_df.Sort());
            Assert.IsType <DataFrame>(_df.Sort(_df["age"]));
            Assert.IsType <DataFrame>(_df.Sort(_df["age"], _df["name"]));

            Assert.IsType <DataFrame>(_df.OrderBy("age"));
            Assert.IsType <DataFrame>(_df.OrderBy("age", "name"));
            Assert.IsType <DataFrame>(_df.OrderBy());
            Assert.IsType <DataFrame>(_df.OrderBy(_df["age"]));
            Assert.IsType <DataFrame>(_df.OrderBy(_df["age"], _df["name"]));

            Assert.IsType <DataFrame>(_df.Hint("broadcast"));
            Assert.IsType <DataFrame>(_df.Hint("broadcast", new[] { "hello", "world" }));

            Assert.IsType <Column>(_df.Col("age"));

            Assert.IsType <Column>(_df.ColRegex("age"));

            Assert.IsType <DataFrame>(_df.As("alias"));

            Assert.IsType <DataFrame>(_df.Alias("alias"));

            Assert.IsType <DataFrame>(_df.Select("age"));
            Assert.IsType <DataFrame>(_df.Select("age", "name"));
            Assert.IsType <DataFrame>(_df.Select());
            Assert.IsType <DataFrame>(_df.Select(_df["age"]));
            Assert.IsType <DataFrame>(_df.Select(_df["age"], _df["name"]));

            Assert.IsType <DataFrame>(_df.SelectExpr());
            Assert.IsType <DataFrame>(_df.SelectExpr("age * 2"));
            Assert.IsType <DataFrame>(_df.SelectExpr("age * 2", "abs(age)"));

            Assert.IsType <DataFrame>(_df.Filter(_df["age"] > 21));
            Assert.IsType <DataFrame>(_df.Filter("age > 21"));

            Assert.IsType <DataFrame>(_df.Where(_df["age"] > 21));
            Assert.IsType <DataFrame>(_df.Where("age > 21"));

            Assert.IsType <RelationalGroupedDataset>(_df.GroupBy("age"));
            Assert.IsType <RelationalGroupedDataset>(_df.GroupBy("age", "name"));
            Assert.IsType <RelationalGroupedDataset>(_df.GroupBy());
            Assert.IsType <RelationalGroupedDataset>(_df.GroupBy(_df["age"]));
            Assert.IsType <RelationalGroupedDataset>(_df.GroupBy(_df["age"], _df["name"]));

            {
                RelationalGroupedDataset df =
                    _df.WithColumn("tempAge", _df["age"]).GroupBy("name");

                Assert.IsType <DataFrame>(df.Mean("age"));
                Assert.IsType <DataFrame>(df.Mean("age", "tempAge"));

                Assert.IsType <DataFrame>(df.Max("age"));
                Assert.IsType <DataFrame>(df.Max("age", "tempAge"));

                Assert.IsType <DataFrame>(df.Avg("age"));
                Assert.IsType <DataFrame>(df.Avg("age", "tempAge"));

                Assert.IsType <DataFrame>(df.Min("age"));
                Assert.IsType <DataFrame>(df.Min("age", "tempAge"));

                Assert.IsType <DataFrame>(df.Sum("age"));
                Assert.IsType <DataFrame>(df.Sum("age", "tempAge"));
            }

            Assert.IsType <RelationalGroupedDataset>(_df.Rollup("age"));
            Assert.IsType <RelationalGroupedDataset>(_df.Rollup("age", "name"));
            Assert.IsType <RelationalGroupedDataset>(_df.Rollup());
            Assert.IsType <RelationalGroupedDataset>(_df.Rollup(_df["age"]));
            Assert.IsType <RelationalGroupedDataset>(_df.Rollup(_df["age"], _df["name"]));

            Assert.IsType <RelationalGroupedDataset>(_df.Cube("age"));
            Assert.IsType <RelationalGroupedDataset>(_df.Cube("age", "name"));
            Assert.IsType <RelationalGroupedDataset>(_df.Cube());
            Assert.IsType <RelationalGroupedDataset>(_df.Cube(_df["age"]));
            Assert.IsType <RelationalGroupedDataset>(_df.Cube(_df["age"], _df["name"]));

            Assert.IsType <DataFrame>(_df.Agg(Avg(_df["age"])));
            Assert.IsType <DataFrame>(_df.Agg(Avg(_df["age"]), Avg(_df["name"])));

            Assert.IsType <DataFrame>(_df.Limit(10));

            Assert.IsType <DataFrame>(_df.Union(_df));

            Assert.IsType <DataFrame>(_df.UnionByName(_df));

            Assert.IsType <DataFrame>(_df.Intersect(_df));

            Assert.IsType <DataFrame>(_df.Except(_df));

            Assert.IsType <DataFrame>(_df.Sample(0.5));
            Assert.IsType <DataFrame>(_df.Sample(0.5, true));
            Assert.IsType <DataFrame>(_df.Sample(0.5, false, 12345));

            Assert.IsType <DataFrame[]>(_df.RandomSplit(new[] { 0.2, 0.8 }));
            Assert.IsType <DataFrame[]>(_df.RandomSplit(new[] { 0.2, 0.8 }, 12345));

            Assert.IsType <DataFrame>(_df.WithColumn("age2", _df["age"]));

            Assert.IsType <DataFrame>(_df.WithColumnRenamed("age", "age2"));

            Assert.IsType <DataFrame>(_df.Drop());
            Assert.IsType <DataFrame>(_df.Drop("age"));
            Assert.IsType <DataFrame>(_df.Drop("age", "name"));

            Assert.IsType <DataFrame>(_df.Drop(_df["age"]));

            Assert.IsType <DataFrame>(_df.DropDuplicates());
            Assert.IsType <DataFrame>(_df.DropDuplicates("age"));
            Assert.IsType <DataFrame>(_df.DropDuplicates("age", "name"));

            Assert.IsType <DataFrame>(_df.Describe());
            Assert.IsType <DataFrame>(_df.Describe("age"));
            Assert.IsType <DataFrame>(_df.Describe("age", "name"));

            Assert.IsType <DataFrame>(_df.Summary());
            Assert.IsType <DataFrame>(_df.Summary("count"));
            Assert.IsType <DataFrame>(_df.Summary("count", "mean"));

            Assert.IsType <Row[]>(_df.Head(2).ToArray());
            Assert.IsType <Row>(_df.Head());

            Assert.IsType <Row>(_df.First());

            Assert.IsType <DataFrame>(_df.Transform(df => df.Drop("age")));

            Assert.IsType <Row[]>(_df.Take(3).ToArray());

            Assert.IsType <Row[]>(_df.Collect().ToArray());

            Assert.IsType <Row[]>(_df.ToLocalIterator().ToArray());

            Assert.IsType <long>(_df.Count());

            Assert.IsType <DataFrame>(_df.Repartition(2));
            Assert.IsType <DataFrame>(_df.Repartition(2, _df["age"]));
            Assert.IsType <DataFrame>(_df.Repartition(_df["age"]));
            Assert.IsType <DataFrame>(_df.Repartition());

            Assert.IsType <DataFrame>(_df.RepartitionByRange(2, _df["age"]));
            Assert.IsType <DataFrame>(_df.RepartitionByRange(_df["age"]));

            Assert.IsType <DataFrame>(_df.Coalesce(1));

            Assert.IsType <DataFrame>(_df.Distinct());

            Assert.IsType <DataFrame>(_df.Persist());

            Assert.IsType <DataFrame>(_df.Persist(StorageLevel.DISK_ONLY));

            Assert.IsType <DataFrame>(_df.Cache());

            Assert.IsType <StorageLevel>(_df.StorageLevel());

            Assert.IsType <DataFrame>(_df.Unpersist());

            _df.CreateTempView("view");
            _df.CreateOrReplaceTempView("view");

            _df.CreateGlobalTempView("global_view");
            _df.CreateOrReplaceGlobalTempView("global_view");
        }