Exemplo n.º 1
0
        /// <inheritdoc cref="State{T}.Sum"/>
        public override FrequenciesAndNumRows Sum(FrequenciesAndNumRows other)
        {
            IEnumerable <string> columns = Frequencies.Schema().Fields
                                           .Select(field => field.Name)
                                           .Where(field => field != AnalyzersExt.COUNT_COL);

            IEnumerable <Column> projectionAfterMerge = columns
                                                        .Select(col =>
                                                                Coalesce(Col($"this.{col}"), Col($"other.{col}")).As(col))
                                                        .Append(
                (AnalyzersExt.ZeroIfNull($"this.{AnalyzersExt.COUNT_COL}") +
                 AnalyzersExt.ZeroIfNull($"other.{AnalyzersExt.COUNT_COL}")).As(AnalyzersExt.COUNT_COL));


            Column joinCondition = columns.Aggregate(NullSafeEq(columns.First()),
                                                     (previous, result) => previous.And(NullSafeEq(result)));


            DataFrame frequenciesSum = Frequencies
                                       .Alias("this")
                                       .Join(other.Frequencies.Alias("other"), joinCondition, "outer")
                                       .Select(projectionAfterMerge.ToArray());


            return(new FrequenciesAndNumRows(frequenciesSum, NumRows + other.NumRows));
        }
Exemplo n.º 2
0
        public void TestAlias()
        {
            // Arrange
            const string alias = "alias1";
            var          expectedResultDataFrameProxy = new Mock <IDataFrameProxy>().Object;

            mockDataFrameProxy.Setup(m => m.Alias(It.IsAny <string>())).Returns(expectedResultDataFrameProxy);
            var sc = new SparkContext(null);

            // Act
            var originalDataFrame     = new DataFrame(mockDataFrameProxy.Object, sc);
            var actualResultDataFrame = originalDataFrame.Alias(alias);

            // Assert
            mockDataFrameProxy.Verify(m => m.Alias(alias)); // assert Drop of Proxy was invoked with correct parameters
            Assert.AreEqual(expectedResultDataFrameProxy, actualResultDataFrame.DataFrameProxy);
        }
Exemplo n.º 3
0
        public void TestSignaturesV2_3_X()
        {
            Column col = _df["name"];

            col = _df["age"];

            DataFrame df = _df.ToDF();

            df = df.ToDF("name2", "age2");

            StructType schema = _df.Schema();

            Assert.NotNull(schema);

            _df.PrintSchema();

            _df.Explain();
            _df.Explain(true);
            _df.Explain(false);

            Assert.Equal(2, _df.Columns().ToArray().Length);

            _df.IsLocal();

            _df.IsStreaming();

            using (var tempDir = new TemporaryDirectory())
            {
                // The following is required for *CheckPoint().
                _spark.SparkContext.SetCheckpointDir(tempDir.Path);

                _df.Checkpoint();
                _df.Checkpoint(false);

                _df.LocalCheckpoint();
                _df.LocalCheckpoint(false);
            }

            _df.WithWatermark("time", "10 minutes");

            _df.Show();
            _df.Show(10);
            _df.Show(10, 10);
            _df.Show(10, 10, true);

            _df.Join(_df);
            _df.Join(_df, "name");
            _df.Join(_df, new[] { "name" });
            _df.Join(_df, new[] { "name" }, "outer");
            _df.Join(_df, _df["age"] == _df["age"]);
            _df.Join(_df, _df["age"] == _df["age"], "outer");

            _df.CrossJoin(_df);

            _df.SortWithinPartitions("age");
            _df.SortWithinPartitions("age", "name");
            _df.SortWithinPartitions();
            _df.SortWithinPartitions(_df["age"]);
            _df.SortWithinPartitions(_df["age"], _df["name"]);

            _df.Sort("age");
            _df.Sort("age", "name");
            _df.Sort();
            _df.Sort(_df["age"]);
            _df.Sort(_df["age"], _df["name"]);

            _df.OrderBy("age");
            _df.OrderBy("age", "name");
            _df.OrderBy();
            _df.OrderBy(_df["age"]);
            _df.OrderBy(_df["age"], _df["name"]);

            _df.Hint("broadcast");
            _df.Hint("broadcast", new[] { "hello", "world" });

            _df.Col("age");

            _df.ColRegex("age");

            _df.As("alias");

            _df.Alias("alias");

            _df.Select("age");
            _df.Select("age", "name");
            _df.Select();
            _df.Select(_df["age"]);
            _df.Select(_df["age"], _df["name"]);

            _df.SelectExpr();
            _df.SelectExpr("age * 2");
            _df.SelectExpr("age * 2", "abs(age)");

            _df.Filter(_df["age"] > 21);
            _df.Filter("age > 21");

            _df.Where(_df["age"] > 21);
            _df.Where("age > 21");

            _df.GroupBy("age");
            _df.GroupBy("age", "name");
            _df.GroupBy();
            _df.GroupBy(_df["age"]);
            _df.GroupBy(_df["age"], _df["name"]);

            _df.Rollup("age");
            _df.Rollup("age", "name");
            _df.Rollup();
            _df.Rollup(_df["age"]);
            _df.Rollup(_df["age"], _df["name"]);

            _df.Cube("age");
            _df.Cube("age", "name");
            _df.Cube();
            _df.Cube(_df["age"]);
            _df.Cube(_df["age"], _df["name"]);

            _df.Agg(Avg(_df["age"]));
            _df.Agg(Avg(_df["age"]), Avg(_df["name"]));

            _df.Limit(10);

            _df.Union(_df);

            _df.UnionByName(_df);

            _df.Intersect(_df);

            _df.Except(_df);

            _df.Sample(0.5);
            _df.Sample(0.5, true);
            _df.Sample(0.5, false, 12345);

            _df.RandomSplit(new[] { 0.2, 0.8 });
            _df.RandomSplit(new[] { 0.2, 0.8 }, 12345);

            _df.WithColumn("age2", _df["age"]);

            _df.WithColumnRenamed("age", "age2");

            _df.Drop();
            _df.Drop("age");
            _df.Drop("age", "name");

            _df.Drop(_df["age"]);

            _df.DropDuplicates();
            _df.DropDuplicates("age");
            _df.DropDuplicates("age", "name");

            _df.Describe();
            _df.Describe("age");
            _df.Describe("age", "name");

            _df.Summary();
            _df.Summary("count");
            _df.Summary("count", "mean");

            _df.Head(2);
            _df.Head();

            _df.First();

            _df.Take(3).ToArray();

            _df.Collect().ToArray();

            _df.ToLocalIterator().ToArray();

            _df.Count();

            _df.Repartition(2);
            _df.Repartition(2, _df["age"]);
            _df.Repartition(_df["age"]);
            _df.Repartition();

            _df.RepartitionByRange(2, _df["age"]);
            _df.RepartitionByRange(_df["age"]);

            _df.Coalesce(1);

            _df.Distinct();

            _df.Persist();

            _df.Cache();

            _df.Unpersist();

            _df.CreateTempView("view");
            _df.CreateOrReplaceTempView("view");

            _df.CreateGlobalTempView("global_view");
            _df.CreateOrReplaceGlobalTempView("global_view");
        }