Пример #1
0
        public void TestCache()
        {
            // arrange
            mockDataFrameProxy.Setup(m => m.Persist(It.IsAny <StorageLevelType>()));

            var sc        = new SparkContext(null);
            var dataFrame = new DataFrame(mockDataFrameProxy.Object, sc);

            dataFrame.Cache();

            // assert
            mockDataFrameProxy.Verify(m => m.Persist(StorageLevelType.MEMORY_AND_DISK), Times.Once());
        }
Пример #2
0
        private static void ReviewsCleanup(DataFrame dataFrame)
        {
            Console.WriteLine("Ratings Clean-up");

            dataFrame = dataFrame
                        .Filter(
                dataFrame["reviewerID"].IsNotNull()
                .And(dataFrame["asin"].IsNotNull())
                .And(dataFrame["reviewText"].IsNotNull()));

            dataFrame = dataFrame
                        .WithColumnRenamed("reviewerID", "rid")
                        .WithColumnRenamed("reviewText", "review_text")
                        .WithColumnRenamed("unixReviewTime", "unix_time");

            dataFrame.Cache();

            dataFrame.CreateOrReplaceTempView("ElectronicsReviews");

            Console.WriteLine($"Reviews Count: {dataFrame.Count()}");
            Console.WriteLine("Done");
            Console.WriteLine();
        }
Пример #3
0
        public void TestSignaturesV2_3_X()
        {
            Column col = _df["name"];

            col = _df["age"];

            DataFrame df = _df.ToDF();

            df = df.ToDF("name2", "age2");

            StructType schema = _df.Schema();

            Assert.NotNull(schema);

            _df.PrintSchema();

            _df.Explain();
            _df.Explain(true);
            _df.Explain(false);

            Assert.Equal(2, _df.Columns().ToArray().Length);

            _df.IsLocal();

            _df.IsStreaming();

            using (var tempDir = new TemporaryDirectory())
            {
                // The following is required for *CheckPoint().
                _spark.SparkContext.SetCheckpointDir(tempDir.Path);

                _df.Checkpoint();
                _df.Checkpoint(false);

                _df.LocalCheckpoint();
                _df.LocalCheckpoint(false);
            }

            _df.WithWatermark("time", "10 minutes");

            _df.Show();
            _df.Show(10);
            _df.Show(10, 10);
            _df.Show(10, 10, true);

            _df.Join(_df);
            _df.Join(_df, "name");
            _df.Join(_df, new[] { "name" });
            _df.Join(_df, new[] { "name" }, "outer");
            _df.Join(_df, _df["age"] == _df["age"]);
            _df.Join(_df, _df["age"] == _df["age"], "outer");

            _df.CrossJoin(_df);

            _df.SortWithinPartitions("age");
            _df.SortWithinPartitions("age", "name");
            _df.SortWithinPartitions();
            _df.SortWithinPartitions(_df["age"]);
            _df.SortWithinPartitions(_df["age"], _df["name"]);

            _df.Sort("age");
            _df.Sort("age", "name");
            _df.Sort();
            _df.Sort(_df["age"]);
            _df.Sort(_df["age"], _df["name"]);

            _df.OrderBy("age");
            _df.OrderBy("age", "name");
            _df.OrderBy();
            _df.OrderBy(_df["age"]);
            _df.OrderBy(_df["age"], _df["name"]);

            _df.Hint("broadcast");
            _df.Hint("broadcast", new[] { "hello", "world" });

            _df.Col("age");

            _df.ColRegex("age");

            _df.As("alias");

            _df.Alias("alias");

            _df.Select("age");
            _df.Select("age", "name");
            _df.Select();
            _df.Select(_df["age"]);
            _df.Select(_df["age"], _df["name"]);

            _df.SelectExpr();
            _df.SelectExpr("age * 2");
            _df.SelectExpr("age * 2", "abs(age)");

            _df.Filter(_df["age"] > 21);
            _df.Filter("age > 21");

            _df.Where(_df["age"] > 21);
            _df.Where("age > 21");

            _df.GroupBy("age");
            _df.GroupBy("age", "name");
            _df.GroupBy();
            _df.GroupBy(_df["age"]);
            _df.GroupBy(_df["age"], _df["name"]);

            _df.Rollup("age");
            _df.Rollup("age", "name");
            _df.Rollup();
            _df.Rollup(_df["age"]);
            _df.Rollup(_df["age"], _df["name"]);

            _df.Cube("age");
            _df.Cube("age", "name");
            _df.Cube();
            _df.Cube(_df["age"]);
            _df.Cube(_df["age"], _df["name"]);

            _df.Agg(Avg(_df["age"]));
            _df.Agg(Avg(_df["age"]), Avg(_df["name"]));

            _df.Limit(10);

            _df.Union(_df);

            _df.UnionByName(_df);

            _df.Intersect(_df);

            _df.Except(_df);

            _df.Sample(0.5);
            _df.Sample(0.5, true);
            _df.Sample(0.5, false, 12345);

            _df.RandomSplit(new[] { 0.2, 0.8 });
            _df.RandomSplit(new[] { 0.2, 0.8 }, 12345);

            _df.WithColumn("age2", _df["age"]);

            _df.WithColumnRenamed("age", "age2");

            _df.Drop();
            _df.Drop("age");
            _df.Drop("age", "name");

            _df.Drop(_df["age"]);

            _df.DropDuplicates();
            _df.DropDuplicates("age");
            _df.DropDuplicates("age", "name");

            _df.Describe();
            _df.Describe("age");
            _df.Describe("age", "name");

            _df.Summary();
            _df.Summary("count");
            _df.Summary("count", "mean");

            _df.Head(2);
            _df.Head();

            _df.First();

            _df.Take(3).ToArray();

            _df.Collect().ToArray();

            _df.ToLocalIterator().ToArray();

            _df.Count();

            _df.Repartition(2);
            _df.Repartition(2, _df["age"]);
            _df.Repartition(_df["age"]);
            _df.Repartition();

            _df.RepartitionByRange(2, _df["age"]);
            _df.RepartitionByRange(_df["age"]);

            _df.Coalesce(1);

            _df.Distinct();

            _df.Persist();

            _df.Cache();

            _df.Unpersist();

            _df.CreateTempView("view");
            _df.CreateOrReplaceTempView("view");

            _df.CreateGlobalTempView("global_view");
            _df.CreateOrReplaceGlobalTempView("global_view");
        }
Пример #4
0
        private static void MetadataCleanup(DataFrame dataFrame)
        {
            Console.WriteLine("Metadata Clean-up");

            var priceCleanup = Udf <string, float>(
                p =>
            {
                if (!string.IsNullOrEmpty(p))
                {
                    var index = 0;

                    for (var i = 0; i < p.Length; i++)
                    {
                        if (char.IsDigit(p[i]))
                        {
                            index = i;
                            break;
                        }
                    }

                    if (float.TryParse(p.Substring(index), out var result))
                    {
                        return(result);
                    }
                }

                return(-1f);
            });

            var dateCleanup = Udf <string, double>(
                d =>
            {
                if (!string.IsNullOrEmpty(d) && DateTime.TryParse(d, out var result))
                {
                    return((result.ToUniversalTime() - new DateTime(1970, 1, 1)).TotalSeconds);
                }

                return(-1L);
            });

            var rankCleanup = Udf <string, long>(
                r =>
            {
                if (!string.IsNullOrEmpty(r))
                {
                    var regex = new Regex(@"\d+(,\d+)*", RegexOptions.Singleline);
                    var match = regex.Match(r);
                    if (match.Success && long.TryParse(match.Value.Replace(",", string.Empty), out var result))
                    {
                        return(result);
                    }
                }

                return(-1L);
            });

            dataFrame = dataFrame
                        .Filter(
                dataFrame["asin"].IsNotNull()
                .And(dataFrame["title"].IsNotNull())
                .And(dataFrame["main_cat"].IsNotNull())
                .And(dataFrame["brand"].IsNotNull())
                .And(Not(dataFrame["main_cat"].IsIn("Grocery", "Pet Supplies", "Baby", "Books", "Appstore for Android", "Gift Cards"))));

            dataFrame = dataFrame
                        .WithColumn("clean_price", priceCleanup(dataFrame["price"]))
                        .WithColumn("clean-date", dateCleanup(dataFrame["date"]))
                        .WithColumn("clean-rank", rankCleanup(dataFrame["rank"]))
                        .Drop(dataFrame["price"])
                        .Drop(dataFrame["date"])
                        .Drop(dataFrame["rank"])
                        .WithColumnRenamed("clean_price", "price")
                        .WithColumnRenamed("clean-date", "unixTime")
                        .WithColumnRenamed("clean-rank", "rank");

            dataFrame.Cache();
            dataFrame.CreateOrReplaceTempView("ElectronicsMetadata");

            Console.WriteLine($"Metadata Count: {dataFrame.Count()}");
            Console.WriteLine("Done");
            Console.WriteLine();
        }