コード例 #1
0
        public void TestWithColumn()
        {
            Func <Column, Column> sizeNameAgeUdf = Udf <Row, string>(
                r =>
            {
                string name = r.GetAs <string>("name");
                int?age     = r.GetAs <int?>("age");
                if (age.HasValue)
                {
                    return($"{r.Size()},{name},{age.Value}");
                }

                return($"{r.Size()},{name},{string.Empty}");
            });

            string[]  allCols      = _df.Columns().ToArray();
            DataFrame nameAgeColDF =
                _df.WithColumn("NameAgeCol", Struct(allCols[0], allCols.Skip(1).ToArray()));
            DataFrame sizeNameAgeColDF =
                nameAgeColDF.WithColumn("SizeNameAgeCol", sizeNameAgeUdf(nameAgeColDF["NameAgeCol"]));

            Row[] originalDFRows = _df.Collect().ToArray();
            Assert.Equal(3, originalDFRows.Length);

            Row[] sizeNameAgeColDFRows = sizeNameAgeColDF.Collect().ToArray();
            Assert.Equal(3, sizeNameAgeColDFRows.Length);

            {
                Row row = sizeNameAgeColDFRows[0];
                Assert.Equal("Michael", row.GetAs <string>("name"));
                Assert.Null(row.Get("age"));
                Assert.IsType <Row>(row.Get("NameAgeCol"));
                Assert.Equal(originalDFRows[0], row.GetAs <Row>("NameAgeCol"));
                Assert.Equal("2,Michael,", row.GetAs <string>("SizeNameAgeCol"));
            }

            {
                Row row = sizeNameAgeColDFRows[1];
                Assert.Equal("Andy", row.GetAs <string>("name"));
                Assert.Equal(30, row.GetAs <int>("age"));
                Assert.IsType <Row>(row.Get("NameAgeCol"));
                Assert.Equal(originalDFRows[1], row.GetAs <Row>("NameAgeCol"));
                Assert.Equal("2,Andy,30", row.GetAs <string>("SizeNameAgeCol"));
            }

            {
                Row row = sizeNameAgeColDFRows[2];
                Assert.Equal("Justin", row.GetAs <string>("name"));
                Assert.Equal(19, row.GetAs <int>("age"));
                Assert.IsType <Row>(row.Get("NameAgeCol"));
                Assert.Equal(originalDFRows[2], row.GetAs <Row>("NameAgeCol"));
                Assert.Equal("2,Justin,19", row.GetAs <string>("SizeNameAgeCol"));
            }
        }
コード例 #2
0
        public static void AssertSameRows(DataFrame dataFrameA, DataFrame dataFrameB, Option <ITestOutputHelper> helper)
        {
            Column[] dfAOrderedColumns = dataFrameA
                                         .Columns()
                                         .OrderByDescending(val => val)
                                         .Select(Column)
                                         .ToArray();
            Column[] dfBOrderedColumns = dataFrameB
                                         .Columns()
                                         .OrderByDescending(val => val)
                                         .Select(Column)
                                         .ToArray();

            IEnumerable <Row> dfASeq = dataFrameA.Select(dfAOrderedColumns).Collect();
            IEnumerable <Row> dfBSeq = dataFrameB.Select(dfBOrderedColumns).Collect();

            foreach (Row rowA in dfASeq)
            {
                if (helper.HasValue)
                {
                    helper.Value.WriteLine($"Computed - {rowA}");
                }
            }

            int i = 0;

            foreach (Row rowA in dfASeq)
            {
                dfBSeq.Select(row => row.Values.ToString()).ShouldContain(rowA.Values.ToString());
            }
        }
コード例 #3
0
        private static string FormatTagColumnNameInDataFrame(string tagName, DataFrame dataFrame)
        {
            string tagColumnName = tagName.Replace("[^A-Za-z0-9_]", "").ToLowerInvariant();

            if (dataFrame.Columns().Contains(tagColumnName))
            {
                tagColumnName += "_2";
            }

            return(tagColumnName);
        }
コード例 #4
0
        private DataFrame DataFrameUnion(DataFrame dataFrameOne, DataFrame dataFrameTwo)
        {
            string[] columnsOne = dataFrameOne.Columns().ToArray();
            string[] columnsTwo = dataFrameTwo.Columns().ToArray();

            IEnumerable <string> columnTotal = columnsOne.Concat(columnsTwo).Distinct();

            return(dataFrameOne
                   .Select(WithAllColumns(columnsOne, columnTotal.ToArray()).ToArray())
                   .Union(dataFrameTwo.Select(WithAllColumns(columnsTwo, columnTotal))));
        }
コード例 #5
0
        public void TestIDFModel()
        {
            int    expectedDocFrequency = 1980;
            string expectedInputCol     = "rawFeatures";
            string expectedOutputCol    = "features";

            DataFrame sentenceData =
                _spark.Sql("SELECT 0.0 as label, 'Hi I heard about Spark' as sentence");

            Tokenizer tokenizer = new Tokenizer()
                                  .SetInputCol("sentence")
                                  .SetOutputCol("words");

            DataFrame wordsData = tokenizer.Transform(sentenceData);

            HashingTF hashingTF = new HashingTF()
                                  .SetInputCol("words")
                                  .SetOutputCol(expectedInputCol)
                                  .SetNumFeatures(20);

            DataFrame featurizedData = hashingTF.Transform(wordsData);

            IDF idf = new IDF()
                      .SetInputCol(expectedInputCol)
                      .SetOutputCol(expectedOutputCol)
                      .SetMinDocFreq(expectedDocFrequency);

            IDFModel idfModel = idf.Fit(featurizedData);

            DataFrame rescaledData = idfModel.Transform(featurizedData);

            Assert.Contains(expectedOutputCol, rescaledData.Columns());

            Assert.Equal(expectedInputCol, idfModel.GetInputCol());
            Assert.Equal(expectedOutputCol, idfModel.GetOutputCol());
            Assert.Equal(expectedDocFrequency, idfModel.GetMinDocFreq());

            using (var tempDirectory = new TemporaryDirectory())
            {
                string modelPath = Path.Join(tempDirectory.Path, "idfModel");
                idfModel.Save(modelPath);

                IDFModel loadedModel = IDFModel.Load(modelPath);
                Assert.Equal(idfModel.Uid(), loadedModel.Uid());
            }

            TestFeatureBase(idfModel, "minDocFreq", 1000);
        }
コード例 #6
0
        public void TestHashingTF()
        {
            string expectedInputCol  = "input_col";
            string expectedOutputCol = "output_col";
            int    expectedFeatures  = 10;

            Assert.IsType <HashingTF>(new HashingTF());

            HashingTF hashingTf = new HashingTF("my-unique-id")
                                  .SetNumFeatures(expectedFeatures)
                                  .SetInputCol(expectedInputCol)
                                  .SetOutputCol(expectedOutputCol);

            Assert.Equal(expectedFeatures, hashingTf.GetNumFeatures());
            Assert.Equal(expectedInputCol, hashingTf.GetInputCol());
            Assert.Equal(expectedOutputCol, hashingTf.GetOutputCol());

            DataFrame input = _spark.Sql("SELECT array('this', 'is', 'a', 'string', 'a', 'a')" +
                                         " as input_col");

            DataFrame output       = hashingTf.Transform(input);
            DataFrame outputVector = output.Select(expectedOutputCol);

            Assert.Contains(expectedOutputCol, outputVector.Columns());

            using (var tempDirectory = new TemporaryDirectory())
            {
                string savePath = Path.Join(tempDirectory.Path, "hashingTF");
                hashingTf.Save(savePath);

                HashingTF loadedHashingTf = HashingTF.Load(savePath);
                Assert.Equal(hashingTf.Uid(), loadedHashingTf.Uid());
            }

            hashingTf.SetBinary(true);
            Assert.True(hashingTf.GetBinary());

            TestFeatureBase(hashingTf, "numFeatures", 1000);
        }
コード例 #7
0
        public void TestColumns()
        {
            // Arrange
            const string columnName      = "column1";
            var          mockSchemaProxy = new Mock <IStructTypeProxy>();
            var          mockFieldProxy  = new Mock <IStructFieldProxy>();
            var          expectedResultDataFrameProxy = new Mock <IDataFrameProxy>().Object;

            mockDataFrameProxy.Setup(m => m.GetSchema()).Returns(mockSchemaProxy.Object);
            mockSchemaProxy.Setup(m => m.GetStructTypeFields()).Returns(new List <IStructFieldProxy> {
                mockFieldProxy.Object
            });
            mockFieldProxy.Setup(m => m.GetStructFieldName()).Returns(columnName);
            var sc = new SparkContext(null);

            // Act
            var originalDataFrame = new DataFrame(mockDataFrameProxy.Object, sc);
            var actualColumns     = originalDataFrame.Columns();

            // Assert
            CollectionAssert.AreEqual(new[] { columnName }, actualColumns.ToArray());
        }
コード例 #8
0
        public void TestSignaturesV2_3_X()
        {
            Column col = _df["name"];

            col = _df["age"];

            DataFrame df = _df.ToDF();

            df = df.ToDF("name2", "age2");

            StructType schema = _df.Schema();

            Assert.NotNull(schema);

            _df.PrintSchema();

            _df.Explain();
            _df.Explain(true);
            _df.Explain(false);

            Assert.Equal(2, _df.Columns().ToArray().Length);

            _df.IsLocal();

            _df.IsStreaming();

            using (var tempDir = new TemporaryDirectory())
            {
                // The following is required for *CheckPoint().
                _spark.SparkContext.SetCheckpointDir(tempDir.Path);

                _df.Checkpoint();
                _df.Checkpoint(false);

                _df.LocalCheckpoint();
                _df.LocalCheckpoint(false);
            }

            _df.WithWatermark("time", "10 minutes");

            _df.Show();
            _df.Show(10);
            _df.Show(10, 10);
            _df.Show(10, 10, true);

            _df.Join(_df);
            _df.Join(_df, "name");
            _df.Join(_df, new[] { "name" });
            _df.Join(_df, new[] { "name" }, "outer");
            _df.Join(_df, _df["age"] == _df["age"]);
            _df.Join(_df, _df["age"] == _df["age"], "outer");

            _df.CrossJoin(_df);

            _df.SortWithinPartitions("age");
            _df.SortWithinPartitions("age", "name");
            _df.SortWithinPartitions();
            _df.SortWithinPartitions(_df["age"]);
            _df.SortWithinPartitions(_df["age"], _df["name"]);

            _df.Sort("age");
            _df.Sort("age", "name");
            _df.Sort();
            _df.Sort(_df["age"]);
            _df.Sort(_df["age"], _df["name"]);

            _df.OrderBy("age");
            _df.OrderBy("age", "name");
            _df.OrderBy();
            _df.OrderBy(_df["age"]);
            _df.OrderBy(_df["age"], _df["name"]);

            _df.Hint("broadcast");
            _df.Hint("broadcast", new[] { "hello", "world" });

            _df.Col("age");

            _df.ColRegex("age");

            _df.As("alias");

            _df.Alias("alias");

            _df.Select("age");
            _df.Select("age", "name");
            _df.Select();
            _df.Select(_df["age"]);
            _df.Select(_df["age"], _df["name"]);

            _df.SelectExpr();
            _df.SelectExpr("age * 2");
            _df.SelectExpr("age * 2", "abs(age)");

            _df.Filter(_df["age"] > 21);
            _df.Filter("age > 21");

            _df.Where(_df["age"] > 21);
            _df.Where("age > 21");

            _df.GroupBy("age");
            _df.GroupBy("age", "name");
            _df.GroupBy();
            _df.GroupBy(_df["age"]);
            _df.GroupBy(_df["age"], _df["name"]);

            _df.Rollup("age");
            _df.Rollup("age", "name");
            _df.Rollup();
            _df.Rollup(_df["age"]);
            _df.Rollup(_df["age"], _df["name"]);

            _df.Cube("age");
            _df.Cube("age", "name");
            _df.Cube();
            _df.Cube(_df["age"]);
            _df.Cube(_df["age"], _df["name"]);

            _df.Agg(Avg(_df["age"]));
            _df.Agg(Avg(_df["age"]), Avg(_df["name"]));

            _df.Limit(10);

            _df.Union(_df);

            _df.UnionByName(_df);

            _df.Intersect(_df);

            _df.Except(_df);

            _df.Sample(0.5);
            _df.Sample(0.5, true);
            _df.Sample(0.5, false, 12345);

            _df.RandomSplit(new[] { 0.2, 0.8 });
            _df.RandomSplit(new[] { 0.2, 0.8 }, 12345);

            _df.WithColumn("age2", _df["age"]);

            _df.WithColumnRenamed("age", "age2");

            _df.Drop();
            _df.Drop("age");
            _df.Drop("age", "name");

            _df.Drop(_df["age"]);

            _df.DropDuplicates();
            _df.DropDuplicates("age");
            _df.DropDuplicates("age", "name");

            _df.Describe();
            _df.Describe("age");
            _df.Describe("age", "name");

            _df.Summary();
            _df.Summary("count");
            _df.Summary("count", "mean");

            _df.Head(2);
            _df.Head();

            _df.First();

            _df.Take(3).ToArray();

            _df.Collect().ToArray();

            _df.ToLocalIterator().ToArray();

            _df.Count();

            _df.Repartition(2);
            _df.Repartition(2, _df["age"]);
            _df.Repartition(_df["age"]);
            _df.Repartition();

            _df.RepartitionByRange(2, _df["age"]);
            _df.RepartitionByRange(_df["age"]);

            _df.Coalesce(1);

            _df.Distinct();

            _df.Persist();

            _df.Cache();

            _df.Unpersist();

            _df.CreateTempView("view");
            _df.CreateOrReplaceTempView("view");

            _df.CreateGlobalTempView("global_view");
            _df.CreateOrReplaceGlobalTempView("global_view");
        }