Exemple #1
0
        public void TestIndexCreateAndDelete()
        {
            // Should be one active index.
            DataFrame indexes = _hyperspace.Indexes();

            Assert.Equal(1, indexes.Count());
            Assert.Equal(_sampleIndexName, indexes.SelectExpr("name").First()[0]);
            Assert.Equal(States.Active, indexes.SelectExpr("state").First()[0]);

            // Delete the index then verify it has been deleted.
            _hyperspace.DeleteIndex(_sampleIndexName);
            indexes = _hyperspace.Indexes();
            Assert.Equal(1, indexes.Count());
            Assert.Equal(States.Deleted, indexes.SelectExpr("state").First()[0]);

            // Restore the index to active state and verify it is back.
            _hyperspace.RestoreIndex(_sampleIndexName);
            indexes = _hyperspace.Indexes();
            Assert.Equal(1, indexes.Count());
            Assert.Equal(States.Active, indexes.SelectExpr("state").First()[0]);

            // Delete and vacuum the index, then verify it is gone.
            _hyperspace.DeleteIndex(_sampleIndexName);
            _hyperspace.VacuumIndex(_sampleIndexName);
            Assert.Equal(0, _hyperspace.Indexes().Count());
        }
        public void TestWord2VecModel()
        {
            DataFrame documentDataFrame =
                _spark.Sql("SELECT split('Hi I heard about Spark', ' ') as text");

            Word2Vec word2vec = new Word2Vec()
                                .SetInputCol("text")
                                .SetOutputCol("result")
                                .SetMinCount(1);

            Word2VecModel model = word2vec.Fit(documentDataFrame);

            const int expectedSynonyms = 2;
            DataFrame synonyms         = model.FindSynonyms("Hi", expectedSynonyms);

            Assert.Equal(expectedSynonyms, synonyms.Count());
            synonyms.Show();

            using (var tempDirectory = new TemporaryDirectory())
            {
                string savePath = Path.Join(tempDirectory.Path, "word2vecModel");
                model.Save(savePath);

                Word2VecModel loadedModel = Word2VecModel.Load(savePath);
                Assert.Equal(model.Uid(), loadedModel.Uid());
            }
        }
Exemple #3
0
        private static void ShowDatasetInfo(DataFrame df)
        {
            //Print first rows and schema
            df.Show();
            df.PrintSchema();

            var numRows = df.Count();

            Console.WriteLine($"Found {numRows} movies in movie dataset");

            var directorsCount = df.Select("director_name").Distinct().Count();

            Console.WriteLine($"Found {directorsCount} diferent Directors in movie dataset");

            var maxActorDF = df.GroupBy("actor_1_name").Count().OrderBy(Desc("count"));

            maxActorDF.Explain();

            maxActorDF.Show();

            var actorWithMostAppearances = maxActorDF.First().GetAs <String>(0);

            Console.WriteLine($"Actor with most appearances is {actorWithMostAppearances}");

            var mostVotedMovieDF = df.Select("movie_title", "movie_facebook_likes").OrderBy(Desc("movie_facebook_likes"));

            mostVotedMovieDF.Show();

            var mostVotedMovieRow = mostVotedMovieDF.First();

            Console.WriteLine($"Most rated movie is {mostVotedMovieRow.GetAs<String>("movie_title")} " +
                              $"with {mostVotedMovieRow.GetAs<Int32>("movie_facebook_likes")} votes");
        }
Exemple #4
0
        public void TestEmailSearchTopNReducerBasics()
        {
            // Read the sample data.
            DataFrame df = _spark
                           .Read()
                           .Schema("Id STRING, DisplayName STRING, GivenName STRING, Surname STRING, IMAddress STRING, EmailAddress STRING, RelevanceScore DOUBLE, puser STRING, ptenant STRING")
                           .Json($"{TestEnvironment.ResourceDirectory}neighbors.json");

            // Trim the IMAddress column.
            Func <Column, Column> trimIMAddress = Udf <string, string>((str) => str.StartsWith("sip:") ? str.Substring(4) : str);

            df = df.WithColumn("IMAddress", trimIMAddress(df["IMAddress"]));

            // Reduce
            df = df.GroupBy("puser", "ptenant").Agg(CollectList("GivenName").Alias("GivenNames"),
                                                    CollectList("Surname").Alias("Surnames"),
                                                    CollectList("DisplayName").Alias("DisplayNames"),
                                                    CollectList("EmailAddress").Alias("EmailAddresses"),
                                                    CollectList("RelevanceScore").Alias("RelevanceScores"),
                                                    CollectList("IMAddress").Alias("IMAddresses"));
            // Format the output.
            df = df.Select(df["puser"],
                           df["ptenant"],
                           ConcatWs(";", df["GivenNames"]).Alias("GivenNames"),
                           ConcatWs(";", df["Surnames"]).Alias("Surnames"),
                           ConcatWs(";", df["DisplayNames"]).Alias("DisplayNames"),
                           ConcatWs(";", df["EmailAddresses"]).Alias("EmailAddresses"),
                           ConcatWs(";", df["RelevanceScores"]).Alias("RelevanceScores"),
                           ConcatWs(";", df["IMAddresses"]).Alias("IMAddresses"));

            Assert.Equal(2, df.Count());
            foreach (Row row in df.Collect())
            {
                string puser = row.GetAs <string>("puser");
                Assert.Equal("MSFT", row.GetAs <string>("ptenant"));
                Assert.Equal("1101.0;900.0;857.0", row.GetAs <string>("RelevanceScores"));
                switch (puser)
                {
                case "ruih":
                    Assert.Equal("AliceFN;BobFN;CharlieFN", row.GetAs <string>("GivenNames"));
                    Assert.Equal("AliceLN;BobLN;CharlieLN", row.GetAs <string>("Surnames"));
                    Assert.Equal("AliceFN AliceLN;BobFN BobLN;CharlieFN CharlieLN", row.GetAs <string>("DisplayNames"));
                    Assert.Equal("[email protected];[email protected];[email protected]", row.GetAs <string>("EmailAddresses"));
                    Assert.Equal("[email protected];[email protected];[email protected]", row.GetAs <string>("IMAddresses"));
                    break;

                case "rui":
                    Assert.Equal("DougFN;ElvaFN;FrankFN", row.GetAs <string>("GivenNames"));
                    Assert.Equal("DougLN;ElvaLN;FrankLN", row.GetAs <string>("Surnames"));
                    Assert.Equal("DougFN DougLN;ElvaFN ElvaLN;FrankFN FrankLN", row.GetAs <string>("DisplayNames"));
                    Assert.Equal("[email protected];[email protected];[email protected]", row.GetAs <string>("EmailAddresses"));
                    Assert.Equal("[email protected];[email protected];[email protected]", row.GetAs <string>("IMAddresses"));
                    break;

                default:
                    throw new Exception($"Unexpected age: {puser}.");
                }
            }
        }
Exemple #5
0
        /// <inheritdoc cref="Analyzer{S,M}.ComputeStateFrom"/>
        public override Option <FrequenciesAndNumRows> ComputeStateFrom(DataFrame dataFrame)
        {
            long totalCount = dataFrame.Count();

            DataFrame dataFrameFiltered = FilterOptional(Where, dataFrame);
            DataFrame binnedDataFrame   = BinOptional(BinningUdf, dataFrameFiltered);

            binnedDataFrame = binnedDataFrame.Select(Col(Column).Cast("string"))
                              .Na().Fill(NULL_FIELD_REPLACEMENT)
                              .GroupBy(Column)
                              .Count()
                              .WithColumnRenamed("count", AnalyzersExt.COUNT_COL);


            return(new Option <FrequenciesAndNumRows>(new FrequenciesAndNumRows(binnedDataFrame, totalCount)));
        }
Exemple #6
0
        /// <summary>
        /// Validate that a range DataFrame contains only the expected values.
        /// </summary>
        /// <param name="expectedValues"></param>
        /// <param name="dataFrame"></param>
        private void ValidateRangeDataFrame(IEnumerable <int> expectedValues, DataFrame dataFrame)
        {
            Assert.Equal(expectedValues.Count(), dataFrame.Count());

            var sortedExpectedValues = new List <int>(expectedValues);

            sortedExpectedValues.Sort();

            var sortedValues = new List <int>(
                dataFrame
                // We need to select the "id" column, otherwise Collect() won't show the
                // updates made to the DeltaTable.
                .Select("id")
                .Sort("id")
                .Collect()
                .Select(row => Convert.ToInt32(row.Get("id"))));

            Assert.True(sortedValues.SequenceEqual(expectedValues));
        }
Exemple #7
0
        private static void ReviewsCleanup(DataFrame dataFrame)
        {
            Console.WriteLine("Ratings Clean-up");

            dataFrame = dataFrame
                        .Filter(
                dataFrame["reviewerID"].IsNotNull()
                .And(dataFrame["asin"].IsNotNull())
                .And(dataFrame["reviewText"].IsNotNull()));

            dataFrame = dataFrame
                        .WithColumnRenamed("reviewerID", "rid")
                        .WithColumnRenamed("reviewText", "review_text")
                        .WithColumnRenamed("unixReviewTime", "unix_time");

            dataFrame.Cache();

            dataFrame.CreateOrReplaceTempView("ElectronicsReviews");

            Console.WriteLine($"Reviews Count: {dataFrame.Count()}");
            Console.WriteLine("Done");
            Console.WriteLine();
        }
        private static bool ValidateEntity(DataFrame data)
        {
            var ret = true;

            if (data.Schema().Json != _expectedSchema.Json)
            {
                Console.WriteLine("Expected Schema Does NOT Match");
                Console.WriteLine("Actual Schema: " + data.Schema().SimpleString);
                Console.WriteLine("Expected Schema: " + _expectedSchema.SimpleString);
                ret = false;
            }

            if (data.Filter(Col("Date").IsNotNull()).Count() == 0)
            {
                Console.WriteLine("Date Parsing resulted in all NULL's");
                ret = false;
            }

            if (data.Count() == 0)
            {
                Console.WriteLine("DataFrame is empty");
                ret = false;
            }

            var amountBySuppliers = data.GroupBy(Col("Supplier")).Sum("Amount")
                                    .Filter(Col("Sum(Amount)") < 25000);

            if (amountBySuppliers.Count() > 0)
            {
                Console.WriteLine("Amounts should only ever be over 25k");
                amountBySuppliers.Show();
                ret = false;
            }

            return(ret);
        }
Exemple #9
0
        public void TestSignaturesV2_3_X()
        {
            Column col = _df["name"];

            col = _df["age"];

            DataFrame df = _df.ToDF();

            df = df.ToDF("name2", "age2");

            StructType schema = _df.Schema();

            Assert.NotNull(schema);

            _df.PrintSchema();

            _df.Explain();
            _df.Explain(true);
            _df.Explain(false);

            Assert.Equal(2, _df.Columns().ToArray().Length);

            _df.IsLocal();

            _df.IsStreaming();

            using (var tempDir = new TemporaryDirectory())
            {
                // The following is required for *CheckPoint().
                _spark.SparkContext.SetCheckpointDir(tempDir.Path);

                _df.Checkpoint();
                _df.Checkpoint(false);

                _df.LocalCheckpoint();
                _df.LocalCheckpoint(false);
            }

            _df.WithWatermark("time", "10 minutes");

            _df.Show();
            _df.Show(10);
            _df.Show(10, 10);
            _df.Show(10, 10, true);

            _df.Join(_df);
            _df.Join(_df, "name");
            _df.Join(_df, new[] { "name" });
            _df.Join(_df, new[] { "name" }, "outer");
            _df.Join(_df, _df["age"] == _df["age"]);
            _df.Join(_df, _df["age"] == _df["age"], "outer");

            _df.CrossJoin(_df);

            _df.SortWithinPartitions("age");
            _df.SortWithinPartitions("age", "name");
            _df.SortWithinPartitions();
            _df.SortWithinPartitions(_df["age"]);
            _df.SortWithinPartitions(_df["age"], _df["name"]);

            _df.Sort("age");
            _df.Sort("age", "name");
            _df.Sort();
            _df.Sort(_df["age"]);
            _df.Sort(_df["age"], _df["name"]);

            _df.OrderBy("age");
            _df.OrderBy("age", "name");
            _df.OrderBy();
            _df.OrderBy(_df["age"]);
            _df.OrderBy(_df["age"], _df["name"]);

            _df.Hint("broadcast");
            _df.Hint("broadcast", new[] { "hello", "world" });

            _df.Col("age");

            _df.ColRegex("age");

            _df.As("alias");

            _df.Alias("alias");

            _df.Select("age");
            _df.Select("age", "name");
            _df.Select();
            _df.Select(_df["age"]);
            _df.Select(_df["age"], _df["name"]);

            _df.SelectExpr();
            _df.SelectExpr("age * 2");
            _df.SelectExpr("age * 2", "abs(age)");

            _df.Filter(_df["age"] > 21);
            _df.Filter("age > 21");

            _df.Where(_df["age"] > 21);
            _df.Where("age > 21");

            _df.GroupBy("age");
            _df.GroupBy("age", "name");
            _df.GroupBy();
            _df.GroupBy(_df["age"]);
            _df.GroupBy(_df["age"], _df["name"]);

            _df.Rollup("age");
            _df.Rollup("age", "name");
            _df.Rollup();
            _df.Rollup(_df["age"]);
            _df.Rollup(_df["age"], _df["name"]);

            _df.Cube("age");
            _df.Cube("age", "name");
            _df.Cube();
            _df.Cube(_df["age"]);
            _df.Cube(_df["age"], _df["name"]);

            _df.Agg(Avg(_df["age"]));
            _df.Agg(Avg(_df["age"]), Avg(_df["name"]));

            _df.Limit(10);

            _df.Union(_df);

            _df.UnionByName(_df);

            _df.Intersect(_df);

            _df.Except(_df);

            _df.Sample(0.5);
            _df.Sample(0.5, true);
            _df.Sample(0.5, false, 12345);

            _df.RandomSplit(new[] { 0.2, 0.8 });
            _df.RandomSplit(new[] { 0.2, 0.8 }, 12345);

            _df.WithColumn("age2", _df["age"]);

            _df.WithColumnRenamed("age", "age2");

            _df.Drop();
            _df.Drop("age");
            _df.Drop("age", "name");

            _df.Drop(_df["age"]);

            _df.DropDuplicates();
            _df.DropDuplicates("age");
            _df.DropDuplicates("age", "name");

            _df.Describe();
            _df.Describe("age");
            _df.Describe("age", "name");

            _df.Summary();
            _df.Summary("count");
            _df.Summary("count", "mean");

            _df.Head(2);
            _df.Head();

            _df.First();

            _df.Take(3).ToArray();

            _df.Collect().ToArray();

            _df.ToLocalIterator().ToArray();

            _df.Count();

            _df.Repartition(2);
            _df.Repartition(2, _df["age"]);
            _df.Repartition(_df["age"]);
            _df.Repartition();

            _df.RepartitionByRange(2, _df["age"]);
            _df.RepartitionByRange(_df["age"]);

            _df.Coalesce(1);

            _df.Distinct();

            _df.Persist();

            _df.Cache();

            _df.Unpersist();

            _df.CreateTempView("view");
            _df.CreateOrReplaceTempView("view");

            _df.CreateGlobalTempView("global_view");
            _df.CreateOrReplaceGlobalTempView("global_view");
        }
Exemple #10
0
        public void TestEmailSearchSuccessActionReducerBasics()
        {
            // Read the sample data.
            DataFrame df = _spark.Read().Json($"{TestEnvironment.ResourceDirectory}search_actions.json");

            // Select the required columns.
            df = df.Select("ImpressionId", "ConversationId", "EntityType", "FolderIdList", "ReferenceIdList", "ItemIdList", "ItemImmutableIdList");

            // Convert columns of concatenated string to array of strings.
            Func <Column, Column> toStringArrayUdf = Udf <string, string[]>((str) => str.Split(';'));

            df = df.WithColumn("FolderIdList", toStringArrayUdf(df["FolderIdList"]))
                 .WithColumn("ReferenceIdList", toStringArrayUdf(df["ReferenceIdList"]))
                 .WithColumn("ItemIdList", toStringArrayUdf(df["ItemIdList"]))
                 .WithColumn("ItemImmutableIdList", toStringArrayUdf(df["ItemImmutableIdList"]));

            // Apply the ArrayZip function to combine the i-th element of each array.
            df = df.Select(df["ConversationId"], df["ImpressionId"], df["EntityType"], ArraysZip(df["FolderIdList"], df["ReferenceIdList"], df["ItemIdList"], df["ItemImmutableIdList"]).Alias("ConcatedColumn"));

            // Apply the Explode function to split into multiple rows.
            df = df.Select(df["ConversationId"], df["ImpressionId"], df["EntityType"], Explode(df["ConcatedColumn"]).Alias("NewColumn"));

            // Create multiple columns.
            df = df.WithColumn("FolderId", df["NewColumn"].GetField("FolderIdList"))
                 .WithColumn("ReferenceId", df["NewColumn"].GetField("ReferenceIdList"))
                 .WithColumn("ItemId", df["NewColumn"].GetField("ItemIdList"))
                 .WithColumn("ItemImmutableId", df["NewColumn"].GetField("ItemImmutableIdList"))
                 .Select("ConversationId", "ImpressionId", "EntityType", "FolderId", "ItemId", "ReferenceId", "ItemImmutableId");

            // Check the results.
            Assert.Equal(3, df.Count());
            int i = 0;

            foreach (Row row in df.Collect())
            {
                string impressionId   = row.GetAs <string>("ImpressionId");
                string conversationId = row.GetAs <string>("ConversationId");
                string entityType     = row.GetAs <string>("EntityType");
                Assert.Equal("Imp1", impressionId);
                Assert.Equal("DD8A6B40-B4C9-426F-8194-895E9053077C", conversationId);
                Assert.Equal("Message", entityType);
                string folderId        = row.GetAs <string>("FolderId");
                string itemId          = row.GetAs <string>("ItemId");
                string referenceId     = row.GetAs <string>("ReferenceId");
                string itemImmutableId = row.GetAs <string>("ItemImmutableId");
                if (i == 0)
                {
                    Assert.Equal("F1", folderId);
                    Assert.Equal("ItemId1", itemId);
                    Assert.Equal("R1", referenceId);
                    Assert.Equal("ItemImmutableId1", itemImmutableId);
                }
                else if (i == 1)
                {
                    Assert.Equal("F2", folderId);
                    Assert.Equal("ItemId2", itemId);
                    Assert.Equal("R2", referenceId);
                    Assert.Equal("ItemImmutableId2", itemImmutableId);
                }
                else if (i == 2)
                {
                    Assert.Equal("F3", folderId);
                    Assert.Equal("ItemId3", itemId);
                    Assert.Equal("R3", referenceId);
                    Assert.Equal("ItemImmutableId3", itemImmutableId);
                }
                else
                {
                    throw new Exception(string.Format("Unexpected row: ConversationId={0}, ImpressionId={1}", conversationId, impressionId));
                }

                i++;
            }
        }
Exemple #11
0
        private static void MetadataCleanup(DataFrame dataFrame)
        {
            Console.WriteLine("Metadata Clean-up");

            var priceCleanup = Udf <string, float>(
                p =>
            {
                if (!string.IsNullOrEmpty(p))
                {
                    var index = 0;

                    for (var i = 0; i < p.Length; i++)
                    {
                        if (char.IsDigit(p[i]))
                        {
                            index = i;
                            break;
                        }
                    }

                    if (float.TryParse(p.Substring(index), out var result))
                    {
                        return(result);
                    }
                }

                return(-1f);
            });

            var dateCleanup = Udf <string, double>(
                d =>
            {
                if (!string.IsNullOrEmpty(d) && DateTime.TryParse(d, out var result))
                {
                    return((result.ToUniversalTime() - new DateTime(1970, 1, 1)).TotalSeconds);
                }

                return(-1L);
            });

            var rankCleanup = Udf <string, long>(
                r =>
            {
                if (!string.IsNullOrEmpty(r))
                {
                    var regex = new Regex(@"\d+(,\d+)*", RegexOptions.Singleline);
                    var match = regex.Match(r);
                    if (match.Success && long.TryParse(match.Value.Replace(",", string.Empty), out var result))
                    {
                        return(result);
                    }
                }

                return(-1L);
            });

            dataFrame = dataFrame
                        .Filter(
                dataFrame["asin"].IsNotNull()
                .And(dataFrame["title"].IsNotNull())
                .And(dataFrame["main_cat"].IsNotNull())
                .And(dataFrame["brand"].IsNotNull())
                .And(Not(dataFrame["main_cat"].IsIn("Grocery", "Pet Supplies", "Baby", "Books", "Appstore for Android", "Gift Cards"))));

            dataFrame = dataFrame
                        .WithColumn("clean_price", priceCleanup(dataFrame["price"]))
                        .WithColumn("clean-date", dateCleanup(dataFrame["date"]))
                        .WithColumn("clean-rank", rankCleanup(dataFrame["rank"]))
                        .Drop(dataFrame["price"])
                        .Drop(dataFrame["date"])
                        .Drop(dataFrame["rank"])
                        .WithColumnRenamed("clean_price", "price")
                        .WithColumnRenamed("clean-date", "unixTime")
                        .WithColumnRenamed("clean-rank", "rank");

            dataFrame.Cache();
            dataFrame.CreateOrReplaceTempView("ElectronicsMetadata");

            Console.WriteLine($"Metadata Count: {dataFrame.Count()}");
            Console.WriteLine("Done");
            Console.WriteLine();
        }