public void TestIndexCreateAndDelete() { // Should be one active index. DataFrame indexes = _hyperspace.Indexes(); Assert.Equal(1, indexes.Count()); Assert.Equal(_sampleIndexName, indexes.SelectExpr("name").First()[0]); Assert.Equal(States.Active, indexes.SelectExpr("state").First()[0]); // Delete the index then verify it has been deleted. _hyperspace.DeleteIndex(_sampleIndexName); indexes = _hyperspace.Indexes(); Assert.Equal(1, indexes.Count()); Assert.Equal(States.Deleted, indexes.SelectExpr("state").First()[0]); // Restore the index to active state and verify it is back. _hyperspace.RestoreIndex(_sampleIndexName); indexes = _hyperspace.Indexes(); Assert.Equal(1, indexes.Count()); Assert.Equal(States.Active, indexes.SelectExpr("state").First()[0]); // Delete and vacuum the index, then verify it is gone. _hyperspace.DeleteIndex(_sampleIndexName); _hyperspace.VacuumIndex(_sampleIndexName); Assert.Equal(0, _hyperspace.Indexes().Count()); }
public void TestWord2VecModel() { DataFrame documentDataFrame = _spark.Sql("SELECT split('Hi I heard about Spark', ' ') as text"); Word2Vec word2vec = new Word2Vec() .SetInputCol("text") .SetOutputCol("result") .SetMinCount(1); Word2VecModel model = word2vec.Fit(documentDataFrame); const int expectedSynonyms = 2; DataFrame synonyms = model.FindSynonyms("Hi", expectedSynonyms); Assert.Equal(expectedSynonyms, synonyms.Count()); synonyms.Show(); using (var tempDirectory = new TemporaryDirectory()) { string savePath = Path.Join(tempDirectory.Path, "word2vecModel"); model.Save(savePath); Word2VecModel loadedModel = Word2VecModel.Load(savePath); Assert.Equal(model.Uid(), loadedModel.Uid()); } }
private static void ShowDatasetInfo(DataFrame df) { //Print first rows and schema df.Show(); df.PrintSchema(); var numRows = df.Count(); Console.WriteLine($"Found {numRows} movies in movie dataset"); var directorsCount = df.Select("director_name").Distinct().Count(); Console.WriteLine($"Found {directorsCount} diferent Directors in movie dataset"); var maxActorDF = df.GroupBy("actor_1_name").Count().OrderBy(Desc("count")); maxActorDF.Explain(); maxActorDF.Show(); var actorWithMostAppearances = maxActorDF.First().GetAs <String>(0); Console.WriteLine($"Actor with most appearances is {actorWithMostAppearances}"); var mostVotedMovieDF = df.Select("movie_title", "movie_facebook_likes").OrderBy(Desc("movie_facebook_likes")); mostVotedMovieDF.Show(); var mostVotedMovieRow = mostVotedMovieDF.First(); Console.WriteLine($"Most rated movie is {mostVotedMovieRow.GetAs<String>("movie_title")} " + $"with {mostVotedMovieRow.GetAs<Int32>("movie_facebook_likes")} votes"); }
public void TestEmailSearchTopNReducerBasics() { // Read the sample data. DataFrame df = _spark .Read() .Schema("Id STRING, DisplayName STRING, GivenName STRING, Surname STRING, IMAddress STRING, EmailAddress STRING, RelevanceScore DOUBLE, puser STRING, ptenant STRING") .Json($"{TestEnvironment.ResourceDirectory}neighbors.json"); // Trim the IMAddress column. Func <Column, Column> trimIMAddress = Udf <string, string>((str) => str.StartsWith("sip:") ? str.Substring(4) : str); df = df.WithColumn("IMAddress", trimIMAddress(df["IMAddress"])); // Reduce df = df.GroupBy("puser", "ptenant").Agg(CollectList("GivenName").Alias("GivenNames"), CollectList("Surname").Alias("Surnames"), CollectList("DisplayName").Alias("DisplayNames"), CollectList("EmailAddress").Alias("EmailAddresses"), CollectList("RelevanceScore").Alias("RelevanceScores"), CollectList("IMAddress").Alias("IMAddresses")); // Format the output. df = df.Select(df["puser"], df["ptenant"], ConcatWs(";", df["GivenNames"]).Alias("GivenNames"), ConcatWs(";", df["Surnames"]).Alias("Surnames"), ConcatWs(";", df["DisplayNames"]).Alias("DisplayNames"), ConcatWs(";", df["EmailAddresses"]).Alias("EmailAddresses"), ConcatWs(";", df["RelevanceScores"]).Alias("RelevanceScores"), ConcatWs(";", df["IMAddresses"]).Alias("IMAddresses")); Assert.Equal(2, df.Count()); foreach (Row row in df.Collect()) { string puser = row.GetAs <string>("puser"); Assert.Equal("MSFT", row.GetAs <string>("ptenant")); Assert.Equal("1101.0;900.0;857.0", row.GetAs <string>("RelevanceScores")); switch (puser) { case "ruih": Assert.Equal("AliceFN;BobFN;CharlieFN", row.GetAs <string>("GivenNames")); Assert.Equal("AliceLN;BobLN;CharlieLN", row.GetAs <string>("Surnames")); Assert.Equal("AliceFN AliceLN;BobFN BobLN;CharlieFN CharlieLN", row.GetAs <string>("DisplayNames")); Assert.Equal("[email protected];[email protected];[email protected]", row.GetAs <string>("EmailAddresses")); Assert.Equal("[email protected];[email protected];[email protected]", row.GetAs <string>("IMAddresses")); break; case "rui": Assert.Equal("DougFN;ElvaFN;FrankFN", row.GetAs <string>("GivenNames")); Assert.Equal("DougLN;ElvaLN;FrankLN", row.GetAs <string>("Surnames")); Assert.Equal("DougFN DougLN;ElvaFN ElvaLN;FrankFN FrankLN", row.GetAs <string>("DisplayNames")); Assert.Equal("[email protected];[email protected];[email protected]", row.GetAs <string>("EmailAddresses")); Assert.Equal("[email protected];[email protected];[email protected]", row.GetAs <string>("IMAddresses")); break; default: throw new Exception($"Unexpected age: {puser}."); } } }
/// <inheritdoc cref="Analyzer{S,M}.ComputeStateFrom"/> public override Option <FrequenciesAndNumRows> ComputeStateFrom(DataFrame dataFrame) { long totalCount = dataFrame.Count(); DataFrame dataFrameFiltered = FilterOptional(Where, dataFrame); DataFrame binnedDataFrame = BinOptional(BinningUdf, dataFrameFiltered); binnedDataFrame = binnedDataFrame.Select(Col(Column).Cast("string")) .Na().Fill(NULL_FIELD_REPLACEMENT) .GroupBy(Column) .Count() .WithColumnRenamed("count", AnalyzersExt.COUNT_COL); return(new Option <FrequenciesAndNumRows>(new FrequenciesAndNumRows(binnedDataFrame, totalCount))); }
/// <summary> /// Validate that a range DataFrame contains only the expected values. /// </summary> /// <param name="expectedValues"></param> /// <param name="dataFrame"></param> private void ValidateRangeDataFrame(IEnumerable <int> expectedValues, DataFrame dataFrame) { Assert.Equal(expectedValues.Count(), dataFrame.Count()); var sortedExpectedValues = new List <int>(expectedValues); sortedExpectedValues.Sort(); var sortedValues = new List <int>( dataFrame // We need to select the "id" column, otherwise Collect() won't show the // updates made to the DeltaTable. .Select("id") .Sort("id") .Collect() .Select(row => Convert.ToInt32(row.Get("id")))); Assert.True(sortedValues.SequenceEqual(expectedValues)); }
private static void ReviewsCleanup(DataFrame dataFrame) { Console.WriteLine("Ratings Clean-up"); dataFrame = dataFrame .Filter( dataFrame["reviewerID"].IsNotNull() .And(dataFrame["asin"].IsNotNull()) .And(dataFrame["reviewText"].IsNotNull())); dataFrame = dataFrame .WithColumnRenamed("reviewerID", "rid") .WithColumnRenamed("reviewText", "review_text") .WithColumnRenamed("unixReviewTime", "unix_time"); dataFrame.Cache(); dataFrame.CreateOrReplaceTempView("ElectronicsReviews"); Console.WriteLine($"Reviews Count: {dataFrame.Count()}"); Console.WriteLine("Done"); Console.WriteLine(); }
private static bool ValidateEntity(DataFrame data) { var ret = true; if (data.Schema().Json != _expectedSchema.Json) { Console.WriteLine("Expected Schema Does NOT Match"); Console.WriteLine("Actual Schema: " + data.Schema().SimpleString); Console.WriteLine("Expected Schema: " + _expectedSchema.SimpleString); ret = false; } if (data.Filter(Col("Date").IsNotNull()).Count() == 0) { Console.WriteLine("Date Parsing resulted in all NULL's"); ret = false; } if (data.Count() == 0) { Console.WriteLine("DataFrame is empty"); ret = false; } var amountBySuppliers = data.GroupBy(Col("Supplier")).Sum("Amount") .Filter(Col("Sum(Amount)") < 25000); if (amountBySuppliers.Count() > 0) { Console.WriteLine("Amounts should only ever be over 25k"); amountBySuppliers.Show(); ret = false; } return(ret); }
public void TestSignaturesV2_3_X() { Column col = _df["name"]; col = _df["age"]; DataFrame df = _df.ToDF(); df = df.ToDF("name2", "age2"); StructType schema = _df.Schema(); Assert.NotNull(schema); _df.PrintSchema(); _df.Explain(); _df.Explain(true); _df.Explain(false); Assert.Equal(2, _df.Columns().ToArray().Length); _df.IsLocal(); _df.IsStreaming(); using (var tempDir = new TemporaryDirectory()) { // The following is required for *CheckPoint(). _spark.SparkContext.SetCheckpointDir(tempDir.Path); _df.Checkpoint(); _df.Checkpoint(false); _df.LocalCheckpoint(); _df.LocalCheckpoint(false); } _df.WithWatermark("time", "10 minutes"); _df.Show(); _df.Show(10); _df.Show(10, 10); _df.Show(10, 10, true); _df.Join(_df); _df.Join(_df, "name"); _df.Join(_df, new[] { "name" }); _df.Join(_df, new[] { "name" }, "outer"); _df.Join(_df, _df["age"] == _df["age"]); _df.Join(_df, _df["age"] == _df["age"], "outer"); _df.CrossJoin(_df); _df.SortWithinPartitions("age"); _df.SortWithinPartitions("age", "name"); _df.SortWithinPartitions(); _df.SortWithinPartitions(_df["age"]); _df.SortWithinPartitions(_df["age"], _df["name"]); _df.Sort("age"); _df.Sort("age", "name"); _df.Sort(); _df.Sort(_df["age"]); _df.Sort(_df["age"], _df["name"]); _df.OrderBy("age"); _df.OrderBy("age", "name"); _df.OrderBy(); _df.OrderBy(_df["age"]); _df.OrderBy(_df["age"], _df["name"]); _df.Hint("broadcast"); _df.Hint("broadcast", new[] { "hello", "world" }); _df.Col("age"); _df.ColRegex("age"); _df.As("alias"); _df.Alias("alias"); _df.Select("age"); _df.Select("age", "name"); _df.Select(); _df.Select(_df["age"]); _df.Select(_df["age"], _df["name"]); _df.SelectExpr(); _df.SelectExpr("age * 2"); _df.SelectExpr("age * 2", "abs(age)"); _df.Filter(_df["age"] > 21); _df.Filter("age > 21"); _df.Where(_df["age"] > 21); _df.Where("age > 21"); _df.GroupBy("age"); _df.GroupBy("age", "name"); _df.GroupBy(); _df.GroupBy(_df["age"]); _df.GroupBy(_df["age"], _df["name"]); _df.Rollup("age"); _df.Rollup("age", "name"); _df.Rollup(); _df.Rollup(_df["age"]); _df.Rollup(_df["age"], _df["name"]); _df.Cube("age"); _df.Cube("age", "name"); _df.Cube(); _df.Cube(_df["age"]); _df.Cube(_df["age"], _df["name"]); _df.Agg(Avg(_df["age"])); _df.Agg(Avg(_df["age"]), Avg(_df["name"])); _df.Limit(10); _df.Union(_df); _df.UnionByName(_df); _df.Intersect(_df); _df.Except(_df); _df.Sample(0.5); _df.Sample(0.5, true); _df.Sample(0.5, false, 12345); _df.RandomSplit(new[] { 0.2, 0.8 }); _df.RandomSplit(new[] { 0.2, 0.8 }, 12345); _df.WithColumn("age2", _df["age"]); _df.WithColumnRenamed("age", "age2"); _df.Drop(); _df.Drop("age"); _df.Drop("age", "name"); _df.Drop(_df["age"]); _df.DropDuplicates(); _df.DropDuplicates("age"); _df.DropDuplicates("age", "name"); _df.Describe(); _df.Describe("age"); _df.Describe("age", "name"); _df.Summary(); _df.Summary("count"); _df.Summary("count", "mean"); _df.Head(2); _df.Head(); _df.First(); _df.Take(3).ToArray(); _df.Collect().ToArray(); _df.ToLocalIterator().ToArray(); _df.Count(); _df.Repartition(2); _df.Repartition(2, _df["age"]); _df.Repartition(_df["age"]); _df.Repartition(); _df.RepartitionByRange(2, _df["age"]); _df.RepartitionByRange(_df["age"]); _df.Coalesce(1); _df.Distinct(); _df.Persist(); _df.Cache(); _df.Unpersist(); _df.CreateTempView("view"); _df.CreateOrReplaceTempView("view"); _df.CreateGlobalTempView("global_view"); _df.CreateOrReplaceGlobalTempView("global_view"); }
public void TestEmailSearchSuccessActionReducerBasics() { // Read the sample data. DataFrame df = _spark.Read().Json($"{TestEnvironment.ResourceDirectory}search_actions.json"); // Select the required columns. df = df.Select("ImpressionId", "ConversationId", "EntityType", "FolderIdList", "ReferenceIdList", "ItemIdList", "ItemImmutableIdList"); // Convert columns of concatenated string to array of strings. Func <Column, Column> toStringArrayUdf = Udf <string, string[]>((str) => str.Split(';')); df = df.WithColumn("FolderIdList", toStringArrayUdf(df["FolderIdList"])) .WithColumn("ReferenceIdList", toStringArrayUdf(df["ReferenceIdList"])) .WithColumn("ItemIdList", toStringArrayUdf(df["ItemIdList"])) .WithColumn("ItemImmutableIdList", toStringArrayUdf(df["ItemImmutableIdList"])); // Apply the ArrayZip function to combine the i-th element of each array. df = df.Select(df["ConversationId"], df["ImpressionId"], df["EntityType"], ArraysZip(df["FolderIdList"], df["ReferenceIdList"], df["ItemIdList"], df["ItemImmutableIdList"]).Alias("ConcatedColumn")); // Apply the Explode function to split into multiple rows. df = df.Select(df["ConversationId"], df["ImpressionId"], df["EntityType"], Explode(df["ConcatedColumn"]).Alias("NewColumn")); // Create multiple columns. df = df.WithColumn("FolderId", df["NewColumn"].GetField("FolderIdList")) .WithColumn("ReferenceId", df["NewColumn"].GetField("ReferenceIdList")) .WithColumn("ItemId", df["NewColumn"].GetField("ItemIdList")) .WithColumn("ItemImmutableId", df["NewColumn"].GetField("ItemImmutableIdList")) .Select("ConversationId", "ImpressionId", "EntityType", "FolderId", "ItemId", "ReferenceId", "ItemImmutableId"); // Check the results. Assert.Equal(3, df.Count()); int i = 0; foreach (Row row in df.Collect()) { string impressionId = row.GetAs <string>("ImpressionId"); string conversationId = row.GetAs <string>("ConversationId"); string entityType = row.GetAs <string>("EntityType"); Assert.Equal("Imp1", impressionId); Assert.Equal("DD8A6B40-B4C9-426F-8194-895E9053077C", conversationId); Assert.Equal("Message", entityType); string folderId = row.GetAs <string>("FolderId"); string itemId = row.GetAs <string>("ItemId"); string referenceId = row.GetAs <string>("ReferenceId"); string itemImmutableId = row.GetAs <string>("ItemImmutableId"); if (i == 0) { Assert.Equal("F1", folderId); Assert.Equal("ItemId1", itemId); Assert.Equal("R1", referenceId); Assert.Equal("ItemImmutableId1", itemImmutableId); } else if (i == 1) { Assert.Equal("F2", folderId); Assert.Equal("ItemId2", itemId); Assert.Equal("R2", referenceId); Assert.Equal("ItemImmutableId2", itemImmutableId); } else if (i == 2) { Assert.Equal("F3", folderId); Assert.Equal("ItemId3", itemId); Assert.Equal("R3", referenceId); Assert.Equal("ItemImmutableId3", itemImmutableId); } else { throw new Exception(string.Format("Unexpected row: ConversationId={0}, ImpressionId={1}", conversationId, impressionId)); } i++; } }
private static void MetadataCleanup(DataFrame dataFrame) { Console.WriteLine("Metadata Clean-up"); var priceCleanup = Udf <string, float>( p => { if (!string.IsNullOrEmpty(p)) { var index = 0; for (var i = 0; i < p.Length; i++) { if (char.IsDigit(p[i])) { index = i; break; } } if (float.TryParse(p.Substring(index), out var result)) { return(result); } } return(-1f); }); var dateCleanup = Udf <string, double>( d => { if (!string.IsNullOrEmpty(d) && DateTime.TryParse(d, out var result)) { return((result.ToUniversalTime() - new DateTime(1970, 1, 1)).TotalSeconds); } return(-1L); }); var rankCleanup = Udf <string, long>( r => { if (!string.IsNullOrEmpty(r)) { var regex = new Regex(@"\d+(,\d+)*", RegexOptions.Singleline); var match = regex.Match(r); if (match.Success && long.TryParse(match.Value.Replace(",", string.Empty), out var result)) { return(result); } } return(-1L); }); dataFrame = dataFrame .Filter( dataFrame["asin"].IsNotNull() .And(dataFrame["title"].IsNotNull()) .And(dataFrame["main_cat"].IsNotNull()) .And(dataFrame["brand"].IsNotNull()) .And(Not(dataFrame["main_cat"].IsIn("Grocery", "Pet Supplies", "Baby", "Books", "Appstore for Android", "Gift Cards")))); dataFrame = dataFrame .WithColumn("clean_price", priceCleanup(dataFrame["price"])) .WithColumn("clean-date", dateCleanup(dataFrame["date"])) .WithColumn("clean-rank", rankCleanup(dataFrame["rank"])) .Drop(dataFrame["price"]) .Drop(dataFrame["date"]) .Drop(dataFrame["rank"]) .WithColumnRenamed("clean_price", "price") .WithColumnRenamed("clean-date", "unixTime") .WithColumnRenamed("clean-rank", "rank"); dataFrame.Cache(); dataFrame.CreateOrReplaceTempView("ElectronicsMetadata"); Console.WriteLine($"Metadata Count: {dataFrame.Count()}"); Console.WriteLine("Done"); Console.WriteLine(); }