Beispiel #1
0
        public void TestWithDuplicateDates()
        {
            var date   = new Date(2020, 1, 1);
            var schema = new StructType(new StructField[]
            {
                new StructField("date", new DateType())
            });
            var data = new GenericRow[]
            {
                new GenericRow(new object[] { date }),
                new GenericRow(new object[] { date }),
                new GenericRow(new object[] { date })
            };

            DataFrame df = _spark.CreateDataFrame(data, schema);

            Row[] rows = df.Collect().ToArray();

            Assert.Equal(3, rows.Length);
            foreach (Row row in rows)
            {
                Assert.Single(row.Values);
                Assert.Equal(date, row.GetAs <Date>(0));
            }
        }
Beispiel #2
0
        public void TestWithDuplicateTimestamps()
        {
            var timestamp = new Timestamp(2020, 1, 1, 0, 0, 0, 0);
            var schema    = new StructType(new StructField[]
            {
                new StructField("ts", new TimestampType())
            });
            var data = new GenericRow[]
            {
                new GenericRow(new object[] { timestamp }),
                new GenericRow(new object[] { timestamp }),
                new GenericRow(new object[] { timestamp })
            };

            DataFrame df = _spark.CreateDataFrame(data, schema);

            Row[] rows = df.Collect().ToArray();

            Assert.Equal(3, rows.Length);
            foreach (Row row in rows)
            {
                Assert.Single(row.Values);
                Assert.Equal(timestamp, row.GetAs <Timestamp>(0));
            }
        }
Beispiel #3
0
        public void TestEmailSearchTopNReducerBasics()
        {
            // Read the sample data.
            DataFrame df = _spark
                           .Read()
                           .Schema("Id STRING, DisplayName STRING, GivenName STRING, Surname STRING, IMAddress STRING, EmailAddress STRING, RelevanceScore DOUBLE, puser STRING, ptenant STRING")
                           .Json($"{TestEnvironment.ResourceDirectory}neighbors.json");

            // Trim the IMAddress column.
            Func <Column, Column> trimIMAddress = Udf <string, string>((str) => str.StartsWith("sip:") ? str.Substring(4) : str);

            df = df.WithColumn("IMAddress", trimIMAddress(df["IMAddress"]));

            // Reduce
            df = df.GroupBy("puser", "ptenant").Agg(CollectList("GivenName").Alias("GivenNames"),
                                                    CollectList("Surname").Alias("Surnames"),
                                                    CollectList("DisplayName").Alias("DisplayNames"),
                                                    CollectList("EmailAddress").Alias("EmailAddresses"),
                                                    CollectList("RelevanceScore").Alias("RelevanceScores"),
                                                    CollectList("IMAddress").Alias("IMAddresses"));
            // Format the output.
            df = df.Select(df["puser"],
                           df["ptenant"],
                           ConcatWs(";", df["GivenNames"]).Alias("GivenNames"),
                           ConcatWs(";", df["Surnames"]).Alias("Surnames"),
                           ConcatWs(";", df["DisplayNames"]).Alias("DisplayNames"),
                           ConcatWs(";", df["EmailAddresses"]).Alias("EmailAddresses"),
                           ConcatWs(";", df["RelevanceScores"]).Alias("RelevanceScores"),
                           ConcatWs(";", df["IMAddresses"]).Alias("IMAddresses"));

            Assert.Equal(2, df.Count());
            foreach (Row row in df.Collect())
            {
                string puser = row.GetAs <string>("puser");
                Assert.Equal("MSFT", row.GetAs <string>("ptenant"));
                Assert.Equal("1101.0;900.0;857.0", row.GetAs <string>("RelevanceScores"));
                switch (puser)
                {
                case "ruih":
                    Assert.Equal("AliceFN;BobFN;CharlieFN", row.GetAs <string>("GivenNames"));
                    Assert.Equal("AliceLN;BobLN;CharlieLN", row.GetAs <string>("Surnames"));
                    Assert.Equal("AliceFN AliceLN;BobFN BobLN;CharlieFN CharlieLN", row.GetAs <string>("DisplayNames"));
                    Assert.Equal("[email protected];[email protected];[email protected]", row.GetAs <string>("EmailAddresses"));
                    Assert.Equal("[email protected];[email protected];[email protected]", row.GetAs <string>("IMAddresses"));
                    break;

                case "rui":
                    Assert.Equal("DougFN;ElvaFN;FrankFN", row.GetAs <string>("GivenNames"));
                    Assert.Equal("DougLN;ElvaLN;FrankLN", row.GetAs <string>("Surnames"));
                    Assert.Equal("DougFN DougLN;ElvaFN ElvaLN;FrankFN FrankLN", row.GetAs <string>("DisplayNames"));
                    Assert.Equal("[email protected];[email protected];[email protected]", row.GetAs <string>("EmailAddresses"));
                    Assert.Equal("[email protected];[email protected];[email protected]", row.GetAs <string>("IMAddresses"));
                    break;

                default:
                    throw new Exception($"Unexpected age: {puser}.");
                }
            }
        }
Beispiel #4
0
 private void ValidateDataFrame(
     DataFrame actual,
     IEnumerable <object[]> expectedRows,
     StructType expectedSchema)
 {
     Assert.Equal(expectedSchema, actual.Schema());
     Assert.Equal(expectedRows, actual.Collect().Select(r => r.Values));
 }
Beispiel #5
0
        public void TestCollect()
        {
            Row[] rows = _df.Collect().ToArray();
            Assert.Equal(3, rows.Length);

            Row row1 = rows[0];

            Assert.Equal("Michael", row1.GetAs <string>("name"));
            Assert.Null(row1.Get("age"));

            Row row2 = rows[1];

            Assert.Equal("Andy", row2.GetAs <string>("name"));
            Assert.Equal(30, row2.GetAs <int>("age"));

            Row row3 = rows[2];

            Assert.Equal("Justin", row3.GetAs <string>("name"));
            Assert.Equal(19, row3.GetAs <int>("age"));
        }
Beispiel #6
0
        public void TestWithColumn()
        {
            Func <Column, Column> sizeNameAgeUdf = Udf <Row, string>(
                r =>
            {
                string name = r.GetAs <string>("name");
                int?age     = r.GetAs <int?>("age");
                if (age.HasValue)
                {
                    return($"{r.Size()},{name},{age.Value}");
                }

                return($"{r.Size()},{name},{string.Empty}");
            });

            string[]  allCols      = _df.Columns().ToArray();
            DataFrame nameAgeColDF =
                _df.WithColumn("NameAgeCol", Struct(allCols[0], allCols.Skip(1).ToArray()));
            DataFrame sizeNameAgeColDF =
                nameAgeColDF.WithColumn("SizeNameAgeCol", sizeNameAgeUdf(nameAgeColDF["NameAgeCol"]));

            Row[] originalDFRows = _df.Collect().ToArray();
            Assert.Equal(3, originalDFRows.Length);

            Row[] sizeNameAgeColDFRows = sizeNameAgeColDF.Collect().ToArray();
            Assert.Equal(3, sizeNameAgeColDFRows.Length);

            {
                Row row = sizeNameAgeColDFRows[0];
                Assert.Equal("Michael", row.GetAs <string>("name"));
                Assert.Null(row.Get("age"));
                Assert.IsType <Row>(row.Get("NameAgeCol"));
                Assert.Equal(originalDFRows[0], row.GetAs <Row>("NameAgeCol"));
                Assert.Equal("2,Michael,", row.GetAs <string>("SizeNameAgeCol"));
            }

            {
                Row row = sizeNameAgeColDFRows[1];
                Assert.Equal("Andy", row.GetAs <string>("name"));
                Assert.Equal(30, row.GetAs <int>("age"));
                Assert.IsType <Row>(row.Get("NameAgeCol"));
                Assert.Equal(originalDFRows[1], row.GetAs <Row>("NameAgeCol"));
                Assert.Equal("2,Andy,30", row.GetAs <string>("SizeNameAgeCol"));
            }

            {
                Row row = sizeNameAgeColDFRows[2];
                Assert.Equal("Justin", row.GetAs <string>("name"));
                Assert.Equal(19, row.GetAs <int>("age"));
                Assert.IsType <Row>(row.Get("NameAgeCol"));
                Assert.Equal(originalDFRows[2], row.GetAs <Row>("NameAgeCol"));
                Assert.Equal("2,Justin,19", row.GetAs <string>("SizeNameAgeCol"));
            }
        }
Beispiel #7
0
        public void TestVersion()
        {
            DataFrame versionDf = _spark.GetAssemblyInfo();

            Row[] versionRows = versionDf.Collect().ToArray();
            Assert.Equal(2, versionRows.Length);

            Assert.Equal(
                new string[] { "Microsoft.Spark", "Microsoft.Spark.Worker" },
                versionRows.Select(r => r.GetAs <string>("AssemblyName")));
            for (int i = 0; i < 2; ++i)
            {
                Assert.False(
                    string.IsNullOrWhiteSpace(versionRows[i].GetAs <string>("AssemblyVersion")));
                Assert.False(
                    string.IsNullOrWhiteSpace(versionRows[i].GetAs <string>("HostName")));
            }
        }
Beispiel #8
0
        public void TestCollect()
        {
            Console.WriteLine(">>>>>>>>>>>>>>>>>> Dataframe unit testing started >>>>>>>>>>> ");
            _df.Show(10);
            Row[] rows = _df.Collect().ToArray();
            Assert.Equal(3, rows.Length);

            Row row1 = rows[0];

            Assert.Equal("Michael", row1.GetAs <string>("name"));
            Assert.Null(row1.Get("age"));

            Row row2 = rows[1];

            Assert.Equal("Andy", row2.GetAs <string>("name"));
            Assert.Equal(30, row2.GetAs <int>("age"));

            Row row3 = rows[2];

            Assert.Equal("Justin", row3.GetAs <string>("name"));
            Assert.Equal(19, row3.GetAs <int>("age"));
            Console.WriteLine(">>>>>>>>>>>>>>>>>> Dataframe unit testing end >>>>>>>>>>> ");
        }
        private void AssertSameRows(DataFrame dataFrameA, DataFrame dataFrameB)
        {
            IEnumerable <Row> dfASeq = dataFrameA.Collect();
            IEnumerable <Row> dfBSeq = dataFrameB.Collect();

            int i = 0;

            foreach (Row rowA in dfASeq)
            {
                Row rowB = dfBSeq.Skip(i).First();

                _helper.WriteLine($"Computed - {rowA}");
                _helper.WriteLine($"Expected - {rowB}");

                int columnSize = rowA.Size();

                for (int j = 0; j < columnSize; j++)
                {
                    rowA[j].ShouldBe(rowB[j]);
                }

                i++;
            }
        }
Beispiel #10
0
        public void Run(string[] args)
        {
            if (args.Length != 1)
            {
                Console.Error.WriteLine(
                    "Usage: Basic <path to SPARK_HOME/examples/src/main/resources/people.json>");
                Environment.Exit(1);
            }

            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName(".NET Spark SQL basic example")
                                 .Config("spark.some.config.option", "some-value")
                                 .GetOrCreate();

            // Need to explicitly specify the schema since pickling vs. arrow formatting
            // will return different types. Pickling will turn longs into ints if the values fit.
            // Same as the "age INT, name STRING" DDL-format string.
            var inputSchema = new StructType(new[]
            {
                new StructField("age", new IntegerType()),
                new StructField("name", new StringType())
            });
            DataFrame df = spark.Read().Schema(inputSchema).Json(args[0]);

            Spark.Sql.Types.StructType schema = df.Schema();
            Console.WriteLine(schema.SimpleString);

            IEnumerable <Row> rows = df.Collect();

            foreach (Row row in rows)
            {
                Console.WriteLine(row);
            }

            df.Show();

            df.PrintSchema();

            df.Select("name", "age", "age", "name").Show();

            df.Select(df["name"], df["age"] + 1).Show();

            df.Filter(df["age"] > 21).Show();

            df.GroupBy("age")
            .Agg(Avg(df["age"]), Avg(df["age"]), CountDistinct(df["age"], df["age"]))
            .Show();

            df.CreateOrReplaceTempView("people");

            // Registering Udf for SQL expression.
            DataFrame sqlDf = spark.Sql("SELECT * FROM people");

            sqlDf.Show();

            spark.Udf().Register <int?, string, string>(
                "my_udf",
                (age, name) => name + " with " + ((age.HasValue) ? age.Value.ToString() : "null"));

            sqlDf = spark.Sql("SELECT my_udf(*) FROM people");
            sqlDf.Show();

            // Using UDF via data frames.
            Func <Column, Column, Column> addition = Udf <int?, string, string>(
                (age, name) => name + " is " + (age.HasValue ? age.Value + 10 : 0));

            df.Select(addition(df["age"], df["name"])).Show();

            // Chaining example:
            Func <Column, Column> addition2 = Udf <string, string>(str => $"hello {str}!");

            df.Select(addition2(addition(df["age"], df["name"]))).Show();

            // Multiple UDF example:
            df.Select(addition(df["age"], df["name"]), addition2(df["name"])).Show();

            // UDF return type as array.
            Func <Column, Column> udfArray =
                Udf <string, string[]>((str) => new string[] { str, str + str });

            df.Select(Explode(udfArray(df["name"]))).Show();

            // UDF return type as map.
            Func <Column, Column> udfMap =
                Udf <string, IDictionary <string, string[]> >(
                    (str) => new Dictionary <string, string[]> {
                { str, new[] { str, str } }
            });

            df.Select(udfMap(df["name"]).As("UdfMap")).Show(truncate: 50);

            // Joins.
            DataFrame joinedDf = df.Join(df, "name");

            joinedDf.Show();

            DataFrame joinedDf2 = df.Join(df, new[] { "name", "age" });

            joinedDf2.Show();

            DataFrame joinedDf3 = df.Join(df, df["name"] == df["name"], "outer");

            joinedDf3.Show();

            spark.Stop();
        }
 private string[] ToStringArray(DataFrame df)
 {
     Row[] rows = df.Collect().ToArray();
     return(rows.Select(s => s[0].ToString()).ToArray());
 }
        private void TestAndValidateForeach(
            string streamInputPath,
            TestForeachWriter foreachWriter,
            int expectedCSVFiles,
            int expectedExceptionFiles,
            IEnumerable <int> expectedOutput)
        {
            // Temporary folder the TestForeachWriter will write to.
            using var dstTempDirectory = new TemporaryDirectory();
            foreachWriter.WritePath    = dstTempDirectory.Path;

            // Read streamInputPath, repartition data, then
            // call TestForeachWriter on the data.
            DataStreamWriter dsw = _spark
                                   .ReadStream()
                                   .Schema(new StructType(new[]
            {
                new StructField("id", new IntegerType()),
                new StructField("idStr", new StringType()),
                new StructField("idAndIdStr", new StructType(new[]
                {
                    new StructField("id", new IntegerType()),
                    new StructField("idStr", new StringType())
                }))
            }))
                                   .Json(streamInputPath)
                                   .Repartition(expectedCSVFiles)
                                   .WriteStream()
                                   .Foreach(foreachWriter);

            // Trigger the stream batch once.
            if (expectedExceptionFiles > 0)
            {
                Assert.Throws <Exception>(
                    () => dsw.Trigger(Trigger.Once()).Start().AwaitTermination());
            }
            else
            {
                dsw.Trigger(Trigger.Once()).Start().AwaitTermination();
            }

            // Verify that TestForeachWriter created a unique .csv when
            // ForeachWriter.Open was called on each partitionId.
            Assert.Equal(
                expectedCSVFiles,
                Directory.GetFiles(dstTempDirectory.Path, "*.csv").Length);

            // Only if ForeachWriter.Process(Row) throws an exception, will
            // ForeachWriter.Close(Exception) create a file with the
            // .exception extension.
            Assert.Equal(
                expectedExceptionFiles,
                Directory.GetFiles(dstTempDirectory.Path, "*.exception").Length);

            // Read in the *.csv file(s) generated by the TestForeachWriter.
            // If there are multiple input files, sorting by "id" will make
            // validation simpler. Contents of the *.csv will only be populated
            // on successful calls to the ForeachWriter.Process method.
            DataFrame foreachWriterOutputDF = _spark
                                              .Read()
                                              .Schema("id INT")
                                              .Csv(dstTempDirectory.Path)
                                              .Sort("id");

            // Validate expected *.csv data.
            Assert.Equal(
                expectedOutput.Select(i => new object[] { i }),
                foreachWriterOutputDF.Collect().Select(r => r.Values));
        }
Beispiel #13
0
        public void TestForeachBatch()
        {
            // Temporary folder to put our test stream input.
            using var srcTempDirectory = new TemporaryDirectory();
            // Temporary folder to write ForeachBatch output.
            using var dstTempDirectory = new TemporaryDirectory();

            Func <Column, Column> outerUdf = Udf <int, int>(i => i + 100);

            // id column: [0, 1, ..., 9]
            WriteCsv(0, 10, Path.Combine(srcTempDirectory.Path, "input1.csv"));

            DataStreamWriter dsw = _spark
                                   .ReadStream()
                                   .Schema("id INT")
                                   .Csv(srcTempDirectory.Path)
                                   .WriteStream()
                                   .ForeachBatch((df, id) =>
            {
                Func <Column, Column> innerUdf = Udf <int, int>(i => i + 200);
                df.Select(outerUdf(innerUdf(Col("id"))))
                .Write()
                .Csv(Path.Combine(dstTempDirectory.Path, id.ToString()));
            });

            StreamingQuery sq = dsw.Start();

            // Process until all available data in the source has been processed and committed
            // to the ForeachBatch sink.
            sq.ProcessAllAvailable();

            // Add new file to the source path. The spark stream will read any new files
            // added to the source path.
            // id column: [10, 11, ..., 19]
            WriteCsv(10, 10, Path.Combine(srcTempDirectory.Path, "input2.csv"));

            // Process until all available data in the source has been processed and committed
            // to the ForeachBatch sink.
            sq.ProcessAllAvailable();
            sq.Stop();

            // Verify folders in the destination path.
            string[] csvPaths =
                Directory.GetDirectories(dstTempDirectory.Path).OrderBy(s => s).ToArray();
            var expectedPaths = new string[]
            {
                Path.Combine(dstTempDirectory.Path, "0"),
                Path.Combine(dstTempDirectory.Path, "1"),
            };

            Assert.True(expectedPaths.SequenceEqual(csvPaths));

            // Read the generated csv paths and verify contents.
            DataFrame df = _spark
                           .Read()
                           .Schema("id INT")
                           .Csv(csvPaths[0], csvPaths[1])
                           .Sort("id");

            IEnumerable <int> actualIds = df.Collect().Select(r => r.GetAs <int>("id"));

            Assert.True(Enumerable.Range(300, 20).SequenceEqual(actualIds));
        }
        public void TestDataFrameCollect()
        {
            string jsonSchema = @"
                {
                  ""type"" : ""struct"",
                  ""fields"" : [ {
                    ""name"" : ""address"",
                    ""type"" : {
                      ""type"" : ""struct"",
                      ""fields"" : [ {
                        ""name"" : ""city"",
                        ""type"" : ""string"",
                        ""nullable"" : true,
                        ""metadata"" : { }
                      }, {
                        ""name"" : ""state"",
                        ""type"" : ""string"",
                        ""nullable"" : true,
                        ""metadata"" : { }
                      } ]
                    },
                    ""nullable"" : true,
                    ""metadata"" : { }
                  }, {
                    ""name"" : ""age"",
                    ""type"" : ""long"",
                    ""nullable"" : true,
                    ""metadata"" : { }
                  }, {
                    ""name"" : ""id"",
                    ""type"" : ""string"",
                    ""nullable"" : true,
                    ""metadata"" : { }
                  }, {
                    ""name"" : ""name"",
                    ""type"" : ""string"",
                    ""nullable"" : true,
                    ""metadata"" : { }
                  } ]
                }";

            int    localPort = 4000;
            object row1      = new object[] {
                new object[] { "Columbus", "Ohio" },
                34,
                "123",
                "Bill"
            };

            object row2 = new object[] {
                new object[] { "Seattle", "Washington" },
                43,
                "789",
                "Bill"
            };

            IStructTypeProxy structTypeProxy = new MockStructTypeProxy(jsonSchema);
            IDataFrameProxy  dataFrameProxy  =
                new MockDataFrameProxy(localPort,
                                       new List <object>()
            {
                row1, row2
            },
                                       structTypeProxy);
            DataFrame dataFrame = new DataFrame(dataFrameProxy, null);

            List <Row> rows = new List <Row>();

            foreach (var row in dataFrame.Collect())
            {
                rows.Add(row);
                Console.WriteLine("{0}", row);
            }

            Assert.AreEqual(rows.Count, 2);
            Row firstRow = rows[0];

            string id = firstRow.GetAs <string>("id");

            Assert.IsTrue(id.Equals("123"));
            string name = firstRow.GetAs <string>("name");

            Assert.IsTrue(name.Equals("Bill"));
            int age = firstRow.GetAs <int>("age");

            Assert.AreEqual(age, 34);

            Row address = firstRow.GetAs <Row>("address");

            Assert.AreNotEqual(address, null);
            string city = address.GetAs <string>("city");

            Assert.IsTrue(city.Equals("Columbus"));
            string state = address.GetAs <string>("state");

            Assert.IsTrue(state.Equals("Ohio"));
        }
Beispiel #15
0
        public void TestEmailSearchSuccessActionReducerBasics()
        {
            // Read the sample data.
            DataFrame df = _spark.Read().Json($"{TestEnvironment.ResourceDirectory}search_actions.json");

            // Select the required columns.
            df = df.Select("ImpressionId", "ConversationId", "EntityType", "FolderIdList", "ReferenceIdList", "ItemIdList", "ItemImmutableIdList");

            // Convert columns of concatenated string to array of strings.
            Func <Column, Column> toStringArrayUdf = Udf <string, string[]>((str) => str.Split(';'));

            df = df.WithColumn("FolderIdList", toStringArrayUdf(df["FolderIdList"]))
                 .WithColumn("ReferenceIdList", toStringArrayUdf(df["ReferenceIdList"]))
                 .WithColumn("ItemIdList", toStringArrayUdf(df["ItemIdList"]))
                 .WithColumn("ItemImmutableIdList", toStringArrayUdf(df["ItemImmutableIdList"]));

            // Apply the ArrayZip function to combine the i-th element of each array.
            df = df.Select(df["ConversationId"], df["ImpressionId"], df["EntityType"], ArraysZip(df["FolderIdList"], df["ReferenceIdList"], df["ItemIdList"], df["ItemImmutableIdList"]).Alias("ConcatedColumn"));

            // Apply the Explode function to split into multiple rows.
            df = df.Select(df["ConversationId"], df["ImpressionId"], df["EntityType"], Explode(df["ConcatedColumn"]).Alias("NewColumn"));

            // Create multiple columns.
            df = df.WithColumn("FolderId", df["NewColumn"].GetField("FolderIdList"))
                 .WithColumn("ReferenceId", df["NewColumn"].GetField("ReferenceIdList"))
                 .WithColumn("ItemId", df["NewColumn"].GetField("ItemIdList"))
                 .WithColumn("ItemImmutableId", df["NewColumn"].GetField("ItemImmutableIdList"))
                 .Select("ConversationId", "ImpressionId", "EntityType", "FolderId", "ItemId", "ReferenceId", "ItemImmutableId");

            // Check the results.
            Assert.Equal(3, df.Count());
            int i = 0;

            foreach (Row row in df.Collect())
            {
                string impressionId   = row.GetAs <string>("ImpressionId");
                string conversationId = row.GetAs <string>("ConversationId");
                string entityType     = row.GetAs <string>("EntityType");
                Assert.Equal("Imp1", impressionId);
                Assert.Equal("DD8A6B40-B4C9-426F-8194-895E9053077C", conversationId);
                Assert.Equal("Message", entityType);
                string folderId        = row.GetAs <string>("FolderId");
                string itemId          = row.GetAs <string>("ItemId");
                string referenceId     = row.GetAs <string>("ReferenceId");
                string itemImmutableId = row.GetAs <string>("ItemImmutableId");
                if (i == 0)
                {
                    Assert.Equal("F1", folderId);
                    Assert.Equal("ItemId1", itemId);
                    Assert.Equal("R1", referenceId);
                    Assert.Equal("ItemImmutableId1", itemImmutableId);
                }
                else if (i == 1)
                {
                    Assert.Equal("F2", folderId);
                    Assert.Equal("ItemId2", itemId);
                    Assert.Equal("R2", referenceId);
                    Assert.Equal("ItemImmutableId2", itemImmutableId);
                }
                else if (i == 2)
                {
                    Assert.Equal("F3", folderId);
                    Assert.Equal("ItemId3", itemId);
                    Assert.Equal("R3", referenceId);
                    Assert.Equal("ItemImmutableId3", itemImmutableId);
                }
                else
                {
                    throw new Exception(string.Format("Unexpected row: ConversationId={0}, ImpressionId={1}", conversationId, impressionId));
                }

                i++;
            }
        }
Beispiel #16
0
        public void Run(string[] args)
        {
            if (args.Length != 1)
            {
                Console.Error.WriteLine(
                    "Usage: Logging <path to Apache User Logs>");
                Environment.Exit(1);
            }

            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("Apache User Log Processing")
                                 .GetOrCreate();

            // Read input log file and display it
            DataFrame df = spark.Read().Text(args[0]);

            df.Show();

            // Step 1: UDF to determine if each line is a valid log entry
            // Remove any invalid entries before further filtering
            spark.Udf().Register <string, bool>(
                "GeneralReg",
                log => Regex.IsMatch(log, s_apacheRx));

            df.CreateOrReplaceTempView("Logs");

            // Apply the UDF to get valid log entries
            DataFrame generalDf = spark.Sql(
                "SELECT logs.value, GeneralReg(logs.value) FROM Logs");

            // Only keep log entries that matched the reg ex
            generalDf = generalDf.Filter(generalDf["GeneralReg(value)"]);
            generalDf.Show();

            // View the resulting schema
            // Notice we created a new column "GeneralReg(value)"
            generalDf.PrintSchema();

            // Step 2: Choose valid log entries that start with 10
            spark.Udf().Register <string, bool>(
                "IPReg",
                log => Regex.IsMatch(log, "^(?=10)"));

            generalDf.CreateOrReplaceTempView("IPLogs");

            // Apply UDF to get valid log entries starting with 10
            // Use SQL "WHERE" rather than doing ipDf.Filter(),
            // which avoids creating an extra column "IPReg(value)"
            DataFrame ipDf = spark.Sql(
                "SELECT iplogs.value FROM IPLogs WHERE IPReg(iplogs.value)");

            ipDf.Show();

            // Step 3: Choose valid log entries that start
            // with 10 and deal with spam
            spark.Udf().Register <string, bool>(
                "SpamRegEx",
                log => Regex.IsMatch(log, "\\b(?=spam)\\b"));

            ipDf.CreateOrReplaceTempView("SpamLogs");

            // Apply UDF to get valid, start with 10, spam entries
            DataFrame spamDF = spark.Sql(
                "SELECT spamlogs.value FROM SpamLogs WHERE SpamRegEx(spamlogs.value)");

            // Let's explore the columns in the data we have filtered
            // Use LINQ to count the number of GET requests
            int numGetRequests = spamDF
                                 .Collect()
                                 .Where(r => ContainsGet(r.GetAs <string>("value")))
                                 .Count();

            Console.WriteLine("Number of GET requests: " + numGetRequests);

            spark.Stop();
        }