Beispiel #1
0
        private static DataFrame GetSearchTermTFIDF(SparkSession spark, string searchTerm,
                                                    Tokenizer tokenizer, HashingTF hashingTF, IDFModel idfModel)
        {
            var searchTermDataFrame = spark.CreateDataFrame(new List <string>()
            {
                searchTerm
            })
                                      .WithColumnRenamed("_1", "Content");
            var searchWords         = tokenizer.Transform(searchTermDataFrame);
            var featurizedSeachTerm = hashingTF.Transform(searchWords);
            var search = idfModel.Transform(featurizedSeachTerm).WithColumnRenamed("features", "features2")
                         .WithColumn("norm2", udfCalcNorm(Col("features2")));

            return(search);
        }
Beispiel #2
0
        public void TestIDFModel()
        {
            int    expectedDocFrequency = 1980;
            string expectedInputCol     = "rawFeatures";
            string expectedOutputCol    = "features";

            DataFrame sentenceData =
                _spark.Sql("SELECT 0.0 as label, 'Hi I heard about Spark' as sentence");

            Tokenizer tokenizer = new Tokenizer()
                                  .SetInputCol("sentence")
                                  .SetOutputCol("words");

            DataFrame wordsData = tokenizer.Transform(sentenceData);

            HashingTF hashingTF = new HashingTF()
                                  .SetInputCol("words")
                                  .SetOutputCol(expectedInputCol)
                                  .SetNumFeatures(20);

            DataFrame featurizedData = hashingTF.Transform(wordsData);

            IDF idf = new IDF()
                      .SetInputCol(expectedInputCol)
                      .SetOutputCol(expectedOutputCol)
                      .SetMinDocFreq(expectedDocFrequency);

            IDFModel idfModel = idf.Fit(featurizedData);

            DataFrame rescaledData = idfModel.Transform(featurizedData);

            Assert.Contains(expectedOutputCol, rescaledData.Columns());

            Assert.Equal(expectedInputCol, idfModel.GetInputCol());
            Assert.Equal(expectedOutputCol, idfModel.GetOutputCol());
            Assert.Equal(expectedDocFrequency, idfModel.GetMinDocFreq());

            using (var tempDirectory = new TemporaryDirectory())
            {
                string modelPath = Path.Join(tempDirectory.Path, "idfModel");
                idfModel.Save(modelPath);

                IDFModel loadedModel = IDFModel.Load(modelPath);
                Assert.Equal(idfModel.Uid(), loadedModel.Uid());
            }

            TestFeatureBase(idfModel, "minDocFreq", 1000);
        }
Beispiel #3
0
        private static void Main(string[] args)
        {
            if (args.Length != 2)
            {
                Console.WriteLine("Args!");
                return;
            }

            var sourceDir  = args[0];
            var searchTerm = args[1];

            var spark = SparkSession
                        .Builder()
                        .GetOrCreate();

            // step one train the model

            var hashingTF = new HashingTF()
                            .SetInputCol("words")
                            .SetOutputCol("rawFeatures")
                            .SetNumFeatures(100000);

            var tokenizer = new Tokenizer()
                            .SetInputCol("Content")
                            .SetOutputCol("words");

            var(idfModel, normalized) = GetModelAndNormalizedDataFrame(sourceDir, tokenizer, hashingTF);

            var searchTermTfIdf = GetSearchTermTFIDF(spark, searchTerm, tokenizer, hashingTF, idfModel);

            var results = searchTermTfIdf.CrossJoin(normalized);

            results
            .WithColumn("similarity",
                        udfCosineSimilarity(Col("features"), Col("features2"), Col("norm"), Col("norm2")))
            .Select("path", "similarity")
            .Filter("similarity > 0.0")
            .OrderBy(Desc("similarity"))
            .Limit(10)
            .WithColumn("Search Term", Lit(searchTerm))
            .Show(10, 100000);
        }
Beispiel #4
0
        public void TestHashingTF()
        {
            string expectedInputCol  = "input_col";
            string expectedOutputCol = "output_col";
            int    expectedFeatures  = 10;

            Assert.IsType <HashingTF>(new HashingTF());

            HashingTF hashingTf = new HashingTF("my-unique-id")
                                  .SetNumFeatures(expectedFeatures)
                                  .SetInputCol(expectedInputCol)
                                  .SetOutputCol(expectedOutputCol);

            Assert.Equal(expectedFeatures, hashingTf.GetNumFeatures());
            Assert.Equal(expectedInputCol, hashingTf.GetInputCol());
            Assert.Equal(expectedOutputCol, hashingTf.GetOutputCol());

            DataFrame input = _spark.Sql("SELECT array('this', 'is', 'a', 'string', 'a', 'a')" +
                                         " as input_col");

            DataFrame output       = hashingTf.Transform(input);
            DataFrame outputVector = output.Select(expectedOutputCol);

            Assert.Contains(expectedOutputCol, outputVector.Columns());

            using (var tempDirectory = new TemporaryDirectory())
            {
                string savePath = Path.Join(tempDirectory.Path, "hashingTF");
                hashingTf.Save(savePath);

                HashingTF loadedHashingTf = HashingTF.Load(savePath);
                Assert.Equal(hashingTf.Uid(), loadedHashingTf.Uid());
            }

            hashingTf.SetBinary(true);
            Assert.True(hashingTf.GetBinary());
        }
        private static void Main(string[] args)
        {
            var spark = SparkSession
                        .Builder()
                        .AppName("TF-IDF Application")
                        .GetOrCreate();

            var documentPath = args[0];
            var search       = args[1];

            var documentData = GetDocuments(documentPath);

            var documents = spark.CreateDataFrame(documentData, new StructType(
                                                      new List <StructField>
            {
                new StructField("title", new StringType()),
                new StructField("content", new StringType())
            }));

            var tokenizer = new Tokenizer()
                            .SetInputCol("content")
                            .SetOutputCol("words");

            var hashingTF = new HashingTF()
                            .SetInputCol("words")
                            .SetOutputCol("rawFeatures")
                            .SetNumFeatures(1000000);

            var idf = new IDF()
                      .SetInputCol("rawFeatures")
                      .SetOutputCol("features");

            var tokenizedDocuments  = tokenizer.Transform(documents);
            var featurizedDocuments = hashingTF.Transform(tokenizedDocuments);

            var idfModel = idf.Fit(featurizedDocuments);

            var transformedDocuments =
                idfModel.Transform(featurizedDocuments).Select("title", "features");
            var normalizedDocuments = transformedDocuments.Select(Col("features"),
                                                                  udfCalcNorm(transformedDocuments["features"]).Alias("norm"), Col("title"));

            var searchTerm = spark.CreateDataFrame(
                new List <GenericRow> {
                new GenericRow(new[] { search })
            },
                new StructType(new[] { new StructField("content", new StringType()) }));

            var tokenizedSearchTerm = tokenizer.Transform(searchTerm);

            var featurizedSearchTerm = hashingTF.Transform(tokenizedSearchTerm);

            var normalizedSearchTerm = idfModel
                                       .Transform(featurizedSearchTerm)
                                       .WithColumnRenamed("features", "searchTermFeatures")
                                       .WithColumn("searchTermNorm", udfCalcNorm(Column("searchTermFeatures")));

            var results = normalizedDocuments.CrossJoin(normalizedSearchTerm);

            results
            .WithColumn("similarity",
                        udfCosineSimilarity(Column("features"), Column("searchTermFeatures"),
                                            Col("norm"), Col("searchTermNorm")))
            .OrderBy(Desc("similarity")).Select("title", "similarity")
            .Show(10000, 100);
        }
Beispiel #6
0
        private static (IDFModel, DataFrame) GetModelAndNormalizedDataFrame(string sourceDir,
                                                                            Tokenizer tokenizer, HashingTF hashingTF)
        {
            var sourceDocuments = toDF(GetSourceFiles(sourceDir));
            var words           = tokenizer.Transform(sourceDocuments);
            var featurizedData  = hashingTF.Transform(words);

            var idf = new IDF()
                      .SetInputCol("rawFeatures")
                      .SetOutputCol("features");
            var idfModel = idf.Fit(featurizedData);

            var rescaled = idfModel.Transform(featurizedData);
            var filtered = rescaled.Select("Path", "features");

            return(idfModel, filtered.WithColumn("norm", udfCalcNorm(Col("features"))));
        }