Example #1
0
        public void TestIDFModel()
        {
            int    expectedDocFrequency = 1980;
            string expectedInputCol     = "rawFeatures";
            string expectedOutputCol    = "features";

            DataFrame sentenceData =
                _spark.Sql("SELECT 0.0 as label, 'Hi I heard about Spark' as sentence");

            Tokenizer tokenizer = new Tokenizer()
                                  .SetInputCol("sentence")
                                  .SetOutputCol("words");

            DataFrame wordsData = tokenizer.Transform(sentenceData);

            HashingTF hashingTF = new HashingTF()
                                  .SetInputCol("words")
                                  .SetOutputCol(expectedInputCol)
                                  .SetNumFeatures(20);

            DataFrame featurizedData = hashingTF.Transform(wordsData);

            IDF idf = new IDF()
                      .SetInputCol(expectedInputCol)
                      .SetOutputCol(expectedOutputCol)
                      .SetMinDocFreq(expectedDocFrequency);

            IDFModel idfModel = idf.Fit(featurizedData);

            DataFrame rescaledData = idfModel.Transform(featurizedData);

            Assert.Contains(expectedOutputCol, rescaledData.Columns());

            Assert.Equal(expectedInputCol, idfModel.GetInputCol());
            Assert.Equal(expectedOutputCol, idfModel.GetOutputCol());
            Assert.Equal(expectedDocFrequency, idfModel.GetMinDocFreq());

            using (var tempDirectory = new TemporaryDirectory())
            {
                string modelPath = Path.Join(tempDirectory.Path, "idfModel");
                idfModel.Save(modelPath);

                IDFModel loadedModel = IDFModel.Load(modelPath);
                Assert.Equal(idfModel.Uid(), loadedModel.Uid());
            }

            TestFeatureBase(idfModel, "minDocFreq", 1000);
        }
Example #2
0
        private static (IDFModel, DataFrame) GetModelAndNormalizedDataFrame(string sourceDir,
                                                                            Tokenizer tokenizer, HashingTF hashingTF)
        {
            var sourceDocuments = toDF(GetSourceFiles(sourceDir));
            var words           = tokenizer.Transform(sourceDocuments);
            var featurizedData  = hashingTF.Transform(words);

            var idf = new IDF()
                      .SetInputCol("rawFeatures")
                      .SetOutputCol("features");
            var idfModel = idf.Fit(featurizedData);

            var rescaled = idfModel.Transform(featurizedData);
            var filtered = rescaled.Select("Path", "features");

            return(idfModel, filtered.WithColumn("norm", udfCalcNorm(Col("features"))));
        }
        private static void Main(string[] args)
        {
            var spark = SparkSession
                        .Builder()
                        .AppName("TF-IDF Application")
                        .GetOrCreate();

            var documentPath = args[0];
            var search       = args[1];

            var documentData = GetDocuments(documentPath);

            var documents = spark.CreateDataFrame(documentData, new StructType(
                                                      new List <StructField>
            {
                new StructField("title", new StringType()),
                new StructField("content", new StringType())
            }));

            var tokenizer = new Tokenizer()
                            .SetInputCol("content")
                            .SetOutputCol("words");

            var hashingTF = new HashingTF()
                            .SetInputCol("words")
                            .SetOutputCol("rawFeatures")
                            .SetNumFeatures(1000000);

            var idf = new IDF()
                      .SetInputCol("rawFeatures")
                      .SetOutputCol("features");

            var tokenizedDocuments  = tokenizer.Transform(documents);
            var featurizedDocuments = hashingTF.Transform(tokenizedDocuments);

            var idfModel = idf.Fit(featurizedDocuments);

            var transformedDocuments =
                idfModel.Transform(featurizedDocuments).Select("title", "features");
            var normalizedDocuments = transformedDocuments.Select(Col("features"),
                                                                  udfCalcNorm(transformedDocuments["features"]).Alias("norm"), Col("title"));

            var searchTerm = spark.CreateDataFrame(
                new List <GenericRow> {
                new GenericRow(new[] { search })
            },
                new StructType(new[] { new StructField("content", new StringType()) }));

            var tokenizedSearchTerm = tokenizer.Transform(searchTerm);

            var featurizedSearchTerm = hashingTF.Transform(tokenizedSearchTerm);

            var normalizedSearchTerm = idfModel
                                       .Transform(featurizedSearchTerm)
                                       .WithColumnRenamed("features", "searchTermFeatures")
                                       .WithColumn("searchTermNorm", udfCalcNorm(Column("searchTermFeatures")));

            var results = normalizedDocuments.CrossJoin(normalizedSearchTerm);

            results
            .WithColumn("similarity",
                        udfCosineSimilarity(Column("features"), Column("searchTermFeatures"),
                                            Col("norm"), Col("searchTermNorm")))
            .OrderBy(Desc("similarity")).Select("title", "similarity")
            .Show(10000, 100);
        }