C# (CSharp) IDF.Fit Examples

Programming Language: C# (CSharp)

Class/Type: IDF

Method/Function: Fit

Examples at hotexamples.com: 3

C# (CSharp) IDF.Fit - 3 examples found. These are the top rated real world C# (CSharp) examples of IDF.Fit extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Update(6)

Fit(3)

Evaluate(1)

GetInputCol(1)

GetMinDocFreq(1)

GetOutputCol(1)

Load(1)

Run(1)

Save(1)

Uid(1)

Example #1

Show file

        public void TestIDFModel()
        {
            int    expectedDocFrequency = 1980;
            string expectedInputCol     = "rawFeatures";
            string expectedOutputCol    = "features";

            DataFrame sentenceData =
                _spark.Sql("SELECT 0.0 as label, 'Hi I heard about Spark' as sentence");

            Tokenizer tokenizer = new Tokenizer()
                                  .SetInputCol("sentence")
                                  .SetOutputCol("words");

            DataFrame wordsData = tokenizer.Transform(sentenceData);

            HashingTF hashingTF = new HashingTF()
                                  .SetInputCol("words")
                                  .SetOutputCol(expectedInputCol)
                                  .SetNumFeatures(20);

            DataFrame featurizedData = hashingTF.Transform(wordsData);

            IDF idf = new IDF()
                      .SetInputCol(expectedInputCol)
                      .SetOutputCol(expectedOutputCol)
                      .SetMinDocFreq(expectedDocFrequency);

            IDFModel idfModel = idf.Fit(featurizedData);

            DataFrame rescaledData = idfModel.Transform(featurizedData);

            Assert.Contains(expectedOutputCol, rescaledData.Columns());

            Assert.Equal(expectedInputCol, idfModel.GetInputCol());
            Assert.Equal(expectedOutputCol, idfModel.GetOutputCol());
            Assert.Equal(expectedDocFrequency, idfModel.GetMinDocFreq());

            using (var tempDirectory = new TemporaryDirectory())
            {
                string modelPath = Path.Join(tempDirectory.Path, "idfModel");
                idfModel.Save(modelPath);

                IDFModel loadedModel = IDFModel.Load(modelPath);
                Assert.Equal(idfModel.Uid(), loadedModel.Uid());
            }

            TestFeatureBase(idfModel, "minDocFreq", 1000);
        }

Example #2

Show file

File: Program.cs Project: GoEddie/Spark-Dotnet-TF-IDF

        private static (IDFModel, DataFrame) GetModelAndNormalizedDataFrame(string sourceDir,
                                                                            Tokenizer tokenizer, HashingTF hashingTF)
        {
            var sourceDocuments = toDF(GetSourceFiles(sourceDir));
            var words           = tokenizer.Transform(sourceDocuments);
            var featurizedData  = hashingTF.Transform(words);

            var idf = new IDF()
                      .SetInputCol("rawFeatures")
                      .SetOutputCol("features");
            var idfModel = idf.Fit(featurizedData);

            var rescaled = idfModel.Transform(featurizedData);
            var filtered = rescaled.Select("Path", "features");

            return(idfModel, filtered.WithColumn("norm", udfCalcNorm(Col("features"))));
        }

Example #3

Show file

File: Program.cs Project: Apress/introducing-.net-for-apache-spark

        private static void Main(string[] args)
        {
            var spark = SparkSession
                        .Builder()
                        .AppName("TF-IDF Application")
                        .GetOrCreate();

            var documentPath = args[0];
            var search       = args[1];

            var documentData = GetDocuments(documentPath);

            var documents = spark.CreateDataFrame(documentData, new StructType(
                                                      new List <StructField>
            {
                new StructField("title", new StringType()),
                new StructField("content", new StringType())
            }));

            var tokenizer = new Tokenizer()
                            .SetInputCol("content")
                            .SetOutputCol("words");

            var hashingTF = new HashingTF()
                            .SetInputCol("words")
                            .SetOutputCol("rawFeatures")
                            .SetNumFeatures(1000000);

            var idf = new IDF()
                      .SetInputCol("rawFeatures")
                      .SetOutputCol("features");

            var tokenizedDocuments  = tokenizer.Transform(documents);
            var featurizedDocuments = hashingTF.Transform(tokenizedDocuments);

            var idfModel = idf.Fit(featurizedDocuments);

            var transformedDocuments =
                idfModel.Transform(featurizedDocuments).Select("title", "features");
            var normalizedDocuments = transformedDocuments.Select(Col("features"),
                                                                  udfCalcNorm(transformedDocuments["features"]).Alias("norm"), Col("title"));

            var searchTerm = spark.CreateDataFrame(
                new List <GenericRow> {
                new GenericRow(new[] { search })
            },
                new StructType(new[] { new StructField("content", new StringType()) }));

            var tokenizedSearchTerm = tokenizer.Transform(searchTerm);

            var featurizedSearchTerm = hashingTF.Transform(tokenizedSearchTerm);

            var normalizedSearchTerm = idfModel
                                       .Transform(featurizedSearchTerm)
                                       .WithColumnRenamed("features", "searchTermFeatures")
                                       .WithColumn("searchTermNorm", udfCalcNorm(Column("searchTermFeatures")));

            var results = normalizedDocuments.CrossJoin(normalizedSearchTerm);

            results
            .WithColumn("similarity",
                        udfCosineSimilarity(Column("features"), Column("searchTermFeatures"),
                                            Col("norm"), Col("searchTermNorm")))
            .OrderBy(Desc("similarity")).Select("title", "similarity")
            .Show(10000, 100);
        }