public void TestIDFModel() { int expectedDocFrequency = 1980; string expectedInputCol = "rawFeatures"; string expectedOutputCol = "features"; DataFrame sentenceData = _spark.Sql("SELECT 0.0 as label, 'Hi I heard about Spark' as sentence"); Tokenizer tokenizer = new Tokenizer() .SetInputCol("sentence") .SetOutputCol("words"); DataFrame wordsData = tokenizer.Transform(sentenceData); HashingTF hashingTF = new HashingTF() .SetInputCol("words") .SetOutputCol(expectedInputCol) .SetNumFeatures(20); DataFrame featurizedData = hashingTF.Transform(wordsData); IDF idf = new IDF() .SetInputCol(expectedInputCol) .SetOutputCol(expectedOutputCol) .SetMinDocFreq(expectedDocFrequency); IDFModel idfModel = idf.Fit(featurizedData); DataFrame rescaledData = idfModel.Transform(featurizedData); Assert.Contains(expectedOutputCol, rescaledData.Columns()); Assert.Equal(expectedInputCol, idfModel.GetInputCol()); Assert.Equal(expectedOutputCol, idfModel.GetOutputCol()); Assert.Equal(expectedDocFrequency, idfModel.GetMinDocFreq()); using (var tempDirectory = new TemporaryDirectory()) { string modelPath = Path.Join(tempDirectory.Path, "idfModel"); idfModel.Save(modelPath); IDFModel loadedModel = IDFModel.Load(modelPath); Assert.Equal(idfModel.Uid(), loadedModel.Uid()); } TestFeatureBase(idfModel, "minDocFreq", 1000); }
private static (IDFModel, DataFrame) GetModelAndNormalizedDataFrame(string sourceDir, Tokenizer tokenizer, HashingTF hashingTF) { var sourceDocuments = toDF(GetSourceFiles(sourceDir)); var words = tokenizer.Transform(sourceDocuments); var featurizedData = hashingTF.Transform(words); var idf = new IDF() .SetInputCol("rawFeatures") .SetOutputCol("features"); var idfModel = idf.Fit(featurizedData); var rescaled = idfModel.Transform(featurizedData); var filtered = rescaled.Select("Path", "features"); return(idfModel, filtered.WithColumn("norm", udfCalcNorm(Col("features")))); }
private static void Main(string[] args) { var spark = SparkSession .Builder() .AppName("TF-IDF Application") .GetOrCreate(); var documentPath = args[0]; var search = args[1]; var documentData = GetDocuments(documentPath); var documents = spark.CreateDataFrame(documentData, new StructType( new List <StructField> { new StructField("title", new StringType()), new StructField("content", new StringType()) })); var tokenizer = new Tokenizer() .SetInputCol("content") .SetOutputCol("words"); var hashingTF = new HashingTF() .SetInputCol("words") .SetOutputCol("rawFeatures") .SetNumFeatures(1000000); var idf = new IDF() .SetInputCol("rawFeatures") .SetOutputCol("features"); var tokenizedDocuments = tokenizer.Transform(documents); var featurizedDocuments = hashingTF.Transform(tokenizedDocuments); var idfModel = idf.Fit(featurizedDocuments); var transformedDocuments = idfModel.Transform(featurizedDocuments).Select("title", "features"); var normalizedDocuments = transformedDocuments.Select(Col("features"), udfCalcNorm(transformedDocuments["features"]).Alias("norm"), Col("title")); var searchTerm = spark.CreateDataFrame( new List <GenericRow> { new GenericRow(new[] { search }) }, new StructType(new[] { new StructField("content", new StringType()) })); var tokenizedSearchTerm = tokenizer.Transform(searchTerm); var featurizedSearchTerm = hashingTF.Transform(tokenizedSearchTerm); var normalizedSearchTerm = idfModel .Transform(featurizedSearchTerm) .WithColumnRenamed("features", "searchTermFeatures") .WithColumn("searchTermNorm", udfCalcNorm(Column("searchTermFeatures"))); var results = normalizedDocuments.CrossJoin(normalizedSearchTerm); results .WithColumn("similarity", udfCosineSimilarity(Column("features"), Column("searchTermFeatures"), Col("norm"), Col("searchTermNorm"))) .OrderBy(Desc("similarity")).Select("title", "similarity") .Show(10000, 100); }