private static DataFrame GetSearchTermTFIDF(SparkSession spark, string searchTerm, Tokenizer tokenizer, HashingTF hashingTF, IDFModel idfModel) { var searchTermDataFrame = spark.CreateDataFrame(new List <string>() { searchTerm }) .WithColumnRenamed("_1", "Content"); var searchWords = tokenizer.Transform(searchTermDataFrame); var featurizedSeachTerm = hashingTF.Transform(searchWords); var search = idfModel.Transform(featurizedSeachTerm).WithColumnRenamed("features", "features2") .WithColumn("norm2", udfCalcNorm(Col("features2"))); return(search); }
public void TestIDFModel() { int expectedDocFrequency = 1980; string expectedInputCol = "rawFeatures"; string expectedOutputCol = "features"; DataFrame sentenceData = _spark.Sql("SELECT 0.0 as label, 'Hi I heard about Spark' as sentence"); Tokenizer tokenizer = new Tokenizer() .SetInputCol("sentence") .SetOutputCol("words"); DataFrame wordsData = tokenizer.Transform(sentenceData); HashingTF hashingTF = new HashingTF() .SetInputCol("words") .SetOutputCol(expectedInputCol) .SetNumFeatures(20); DataFrame featurizedData = hashingTF.Transform(wordsData); IDF idf = new IDF() .SetInputCol(expectedInputCol) .SetOutputCol(expectedOutputCol) .SetMinDocFreq(expectedDocFrequency); IDFModel idfModel = idf.Fit(featurizedData); DataFrame rescaledData = idfModel.Transform(featurizedData); Assert.Contains(expectedOutputCol, rescaledData.Columns()); Assert.Equal(expectedInputCol, idfModel.GetInputCol()); Assert.Equal(expectedOutputCol, idfModel.GetOutputCol()); Assert.Equal(expectedDocFrequency, idfModel.GetMinDocFreq()); using (var tempDirectory = new TemporaryDirectory()) { string modelPath = Path.Join(tempDirectory.Path, "idfModel"); idfModel.Save(modelPath); IDFModel loadedModel = IDFModel.Load(modelPath); Assert.Equal(idfModel.Uid(), loadedModel.Uid()); } TestFeatureBase(idfModel, "minDocFreq", 1000); }
private static void Main(string[] args) { if (args.Length != 2) { Console.WriteLine("Args!"); return; } var sourceDir = args[0]; var searchTerm = args[1]; var spark = SparkSession .Builder() .GetOrCreate(); // step one train the model var hashingTF = new HashingTF() .SetInputCol("words") .SetOutputCol("rawFeatures") .SetNumFeatures(100000); var tokenizer = new Tokenizer() .SetInputCol("Content") .SetOutputCol("words"); var(idfModel, normalized) = GetModelAndNormalizedDataFrame(sourceDir, tokenizer, hashingTF); var searchTermTfIdf = GetSearchTermTFIDF(spark, searchTerm, tokenizer, hashingTF, idfModel); var results = searchTermTfIdf.CrossJoin(normalized); results .WithColumn("similarity", udfCosineSimilarity(Col("features"), Col("features2"), Col("norm"), Col("norm2"))) .Select("path", "similarity") .Filter("similarity > 0.0") .OrderBy(Desc("similarity")) .Limit(10) .WithColumn("Search Term", Lit(searchTerm)) .Show(10, 100000); }
public void TestHashingTF() { string expectedInputCol = "input_col"; string expectedOutputCol = "output_col"; int expectedFeatures = 10; Assert.IsType <HashingTF>(new HashingTF()); HashingTF hashingTf = new HashingTF("my-unique-id") .SetNumFeatures(expectedFeatures) .SetInputCol(expectedInputCol) .SetOutputCol(expectedOutputCol); Assert.Equal(expectedFeatures, hashingTf.GetNumFeatures()); Assert.Equal(expectedInputCol, hashingTf.GetInputCol()); Assert.Equal(expectedOutputCol, hashingTf.GetOutputCol()); DataFrame input = _spark.Sql("SELECT array('this', 'is', 'a', 'string', 'a', 'a')" + " as input_col"); DataFrame output = hashingTf.Transform(input); DataFrame outputVector = output.Select(expectedOutputCol); Assert.Contains(expectedOutputCol, outputVector.Columns()); using (var tempDirectory = new TemporaryDirectory()) { string savePath = Path.Join(tempDirectory.Path, "hashingTF"); hashingTf.Save(savePath); HashingTF loadedHashingTf = HashingTF.Load(savePath); Assert.Equal(hashingTf.Uid(), loadedHashingTf.Uid()); } hashingTf.SetBinary(true); Assert.True(hashingTf.GetBinary()); }
private static void Main(string[] args) { var spark = SparkSession .Builder() .AppName("TF-IDF Application") .GetOrCreate(); var documentPath = args[0]; var search = args[1]; var documentData = GetDocuments(documentPath); var documents = spark.CreateDataFrame(documentData, new StructType( new List <StructField> { new StructField("title", new StringType()), new StructField("content", new StringType()) })); var tokenizer = new Tokenizer() .SetInputCol("content") .SetOutputCol("words"); var hashingTF = new HashingTF() .SetInputCol("words") .SetOutputCol("rawFeatures") .SetNumFeatures(1000000); var idf = new IDF() .SetInputCol("rawFeatures") .SetOutputCol("features"); var tokenizedDocuments = tokenizer.Transform(documents); var featurizedDocuments = hashingTF.Transform(tokenizedDocuments); var idfModel = idf.Fit(featurizedDocuments); var transformedDocuments = idfModel.Transform(featurizedDocuments).Select("title", "features"); var normalizedDocuments = transformedDocuments.Select(Col("features"), udfCalcNorm(transformedDocuments["features"]).Alias("norm"), Col("title")); var searchTerm = spark.CreateDataFrame( new List <GenericRow> { new GenericRow(new[] { search }) }, new StructType(new[] { new StructField("content", new StringType()) })); var tokenizedSearchTerm = tokenizer.Transform(searchTerm); var featurizedSearchTerm = hashingTF.Transform(tokenizedSearchTerm); var normalizedSearchTerm = idfModel .Transform(featurizedSearchTerm) .WithColumnRenamed("features", "searchTermFeatures") .WithColumn("searchTermNorm", udfCalcNorm(Column("searchTermFeatures"))); var results = normalizedDocuments.CrossJoin(normalizedSearchTerm); results .WithColumn("similarity", udfCosineSimilarity(Column("features"), Column("searchTermFeatures"), Col("norm"), Col("searchTermNorm"))) .OrderBy(Desc("similarity")).Select("title", "similarity") .Show(10000, 100); }
private static (IDFModel, DataFrame) GetModelAndNormalizedDataFrame(string sourceDir, Tokenizer tokenizer, HashingTF hashingTF) { var sourceDocuments = toDF(GetSourceFiles(sourceDir)); var words = tokenizer.Transform(sourceDocuments); var featurizedData = hashingTF.Transform(words); var idf = new IDF() .SetInputCol("rawFeatures") .SetOutputCol("features"); var idfModel = idf.Fit(featurizedData); var rescaled = idfModel.Transform(featurizedData); var filtered = rescaled.Select("Path", "features"); return(idfModel, filtered.WithColumn("norm", udfCalcNorm(Col("features")))); }