public void SortFeaturesByCountWhenApplyingCountLimit() { // arrange var settings = new CountVectorizerSettings { MaxFeatures = 5 }; var target = new CountVectorizer(settings); var trainingData = GetTrainingData(); target.Fit(trainingData); var vectorizerWithoutLimit = GetFittedVectorizer(trainingData); IEnumerable <uint> getTopCounts(IEnumerable <IDictionary <string, uint> > tokensCounts) => tokensCounts.SelectMany(kv => kv.Values) .OrderByDescending(v => v) .Take((int)settings.MaxFeatures); var expectedTopCounts = getTopCounts(vectorizerWithoutLimit.Transform(trainingData)); // act var result = target.Transform(trainingData); // assert var resultTopCounts = getTopCounts(result); Assert.Equal(expectedTopCounts, resultTopCounts); }
public void TestSignaturesV2_4_X() { const double maxDf = 100; CountVectorizer countVectorizer = new CountVectorizer().SetMaxDF(maxDf); Assert.Equal(maxDf, countVectorizer.GetMaxDF()); }
public void ThrowExceptionOnTransformIfIsNotFitted() { // arrange var target = new CountVectorizer(); // act & assert Assert.Throws <NotFittedException>(() => target.Transform("some test text")); }
public void CountVectorizerBiGram() { var bigram_vectorizer = new CountVectorizer(); var analyze = bigram_vectorizer.build_analyzer(); var tokens = analyze.analyze("Bi-grams are cool!"); Assert.IsTrue(Enumerable.SequenceEqual(tokens, new string[] { "bi", "grams", "are", "cool", "bi grams", "grams are", "are cool" })); }
public void CanFitTransformData() { // arrange var target = new CountVectorizer(); var trainingData = GetTrainingData(); var expected = new List <IDictionary <string, uint> > { new Dictionary <string, uint> { { "somebody", 1 }, { "once", 1 }, { "told", 1 }, { "me", 2 }, { "the", 1 }, { "world", 1 }, { "is", 1 }, { "gonna", 1 }, { "roll", 1 }, { "i", 0 }, { "ain't", 0 }, { "sharpest", 0 }, { "tool", 0 }, { "in", 0 }, { "shed", 0 } }, new Dictionary <string, uint> { { "i", 1 }, { "ain't", 1 }, { "the", 2 }, { "sharpest", 1 }, { "tool", 1 }, { "in", 1 }, { "shed", 1 }, { "somebody", 0 }, { "once", 0 }, { "told", 0 }, { "me", 0 }, { "world", 0 }, { "is", 0 }, { "gonna", 0 }, { "roll", 0 } }, }; // act var res = target.FitTransform(trainingData); // assert Assert.NotNull(res); Assert.Equal(expected, res); }
public void TestPipelineFit() { DataFrame input = _spark.Sql("SELECT array('hello', 'I', 'AM', 'a', 'string', 'TO', " + "'TOKENIZE') as input from range(100)"); const string inputColumn = "input"; const string outputColumn = "output"; const double minDf = 1; const double minTf = 10; const int vocabSize = 10000; CountVectorizer countVectorizer = new CountVectorizer() .SetInputCol(inputColumn) .SetOutputCol(outputColumn) .SetMinDF(minDf) .SetMinTF(minTf) .SetVocabSize(vocabSize); var stages = new JavaPipelineStage[] { countVectorizer }; Pipeline pipeline = new Pipeline().SetStages(stages); PipelineModel pipelineModel = pipeline.Fit(input); DataFrame output = pipelineModel.Transform(input); Assert.IsType <StructType>(pipelineModel.TransformSchema(input.Schema())); Assert.IsType <DataFrame>(output); using (var tempDirectory = new TemporaryDirectory()) { string savePath = Path.Join(tempDirectory.Path, "pipeline"); pipeline.Save(savePath); Pipeline loadedPipeline = Pipeline.Load(savePath); Assert.Equal(pipeline.Uid(), loadedPipeline.Uid()); string writePath = Path.Join(tempDirectory.Path, "pipelineWithWrite"); pipeline.Write().Save(writePath); Pipeline loadedPipelineWithRead = pipeline.Read().Load(writePath); Assert.Equal(pipeline.Uid(), loadedPipelineWithRead.Uid()); } TestFeatureBase(pipeline, "stages", stages); }
public void CanLimitFeaturesCount() { // arrange var settings = new CountVectorizerSettings { MaxFeatures = 5 }; var target = new CountVectorizer(settings); var trainingData = GetTrainingData(); // act target.Fit(trainingData); // assert Assert.True(target.Vocabulary.Count() == 5); }
public void CanFit() { // arrange var target = new CountVectorizer(); var trainingData = new string[] { "Some cool text", "Another cool text" }; // act var res = target.Fit(trainingData); // assert Assert.Same(target, res); Assert.NotNull(res.Vocabulary); }
public void TestCountVectorizer() { DataFrame input = _spark.Sql("SELECT array('hello', 'I', 'AM', 'a', 'string', 'TO', " + "'TOKENIZE') as input from range(100)"); const string inputColumn = "input"; const string outputColumn = "output"; const double minDf = 1; const double minTf = 10; const int vocabSize = 10000; const bool binary = false; var countVectorizer = new CountVectorizer(); countVectorizer .SetInputCol(inputColumn) .SetOutputCol(outputColumn) .SetMinDF(minDf) .SetMinTF(minTf) .SetVocabSize(vocabSize); Assert.IsType <CountVectorizerModel>(countVectorizer.Fit(input)); Assert.Equal(inputColumn, countVectorizer.GetInputCol()); Assert.Equal(outputColumn, countVectorizer.GetOutputCol()); Assert.Equal(minDf, countVectorizer.GetMinDF()); Assert.Equal(minTf, countVectorizer.GetMinTF()); Assert.Equal(vocabSize, countVectorizer.GetVocabSize()); Assert.Equal(binary, countVectorizer.GetBinary()); using (var tempDirectory = new TemporaryDirectory()) { string savePath = Path.Join(tempDirectory.Path, "countVectorizer"); countVectorizer.Save(savePath); CountVectorizer loadedVectorizer = CountVectorizer.Load(savePath); Assert.Equal(countVectorizer.Uid(), loadedVectorizer.Uid()); } Assert.NotEmpty(countVectorizer.ExplainParams()); Assert.NotEmpty(countVectorizer.ToString()); }