public void SortFeaturesByCountWhenApplyingCountLimit()
        {
            // arrange
            var settings = new CountVectorizerSettings
            {
                MaxFeatures = 5
            };

            var target       = new CountVectorizer(settings);
            var trainingData = GetTrainingData();

            target.Fit(trainingData);

            var vectorizerWithoutLimit = GetFittedVectorizer(trainingData);

            IEnumerable <uint> getTopCounts(IEnumerable <IDictionary <string, uint> > tokensCounts) =>
            tokensCounts.SelectMany(kv => kv.Values)
            .OrderByDescending(v => v)
            .Take((int)settings.MaxFeatures);

            var expectedTopCounts = getTopCounts(vectorizerWithoutLimit.Transform(trainingData));

            // act
            var result = target.Transform(trainingData);

            // assert
            var resultTopCounts = getTopCounts(result);

            Assert.Equal(expectedTopCounts, resultTopCounts);
        }
Example #2
0
        public void TestSignaturesV2_4_X()
        {
            const double    maxDf           = 100;
            CountVectorizer countVectorizer = new CountVectorizer().SetMaxDF(maxDf);

            Assert.Equal(maxDf, countVectorizer.GetMaxDF());
        }
        public void ThrowExceptionOnTransformIfIsNotFitted()
        {
            // arrange
            var target = new CountVectorizer();

            // act & assert
            Assert.Throws <NotFittedException>(() => target.Transform("some test text"));
        }
Example #4
0
        public void CountVectorizerBiGram()
        {
            var bigram_vectorizer = new CountVectorizer();

            var analyze = bigram_vectorizer.build_analyzer();
            var tokens  = analyze.analyze("Bi-grams are cool!");

            Assert.IsTrue(Enumerable.SequenceEqual(tokens, new string[] { "bi", "grams", "are", "cool", "bi grams", "grams are", "are cool" }));
        }
        public void CanFitTransformData()
        {
            // arrange
            var target       = new CountVectorizer();
            var trainingData = GetTrainingData();

            var expected = new List <IDictionary <string, uint> >
            {
                new Dictionary <string, uint>
                {
                    { "somebody", 1 },
                    { "once", 1 },
                    { "told", 1 },
                    { "me", 2 },
                    { "the", 1 },
                    { "world", 1 },
                    { "is", 1 },
                    { "gonna", 1 },
                    { "roll", 1 },
                    { "i", 0 },
                    { "ain't", 0 },
                    { "sharpest", 0 },
                    { "tool", 0 },
                    { "in", 0 },
                    { "shed", 0 }
                },
                new Dictionary <string, uint>
                {
                    { "i", 1 },
                    { "ain't", 1 },
                    { "the", 2 },
                    { "sharpest", 1 },
                    { "tool", 1 },
                    { "in", 1 },
                    { "shed", 1 },
                    { "somebody", 0 },
                    { "once", 0 },
                    { "told", 0 },
                    { "me", 0 },
                    { "world", 0 },
                    { "is", 0 },
                    { "gonna", 0 },
                    { "roll", 0 }
                },
            };

            // act
            var res = target.FitTransform(trainingData);

            // assert
            Assert.NotNull(res);
            Assert.Equal(expected, res);
        }
Example #6
0
        public void TestPipelineFit()
        {
            DataFrame input = _spark.Sql("SELECT array('hello', 'I', 'AM', 'a', 'string', 'TO', " +
                                         "'TOKENIZE') as input from range(100)");

            const string inputColumn  = "input";
            const string outputColumn = "output";
            const double minDf        = 1;
            const double minTf        = 10;
            const int    vocabSize    = 10000;

            CountVectorizer countVectorizer = new CountVectorizer()
                                              .SetInputCol(inputColumn)
                                              .SetOutputCol(outputColumn)
                                              .SetMinDF(minDf)
                                              .SetMinTF(minTf)
                                              .SetVocabSize(vocabSize);

            var stages = new JavaPipelineStage[] {
                countVectorizer
            };

            Pipeline      pipeline      = new Pipeline().SetStages(stages);
            PipelineModel pipelineModel = pipeline.Fit(input);

            DataFrame output = pipelineModel.Transform(input);

            Assert.IsType <StructType>(pipelineModel.TransformSchema(input.Schema()));
            Assert.IsType <DataFrame>(output);

            using (var tempDirectory = new TemporaryDirectory())
            {
                string savePath = Path.Join(tempDirectory.Path, "pipeline");
                pipeline.Save(savePath);

                Pipeline loadedPipeline = Pipeline.Load(savePath);
                Assert.Equal(pipeline.Uid(), loadedPipeline.Uid());

                string writePath = Path.Join(tempDirectory.Path, "pipelineWithWrite");
                pipeline.Write().Save(writePath);

                Pipeline loadedPipelineWithRead = pipeline.Read().Load(writePath);
                Assert.Equal(pipeline.Uid(), loadedPipelineWithRead.Uid());
            }

            TestFeatureBase(pipeline, "stages", stages);
        }
        public void CanLimitFeaturesCount()
        {
            // arrange
            var settings = new CountVectorizerSettings
            {
                MaxFeatures = 5
            };

            var target       = new CountVectorizer(settings);
            var trainingData = GetTrainingData();

            // act
            target.Fit(trainingData);

            // assert
            Assert.True(target.Vocabulary.Count() == 5);
        }
        public void CanFit()
        {
            // arrange
            var target       = new CountVectorizer();
            var trainingData = new string[]
            {
                "Some cool text",
                "Another cool text"
            };

            // act
            var res = target.Fit(trainingData);

            // assert
            Assert.Same(target, res);
            Assert.NotNull(res.Vocabulary);
        }
Example #9
0
        public void TestCountVectorizer()
        {
            DataFrame input = _spark.Sql("SELECT array('hello', 'I', 'AM', 'a', 'string', 'TO', " +
                                         "'TOKENIZE') as input from range(100)");

            const string inputColumn  = "input";
            const string outputColumn = "output";
            const double minDf        = 1;
            const double minTf        = 10;
            const int    vocabSize    = 10000;
            const bool   binary       = false;

            var countVectorizer = new CountVectorizer();

            countVectorizer
            .SetInputCol(inputColumn)
            .SetOutputCol(outputColumn)
            .SetMinDF(minDf)
            .SetMinTF(minTf)
            .SetVocabSize(vocabSize);

            Assert.IsType <CountVectorizerModel>(countVectorizer.Fit(input));
            Assert.Equal(inputColumn, countVectorizer.GetInputCol());
            Assert.Equal(outputColumn, countVectorizer.GetOutputCol());
            Assert.Equal(minDf, countVectorizer.GetMinDF());
            Assert.Equal(minTf, countVectorizer.GetMinTF());
            Assert.Equal(vocabSize, countVectorizer.GetVocabSize());
            Assert.Equal(binary, countVectorizer.GetBinary());

            using (var tempDirectory = new TemporaryDirectory())
            {
                string savePath = Path.Join(tempDirectory.Path, "countVectorizer");
                countVectorizer.Save(savePath);

                CountVectorizer loadedVectorizer = CountVectorizer.Load(savePath);
                Assert.Equal(countVectorizer.Uid(), loadedVectorizer.Uid());
            }

            Assert.NotEmpty(countVectorizer.ExplainParams());
            Assert.NotEmpty(countVectorizer.ToString());
        }