示例#1
0
        public void TestCountVectorizer()
        {
            DataFrame input = _spark.Sql("SELECT array('hello', 'I', 'AM', 'a', 'string', 'TO', " +
                                         "'TOKENIZE') as input from range(100)");

            const string inputColumn  = "input";
            const string outputColumn = "output";
            const double minDf        = 1;
            const double minTf        = 10;
            const int    vocabSize    = 10000;
            const bool   binary       = false;

            var countVectorizer = new CountVectorizer();

            countVectorizer
            .SetInputCol(inputColumn)
            .SetOutputCol(outputColumn)
            .SetMinDF(minDf)
            .SetMinTF(minTf)
            .SetVocabSize(vocabSize);

            Assert.IsType <CountVectorizerModel>(countVectorizer.Fit(input));
            Assert.Equal(inputColumn, countVectorizer.GetInputCol());
            Assert.Equal(outputColumn, countVectorizer.GetOutputCol());
            Assert.Equal(minDf, countVectorizer.GetMinDF());
            Assert.Equal(minTf, countVectorizer.GetMinTF());
            Assert.Equal(vocabSize, countVectorizer.GetVocabSize());
            Assert.Equal(binary, countVectorizer.GetBinary());

            using (var tempDirectory = new TemporaryDirectory())
            {
                string savePath = Path.Join(tempDirectory.Path, "countVectorizer");
                countVectorizer.Save(savePath);

                CountVectorizer loadedVectorizer = CountVectorizer.Load(savePath);
                Assert.Equal(countVectorizer.Uid(), loadedVectorizer.Uid());
            }

            Assert.NotEmpty(countVectorizer.ExplainParams());
            Assert.NotEmpty(countVectorizer.ToString());
        }