public ProcessedDataset ProcessDataset(List <string> dataset, int ngramSize = default)
        {
            // calculate TF, IDF, and TFIDF values
            TFIDFController.CalculateDatasetTFIDFValues(dataset, out string[] uniqueNGramsVector, out float[] datasetIDFVector, out float[][] datasetTFIDFMatrix, ngramSize: ngramSize);

            // get TFIDF scalar values for each sentence
            var dataseetTFIDFMatrixAbsoluteValues = ScalarValueCalculator.CalculateVectorAbsoluteValueBatch(datasetTFIDFMatrix);

            // calculate maximum number of words in a datapoint within the dataset
            var maximumWordCount = StringTokenizer.FindMaxWordCount(dataset);

            // return
            return(new ProcessedDataset()
            {
                TFIDFMatrixAbsoluteValues = dataseetTFIDFMatrixAbsoluteValues,
                TFIDFMatrix = datasetTFIDFMatrix,
                IDFVector = datasetIDFVector,
                UniqueNGramsVector = uniqueNGramsVector,
                MaximumWordCount = maximumWordCount,
            });
        }
Esempio n. 2
0
        public void MaxWordCountTest(List <string> testset, int expected)
        {
            var maxWordCount = StringTokenizer.FindMaxWordCount(testset);

            Assert.Equal(expected, maxWordCount);
        }