public ProcessedDataset ProcessDataset(List <string> dataset, int ngramSize = default) { // calculate TF, IDF, and TFIDF values TFIDFController.CalculateDatasetTFIDFValues(dataset, out string[] uniqueNGramsVector, out float[] datasetIDFVector, out float[][] datasetTFIDFMatrix, ngramSize: ngramSize); // get TFIDF scalar values for each sentence var dataseetTFIDFMatrixAbsoluteValues = ScalarValueCalculator.CalculateVectorAbsoluteValueBatch(datasetTFIDFMatrix); // calculate maximum number of words in a datapoint within the dataset var maximumWordCount = StringTokenizer.FindMaxWordCount(dataset); // return return(new ProcessedDataset() { TFIDFMatrixAbsoluteValues = dataseetTFIDFMatrixAbsoluteValues, TFIDFMatrix = datasetTFIDFMatrix, IDFVector = datasetIDFVector, UniqueNGramsVector = uniqueNGramsVector, MaximumWordCount = maximumWordCount, }); }
public void MaxWordCountTest(List <string> testset, int expected) { var maxWordCount = StringTokenizer.FindMaxWordCount(testset); Assert.Equal(expected, maxWordCount); }