public void TestTFIDF() { var stringTableBuilder = new StringTableBuilder(); var bag = new ClassificationBag { Classification = new[] { Tuple.Create(new[] { "Chinese", "Beijing", "Chinese" }, true), Tuple.Create(new[] { "Chinese", "Chinese", "Shanghai" }, true), Tuple.Create(new[] { "Chinese", "Macao" }, true), Tuple.Create(new[] { "Tokyo", "Japan", "Chinese" }, false), }.Select(d => new IndexedClassification { Name = d.Item2 ? "china" : "japan", Data = d.Item1.Select(s => stringTableBuilder.GetIndex(s)).ToArray() }).ToArray() }; Assert.AreEqual(bag.Classification.Length, 4); Assert.AreEqual(bag.Classification[0].Data.Length, 3); var set = bag.ConvertToSparseVectors(true); Assert.AreEqual(set.Classification.Length, 2); Assert.AreEqual(set.Classification[0].Data.Length, 4); var tfidf = set.TFIDF(); Assert.AreEqual(tfidf.Classification.Length, 2); Assert.AreEqual(tfidf.Classification[0].Data.Length, 4); }
/// <summary> /// Multinomial naive bayes preserves the count of each feature within the model. Useful for long documents. /// </summary> /// <param name="data">The training data</param> /// <returns>A model that can be used for classification</returns> public static MultinomialNaiveBayes TrainMultinomialNaiveBayes(this ClassificationBag data) { var trainer = new MultinomialNaiveBayesTrainer(); foreach (var classification in data.Classification) { trainer.AddClassification(classification.Name, classification.Data); } return(trainer.Train()); }
///// <summary> ///// Random projections allow you to reduce the dimensions of a matrix while still preserving significant information ///// </summary> ///// <param name="lap">Linear algebra provider</param> ///// <param name="inputSize">The vector size to reduce from</param> ///// <returns></returns> //public static IRandomProjection CreateRandomProjection(this ILinearAlgebraProvider lap, int inputSize) //{ // var reducedSize = RandomProjection.MinDim(inputSize); // return CreateRandomProjection(lap, inputSize, reducedSize); //} ///// <summary> ///// Markov models summarise sequential data (over a window of size 2) ///// </summary> ///// <typeparam name="T">The data type within the model</typeparam> ///// <param name="data">An enumerable of sequences of type T</param> ///// <returns>A sequence of markov model observations</returns> //public static MarkovModel2<T> TrainMarkovModel2<T>(this IEnumerable<IEnumerable<T>> data) //{ // var trainer = new MarkovModelTrainer2<T>(); // foreach (var sequence in data) // trainer.Add(sequence); // return trainer.Build(); //} ///// <summary> ///// Markov models summarise sequential data (over a window of size 3) ///// </summary> ///// <typeparam name="T">The data type within the model</typeparam> ///// <param name="data">An enumerable of sequences of type T</param> ///// <returns>A sequence of markov model observations</returns> //public static MarkovModel3<T> TrainMarkovModel3<T>(this IEnumerable<IEnumerable<T>> data) //{ // var trainer = new MarkovModelTrainer3<T>(); // foreach (var sequence in data) // trainer.Add(sequence); // return trainer.Build(); //} /// <summary> /// Bernoulli naive bayes treats each feature as either 1 or 0 - all feature counts are discarded. Useful for short documents. /// </summary> /// <param name="data">The training data</param> /// <returns>A model that can be used for classification</returns> public static BernoulliNaiveBayes TrainBernoulliNaiveBayes(this ClassificationBag data) { var trainer = new BernoulliNaiveBayesTrainer(); foreach (var classification in data.Classification) { trainer.AddClassification(classification.Name, classification.Data); } return(trainer.Train()); }