// Generates what should be printed into the .txt file as the predicted book. public List <SentenceExample> GeneratePredictedBook(List <SentenceExample> allSentences, DecisionTree model, int numAdjacentExamples) { var predictedBook = new List <SentenceExample>(); Random rand = new Random(); // Add sentences to predictedBook. var firstSentence = allSentences[rand.Next(0, allSentences.Count)]; allSentences.Remove(firstSentence); predictedBook.Add(firstSentence); while (allSentences.Count != 0) { var nextSentences = GenerateNextSentences(allSentences, firstSentence, numAdjacentExamples, rand); var labels = model.Predict(ClassificationUtil.ConstructMatrixX(bagOfWords, nextSentences)); // Gets the best sentence according to label. int maxValue = labels.Max(); int maxIndex = labels.ToList().IndexOf(maxValue); var bestNextSentence = nextSentences[maxIndex]; // Add bestNextSentence as next sentence in new book. allSentences.Remove(bestNextSentence); predictedBook.Add(bestNextSentence); firstSentence = bestNextSentence; } return(predictedBook); }
// TODO refactor public void TrainModel(List <string> ids, int numExamplesToClassify, int numAdjacentExamples) { // At least 2 ids to train on. if (ids.Count < 2) { throw new Exception("At least 2 ids must be specified."); } var trainingBooks = _books.Where(book => ids.Contains(book.Key)).ToList(); // Classify all sentences. var allSentences = new List <SentenceExample>(); foreach (var book in trainingBooks) { allSentences.AddRange(book.Value.sentences); } var rand = new Random(); RandomUtil.Shuffle(allSentences, rand); // TODO -> magic number, should be hyperparam var clusteringModel = new KMeans(4); clusteringModel.Fit(ClassificationUtil.ConstructMatrixX(_bagOfWords, allSentences.Take(allSentences.Count / 2).ToList())); var clusterList = clusteringModel.Predict(ClassificationUtil.ConstructMatrixX(_bagOfWords, allSentences)); for (int i = 0; i < clusterList.Count; i++) { allSentences[i].classification = clusterList[i]; } // Get random assortment of sentences to train on. var sentencesToClassify = new List <SentenceExample>(); for (int i = 0; i < allSentences.Count && i < numExamplesToClassify; i++) { sentencesToClassify.Add(allSentences[i]); } // Get training data using sentences from user input. var trainingData = DisplayAdjacentSentences(sentencesToClassify, numAdjacentExamples, rand); var bookCol = new BookCollection(ids); bookCol.sentences = allSentences; bookCol.trainingExamples = trainingData.Item1; bookCol.trainingLabels = trainingData.Item2; bookCol.bagOfWords = _bagOfWords; _bookCollections.Add(bookCol.id, bookCol); }
public string GenerateBook(string id, int maxDepth, int numTrees, int numAdjacentExamples) { var bookCol = _bookCollections[id]; // Train model. //var model = new RandomForest(maxDepth, numTrees); var model = new DecisionTree(maxDepth, new DecisionStumpInfoGain()); var XTrain = ClassificationUtil.ConstructMatrixX(_bagOfWords, bookCol.trainingExamples); model.Fit(XTrain, bookCol.trainingLabels); var allSentences = bookCol.sentences; var bookGenerator = new BookGenerator(_bagOfWords); string[] stringOutput = bookGenerator.GeneratePredictedBook(allSentences, model, numAdjacentExamples) .Select(x => x.sentence).ToArray(); var outputLocation = @"..\..\" + bookCol.id + ".txt"; System.IO.File.WriteAllLines(outputLocation, stringOutput); return(outputLocation); }