// want to keep 0-9, a-z, A-Z, '.', '?', '!', ';', ''' // [^a-zA-Z0-9\.\?\!;' ] /// <summary> /// Trains model off static corpus /// </summary> /// <param name="fileName"></param> public void TrainModel(string fileName) { Queue <string> chain = new Queue <string>(); //string[] lines = System.IO.File.ReadAllLines("../../" + fileName); string[] phrases = RegexLogic.GetPhrasesFromFile(fileName); for (int i = 0; i < phrases.Count(); i++) { string phrase = phrases[i]; string word = phrase.ToLower(); if (!exceptionList.Contains(word)) { word = Regex.Replace(word, "[\\.\\?\\!;~]", "").ToLower(); } if (word == "") { break; } //if (word.Contains('\'') && !(phrase.Substring(0, 1) == phrase.Substring(0, 1).ToUpper() && phrase.Substring(phrase.Length - 2, 2) == "'s")) // Debugger.Log(String.Format("{0}: {1} ({2})",Regex.Split(fileName, "\\\\").Last(), word, i+1)); //Console.WriteLine(check); bool terminator = (phrase != word); ObserveEvent(chain, word); if (terminator) { chain.Dequeue(); chain = ChainPush(chain); } } Console.WriteLine("Trained on file " + fileName); }
public double TestModelValuation() { Debugger.StartTest(model, testFilePath.Split('\\').Last()); string[] phrases = RegexLogic.GetPhrasesFromFile(testFilePath); double scoreSum = 0; for (int i = 0; i < phrases.Count(); i++) { string phrase = phrases[i]; string word = GetWordFromPhrase(phrase); if (word == "") { fake++; continue; } double modelEvaluation = EvaluateWord(new Queue <string>(evidence.ToArray()), word); scoreSum += modelEvaluation; UpdateTestState(word, phrase); } double modelScore = scoreSum / (double)events; Debugger.Log(String.Format("{0}: {1}", testFilePath, modelScore)); Debugger.FinishTest(model, testFilePath.Split('\\').Last()); Console.WriteLine(); return(modelScore); }
public Tuple <int, int> TestModelPrediction() { Debugger.StartTest(model, testFilePath.Split('\\').Last()); //string[] lines = System.IO.File.ReadAllLines("../../" + fileName); string[] phrases = RegexLogic.GetPhrasesFromFile(testFilePath); for (int i = 0; i < phrases.Count(); i++) { string phrase = phrases[i]; string word = GetWordFromPhrase(phrase); if (word == "") { fake++; continue; } string prediction = PredictWord(new Queue <string>(evidence.ToArray()), word); if (prediction == word) { correctPredictions++; Debugger.LogMatch(model, testFilePath.Split('\\').Last(), word); } UpdateTestState(word, phrase); } Debugger.Log(String.Format("{0}:\n\tevents: {1}\n\tcorrect: {2}\n\tfake: {3}", testFilePath, events, correctPredictions, fake)); Debugger.FinishTest(model, testFilePath.Split('\\').Last()); Console.WriteLine(); return(new Tuple <int, int>(correctPredictions, events)); }
private static void CountFile(string file) { Console.WriteLine(file); List <string> phrases = new List <string>(RegexLogic.GetPhrasesFromFile(file)); phrases.RemoveAll(item => Regex.Replace(item, "[\\.\\?\\!;~]", "") == ""); Debugger.Log(String.Format("{0}: {1} words", file, phrases.Count)); }