public void PreProcessingPipelineStopWordsTest() { List <Tokenizer.WordTag> inputSw = new List <Tokenizer.WordTag>() { new Tokenizer.WordTag("(", "("), new Tokenizer.WordTag(")", ")"), new Tokenizer.WordTag("[", "["), new Tokenizer.WordTag("]", "]"), new Tokenizer.WordTag("{", "nil"), new Tokenizer.WordTag("}", "nil"), }; List <Tokenizer.WordTag> expected = new List <Tokenizer.WordTag>(); List <Tokenizer.WordTag> res = TextPreprocessing.PreProcessingPipeline(inputSw); Assert.AreEqual(expected, res); }
public void PreProcessingPipelineKeepCapitalWordsTest() { List <Tokenizer.WordTag> inputSw = new List <Tokenizer.WordTag>() { new Tokenizer.WordTag("Hello", "nn"), new Tokenizer.WordTag("hello", "nn"), new Tokenizer.WordTag("HELLO", "nn"), new Tokenizer.WordTag("hELLO", "nn"), }; List <Tokenizer.WordTag> expected = new List <Tokenizer.WordTag>() { new Tokenizer.WordTag("Hello", "nn"), new Tokenizer.WordTag("HELLO", "nn"), }; List <Tokenizer.WordTag> res = TextPreprocessing.PreProcessingPipeline(inputSw, keepOnlyCapitalizedWords: true); Assert.AreEqual(expected, res); }
public void PreProcessingPipelineDigitsTest() { List <Tokenizer.WordTag> inputSw = new List <Tokenizer.WordTag>() { new Tokenizer.WordTag("123.03", "cd"), new Tokenizer.WordTag("9780", "cd"), new Tokenizer.WordTag("9780-965", "cd"), new Tokenizer.WordTag("abc", "cd"), new Tokenizer.WordTag("123d", "cd"), new Tokenizer.WordTag("123de", "cd"), new Tokenizer.WordTag("123def", "cd"), new Tokenizer.WordTag("123456defg", "cd"), new Tokenizer.WordTag("abc-123", "cd"), new Tokenizer.WordTag("123++-", "cd"), new Tokenizer.WordTag("9A78B087C", "cd"), new Tokenizer.WordTag("9A78B087", "cd"), }; List <Tokenizer.WordTag> expected = new List <Tokenizer.WordTag>() { new Tokenizer.WordTag("abc", "cd"), new Tokenizer.WordTag("def", "cd"), new Tokenizer.WordTag("defg", "cd"), new Tokenizer.WordTag("abc", "cd"), new Tokenizer.WordTag("ABC", "cd") }; List <Tokenizer.WordTag> res = TextPreprocessing.PreProcessingPipeline(inputSw); Assert.AreEqual(expected, res); }
public void PreProcessingPipelineAllWordsToLowerTest() { List <Tokenizer.WordTag> inputSw = new List <Tokenizer.WordTag>() { new Tokenizer.WordTag("Hello", "nn"), new Tokenizer.WordTag("hello", "nn"), new Tokenizer.WordTag("HeLLo", "nn"), new Tokenizer.WordTag("hELLO", "nn"), new Tokenizer.WordTag("HELLO", "nn"), }; List <Tokenizer.WordTag> expected = new List <Tokenizer.WordTag>() { new Tokenizer.WordTag("hello", "nn"), new Tokenizer.WordTag("hello", "nn"), new Tokenizer.WordTag("hello", "nn"), new Tokenizer.WordTag("hello", "nn"), new Tokenizer.WordTag("hello", "nn"), }; List <Tokenizer.WordTag> res = TextPreprocessing.PreProcessingPipeline(inputSw, toLowerOption: true); Assert.AreEqual(expected, res); }
static void Main(string[] args) { string path = Directory.GetParent(Directory.GetCurrentDirectory()).Parent.Parent.FullName + "\\"; #if (RULE_70_30) Console.WriteLine("You chose Rule 70% - training, 30% - testing for the data-set!"); const string BrownfolderTrain = "dataset\\70_30\\train", BrownfolderTest = "dataset\\70_30\\test"; #region Load Train Files & pre-process data var text = LoadAndReadFolderFiles(BrownfolderTrain); var oldWords = Tokenizer.SeparateTagFromWord(Tokenizer.TokenizePennTreebank(text)); var words = SpeechPartClassifier.GetNewHierarchicTags(oldWords); var capWords = TextPreprocessing.PreProcessingPipeline(words, toLowerOption: false, keepOnlyCapitalizedWords: true); var uncapWords = TextPreprocessing.PreProcessingPipeline(words, toLowerOption: true, keepOnlyCapitalizedWords: false); #endregion #region Load Test Files & pre-process data var textTest = LoadAndReadFolderFiles(BrownfolderTest); var oldWordsTest = Tokenizer.SeparateTagFromWord(Tokenizer.TokenizePennTreebank(textTest)); var wordsTest = SpeechPartClassifier.GetNewHierarchicTags(oldWordsTest); wordsTest = TextPreprocessing.PreProcessingPipeline(wordsTest); wordsTest = TextPreprocessing.Cleaning.EliminateDuplicateSequenceOfEndOfSentenceTags(wordsTest); #endregion Console.WriteLine("Done with loading and creating tokens for train & test files!"); #region Part of Speech Model Training PartOfSpeechModel tagger = new PartOfSpeechModel(); Stopwatch sw = new Stopwatch(); sw.Start(); tagger.CreateHiddenMarkovModel(uncapWords, capWords, smoothingCoef: 1); tagger.CalculateHiddenMarkovModelProbabilitiesForTestCorpus(wordsTest, model: "trigram"); sw.Stop(); #endregion #region Debug for Emissions & Transitions matrix & write trained files //foreach (var model in tagger.EmissionFreq) //{ // Console.WriteLine(model.Word); // foreach (var item in model.TagFreq) // { // Console.WriteLine(" " + item.Key + " -> " + item.Value); // } //} //foreach (var item in tagger.UnigramFreq) // Console.WriteLine(item.Key + " -> " + item.Value); //foreach (var item in tagger.BigramTransition) // Console.WriteLine(item.Key + " -> " + item.Value); //foreach (var item in tagger.TrigramTransition) // Console.WriteLine(item.Key + " -> " + item.Value); //WriteToTxtFile("Models", "emissionWithCapital.json", JsonConvert.SerializeObject(tagger.CapitalEmissionFreq)); //WriteToTxtFile("Models", "emission.json", JsonConvert.SerializeObject(tagger.EmissionFreq)); //WriteToTxtFile("Models", "unigram.json", JsonConvert.SerializeObject(tagger.UnigramFreq)); //WriteToTxtFile("Models", "bigram.json", JsonConvert.SerializeObject(tagger.BigramTransition)); //WriteToTxtFile("Models", "trigram.json", JsonConvert.SerializeObject(tagger.TrigramTransition)); //WriteToTxtFile("Models", "nonCapitalizedPrefix.json", JsonConvert.SerializeObject(tagger.PrefixEmissionProbabilities)); //WriteToTxtFile("Models", "capitalizedPrefix.json", JsonConvert.SerializeObject(tagger.PrefixCapitalizedWordEmissionProbabilities)); //WriteToTxtFile("Models", "nonCapitalizedSuffix.json", JsonConvert.SerializeObject(tagger.SuffixEmissionProbabilities)); //WriteToTxtFile("Models", "capitalizedSuffix.json", JsonConvert.SerializeObject(tagger.SuffixCapitalizedWordEmissionProbabilities)); //Console.WriteLine("Done writing models on filesystem!"); #endregion Console.WriteLine("Done with training POS MODEL & calculating probabilities! Time: " + sw.ElapsedMilliseconds + " ms"); Console.WriteLine("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"); #region Decoding Viterbi Model Decoder decoder = new Decoder(); sw.Reset(); sw.Start(); decoder.ViterbiDecoding(tagger, wordsTest, modelForward: "trigram", modelBackward: "trigram", mode: "backward", beam: 0); sw.Stop(); #endregion Console.WriteLine("Done with DECODING VITERBI MODEL! Time: " + sw.ElapsedMilliseconds + " ms"); #region Old method to guess probabilities //decoder.UnknownWords = new HashSet<string>(); //decoder.PredictedTags = new List<string>(); //foreach (var tw in wordsTest) //{ // HMMTagger.EmissionModel modelMax; // modelMax = tagger.WordTagsEmissionFrequence.Find(x => x.Word == tw.word); // if (modelMax != null) // { // string maxTag = modelMax.TagFreq.OrderByDescending(x => x.Value).FirstOrDefault().Key; // // case default-tag NN ONLY // //decoder.PredictedTags.Add("NN"); // // case maxTag // decoder.PredictedTags.Add(maxTag); // } // else // { // const string deftag = "NN"; // decoder.PredictedTags.Add(deftag); // NULL / NN // decoder.UnknownWords.Add(tw.word); // } //} #endregion #region Debug for Emissions & Transitions //foreach (var item in decoder.EmissionProbabilities) //{ // Console.WriteLine(item.Word); // foreach (var item2 in item.TagFreq) // Console.WriteLine("\t" + item2.Key + " -> " + item2.Value); //} //foreach (var item in decoder.UnigramProbabilities) // Console.WriteLine("UNI: " + item.Key + "->" + item.Value); //foreach (var item in decoder.BigramTransitionProbabilities) // Console.WriteLine("BI: " + item.Key + " -> " + item.Value); //foreach (var item in decoder.TrigramTransitionProbabilities) // Console.WriteLine("TRI: " + item.Key + " -> " + item.Value); //foreach (var item in decoder.ViterbiGraph) //{ // foreach (var item2 in item) // Console.Write(item2.CurrentTag + ":" + item2.value + " "); // Console.WriteLine(); //} //Console.WriteLine("Predicted tags: "); //foreach (var item in decoder.PredictedTags) // Console.Write(item + " "); Console.WriteLine("testwords: " + wordsTest.Count + " , predwords: " + decoder.PredictedTags.Count); #endregion Console.WriteLine("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"); #region Evaluations & results Evaluation eval = new Evaluation(); eval.CreateSupervizedEvaluationsMatrix(wordsTest, decoder.PredictedTags, decoder.UnknownWords, fbeta: 1); //using (System.IO.StreamWriter file = new System.IO.StreamWriter(path + "statistics\\" + "bdt.csv")) //{ // file.WriteLine("TAG,ACCURACY,PRECISION,RECALL(TPR),SPECIFICITY(TNR),F1-SCORE"); // var fullMatrix = eval.PrintClassificationResultsMatrix(); // for (int i = 0; i < eval.GetFullMatrixLineLength(); i++) // { // for (int j = 0; j < eval.GetFullMatrixColLength(); j++) // file.Write(fullMatrix[i][j] + ","); // file.WriteLine(); // } //} Console.WriteLine("TAG ACCURACY PRECISION RECALL(TPR) SPECIFICITY(TNR) F1-SCORE"); var fullMatrix = eval.PrintClassificationResultsMatrix(); for (int i = 0; i < eval.GetFullMatrixLineLength(); i++) { for (int j = 0; j < eval.GetFullMatrixColLength(); j++) { Console.Write(fullMatrix[i][j] + " "); } Console.WriteLine(); } Console.WriteLine("\nAccuracy for known words: " + eval.GetNaiveAccuracy(wordsTest, decoder.PredictedTags, decoder.UnknownWords, evalMode: "k")); Console.WriteLine("Accuracy for unknown words: " + eval.GetNaiveAccuracy(wordsTest, decoder.PredictedTags, decoder.UnknownWords, evalMode: "u")); Console.WriteLine("Accuracy on both: " + eval.GetNaiveAccuracy(wordsTest, decoder.PredictedTags, decoder.UnknownWords, evalMode: "k+u")); #endregion Console.WriteLine("+"); #region Count known&unknown words int unkwordscount = 0, knownwordscount = 0; foreach (var item in wordsTest) { if (decoder.UnknownWords.Contains(item.word)) { unkwordscount++; } else { knownwordscount++; } } Console.WriteLine("Unknown words (count): " + unkwordscount + " | Procentage (%): " + (float)unkwordscount / wordsTest.Count); Console.WriteLine("Known words (count): " + knownwordscount + " | Procentage (%): " + (float)knownwordscount / wordsTest.Count); Console.WriteLine("Total words (count): " + wordsTest.Count); #endregion //using (System.IO.StreamWriter file = new System.IO.StreamWriter(path + "statistics\\" + "unknown_words.csv")) //{ // file.WriteLine("Unknown Words"); // foreach(var item in decoder.UnknownWords) // { // file.WriteLine("\"" + item + "\""); // } //} #region Suffix & Prefix hitrate //List<string> suffixStr = new List<string>(); //List<string> prefixStr = new List<string>(); //List<Tuple<int, int>> suffixHR = new List<Tuple<int, int>>(); //List<Tuple<int, int>> prefixHR = new List<Tuple<int, int>>(); //foreach (var item in tagger.SuffixEmissionProbabilities) //{ // suffixStr.Add(item.Word); // suffixHR.Add(new Tuple<int, int>(0, 0)); //} //foreach (var item in tagger.PrefixEmissionProbabilities) //{ // prefixStr.Add(item.Word); // prefixHR.Add(new Tuple<int, int>(0, 0)); //} //for (int i = 0; i < wordsTest.Count; i++) //{ // if (!decoder.UnknownWords.Contains(wordsTest[i].word)) continue; // for (int j = 0; j < suffixStr.Count; j++) // { // if (wordsTest[i].word.EndsWith(suffixStr[j])) // { // int hitr = suffixHR[j].Item1; // int allr = suffixHR[j].Item2 + 1; // if (wordsTest[i].tag == decoder.PredictedTags[i]) // suffixHR[j] = new Tuple<int, int>(hitr + 1, allr); // else suffixHR[j] = new Tuple<int, int>(hitr, allr); // break; // } // } // for (int j = 0; j < prefixStr.Count; j++) // { // if (wordsTest[i].word.ToLower().StartsWith(prefixStr[j])) // { // int hitr = prefixHR[j].Item1; // int allr = prefixHR[j].Item2 + 1; // if (wordsTest[i].tag == decoder.PredictedTags[i]) // prefixHR[j] = new Tuple<int, int>(hitr + 1, allr); // else prefixHR[j] = new Tuple<int, int>(hitr, allr); // break; // } // } //} //Console.WriteLine("Prefixes: "); //for (int i = 0; i < prefixStr.Count; i++) //{ // Console.WriteLine(prefixStr[i] + ": (" + prefixHR[i].Item1 + ", " + prefixHR[i].Item2 + ") -> " + (float)prefixHR[i].Item1 / prefixHR[i].Item2); //} //Console.WriteLine("\nSuffixes: "); //for (int i = 0; i < suffixStr.Count; i++) //{ // Console.WriteLine(suffixStr[i] + ": (" + suffixHR[i].Item1 + ", " + suffixHR[i].Item2 + ") -> " + (float)suffixHR[i].Item1 / suffixHR[i].Item2); //} #endregion #region Save predictions tags to excel //using (System.IO.StreamWriter file = new System.IO.StreamWriter(path + "statistics\\" + "trigram_bidirectional.csv")) //{ // file.WriteLine("Word,Real Tag,Prediction Tag,Is in Train T/F,Predicted T/F"); // for (int i = 0; i < wordsTest.Count; i++) // { // bool isInTrain = true, predictedB = false; // if (decoder.UnknownWords.Contains(wordsTest[i].word)) // isInTrain = false; // if (wordsTest[i].tag == decoder.PredictedTags[i]) // predictedB = true; // file.WriteLine("\"" + wordsTest[i].word + "\"," + wordsTest[i].tag + "," + decoder.PredictedTags[i] + "," + isInTrain + "," + predictedB); // } //} #endregion #elif (CROSS_VALIDATION) const int FOLDS = 4; const bool SHUFFLE = true; const string CVPATH = "dataset\\crossvalidation"; Console.WriteLine("You chose Cross-Validation for the data-set! Folds: " + FOLDS + ", Shuffle-option: " + SHUFFLE); string BrownFolderPath = path + CVPATH; #region Part of Speech Tag Frequence Count //var tx = LoadAndReadFolderFiles("dataset\\crossvalidation"); //var ow = Tokenizer.SeparateTagFromWord(Tokenizer.WordTokenizeCorpus(tx)); //var nw = SpeechPartClassification.GetNewHierarchicTags(ow); //var res = SpeechPartClassification.SpeechPartFrequence(nw); //foreach (var item in res) // Console.WriteLine(item.Key + ": " + item.Value); #endregion List <float> knownacc = new List <float>(), unknownacc = new List <float>(), totalacc = new List <float>(), procentageunk = new List <float>(); CrossValidation cv = new CrossValidation(filePath: BrownFolderPath, fold: FOLDS, shuffle: SHUFFLE); // with randomness Console.WriteLine("Done with loading dataset & splitting them into folds!\n"); for (int foldNumber = 0; foldNumber < FOLDS; foldNumber++) { #region Load Train Files & pre-process data var text = cv.TrainFile[foldNumber]; var oldWords = Tokenizer.SeparateTagFromWord(Tokenizer.TokenizePennTreebank(text)); var words = SpeechPartClassifier.GetNewHierarchicTags(oldWords); var capWords = TextPreprocessing.PreProcessingPipeline(words, toLowerOption: false, keepOnlyCapitalizedWords: true); var uncapWords = TextPreprocessing.PreProcessingPipeline(words, toLowerOption: true, keepOnlyCapitalizedWords: false); #endregion #region Load Test Files & pre-process data var textTest = cv.TestFile[foldNumber]; var oldWordsTest = Tokenizer.SeparateTagFromWord(Tokenizer.TokenizePennTreebank(textTest)); var wordsTest = SpeechPartClassifier.GetNewHierarchicTags(oldWordsTest); wordsTest = TextPreprocessing.PreProcessingPipeline(wordsTest); wordsTest = TextPreprocessing.Cleaning.EliminateDuplicateSequenceOfEndOfSentenceTags(wordsTest); #endregion Console.WriteLine("Done with loading and creating tokens for train & test files!"); #region Hidden Markov Model Training PartOfSpeechModel tagger = new PartOfSpeechModel(); Stopwatch sw = new Stopwatch(); sw.Start(); tagger.CreateHiddenMarkovModel(uncapWords, capWords); tagger.CalculateHiddenMarkovModelProbabilitiesForTestCorpus(wordsTest, model: "trigram"); sw.Stop(); Console.WriteLine("Done with training POS MODEL & calculating probabilities! Time: " + sw.ElapsedMilliseconds + " ms"); //Console.WriteLine("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"); #endregion #region Decoding Viterbi Model Decoder decoder = new Decoder(); sw.Reset(); sw.Start(); decoder.ViterbiDecoding(tagger, wordsTest, modelForward: "trigram", modelBackward: "trigram", mode: "f+b"); sw.Stop(); Console.WriteLine("Done with DECODING VITERBI MODEL! Time: " + sw.ElapsedMilliseconds + " ms"); //Console.WriteLine("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"); #endregion #region Evaluations & results Evaluation eval = new Evaluation(); //eval.CreateSupervizedEvaluationsMatrix(wordsTest, decoder.PredictedTags, decoder.UnknownWords, fbeta: 1); //Console.WriteLine("TAG\t\tACCURACY\t\tPRECISION\t\tRECALL(TPR)\t\tF1-SCORE\t\tSPECIFICITY(TNR)"); //var fullMatrix = eval.PrintClassificationResultsMatrix(); //for (int i = 0; i < eval.GetFullMatrixLineLength(); i++) //{ // for (int j = 0; j < eval.GetFullMatrixColLength(); j++) // Console.Write(fullMatrix[i][j] + "\t\t"); // Console.WriteLine(); //} var ka = eval.GetNaiveAccuracy(wordsTest, decoder.PredictedTags, decoder.UnknownWords, evalMode: "k"); knownacc.Add(ka); var unkw = eval.GetNaiveAccuracy(wordsTest, decoder.PredictedTags, decoder.UnknownWords, evalMode: "u"); unknownacc.Add(unkw); var tot = eval.GetNaiveAccuracy(wordsTest, decoder.PredictedTags, decoder.UnknownWords, evalMode: "k+u"); totalacc.Add(tot); Console.WriteLine("\nAccuracy for known words: " + ka); Console.WriteLine("Accuracy for unknown words: " + unkw); Console.WriteLine("Accuracy on both: " + tot); #endregion Console.WriteLine("+"); #region Count known&unknown words int unkwordscount = 0, knownwordscount = 0; foreach (var item in wordsTest) { if (decoder.UnknownWords.Contains(item.word)) { unkwordscount++; } else { knownwordscount++; } } var proc = (float)unkwordscount / wordsTest.Count; procentageunk.Add(proc); Console.WriteLine("Unknown words (count): " + unkwordscount + " | Procentage (%): " + proc); Console.WriteLine("Known words (count): " + knownwordscount + " | Procentage (%): " + (float)knownwordscount / wordsTest.Count); Console.WriteLine("Total words (count): " + wordsTest.Count); #endregion Console.WriteLine("\n\n[FOLD " + (foldNumber + 1) + "/" + FOLDS + " DONE!]\n\n"); } var known = (float)knownacc.Sum() / FOLDS; known = (float)Math.Round(known * 100, 3); var unk = (float)unknownacc.Sum() / FOLDS; unk = (float)Math.Round(unk * 100, 3); var total = (float)totalacc.Sum() / FOLDS; total = (float)Math.Round(total * 100, 3); var procunk = (float)procentageunk.Sum() / FOLDS; procunk = (float)Math.Round(procunk * 100, 3); Console.WriteLine("Procentage (%): " + procunk); Console.WriteLine("Accuracy for all unknown words: " + unk); Console.WriteLine("\nAccuracy for all known words: " + known); Console.WriteLine("Accuracy on all total: " + total); #elif (DEMO_APP) #region Load & convert to model string modelsPath = path + "\\models\\"; string unigram = File.ReadAllText(modelsPath + "unigram.json"); string bigram = File.ReadAllText(modelsPath + "bigram.json"); string trigram = File.ReadAllText(modelsPath + "trigram.json"); string capitalizedPrefix = File.ReadAllText(modelsPath + "capitalizedPrefix.json"); string nonCapitalizedPrefix = File.ReadAllText(modelsPath + "nonCapitalizedPrefix.json"); string capitalizedSuffix = File.ReadAllText(modelsPath + "capitalizedSuffix.json"); string nonCapitalizedSuffix = File.ReadAllText(modelsPath + "nonCapitalizedSuffix.json"); string emission = File.ReadAllText(modelsPath + "emission.json"); string emissionWithCapital = File.ReadAllText(modelsPath + "emissionWithCapital.json"); var unigramFreq = JsonConvert.DeserializeObject <Dictionary <string, int> >(unigram); var bigramNonConverted = JsonConvert.DeserializeObject <Dictionary <string, int> >(bigram); var trigramNonConverted = JsonConvert.DeserializeObject <Dictionary <string, int> >(trigram); var capitalizedPrefixProb = JsonConvert.DeserializeObject <List <PartOfSpeechModel.EmissionProbabilisticModel> >(capitalizedPrefix); var nonCapitalizedPrefixProb = JsonConvert.DeserializeObject <List <PartOfSpeechModel.EmissionProbabilisticModel> >(nonCapitalizedPrefix); var capitalizedSuffixProb = JsonConvert.DeserializeObject <List <PartOfSpeechModel.EmissionProbabilisticModel> >(capitalizedSuffix); var nonCapitalizedSuffixProb = JsonConvert.DeserializeObject <List <PartOfSpeechModel.EmissionProbabilisticModel> >(nonCapitalizedSuffix); var emissionFreq = JsonConvert.DeserializeObject <List <PartOfSpeechModel.EmissionModel> >(emission); var emissionWithCapitalFreq = JsonConvert.DeserializeObject <List <PartOfSpeechModel.EmissionModel> >(emissionWithCapital); Dictionary <Tuple <string, string>, int> bigramFreq = new Dictionary <Tuple <string, string>, int>(); Dictionary <Tuple <string, string, string>, int> trigramFreq = new Dictionary <Tuple <string, string, string>, int>(); foreach (var item in bigramNonConverted) { string[] split = item.Key.Split(','); var charsToRemove = new string[] { "(", ")", " " }; foreach (var c in charsToRemove) { split[0] = split[0].Replace(c, string.Empty); split[1] = split[1].Replace(c, string.Empty); } bigramFreq.Add(new Tuple <string, string>(split[0], split[1]), item.Value); } foreach (var item in trigramNonConverted) { string[] split = item.Key.Split(','); var charsToRemove = new string[] { "(", ")", " " }; foreach (var c in charsToRemove) { split[0] = split[0].Replace(c, string.Empty); split[1] = split[1].Replace(c, string.Empty); split[2] = split[2].Replace(c, string.Empty); } trigramFreq.Add(new Tuple <string, string, string>(split[0], split[1], split[2]), item.Value); } #endregion PartOfSpeechModel model = new PartOfSpeechModel(emissionFreq, emissionWithCapitalFreq, unigramFreq, bigramFreq, trigramFreq, nonCapitalizedSuffixProb, nonCapitalizedPrefixProb, capitalizedSuffixProb, capitalizedPrefixProb); NLP.Decoder decoder = new NLP.Decoder(); string input = null; List <string> preprocessedInput; while (true) { do { if (string.IsNullOrWhiteSpace(input)) { input = read(); } preprocessedInput = Tokenizer.TokenizeSentenceWords(input); input = null; } while (preprocessedInput.Count == 0 || preprocessedInput[0] == string.Empty); preprocessedInput = TextPreprocessing.PreProcessingPipeline(preprocessedInput); model.CalculateHiddenMarkovModelProbabilitiesForTestCorpus(preprocessedInput, model: "trigram"); List <Tokenizer.WordTag> inputTest = new List <Tokenizer.WordTag>(); foreach (var item in preprocessedInput) { if (item == "." || item == "!" || item == "?") { inputTest.Add(new Tokenizer.WordTag(item, ".")); } else { inputTest.Add(new Tokenizer.WordTag(item, "")); } } if (inputTest[inputTest.Count - 1].tag != ".") // safe case check { inputTest.Add(new Tokenizer.WordTag(".", ".")); } decoder.ViterbiDecoding(model, inputTest, modelForward: "trigram", modelBackward: "trigram", mode: "f+b"); Dictionary <string, int> histogram = new Dictionary <string, int>(); Dictionary <int, double> freqHisto = new Dictionary <int, double>(); foreach (var item in decoder.PredictedTags) { if (histogram.ContainsKey(item)) { histogram[item] += 1; } else { histogram.Add(item, 1); } } int sum = histogram.Sum(x => x.Value); foreach (var item in histogram) { int index = getIndexForConversion(item.Key); double val = Math.Round(((double)item.Value / sum) * 100.0d, 1); freqHisto.Add(index, val); } header(freqHisto); Console.ResetColor(); Console.Write("Tagged Sentence: "); for (int i = 0; i < decoder.PredictedTags.Count; i++) { int index = getIndexForConversion(decoder.PredictedTags[i]); Console.ForegroundColor = frColor[index]; Console.BackgroundColor = bkColor[index]; Console.Write(" " + inputTest[i].word + " "); emptySpace(); } Console.WriteLine(); Console.WriteLine(); Console.Write("Enter your sentence here: "); input = Console.ReadLine(); Console.Clear(); } #endif }