static void Main(string[] args) { string file1 = @"C:\Users\azend\Documents\GitHubVisualStudio\UWNLP\Assignment1\LanguageModels.UnitTests\TestData\brown.txt"; SentenceNormalizer normalizer = new SentenceNormalizer(1, "{{*}}", "{{END}}", " ", "."); HashSet<string> vocabulary = new HashSet<string>(StringComparer.Ordinal); List<List<string>> splitCorpus = SplitCorpus(file1, 80, 10, 10); foreach (string sentence in splitCorpus[0]) { string normalizedSentence = normalizer.Normalize(sentence); foreach (string token in normalizer.Tokenize(normalizedSentence)) { vocabulary.Add(token); } } double validationWords = 0; double unkownWords = 0; foreach (string sentence in splitCorpus[1]) { string normalizedSentence = normalizer.Normalize(sentence); foreach (string token in normalizer.Tokenize(normalizedSentence)) { validationWords++; if (!vocabulary.Contains(token)) { unkownWords++; } } } Console.WriteLine("Total number of words in validation: {0}", validationWords); Console.WriteLine("Number of unseen words in validaiton is: {0}", unkownWords); Console.WriteLine("Percentage is: {0}", unkownWords / validationWords); }
/// <summary> /// Initializes the class. /// </summary> private void Init() { Normalizer = new SentenceNormalizer(Settings.NGramOrder, Settings.StartToken, Settings.EndToken, Settings.Separator, Settings.PossibleEnd); NGramCounter = new NGramCounter(Settings); Vocabulary = new HashSet<string>(Settings.StringComparer); _wordsSeenOnlyOnce = new HashSet<string>(Settings.StringComparer); PMLCache = new Dictionary<NGram, double>(); }