Exemple #1
0
        static void Main(string[] args)
        {
            string file1 = @"C:\Users\azend\Documents\GitHubVisualStudio\UWNLP\Assignment1\LanguageModels.UnitTests\TestData\brown.txt";

            SentenceNormalizer normalizer = new SentenceNormalizer(1, "{{*}}", "{{END}}", " ", ".");
            HashSet<string> vocabulary = new HashSet<string>(StringComparer.Ordinal);

            List<List<string>> splitCorpus = SplitCorpus(file1, 80, 10, 10);
            foreach (string sentence in splitCorpus[0])
            {
                string normalizedSentence = normalizer.Normalize(sentence);
                foreach (string token in normalizer.Tokenize(normalizedSentence))
                {
                    vocabulary.Add(token);
                }
            }

            double validationWords = 0;
            double unkownWords = 0;
            foreach (string sentence in splitCorpus[1])
            {
                string normalizedSentence = normalizer.Normalize(sentence);
                foreach (string token in normalizer.Tokenize(normalizedSentence))
                {
                    validationWords++;
                    if (!vocabulary.Contains(token))
                    {
                        unkownWords++;
                    }
                }
            }

            Console.WriteLine("Total number of words in validation: {0}", validationWords);
            Console.WriteLine("Number of unseen words in validaiton is: {0}", unkownWords);
            Console.WriteLine("Percentage is: {0}", unkownWords / validationWords);
        }
Exemple #2
0
 /// <summary>
 /// Initializes the class.
 /// </summary>
 private void Init()
 {
     Normalizer = new SentenceNormalizer(Settings.NGramOrder, Settings.StartToken, Settings.EndToken, Settings.Separator, Settings.PossibleEnd);
     NGramCounter = new NGramCounter(Settings);
     Vocabulary = new HashSet<string>(Settings.StringComparer);
     _wordsSeenOnlyOnce = new HashSet<string>(Settings.StringComparer);
     PMLCache = new Dictionary<NGram, double>();
 }