private void LoadFile(ITaggedFileReader reader, IDictionary <string, IntCounter <string> > wordTagCounts) { log.Info("Loading tagged words from " + reader.Filename()); List <string> words = new List <string>(); List <string> tags = new List <string>(); int numSentences = 0; int numWords = 0; int maxLen = int.MinValue; int minLen = int.MaxValue; foreach (IList <TaggedWord> sentence in reader) { if (maxentTagger.wordFunction != null) { IList <TaggedWord> newSentence = new List <TaggedWord>(sentence.Count); foreach (TaggedWord word in sentence) { TaggedWord newWord = new TaggedWord(maxentTagger.wordFunction.Apply(word.Word()), word.Tag()); newSentence.Add(newWord); } sentence = newSentence; } foreach (TaggedWord tw in sentence) { if (tw != null) { words.Add(tw.Word()); tags.Add(tw.Tag()); if (!maxentTagger.tagTokens.Contains(tw.Tag())) { maxentTagger.tagTokens[tw.Tag()] = Generics.NewHashSet <string>(); } maxentTagger.tagTokens[tw.Tag()].Add(tw.Word()); } } maxLen = (sentence.Count > maxLen ? sentence.Count : maxLen); minLen = (sentence.Count < minLen ? sentence.Count : minLen); words.Add(Edu.Stanford.Nlp.Tagger.Common.Tagger.EosWord); tags.Add(Edu.Stanford.Nlp.Tagger.Common.Tagger.EosTag); numElements = numElements + sentence.Count + 1; // iterate over the words in the sentence for (int i = 0; i < sentence.Count + 1; i++) { History h = new History(totalWords + totalSentences, totalWords + totalSentences + sentence.Count, totalWords + totalSentences + i, pairs, maxentTagger.extractors); string tag = tags[i]; string word = words[i]; pairs.Add(new WordTag(word, tag)); int y = maxentTagger.AddTag(tag); DataWordTag dat = new DataWordTag(h, y, tag); v.Add(dat); IntCounter <string> tagCounts = wordTagCounts[word]; if (tagCounts == null) { tagCounts = new IntCounter <string>(); wordTagCounts[word] = tagCounts; } tagCounts.IncrementCount(tag, 1); } totalSentences++; totalWords += sentence.Count; numSentences++; numWords += sentence.Count; words.Clear(); tags.Clear(); if ((numSentences % 100000) == 0) { log.Info("Read " + numSentences + " sentences, min " + minLen + " words, max " + maxLen + " words ... [still reading]"); } } log.Info("Read " + numWords + " words from " + reader.Filename() + " [done]."); log.Info("Read " + numSentences + " sentences, min " + minLen + " words, max " + maxLen + " words."); }