Esempio n. 1
0
        private void LoadFile(ITaggedFileReader reader, IDictionary <string, IntCounter <string> > wordTagCounts)
        {
            log.Info("Loading tagged words from " + reader.Filename());
            List <string> words        = new List <string>();
            List <string> tags         = new List <string>();
            int           numSentences = 0;
            int           numWords     = 0;
            int           maxLen       = int.MinValue;
            int           minLen       = int.MaxValue;

            foreach (IList <TaggedWord> sentence in reader)
            {
                if (maxentTagger.wordFunction != null)
                {
                    IList <TaggedWord> newSentence = new List <TaggedWord>(sentence.Count);
                    foreach (TaggedWord word in sentence)
                    {
                        TaggedWord newWord = new TaggedWord(maxentTagger.wordFunction.Apply(word.Word()), word.Tag());
                        newSentence.Add(newWord);
                    }
                    sentence = newSentence;
                }
                foreach (TaggedWord tw in sentence)
                {
                    if (tw != null)
                    {
                        words.Add(tw.Word());
                        tags.Add(tw.Tag());
                        if (!maxentTagger.tagTokens.Contains(tw.Tag()))
                        {
                            maxentTagger.tagTokens[tw.Tag()] = Generics.NewHashSet <string>();
                        }
                        maxentTagger.tagTokens[tw.Tag()].Add(tw.Word());
                    }
                }
                maxLen = (sentence.Count > maxLen ? sentence.Count : maxLen);
                minLen = (sentence.Count < minLen ? sentence.Count : minLen);
                words.Add(Edu.Stanford.Nlp.Tagger.Common.Tagger.EosWord);
                tags.Add(Edu.Stanford.Nlp.Tagger.Common.Tagger.EosTag);
                numElements = numElements + sentence.Count + 1;
                // iterate over the words in the sentence
                for (int i = 0; i < sentence.Count + 1; i++)
                {
                    History h    = new History(totalWords + totalSentences, totalWords + totalSentences + sentence.Count, totalWords + totalSentences + i, pairs, maxentTagger.extractors);
                    string  tag  = tags[i];
                    string  word = words[i];
                    pairs.Add(new WordTag(word, tag));
                    int         y   = maxentTagger.AddTag(tag);
                    DataWordTag dat = new DataWordTag(h, y, tag);
                    v.Add(dat);
                    IntCounter <string> tagCounts = wordTagCounts[word];
                    if (tagCounts == null)
                    {
                        tagCounts           = new IntCounter <string>();
                        wordTagCounts[word] = tagCounts;
                    }
                    tagCounts.IncrementCount(tag, 1);
                }
                totalSentences++;
                totalWords += sentence.Count;
                numSentences++;
                numWords += sentence.Count;
                words.Clear();
                tags.Clear();
                if ((numSentences % 100000) == 0)
                {
                    log.Info("Read " + numSentences + " sentences, min " + minLen + " words, max " + maxLen + " words ... [still reading]");
                }
            }
            log.Info("Read " + numWords + " words from " + reader.Filename() + " [done].");
            log.Info("Read " + numSentences + " sentences, min " + minLen + " words, max " + maxLen + " words.");
        }