/// <summary>This method gets feature statistics from a training file found in the TaggerConfig.</summary>
 /// <remarks>
 /// This method gets feature statistics from a training file found in the TaggerConfig.
 /// It is the start of the training process.
 /// </remarks>
 /// <exception cref="System.IO.IOException"/>
 protected internal TaggerExperiments(TaggerConfig config, MaxentTagger maxentTagger)
     : this(maxentTagger)
 {
     log.Info("TaggerExperiments: adding word/tags");
     PairsHolder    pairs = new PairsHolder();
     ReadDataTagged c     = new ReadDataTagged(config, maxentTagger, pairs);
     vArray = new int[][] {  };
     InitTemplatesNew();
     log.Info("Featurizing tagged data tokens...");
     for (int i = 0; i < size; i++)
     {
         DataWordTag d    = c.Get(i);
         string      yS   = d.GetY();
         History     h    = d.GetHistory();
         int         indX = tHistories.Add(h);
         int         indY = d.GetYInd();
         AddTemplatesNew(h, yS);
         AddRareTemplatesNew(h, yS);
         vArray[i][0] = indX;
         vArray[i][1] = indY;
     }
     // It's the 2010s now and it doesn't take so long to featurize....
     // if (i > 0 && (i % 10000) == 0) {
     //   System.err.printf("%d ", i);
     //   if (i % 100000 == 0) { System.err.println(); }
     // }
     // log.info();
     log.Info("Featurized " + c.GetSize() + " data tokens [done].");
     c.Release();
     Ptilde();
     maxentTagger.xSize = xSize;
     maxentTagger.ySize = ySize;
     log.Info("xSize [num Phi templates] = " + xSize + "; ySize [num classes] = " + ySize);
     HashHistories();
     // if we'll look at occurring tags only, we need the histories and pairs still
     if (!maxentTagger.occurringTagsOnly && !maxentTagger.possibleTagsOnly)
     {
         tHistories.Release();
         pairs.Clear();
     }
     GetFeaturesNew();
 }
示例#2
0
        private void LoadFile(ITaggedFileReader reader, IDictionary <string, IntCounter <string> > wordTagCounts)
        {
            log.Info("Loading tagged words from " + reader.Filename());
            List <string> words        = new List <string>();
            List <string> tags         = new List <string>();
            int           numSentences = 0;
            int           numWords     = 0;
            int           maxLen       = int.MinValue;
            int           minLen       = int.MaxValue;

            foreach (IList <TaggedWord> sentence in reader)
            {
                if (maxentTagger.wordFunction != null)
                {
                    IList <TaggedWord> newSentence = new List <TaggedWord>(sentence.Count);
                    foreach (TaggedWord word in sentence)
                    {
                        TaggedWord newWord = new TaggedWord(maxentTagger.wordFunction.Apply(word.Word()), word.Tag());
                        newSentence.Add(newWord);
                    }
                    sentence = newSentence;
                }
                foreach (TaggedWord tw in sentence)
                {
                    if (tw != null)
                    {
                        words.Add(tw.Word());
                        tags.Add(tw.Tag());
                        if (!maxentTagger.tagTokens.Contains(tw.Tag()))
                        {
                            maxentTagger.tagTokens[tw.Tag()] = Generics.NewHashSet <string>();
                        }
                        maxentTagger.tagTokens[tw.Tag()].Add(tw.Word());
                    }
                }
                maxLen = (sentence.Count > maxLen ? sentence.Count : maxLen);
                minLen = (sentence.Count < minLen ? sentence.Count : minLen);
                words.Add(Edu.Stanford.Nlp.Tagger.Common.Tagger.EosWord);
                tags.Add(Edu.Stanford.Nlp.Tagger.Common.Tagger.EosTag);
                numElements = numElements + sentence.Count + 1;
                // iterate over the words in the sentence
                for (int i = 0; i < sentence.Count + 1; i++)
                {
                    History h    = new History(totalWords + totalSentences, totalWords + totalSentences + sentence.Count, totalWords + totalSentences + i, pairs, maxentTagger.extractors);
                    string  tag  = tags[i];
                    string  word = words[i];
                    pairs.Add(new WordTag(word, tag));
                    int         y   = maxentTagger.AddTag(tag);
                    DataWordTag dat = new DataWordTag(h, y, tag);
                    v.Add(dat);
                    IntCounter <string> tagCounts = wordTagCounts[word];
                    if (tagCounts == null)
                    {
                        tagCounts           = new IntCounter <string>();
                        wordTagCounts[word] = tagCounts;
                    }
                    tagCounts.IncrementCount(tag, 1);
                }
                totalSentences++;
                totalWords += sentence.Count;
                numSentences++;
                numWords += sentence.Count;
                words.Clear();
                tags.Clear();
                if ((numSentences % 100000) == 0)
                {
                    log.Info("Read " + numSentences + " sentences, min " + minLen + " words, max " + maxLen + " words ... [still reading]");
                }
            }
            log.Info("Read " + numWords + " words from " + reader.Filename() + " [done].");
            log.Info("Read " + numSentences + " sentences, min " + minLen + " words, max " + maxLen + " words.");
        }