/// <summary>This method gets feature statistics from a training file found in the TaggerConfig.</summary> /// <remarks> /// This method gets feature statistics from a training file found in the TaggerConfig. /// It is the start of the training process. /// </remarks> /// <exception cref="System.IO.IOException"/> protected internal TaggerExperiments(TaggerConfig config, MaxentTagger maxentTagger) : this(maxentTagger) { log.Info("TaggerExperiments: adding word/tags"); PairsHolder pairs = new PairsHolder(); ReadDataTagged c = new ReadDataTagged(config, maxentTagger, pairs); vArray = new int[][] { }; InitTemplatesNew(); log.Info("Featurizing tagged data tokens..."); for (int i = 0; i < size; i++) { DataWordTag d = c.Get(i); string yS = d.GetY(); History h = d.GetHistory(); int indX = tHistories.Add(h); int indY = d.GetYInd(); AddTemplatesNew(h, yS); AddRareTemplatesNew(h, yS); vArray[i][0] = indX; vArray[i][1] = indY; } // It's the 2010s now and it doesn't take so long to featurize.... // if (i > 0 && (i % 10000) == 0) { // System.err.printf("%d ", i); // if (i % 100000 == 0) { System.err.println(); } // } // log.info(); log.Info("Featurized " + c.GetSize() + " data tokens [done]."); c.Release(); Ptilde(); maxentTagger.xSize = xSize; maxentTagger.ySize = ySize; log.Info("xSize [num Phi templates] = " + xSize + "; ySize [num classes] = " + ySize); HashHistories(); // if we'll look at occurring tags only, we need the histories and pairs still if (!maxentTagger.occurringTagsOnly && !maxentTagger.possibleTagsOnly) { tHistories.Release(); pairs.Clear(); } GetFeaturesNew(); }
private void LoadFile(ITaggedFileReader reader, IDictionary <string, IntCounter <string> > wordTagCounts) { log.Info("Loading tagged words from " + reader.Filename()); List <string> words = new List <string>(); List <string> tags = new List <string>(); int numSentences = 0; int numWords = 0; int maxLen = int.MinValue; int minLen = int.MaxValue; foreach (IList <TaggedWord> sentence in reader) { if (maxentTagger.wordFunction != null) { IList <TaggedWord> newSentence = new List <TaggedWord>(sentence.Count); foreach (TaggedWord word in sentence) { TaggedWord newWord = new TaggedWord(maxentTagger.wordFunction.Apply(word.Word()), word.Tag()); newSentence.Add(newWord); } sentence = newSentence; } foreach (TaggedWord tw in sentence) { if (tw != null) { words.Add(tw.Word()); tags.Add(tw.Tag()); if (!maxentTagger.tagTokens.Contains(tw.Tag())) { maxentTagger.tagTokens[tw.Tag()] = Generics.NewHashSet <string>(); } maxentTagger.tagTokens[tw.Tag()].Add(tw.Word()); } } maxLen = (sentence.Count > maxLen ? sentence.Count : maxLen); minLen = (sentence.Count < minLen ? sentence.Count : minLen); words.Add(Edu.Stanford.Nlp.Tagger.Common.Tagger.EosWord); tags.Add(Edu.Stanford.Nlp.Tagger.Common.Tagger.EosTag); numElements = numElements + sentence.Count + 1; // iterate over the words in the sentence for (int i = 0; i < sentence.Count + 1; i++) { History h = new History(totalWords + totalSentences, totalWords + totalSentences + sentence.Count, totalWords + totalSentences + i, pairs, maxentTagger.extractors); string tag = tags[i]; string word = words[i]; pairs.Add(new WordTag(word, tag)); int y = maxentTagger.AddTag(tag); DataWordTag dat = new DataWordTag(h, y, tag); v.Add(dat); IntCounter <string> tagCounts = wordTagCounts[word]; if (tagCounts == null) { tagCounts = new IntCounter <string>(); wordTagCounts[word] = tagCounts; } tagCounts.IncrementCount(tag, 1); } totalSentences++; totalWords += sentence.Count; numSentences++; numWords += sentence.Count; words.Clear(); tags.Clear(); if ((numSentences % 100000) == 0) { log.Info("Read " + numSentences + " sentences, min " + minLen + " words, max " + maxLen + " words ... [still reading]"); } } log.Info("Read " + numWords + " words from " + reader.Filename() + " [done]."); log.Info("Read " + numSentences + " sentences, min " + minLen + " words, max " + maxLen + " words."); }