private void ApplyThresholds(WeightedDataset data) { if (wordThreshold.second > 0) { featureThresholds.Add(wordThreshold); } if (featExtractor.chars && charThreshold.second > 0) { featureThresholds.Add(charThreshold); } if (featExtractor.bigrams && bigramThreshold.second > 0) { featureThresholds.Add(bigramThreshold); } if ((featExtractor.conjunctions || featExtractor.mildConjunctions) && conjThreshold.second > 0) { featureThresholds.Add(conjThreshold); } int types = data.NumFeatureTypes(); if (universalThreshold > 0) { data.ApplyFeatureCountThreshold(universalThreshold); } if (featureThresholds.Count > 0) { data.ApplyFeatureCountThreshold(featureThresholds); } int numRemoved = types - data.NumFeatureTypes(); if (numRemoved > 0) { Verbose("Thresholding removed " + numRemoved + " features."); } }
public virtual void FinishTraining() { IntCounter <string> tagCounter = new IntCounter <string>(); WeightedDataset data = new WeightedDataset(datumCounter.Size()); foreach (TaggedWord word in datumCounter.KeySet()) { int count = datumCounter.GetIntCount(word); if (trainOnLowCount && count > trainCountThreshold) { continue; } if (functionWordTags.Contains(word.Word())) { continue; } tagCounter.IncrementCount(word.Tag()); if (trainByType) { count = 1; } data.Add(new BasicDatum(featExtractor.MakeFeatures(word.Word()), word.Tag()), count); } datumCounter = null; tagDist = Distribution.LaplaceSmoothedDistribution(tagCounter, tagCounter.Size(), 0.5); tagCounter = null; ApplyThresholds(data); Verbose("Making classifier..."); QNMinimizer minim = new QNMinimizer(); //new ResultStoringMonitor(5, "weights")); // minim.shutUp(); LinearClassifierFactory factory = new LinearClassifierFactory(minim); factory.SetTol(tol); factory.SetSigma(sigma); scorer = factory.TrainClassifier(data); Verbose("Done training."); }