/// <exception cref="System.Exception"/> public static void TrainClassification(PairwiseModel model, bool anaphoricityModel) { int numTrainingExamples = model.GetNumTrainingExamples(); Redwood.Log("scoref-train", "Reading compression..."); Compressor <string> compressor = IOUtils.ReadObjectFromFile(StatisticalCorefTrainer.compressorFile); Redwood.Log("scoref-train", "Reading train data..."); IList <DocumentExamples> trainDocuments = IOUtils.ReadObjectFromFile(StatisticalCorefTrainer.extractedFeaturesFile); Redwood.Log("scoref-train", "Building train set..."); IList <Pair <Example, IDictionary <int, CompressedFeatureVector> > > allExamples = anaphoricityModel ? GetAnaphoricityExamples(trainDocuments) : GetExamples(trainDocuments); Redwood.Log("scoref-train", "Training..."); Random random = new Random(0); int i = 0; bool stopTraining = false; while (!stopTraining) { Java.Util.Collections.Shuffle(allExamples, random); foreach (Pair <Example, IDictionary <int, CompressedFeatureVector> > pair in allExamples) { if (i++ > numTrainingExamples) { stopTraining = true; break; } if (i % 10000 == 0) { Redwood.Log("scoref-train", string.Format("On train example %d/%d = %.2f%%", i, numTrainingExamples, 100.0 * i / numTrainingExamples)); } model.Learn(pair.first, pair.second, compressor); } } Redwood.Log("scoref-train", "Writing models..."); model.WriteModel(); }
/// <exception cref="System.Exception"/> public static void TrainRanking(PairwiseModel model) { Redwood.Log("scoref-train", "Reading compression..."); Compressor <string> compressor = IOUtils.ReadObjectFromFile(StatisticalCorefTrainer.compressorFile); Redwood.Log("scoref-train", "Reading train data..."); IList <DocumentExamples> trainDocuments = IOUtils.ReadObjectFromFile(StatisticalCorefTrainer.extractedFeaturesFile); Redwood.Log("scoref-train", "Training..."); for (int i = 0; i < model.GetNumEpochs(); i++) { Java.Util.Collections.Shuffle(trainDocuments); int j = 0; foreach (DocumentExamples doc in trainDocuments) { j++; Redwood.Log("scoref-train", "On epoch: " + i + " / " + model.GetNumEpochs() + ", document: " + j + " / " + trainDocuments.Count); IDictionary <int, IList <Example> > mentionToPotentialAntecedents = new Dictionary <int, IList <Example> >(); foreach (Example e in doc.examples) { int mention = e.mentionId2; IList <Example> potentialAntecedents = mentionToPotentialAntecedents[mention]; if (potentialAntecedents == null) { potentialAntecedents = new List <Example>(); mentionToPotentialAntecedents[mention] = potentialAntecedents; } potentialAntecedents.Add(e); } IList <IList <Example> > examples = new List <IList <Example> >(mentionToPotentialAntecedents.Values); Java.Util.Collections.Shuffle(examples); foreach (IList <Example> es in examples) { if (es.Count == 0) { continue; } if (model is MaxMarginMentionRanker) { MaxMarginMentionRanker ranker = (MaxMarginMentionRanker)model; bool noAntecedent = es.Stream().AllMatch(null); es.Add(new Example(es[0], noAntecedent)); double maxPositiveScore = -double.MaxValue; Example maxScoringPositive = null; foreach (Example e_1 in es) { double score = model.Predict(e_1, doc.mentionFeatures, compressor); if (e_1.label == 1) { System.Diagnostics.Debug.Assert((!noAntecedent ^ e_1.IsNewLink())); if (score > maxPositiveScore) { maxPositiveScore = score; maxScoringPositive = e_1; } } } System.Diagnostics.Debug.Assert((maxScoringPositive != null)); double maxNegativeScore = -double.MaxValue; Example maxScoringNegative = null; MaxMarginMentionRanker.ErrorType maxScoringEt = null; foreach (Example e_2 in es) { double score = model.Predict(e_2, doc.mentionFeatures, compressor); if (e_2.label != 1) { System.Diagnostics.Debug.Assert((!(noAntecedent && e_2.IsNewLink()))); MaxMarginMentionRanker.ErrorType et = MaxMarginMentionRanker.ErrorType.Wl; if (noAntecedent && !e_2.IsNewLink()) { et = MaxMarginMentionRanker.ErrorType.Fl; } else { if (!noAntecedent && e_2.IsNewLink()) { if (e_2.mentionType2 == Dictionaries.MentionType.Pronominal) { et = MaxMarginMentionRanker.ErrorType.FnPron; } else { et = MaxMarginMentionRanker.ErrorType.Fn; } } } if (ranker.multiplicativeCost) { score = ranker.costs[et.id] * (1 - maxPositiveScore + score); } else { score += ranker.costs[et.id]; } if (score > maxNegativeScore) { maxNegativeScore = score; maxScoringNegative = e_2; maxScoringEt = et; } } } System.Diagnostics.Debug.Assert((maxScoringNegative != null)); ranker.Learn(maxScoringPositive, maxScoringNegative, doc.mentionFeatures, compressor, maxScoringEt); } else { double maxPositiveScore = -double.MaxValue; double maxNegativeScore = -double.MaxValue; Example maxScoringPositive = null; Example maxScoringNegative = null; foreach (Example e_1 in es) { double score = model.Predict(e_1, doc.mentionFeatures, compressor); if (e_1.label == 1) { if (score > maxPositiveScore) { maxPositiveScore = score; maxScoringPositive = e_1; } } else { if (score > maxNegativeScore) { maxNegativeScore = score; maxScoringNegative = e_1; } } } model.Learn(maxScoringPositive, maxScoringNegative, doc.mentionFeatures, compressor, 1); } } } } Redwood.Log("scoref-train", "Writing models..."); model.WriteModel(); }