public virtual void FinishTraining() { uwModel = uwModelTrainer.FinishTraining(); Tune(); // index the possible tags for each word InitRulesWithWord(); }
// Records the number of times word/tag pair was seen in training data. // Counts of each tag (stored as a Label) on unknown words. // tag (Label) --> signature --> count public override void InitializeTraining(Options op, ILexicon lex, IIndex <string> wordIndex, IIndex <string> tagIndex, double totalTrees) { base.InitializeTraining(op, lex, wordIndex, tagIndex, totalTrees); seenCounter = new ClassicCounter <IntTaggedWord>(); unSeenCounter = new ClassicCounter <IntTaggedWord>(); tagHash = Generics.NewHashMap(); tc = new ClassicCounter <ILabel>(); c = Generics.NewHashMap(); seenEnd = Generics.NewHashSet(); useEnd = (op.lexOptions.unknownSuffixSize > 0 && op.lexOptions.useUnknownWordSignatures > 0); useFirstCap = op.lexOptions.useUnknownWordSignatures > 0; useGT = (op.lexOptions.useUnknownWordSignatures == 0); useFirst = false; if (useFirst) { log.Info("Including first letter for unknown words."); } if (useFirstCap) { log.Info("Including whether first letter is capitalized for unknown words"); } if (useEnd) { log.Info("Classing unknown word as the average of their equivalents by identity of last " + op.lexOptions.unknownSuffixSize + " letters."); } if (useGT) { log.Info("Using Good-Turing smoothing for unknown words."); } this.indexToStartUnkCounting = (totalTrees * op.trainOptions.fractionBeforeUnseenCounting); this.unknownGTTrainer = (useGT) ? new UnknownGTTrainer() : null; this.model = BuildUWM(); }
// boundary tag -- assumed not a real tag public override void InitializeTraining(Options op, ILexicon lex, IIndex <string> wordIndex, IIndex <string> tagIndex, double totalTrees) { base.InitializeTraining(op, lex, wordIndex, tagIndex, totalTrees); indexToStartUnkCounting = (totalTrees * op.trainOptions.fractionBeforeUnseenCounting); seenCounter = new ClassicCounter <IntTaggedWord>(); unSeenCounter = new ClassicCounter <IntTaggedWord>(); model = new FrenchUnknownWordModel(op, lex, wordIndex, tagIndex, unSeenCounter); }
// Records the number of times word/tag pair was seen in training data. // c has a map from tags as Label to a Counter from word // signatures to Strings; it is used to collect counts that will // initialize the probabilities in tagHash // tc record the marginal counts for each tag as an unknown. It // should be the same as c's totalCount ?? public override void InitializeTraining(Options op, ILexicon lex, IIndex <string> wordIndex, IIndex <string> tagIndex, double totalTrees) { base.InitializeTraining(op, lex, wordIndex, tagIndex, totalTrees); bool useGoodTuringUnknownWordModel = ChineseTreebankParserParams.DefaultUseGoodTurningUnknownWordModel; useFirst = true; useGT = (op.lexOptions.useUnknownWordSignatures == 0); if (lex is ChineseLexicon) { useGoodTuringUnknownWordModel = ((ChineseLexicon)lex).useGoodTuringUnknownWordModel; } else { if (op.tlpParams is ChineseTreebankParserParams) { useGoodTuringUnknownWordModel = ((ChineseTreebankParserParams)op.tlpParams).useGoodTuringUnknownWordModel; } } if (useGoodTuringUnknownWordModel) { this.useGT = true; this.useFirst = false; } this.useUnicodeType = op.lexOptions.useUnicodeType; if (useFirst) { log.Info("ChineseUWM: treating unknown word as the average of their equivalents by first-character identity. useUnicodeType: " + useUnicodeType); } if (useGT) { log.Info("ChineseUWM: using Good-Turing smoothing for unknown words."); } this.c = Generics.NewHashMap(); this.tc = new ClassicCounter <ILabel>(); this.unSeenCounter = new ClassicCounter <IntTaggedWord>(); this.seenCounter = new ClassicCounter <IntTaggedWord>(); this.seenFirst = Generics.NewHashSet(); this.tagHash = Generics.NewHashMap(); this.indexToStartUnkCounting = (totalTrees * op.trainOptions.fractionBeforeUnseenCounting); this.unknownGTTrainer = (useGT) ? new UnknownGTTrainer() : null; IDictionary <string, float> unknownGT = null; if (useGT) { unknownGT = unknownGTTrainer.unknownGT; } this.model = new ChineseUnknownWordModel(op, lex, wordIndex, tagIndex, unSeenCounter, tagHash, unknownGT, useGT, seenFirst); }
public virtual void SetUnknownWordModel(IUnknownWordModel uwm) { chineseLexicon.SetUnknownWordModel(uwm); }
public void SetUnknownWordModel(IUnknownWordModel uwm) { this.uwModel = uwm; }
public virtual void SetUnknownWordModel(IUnknownWordModel uwm) { }