Example #1
0
 public virtual void FinishTraining()
 {
     uwModel = uwModelTrainer.FinishTraining();
     Tune();
     // index the possible tags for each word
     InitRulesWithWord();
 }
 // Records the number of times word/tag pair was seen in training data.
 // Counts of each tag (stored as a Label) on unknown words.
 // tag (Label) --> signature --> count
 public override void InitializeTraining(Options op, ILexicon lex, IIndex <string> wordIndex, IIndex <string> tagIndex, double totalTrees)
 {
     base.InitializeTraining(op, lex, wordIndex, tagIndex, totalTrees);
     seenCounter   = new ClassicCounter <IntTaggedWord>();
     unSeenCounter = new ClassicCounter <IntTaggedWord>();
     tagHash       = Generics.NewHashMap();
     tc            = new ClassicCounter <ILabel>();
     c             = Generics.NewHashMap();
     seenEnd       = Generics.NewHashSet();
     useEnd        = (op.lexOptions.unknownSuffixSize > 0 && op.lexOptions.useUnknownWordSignatures > 0);
     useFirstCap   = op.lexOptions.useUnknownWordSignatures > 0;
     useGT         = (op.lexOptions.useUnknownWordSignatures == 0);
     useFirst      = false;
     if (useFirst)
     {
         log.Info("Including first letter for unknown words.");
     }
     if (useFirstCap)
     {
         log.Info("Including whether first letter is capitalized for unknown words");
     }
     if (useEnd)
     {
         log.Info("Classing unknown word as the average of their equivalents by identity of last " + op.lexOptions.unknownSuffixSize + " letters.");
     }
     if (useGT)
     {
         log.Info("Using Good-Turing smoothing for unknown words.");
     }
     this.indexToStartUnkCounting = (totalTrees * op.trainOptions.fractionBeforeUnseenCounting);
     this.unknownGTTrainer        = (useGT) ? new UnknownGTTrainer() : null;
     this.model = BuildUWM();
 }
Example #3
0
 // boundary tag -- assumed not a real tag
 public override void InitializeTraining(Options op, ILexicon lex, IIndex <string> wordIndex, IIndex <string> tagIndex, double totalTrees)
 {
     base.InitializeTraining(op, lex, wordIndex, tagIndex, totalTrees);
     indexToStartUnkCounting = (totalTrees * op.trainOptions.fractionBeforeUnseenCounting);
     seenCounter             = new ClassicCounter <IntTaggedWord>();
     unSeenCounter           = new ClassicCounter <IntTaggedWord>();
     model = new FrenchUnknownWordModel(op, lex, wordIndex, tagIndex, unSeenCounter);
 }
        // Records the number of times word/tag pair was seen in training data.
        // c has a map from tags as Label to a Counter from word
        // signatures to Strings; it is used to collect counts that will
        // initialize the probabilities in tagHash
        // tc record the marginal counts for each tag as an unknown.  It
        // should be the same as c's totalCount ??
        public override void InitializeTraining(Options op, ILexicon lex, IIndex <string> wordIndex, IIndex <string> tagIndex, double totalTrees)
        {
            base.InitializeTraining(op, lex, wordIndex, tagIndex, totalTrees);
            bool useGoodTuringUnknownWordModel = ChineseTreebankParserParams.DefaultUseGoodTurningUnknownWordModel;

            useFirst = true;
            useGT    = (op.lexOptions.useUnknownWordSignatures == 0);
            if (lex is ChineseLexicon)
            {
                useGoodTuringUnknownWordModel = ((ChineseLexicon)lex).useGoodTuringUnknownWordModel;
            }
            else
            {
                if (op.tlpParams is ChineseTreebankParserParams)
                {
                    useGoodTuringUnknownWordModel = ((ChineseTreebankParserParams)op.tlpParams).useGoodTuringUnknownWordModel;
                }
            }
            if (useGoodTuringUnknownWordModel)
            {
                this.useGT    = true;
                this.useFirst = false;
            }
            this.useUnicodeType = op.lexOptions.useUnicodeType;
            if (useFirst)
            {
                log.Info("ChineseUWM: treating unknown word as the average of their equivalents by first-character identity. useUnicodeType: " + useUnicodeType);
            }
            if (useGT)
            {
                log.Info("ChineseUWM: using Good-Turing smoothing for unknown words.");
            }
            this.c                       = Generics.NewHashMap();
            this.tc                      = new ClassicCounter <ILabel>();
            this.unSeenCounter           = new ClassicCounter <IntTaggedWord>();
            this.seenCounter             = new ClassicCounter <IntTaggedWord>();
            this.seenFirst               = Generics.NewHashSet();
            this.tagHash                 = Generics.NewHashMap();
            this.indexToStartUnkCounting = (totalTrees * op.trainOptions.fractionBeforeUnseenCounting);
            this.unknownGTTrainer        = (useGT) ? new UnknownGTTrainer() : null;
            IDictionary <string, float> unknownGT = null;

            if (useGT)
            {
                unknownGT = unknownGTTrainer.unknownGT;
            }
            this.model = new ChineseUnknownWordModel(op, lex, wordIndex, tagIndex, unSeenCounter, tagHash, unknownGT, useGT, seenFirst);
        }
Example #5
0
 public virtual void SetUnknownWordModel(IUnknownWordModel uwm)
 {
     chineseLexicon.SetUnknownWordModel(uwm);
 }
Example #6
0
 public void SetUnknownWordModel(IUnknownWordModel uwm)
 {
     this.uwModel = uwm;
 }
 public virtual void SetUnknownWordModel(IUnknownWordModel uwm)
 {
 }