Пример #1
0
 public BaseLexicon(Options op, IIndex <string> wordIndex, IIndex <string> tagIndex)
 {
     // protected transient Set<IntTaggedWord> rules = new
     // HashSet<IntTaggedWord>();
     // When it existed, rules somehow held a few less things than rulesWithWord
     // I never figured out why [cdm, Dec 2004]
     // protected transient Set<IntTaggedWord> sigs=Generics.newHashSet();
     // these next two are used for smartMutation calculation
     // = null;
     // = null;
     this.wordIndex = wordIndex;
     this.tagIndex  = tagIndex;
     flexiTag       = op.lexOptions.flexiTag;
     useSignatureForKnownSmoothing  = op.lexOptions.useSignatureForKnownSmoothing;
     this.smoothInUnknownsThreshold = op.lexOptions.smoothInUnknownsThreshold;
     this.smartMutation             = op.lexOptions.smartMutation;
     this.trainOptions = op.trainOptions;
     this.testOptions  = op.testOptions;
     this.op           = op;
     // Construct UnknownWordModel by reflection -- a right pain
     // Lexicons and UnknownWordModels aren't very well encapsulated
     // from each other!
     if (op.lexOptions.uwModelTrainer == null)
     {
         this.uwModelTrainerClass = "edu.stanford.nlp.parser.lexparser.BaseUnknownWordModelTrainer";
     }
     else
     {
         this.uwModelTrainerClass = op.lexOptions.uwModelTrainer;
     }
 }
Пример #2
0
 public TreeAnnotator(IHeadFinder hf, ITreebankLangParserParams tlpp, Options op)
 {
     this.tlpParams    = tlpp;
     this.hf           = hf;
     this.tf           = new LabeledScoredTreeFactory();
     this.trainOptions = op.trainOptions;
 }
 public LinearGrammarSmoother(TrainOptions trainOptions, IIndex <string> stateIndex, IIndex <string> tagIndex)
 {
     annoteChars = Generics.NewHashSet(Arrays.AsList(annotationIntroducingChars));
     //  private static final String SYNTH_NODE_MARK = "@";
     //
     //  private static final Pattern pContext = Pattern.compile("(\\|.+)$");
     // Do not include @ in this list! @ marks synthetic nodes!
     // Stole these from PennTreebankLanguagePack
     this.trainOptions = trainOptions;
     this.stateIndex   = stateIndex;
     this.tagIndex     = tagIndex;
 }
Пример #4
0
        /// <summary>
        /// The tree t is normally expected to be a Penn-Treebank-style tree
        /// in which the top node is an extra node that has a unary expansion.
        /// </summary>
        /// <remarks>
        /// The tree t is normally expected to be a Penn-Treebank-style tree
        /// in which the top node is an extra node that has a unary expansion.
        /// If this isn't the case, an extra node is added and the user is warned.
        /// </remarks>
        public virtual Tree TransformTree(Tree t)
        {
            if (trainOptions.printTreeTransformations > 0)
            {
                TrainOptions.PrintTrainTree(null, "ORIGINAL TREE:", t);
            }
            Tree trTree = annotator.TransformTree(t);

            if (trainOptions.selectivePostSplit)
            {
                trTree = postSplitter.TransformTree(trTree);
            }
            if (trainOptions.printTreeTransformations > 0)
            {
                TrainOptions.PrintTrainTree(trainOptions.printAnnotatedPW, "ANNOTATED TREE:", trTree);
            }
            if (trainOptions.printAnnotatedRuleCounts)
            {
                Tree tr2 = trTree.DeepCopy(new LabeledScoredTreeFactory(), new StringLabelFactory());
                ICollection <Tree> localTrees = tr2.LocalTrees();
                foreach (Tree tr in localTrees)
                {
                    annotatedRuleCounts.IncrementCount(tr);
                }
            }
            if (trainOptions.printAnnotatedStateCounts)
            {
                foreach (Tree subt in trTree)
                {
                    if (!subt.IsLeaf())
                    {
                        annotatedStateCounts.IncrementCount(subt.Label().Value());
                    }
                }
            }
            // if we add the ROOT first, then we don't know how to percolate the heads at the top
            AddRoot(trTree);
            // this creates a few non-binarized rules at the top
            Tree binarizedTree = binarizer.TransformTree(trTree);

            if (trainOptions.printTreeTransformations > 0)
            {
                TrainOptions.PrintTrainTree(trainOptions.printBinarizedPW, "BINARIZED TREE:", binarizedTree);
                trainOptions.printTreeTransformations--;
            }
            if (forceCNF)
            {
                binarizedTree = new CNFTransformers.ToCNFTransformer().TransformTree(binarizedTree);
            }
            //        System.out.println("BinarizedCNF:\n");
            //        binarizedTree.pennPrint();
            return(binarizedTree);
        }
Пример #5
0
 public TreeAnnotatorAndBinarizer(IHeadFinder annotationHF, IHeadFinder binarizationHF, ITreebankLangParserParams tlpParams, bool forceCNF, bool insideFactor, bool doSubcategorization, Options op)
 {
     this.trainOptions = op.trainOptions;
     if (doSubcategorization)
     {
         annotator = new TreeAnnotator(annotationHF, tlpParams, op);
     }
     else
     {
         annotator = new TreeAnnotatorAndBinarizer.TreeNullAnnotator(annotationHF);
     }
     binarizer = new TreeBinarizer(binarizationHF, tlpParams.TreebankLanguagePack(), insideFactor, trainOptions.markovFactor, trainOptions.markovOrder, trainOptions.CompactGrammar() > 0, trainOptions.CompactGrammar() > 1, trainOptions.HselCut, trainOptions
                                   .markFinalStates, trainOptions.simpleBinarizedLabels, trainOptions.noRebinarization);
     if (trainOptions.selectivePostSplit)
     {
         postSplitter = new PostSplitter(tlpParams, op);
     }
     else
     {
         postSplitter = null;
     }
     this.tf       = new LabeledScoredTreeFactory(new CategoryWordTagFactory());
     this.tlp      = tlpParams.TreebankLanguagePack();
     this.forceCNF = forceCNF;
     if (trainOptions.printAnnotatedRuleCounts)
     {
         annotatedRuleCounts = new ClassicCounter <Tree>();
     }
     else
     {
         annotatedRuleCounts = null;
     }
     if (trainOptions.printAnnotatedStateCounts)
     {
         annotatedStateCounts = new ClassicCounter <string>();
     }
     else
     {
         annotatedStateCounts = null;
     }
 }
Пример #6
0
 public BaseUnknownWordModel(Options op, ILexicon lex, IIndex <string> wordIndex, IIndex <string> tagIndex, ClassicCounter <IntTaggedWord> unSeenCounter, IDictionary <ILabel, ClassicCounter <string> > tagHash, IDictionary <string, float> unknownGT,
                             ICollection <string> seenEnd)
 {
     //= true;
     // Only care if first is capitalized
     // only used if useEnd==true
     endLength = op.lexOptions.unknownSuffixSize;
     // TODO: refactor these terms into BaseUnknownWordModelTrainer
     useEnd             = (op.lexOptions.unknownSuffixSize > 0 && op.lexOptions.useUnknownWordSignatures > 0);
     useFirstCap        = op.lexOptions.useUnknownWordSignatures > 0;
     useGT              = (op.lexOptions.useUnknownWordSignatures == 0);
     useFirst           = false;
     this.lex           = lex;
     this.trainOptions  = op.trainOptions;
     this.wordIndex     = wordIndex;
     this.tagIndex      = tagIndex;
     this.unSeenCounter = unSeenCounter;
     this.tagHash       = tagHash;
     this.seenEnd       = seenEnd;
     this.unknownGT     = unknownGT;
     unknownLevel       = op.lexOptions.useUnknownWordSignatures;
 }
 public PostSplitter(ITreebankLangParserParams tlpParams, Options op)
 {
     this.tlpParams    = tlpParams;
     this.hf           = tlpParams.HeadFinder();
     this.trainOptions = op.trainOptions;
 }