public FilterConfusingRules(LexicalizedParser parser)
        {
            BinaryGrammar   binaryGrammar = parser.bg;
            UnaryGrammar    unaryGrammar  = parser.ug;
            Options         op            = parser.GetOp();
            IIndex <string> stateIndex    = parser.stateIndex;

            foreach (UnaryRule unaryRule in unaryGrammar)
            {
                // only make one matrix for each parent state, and only use the
                // basic category for that
                string childState = stateIndex.Get(unaryRule.child);
                string childBasic = op.Langpack().BasicCategory(childState);
                unaryRules.Add(childBasic);
            }
            foreach (BinaryRule binaryRule in binaryGrammar)
            {
                // only make one matrix for each parent state, and only use the
                // basic category for that
                string leftState  = stateIndex.Get(binaryRule.leftChild);
                string leftBasic  = op.Langpack().BasicCategory(leftState);
                string rightState = stateIndex.Get(binaryRule.rightChild);
                string rightBasic = op.Langpack().BasicCategory(rightState);
                binaryRules.Add(leftBasic, rightBasic);
            }
        }
Exemplo n.º 2
0
        public virtual LexicalizedParser GetParserDataFromTreebank(Treebank trainTreebank)
        {
            log.Info("Binarizing training trees...");
            IList <Tree> binaryTrainTrees = GetAnnotatedBinaryTreebankFromTreebank(trainTreebank);

            Timing.Tick("done.");
            IIndex <string> stateIndex = new HashIndex <string>();

            log.Info("Extracting PCFG...");
            IExtractor <Pair <UnaryGrammar, BinaryGrammar> > bgExtractor = new BinaryGrammarExtractor(op, stateIndex);
            Pair <UnaryGrammar, BinaryGrammar> bgug = bgExtractor.Extract(binaryTrainTrees);
            BinaryGrammar bg = bgug.second;

            bg.SplitRules();
            UnaryGrammar ug = bgug.first;

            ug.PurgeRules();
            Timing.Tick("done.");
            log.Info("Extracting Lexicon...");
            IIndex <string> wordIndex = new HashIndex <string>();
            IIndex <string> tagIndex  = new HashIndex <string>();
            ILexicon        lex       = op.tlpParams.Lex(op, wordIndex, tagIndex);

            lex.InitializeTraining(binaryTrainTrees.Count);
            lex.Train(binaryTrainTrees);
            lex.FinishTraining();
            Timing.Tick("done.");
            IExtractor <IDependencyGrammar> dgExtractor = op.tlpParams.DependencyGrammarExtractor(op, wordIndex, tagIndex);
            IDependencyGrammar dg = null;

            if (op.doDep)
            {
                log.Info("Extracting Dependencies...");
                dg = dgExtractor.Extract(binaryTrainTrees);
                dg.SetLexicon(lex);
                Timing.Tick("done.");
            }
            log.Info("Done extracting grammars and lexicon.");
            return(new LexicalizedParser(lex, bg, ug, dg, stateIndex, wordIndex, tagIndex, op));
        }
 /// <param name="op">the parameters of the parser</param>
 public DVModel(Options op, IIndex <string> stateIndex, UnaryGrammar unaryGrammar, BinaryGrammar binaryGrammar)
 {
     this.op = op;
     rand    = new Random(op.trainOptions.randomSeed);
     ReadWordVectors();
     // Binary matrices will be n*2n+1, unary matrices will be n*n+1
     numRows = op.lexOptions.numHid;
     numCols = op.lexOptions.numHid;
     // Build one matrix for each basic category.
     // We assume that each state that has the same basic
     // category is using the same transformation matrix.
     // Use TreeMap for because we want values to be
     // sorted by key later on when building theta vectors
     binaryTransform     = TwoDimensionalMap.TreeMap();
     unaryTransform      = Generics.NewTreeMap();
     binaryScore         = TwoDimensionalMap.TreeMap();
     unaryScore          = Generics.NewTreeMap();
     numBinaryMatrices   = 0;
     numUnaryMatrices    = 0;
     binaryTransformSize = numRows * (numCols * 2 + 1);
     unaryTransformSize  = numRows * (numCols + 1);
     binaryScoreSize     = numCols;
     unaryScoreSize      = numCols;
     if (op.trainOptions.useContextWords)
     {
         binaryTransformSize += numRows * numCols * 2;
         unaryTransformSize  += numRows * numCols * 2;
     }
     identity = SimpleMatrix.Identity(numRows);
     foreach (UnaryRule unaryRule in unaryGrammar)
     {
         // only make one matrix for each parent state, and only use the
         // basic category for that
         string childState = stateIndex.Get(unaryRule.child);
         string childBasic = BasicCategory(childState);
         AddRandomUnaryMatrix(childBasic);
     }
     foreach (BinaryRule binaryRule in binaryGrammar)
     {
         // only make one matrix for each parent state, and only use the
         // basic category for that
         string leftState  = stateIndex.Get(binaryRule.leftChild);
         string leftBasic  = BasicCategory(leftState);
         string rightState = stateIndex.Get(binaryRule.rightChild);
         string rightBasic = BasicCategory(rightState);
         AddRandomBinaryMatrix(leftBasic, rightBasic);
     }
 }
 internal NullGrammarProjection(BinaryGrammar bg, UnaryGrammar ug)
 {
     this.ug = ug;
     this.bg = bg;
 }