public OutsideRuleFilter(BinaryGrammar bg, IIndex <string> stateIndex, IIndex <string> tagIndex)
        {
            this.tagIndex = tagIndex;
            int numStates = stateIndex.Size();

            numTags = tagIndex.Size();
            Allocate(numStates);
            for (int state = 0; state < numStates; state++)
            {
                string         stateStr = stateIndex.Get(state);
                IList <string> left     = new List <string>();
                IList <string> right    = new List <string>();
                if (!bg.IsSynthetic(state))
                {
                    RegisterRule(left, right, state);
                    continue;
                }
                bool           foundSemi = false;
                bool           foundDots = false;
                IList <string> array     = left;
                StringBuilder  sb        = new StringBuilder();
                for (int c = 0; c < stateStr.Length; c++)
                {
                    if (stateStr[c] == ':')
                    {
                        foundSemi = true;
                        continue;
                    }
                    if (!foundSemi)
                    {
                        continue;
                    }
                    if (stateStr[c] == ' ')
                    {
                        if (sb.Length > 0)
                        {
                            string str = sb.ToString();
                            if (!tagIndex.Contains(str))
                            {
                                str = null;
                            }
                            array.Add(str);
                            sb = new StringBuilder();
                        }
                        continue;
                    }
                    if (!foundDots && stateStr[c] == '.')
                    {
                        c        += 3;
                        foundDots = true;
                        array     = right;
                        continue;
                    }
                    sb.Append(stateStr[c]);
                }
                RegisterRule(left, right, state);
            }
        }
 internal BiLexPCFGParser(IScorer scorer, ExhaustivePCFGParser fscorer, ExhaustiveDependencyParser dparser, BinaryGrammar bg, UnaryGrammar ug, IDependencyGrammar dg, ILexicon lex, Options op, IGrammarProjection projection, IIndex <string> stateIndex
                          , IIndex <string> wordIndex, IIndex <string> tagIndex)
 {
     this.fscorer    = fscorer;
     this.projection = projection;
     this.dparser    = dparser;
     this.scorer     = scorer;
     this.bg         = bg;
     this.ug         = ug;
     this.dg         = dg;
     this.lex        = lex;
     this.op         = op;
     this.stateIndex = stateIndex;
     this.wordIndex  = wordIndex;
     this.tagIndex   = tagIndex;
     tempEdge        = new Edge(op.testOptions.exhaustiveTest);
     tempHook        = new Hook(op.testOptions.exhaustiveTest);
 }
Пример #3
0
        protected internal virtual IDictionary <string, TransducerGraph> ConvertGrammarToGraphs(Pair <UnaryGrammar, BinaryGrammar> grammar, ICollection <UnaryRule> unaryRules, ICollection <BinaryRule> binaryRules)
        {
            int           numRules = 0;
            UnaryGrammar  ug       = grammar.first;
            BinaryGrammar bg       = grammar.second;
            IDictionary <string, TransducerGraph> graphs = Generics.NewHashMap();

            // go through the BinaryGrammar and add everything
            foreach (BinaryRule rule in bg)
            {
                numRules++;
                bool wasAdded = AddOneBinaryRule(rule, graphs);
                if (!wasAdded)
                {
                    // add it for later, since we don't make graphs for these
                    binaryRules.Add(rule);
                }
            }
            // now we need to use the UnaryGrammar to
            // add start and end Arcs to the graphs
            foreach (UnaryRule rule_1 in ug)
            {
                numRules++;
                bool wasAdded = AddOneUnaryRule(rule_1, graphs);
                if (!wasAdded)
                {
                    // add it for later, since we don't make graphs for these
                    unaryRules.Add(rule_1);
                }
            }
            if (verbose)
            {
                System.Console.Out.WriteLine("Number of raw rules: " + numRules);
                System.Console.Out.WriteLine("Number of raw states: " + stateIndex.Size());
            }
            return(graphs);
        }
Пример #4
0
        /// <param name="graphs">a Map from String categories to TransducerGraph objects</param>
        /// <param name="unaryRules">is a Set of UnaryRule objects that we need to add</param>
        /// <param name="binaryRules">is a Set of BinaryRule objects that we need to add</param>
        /// <returns>a new Pair of UnaryGrammar, BinaryGrammar</returns>
        protected internal virtual Pair <UnaryGrammar, BinaryGrammar> ConvertGraphsToGrammar(ICollection <TransducerGraph> graphs, ICollection <UnaryRule> unaryRules, ICollection <BinaryRule> binaryRules)
        {
            // first go through all the existing rules and number them with new numberer
            newStateIndex = new HashIndex <string>();
            foreach (UnaryRule rule in unaryRules)
            {
                string parent = stateIndex.Get(rule.parent);
                rule.parent = newStateIndex.AddToIndex(parent);
                string child = stateIndex.Get(rule.child);
                rule.child = newStateIndex.AddToIndex(child);
            }
            foreach (BinaryRule rule_1 in binaryRules)
            {
                string parent = stateIndex.Get(rule_1.parent);
                rule_1.parent = newStateIndex.AddToIndex(parent);
                string leftChild = stateIndex.Get(rule_1.leftChild);
                rule_1.leftChild = newStateIndex.AddToIndex(leftChild);
                string rightChild = stateIndex.Get(rule_1.rightChild);
                rule_1.rightChild = newStateIndex.AddToIndex(rightChild);
            }
            // now go through the graphs and add the rules
            foreach (TransducerGraph graph in graphs)
            {
                object startNode = graph.GetStartNode();
                foreach (TransducerGraph.Arc arc in graph.GetArcs())
                {
                    // TODO: make sure these are the strings we're looking for
                    string source      = arc.GetSourceNode().ToString();
                    string target      = arc.GetTargetNode().ToString();
                    object input       = arc.GetInput();
                    string inputString = input.ToString();
                    double output      = ((double)arc.GetOutput());
                    if (source.Equals(startNode))
                    {
                        // make a UnaryRule
                        UnaryRule ur = new UnaryRule(newStateIndex.AddToIndex(target), newStateIndex.AddToIndex(inputString), SmartNegate(output));
                        unaryRules.Add(ur);
                    }
                    else
                    {
                        if (inputString.Equals(End) || inputString.Equals(Epsilon))
                        {
                            // make a UnaryRule
                            UnaryRule ur = new UnaryRule(newStateIndex.AddToIndex(target), newStateIndex.AddToIndex(source), SmartNegate(output));
                            unaryRules.Add(ur);
                        }
                        else
                        {
                            // make a BinaryRule
                            // figure out whether the input was generated on the left or right
                            int  length      = inputString.Length;
                            char leftOrRight = inputString[length - 1];
                            inputString = Sharpen.Runtime.Substring(inputString, 0, length - 1);
                            BinaryRule br;
                            if (leftOrRight == '<' || leftOrRight == '[')
                            {
                                br = new BinaryRule(newStateIndex.AddToIndex(target), newStateIndex.AddToIndex(inputString), newStateIndex.AddToIndex(source), SmartNegate(output));
                            }
                            else
                            {
                                if (leftOrRight == '>' || leftOrRight == ']')
                                {
                                    br = new BinaryRule(newStateIndex.AddToIndex(target), newStateIndex.AddToIndex(source), newStateIndex.AddToIndex(inputString), SmartNegate(output));
                                }
                                else
                                {
                                    throw new Exception("Arc input is in unexpected format: " + arc);
                                }
                            }
                            binaryRules.Add(br);
                        }
                    }
                }
            }
            // by now, the unaryRules and binaryRules Sets have old untouched and new rules with scores
            ClassicCounter <string> symbolCounter = new ClassicCounter <string>();

            if (outputType == RawCounts)
            {
                // now we take the sets of rules and turn them into grammars
                // the scores of the rules we are given are actually counts
                // so we count parent symbol occurrences
                foreach (UnaryRule rule_2 in unaryRules)
                {
                    symbolCounter.IncrementCount(newStateIndex.Get(rule_2.parent), rule_2.score);
                }
                foreach (BinaryRule rule_3 in binaryRules)
                {
                    symbolCounter.IncrementCount(newStateIndex.Get(rule_3.parent), rule_3.score);
                }
            }
            // now we put the rules in the grammars
            int numStates = newStateIndex.Size();
            // this should be smaller than last one
            int           numRules = 0;
            UnaryGrammar  ug       = new UnaryGrammar(newStateIndex);
            BinaryGrammar bg       = new BinaryGrammar(newStateIndex);

            foreach (UnaryRule rule_4 in unaryRules)
            {
                if (outputType == RawCounts)
                {
                    double count = symbolCounter.GetCount(newStateIndex.Get(rule_4.parent));
                    rule_4.score = (float)Math.Log(rule_4.score / count);
                }
                ug.AddRule(rule_4);
                numRules++;
            }
            foreach (BinaryRule rule_5 in binaryRules)
            {
                if (outputType == RawCounts)
                {
                    double count = symbolCounter.GetCount(newStateIndex.Get(rule_5.parent));
                    rule_5.score = (float)Math.Log((rule_5.score - op.trainOptions.ruleDiscount) / count);
                }
                bg.AddRule(rule_5);
                numRules++;
            }
            if (verbose)
            {
                System.Console.Out.WriteLine("Number of minimized rules: " + numRules);
                System.Console.Out.WriteLine("Number of minimized states: " + newStateIndex.Size());
            }
            ug.PurgeRules();
            bg.SplitRules();
            return(new Pair <UnaryGrammar, BinaryGrammar>(ug, bg));
        }
Пример #5
0
        /* some documentation for Roger's convenience
         * {pcfg,dep,combo}{PE,DE,TE} are precision/dep/tagging evals for the models
         *
         * parser is the PCFG parser
         * dparser is the dependency parser
         * bparser is the combining parser
         *
         * during testing:
         * tree is the test tree (gold tree)
         * binaryTree is the gold tree binarized
         * tree2b is the best PCFG paser, binarized
         * tree2 is the best PCFG parse (debinarized)
         * tree3 is the dependency parse, binarized
         * tree3db is the dependency parser, debinarized
         * tree4 is the best combo parse, binarized and then debinarized
         * tree4b is the best combo parse, binarized
         */
        public static void Main(string[] args)
        {
            Options op = new Options(new EnglishTreebankParserParams());

            // op.tlpParams may be changed to something else later, so don't use it till
            // after options are parsed.
            StringUtils.LogInvocationString(log, args);
            string path          = "/u/nlp/stuff/corpora/Treebank3/parsed/mrg/wsj";
            int    trainLow      = 200;
            int    trainHigh     = 2199;
            int    testLow       = 2200;
            int    testHigh      = 2219;
            string serializeFile = null;
            int    i             = 0;

            while (i < args.Length && args[i].StartsWith("-"))
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-path") && (i + 1 < args.Length))
                {
                    path = args[i + 1];
                    i   += 2;
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-train") && (i + 2 < args.Length))
                    {
                        trainLow  = System.Convert.ToInt32(args[i + 1]);
                        trainHigh = System.Convert.ToInt32(args[i + 2]);
                        i        += 3;
                    }
                    else
                    {
                        if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-test") && (i + 2 < args.Length))
                        {
                            testLow  = System.Convert.ToInt32(args[i + 1]);
                            testHigh = System.Convert.ToInt32(args[i + 2]);
                            i       += 3;
                        }
                        else
                        {
                            if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-serialize") && (i + 1 < args.Length))
                            {
                                serializeFile = args[i + 1];
                                i            += 2;
                            }
                            else
                            {
                                if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-tLPP") && (i + 1 < args.Length))
                                {
                                    try
                                    {
                                        op.tlpParams = (ITreebankLangParserParams)System.Activator.CreateInstance(Sharpen.Runtime.GetType(args[i + 1]));
                                    }
                                    catch (TypeLoadException e)
                                    {
                                        log.Info("Class not found: " + args[i + 1]);
                                        throw new Exception(e);
                                    }
                                    catch (InstantiationException e)
                                    {
                                        log.Info("Couldn't instantiate: " + args[i + 1] + ": " + e.ToString());
                                        throw new Exception(e);
                                    }
                                    catch (MemberAccessException e)
                                    {
                                        log.Info("illegal access" + e);
                                        throw new Exception(e);
                                    }
                                    i += 2;
                                }
                                else
                                {
                                    if (args[i].Equals("-encoding"))
                                    {
                                        // sets encoding for TreebankLangParserParams
                                        op.tlpParams.SetInputEncoding(args[i + 1]);
                                        op.tlpParams.SetOutputEncoding(args[i + 1]);
                                        i += 2;
                                    }
                                    else
                                    {
                                        i = op.SetOptionOrWarn(args, i);
                                    }
                                }
                            }
                        }
                    }
                }
            }
            // System.out.println(tlpParams.getClass());
            ITreebankLanguagePack tlp = op.tlpParams.TreebankLanguagePack();

            op.trainOptions.sisterSplitters = Generics.NewHashSet(Arrays.AsList(op.tlpParams.SisterSplitters()));
            //    BinarizerFactory.TreeAnnotator.setTreebankLang(tlpParams);
            PrintWriter pw = op.tlpParams.Pw();

            op.testOptions.Display();
            op.trainOptions.Display();
            op.Display();
            op.tlpParams.Display();
            // setup tree transforms
            Treebank       trainTreebank = op.tlpParams.MemoryTreebank();
            MemoryTreebank testTreebank  = op.tlpParams.TestMemoryTreebank();

            // Treebank blippTreebank = ((EnglishTreebankParserParams) tlpParams).diskTreebank();
            // String blippPath = "/afs/ir.stanford.edu/data/linguistic-data/BLLIP-WSJ/";
            // blippTreebank.loadPath(blippPath, "", true);
            Timing.StartTime();
            log.Info("Reading trees...");
            testTreebank.LoadPath(path, new NumberRangeFileFilter(testLow, testHigh, true));
            if (op.testOptions.increasingLength)
            {
                testTreebank.Sort(new TreeLengthComparator());
            }
            trainTreebank.LoadPath(path, new NumberRangeFileFilter(trainLow, trainHigh, true));
            Timing.Tick("done.");
            log.Info("Binarizing trees...");
            TreeAnnotatorAndBinarizer binarizer;

            if (!op.trainOptions.leftToRight)
            {
                binarizer = new TreeAnnotatorAndBinarizer(op.tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), true, op);
            }
            else
            {
                binarizer = new TreeAnnotatorAndBinarizer(op.tlpParams.HeadFinder(), new LeftHeadFinder(), op.tlpParams, op.forceCNF, !op.trainOptions.OutsideFactor(), true, op);
            }
            CollinsPuncTransformer collinsPuncTransformer = null;

            if (op.trainOptions.collinsPunc)
            {
                collinsPuncTransformer = new CollinsPuncTransformer(tlp);
            }
            ITreeTransformer debinarizer      = new Debinarizer(op.forceCNF);
            IList <Tree>     binaryTrainTrees = new List <Tree>();

            if (op.trainOptions.selectiveSplit)
            {
                op.trainOptions.splitters = ParentAnnotationStats.GetSplitCategories(trainTreebank, op.trainOptions.tagSelectiveSplit, 0, op.trainOptions.selectiveSplitCutOff, op.trainOptions.tagSelectiveSplitCutOff, op.tlpParams.TreebankLanguagePack());
                if (op.trainOptions.deleteSplitters != null)
                {
                    IList <string> deleted = new List <string>();
                    foreach (string del in op.trainOptions.deleteSplitters)
                    {
                        string baseDel    = tlp.BasicCategory(del);
                        bool   checkBasic = del.Equals(baseDel);
                        for (IEnumerator <string> it = op.trainOptions.splitters.GetEnumerator(); it.MoveNext();)
                        {
                            string elem     = it.Current;
                            string baseElem = tlp.BasicCategory(elem);
                            bool   delStr   = checkBasic && baseElem.Equals(baseDel) || elem.Equals(del);
                            if (delStr)
                            {
                                it.Remove();
                                deleted.Add(elem);
                            }
                        }
                    }
                    log.Info("Removed from vertical splitters: " + deleted);
                }
            }
            if (op.trainOptions.selectivePostSplit)
            {
                ITreeTransformer myTransformer = new TreeAnnotator(op.tlpParams.HeadFinder(), op.tlpParams, op);
                Treebank         annotatedTB   = trainTreebank.Transform(myTransformer);
                op.trainOptions.postSplitters = ParentAnnotationStats.GetSplitCategories(annotatedTB, true, 0, op.trainOptions.selectivePostSplitCutOff, op.trainOptions.tagSelectivePostSplitCutOff, op.tlpParams.TreebankLanguagePack());
            }
            if (op.trainOptions.hSelSplit)
            {
                binarizer.SetDoSelectiveSplit(false);
                foreach (Tree tree in trainTreebank)
                {
                    if (op.trainOptions.collinsPunc)
                    {
                        tree = collinsPuncTransformer.TransformTree(tree);
                    }
                    //tree.pennPrint(tlpParams.pw());
                    tree = binarizer.TransformTree(tree);
                }
                //binaryTrainTrees.add(tree);
                binarizer.SetDoSelectiveSplit(true);
            }
            foreach (Tree tree_1 in trainTreebank)
            {
                if (op.trainOptions.collinsPunc)
                {
                    tree_1 = collinsPuncTransformer.TransformTree(tree_1);
                }
                tree_1 = binarizer.TransformTree(tree_1);
                binaryTrainTrees.Add(tree_1);
            }
            if (op.testOptions.verbose)
            {
                binarizer.DumpStats();
            }
            IList <Tree> binaryTestTrees = new List <Tree>();

            foreach (Tree tree_2 in testTreebank)
            {
                if (op.trainOptions.collinsPunc)
                {
                    tree_2 = collinsPuncTransformer.TransformTree(tree_2);
                }
                tree_2 = binarizer.TransformTree(tree_2);
                binaryTestTrees.Add(tree_2);
            }
            Timing.Tick("done.");
            // binarization
            BinaryGrammar      bg = null;
            UnaryGrammar       ug = null;
            IDependencyGrammar dg = null;
            // DependencyGrammar dgBLIPP = null;
            ILexicon        lex        = null;
            IIndex <string> stateIndex = new HashIndex <string>();
            // extract grammars
            IExtractor <Pair <UnaryGrammar, BinaryGrammar> > bgExtractor = new BinaryGrammarExtractor(op, stateIndex);

            //Extractor bgExtractor = new SmoothedBinaryGrammarExtractor();//new BinaryGrammarExtractor();
            // Extractor lexExtractor = new LexiconExtractor();
            //Extractor dgExtractor = new DependencyMemGrammarExtractor();
            if (op.doPCFG)
            {
                log.Info("Extracting PCFG...");
                Pair <UnaryGrammar, BinaryGrammar> bgug = null;
                if (op.trainOptions.cheatPCFG)
                {
                    IList <Tree> allTrees = new List <Tree>(binaryTrainTrees);
                    Sharpen.Collections.AddAll(allTrees, binaryTestTrees);
                    bgug = bgExtractor.Extract(allTrees);
                }
                else
                {
                    bgug = bgExtractor.Extract(binaryTrainTrees);
                }
                bg = bgug.second;
                bg.SplitRules();
                ug = bgug.first;
                ug.PurgeRules();
                Timing.Tick("done.");
            }
            log.Info("Extracting Lexicon...");
            IIndex <string> wordIndex = new HashIndex <string>();
            IIndex <string> tagIndex  = new HashIndex <string>();

            lex = op.tlpParams.Lex(op, wordIndex, tagIndex);
            lex.InitializeTraining(binaryTrainTrees.Count);
            lex.Train(binaryTrainTrees);
            lex.FinishTraining();
            Timing.Tick("done.");
            if (op.doDep)
            {
                log.Info("Extracting Dependencies...");
                binaryTrainTrees.Clear();
                IExtractor <IDependencyGrammar> dgExtractor = new MLEDependencyGrammarExtractor(op, wordIndex, tagIndex);
                // dgBLIPP = (DependencyGrammar) dgExtractor.extract(new ConcatenationIterator(trainTreebank.iterator(),blippTreebank.iterator()),new TransformTreeDependency(tlpParams,true));
                // DependencyGrammar dg1 = dgExtractor.extract(trainTreebank.iterator(), new TransformTreeDependency(op.tlpParams, true));
                //dgBLIPP=(DependencyGrammar)dgExtractor.extract(blippTreebank.iterator(),new TransformTreeDependency(tlpParams));
                //dg = (DependencyGrammar) dgExtractor.extract(new ConcatenationIterator(trainTreebank.iterator(),blippTreebank.iterator()),new TransformTreeDependency(tlpParams));
                // dg=new DependencyGrammarCombination(dg1,dgBLIPP,2);
                dg = dgExtractor.Extract(binaryTrainTrees);
                //uses information whether the words are known or not, discards unknown words
                Timing.Tick("done.");
                //System.out.print("Extracting Unknown Word Model...");
                //UnknownWordModel uwm = (UnknownWordModel)uwmExtractor.extract(binaryTrainTrees);
                //Timing.tick("done.");
                System.Console.Out.Write("Tuning Dependency Model...");
                dg.Tune(binaryTestTrees);
                //System.out.println("TUNE DEPS: "+tuneDeps);
                Timing.Tick("done.");
            }
            BinaryGrammar      boundBG = bg;
            UnaryGrammar       boundUG = ug;
            IGrammarProjection gp      = new NullGrammarProjection(bg, ug);

            // serialization
            if (serializeFile != null)
            {
                log.Info("Serializing parser...");
                LexicalizedParser parser = new LexicalizedParser(lex, bg, ug, dg, stateIndex, wordIndex, tagIndex, op);
                parser.SaveParserToSerialized(serializeFile);
                Timing.Tick("done.");
            }
            // test: pcfg-parse and output
            ExhaustivePCFGParser parser_1 = null;

            if (op.doPCFG)
            {
                parser_1 = new ExhaustivePCFGParser(boundBG, boundUG, lex, op, stateIndex, wordIndex, tagIndex);
            }
            ExhaustiveDependencyParser dparser = ((op.doDep && !op.testOptions.useFastFactored) ? new ExhaustiveDependencyParser(dg, lex, op, wordIndex, tagIndex) : null);
            IScorer scorer = (op.doPCFG ? new TwinScorer(new ProjectionScorer(parser_1, gp, op), dparser) : null);
            //Scorer scorer = parser;
            BiLexPCFGParser bparser = null;

            if (op.doPCFG && op.doDep)
            {
                bparser = (op.testOptions.useN5) ? new BiLexPCFGParser.N5BiLexPCFGParser(scorer, parser_1, dparser, bg, ug, dg, lex, op, gp, stateIndex, wordIndex, tagIndex) : new BiLexPCFGParser(scorer, parser_1, dparser, bg, ug, dg, lex, op, gp, stateIndex
                                                                                                                                                                                                    , wordIndex, tagIndex);
            }
            Evalb        pcfgPE         = new Evalb("pcfg  PE", true);
            Evalb        comboPE        = new Evalb("combo PE", true);
            AbstractEval pcfgCB         = new Evalb.CBEval("pcfg  CB", true);
            AbstractEval pcfgTE         = new TaggingEval("pcfg  TE");
            AbstractEval comboTE        = new TaggingEval("combo TE");
            AbstractEval pcfgTEnoPunct  = new TaggingEval("pcfg nopunct TE");
            AbstractEval comboTEnoPunct = new TaggingEval("combo nopunct TE");
            AbstractEval depTE          = new TaggingEval("depnd TE");
            AbstractEval depDE          = new UnlabeledAttachmentEval("depnd DE", true, null, tlp.PunctuationWordRejectFilter());
            AbstractEval comboDE        = new UnlabeledAttachmentEval("combo DE", true, null, tlp.PunctuationWordRejectFilter());

            if (op.testOptions.evalb)
            {
                EvalbFormatWriter.InitEVALBfiles(op.tlpParams);
            }
            // int[] countByLength = new int[op.testOptions.maxLength+1];
            // Use a reflection ruse, so one can run this without needing the
            // tagger.  Using a function rather than a MaxentTagger means we
            // can distribute a version of the parser that doesn't include the
            // entire tagger.
            IFunction <IList <IHasWord>, List <TaggedWord> > tagger = null;

            if (op.testOptions.preTag)
            {
                try
                {
                    Type[]   argsClass = new Type[] { typeof(string) };
                    object[] arguments = new object[] { op.testOptions.taggerSerializedFile };
                    tagger = (IFunction <IList <IHasWord>, List <TaggedWord> >)Sharpen.Runtime.GetType("edu.stanford.nlp.tagger.maxent.MaxentTagger").GetConstructor(argsClass).NewInstance(arguments);
                }
                catch (Exception e)
                {
                    log.Info(e);
                    log.Info("Warning: No pretagging of sentences will be done.");
                }
            }
            for (int tNum = 0; tNum < ttSize; tNum++)
            {
                Tree tree        = testTreebank[tNum];
                int  testTreeLen = tree_2.Yield().Count;
                if (testTreeLen > op.testOptions.maxLength)
                {
                    continue;
                }
                Tree binaryTree = binaryTestTrees[tNum];
                // countByLength[testTreeLen]++;
                System.Console.Out.WriteLine("-------------------------------------");
                System.Console.Out.WriteLine("Number: " + (tNum + 1));
                System.Console.Out.WriteLine("Length: " + testTreeLen);
                //tree.pennPrint(pw);
                // System.out.println("XXXX The binary tree is");
                // binaryTree.pennPrint(pw);
                //System.out.println("Here are the tags in the lexicon:");
                //System.out.println(lex.showTags());
                //System.out.println("Here's the tagnumberer:");
                //System.out.println(Numberer.getGlobalNumberer("tags").toString());
                long timeMil1 = Runtime.CurrentTimeMillis();
                Timing.Tick("Starting parse.");
                if (op.doPCFG)
                {
                    //log.info(op.testOptions.forceTags);
                    if (op.testOptions.forceTags)
                    {
                        if (tagger != null)
                        {
                            //System.out.println("Using a tagger to set tags");
                            //System.out.println("Tagged sentence as: " + tagger.processSentence(cutLast(wordify(binaryTree.yield()))).toString(false));
                            parser_1.Parse(AddLast(tagger.Apply(CutLast(Wordify(binaryTree.Yield())))));
                        }
                        else
                        {
                            //System.out.println("Forcing tags to match input.");
                            parser_1.Parse(CleanTags(binaryTree.TaggedYield(), tlp));
                        }
                    }
                    else
                    {
                        // System.out.println("XXXX Parsing " + binaryTree.yield());
                        parser_1.Parse(binaryTree.YieldHasWord());
                    }
                }
                //Timing.tick("Done with pcfg phase.");
                if (op.doDep)
                {
                    dparser.Parse(binaryTree.YieldHasWord());
                }
                //Timing.tick("Done with dependency phase.");
                bool bothPassed = false;
                if (op.doPCFG && op.doDep)
                {
                    bothPassed = bparser.Parse(binaryTree.YieldHasWord());
                }
                //Timing.tick("Done with combination phase.");
                long timeMil2 = Runtime.CurrentTimeMillis();
                long elapsed  = timeMil2 - timeMil1;
                log.Info("Time: " + ((int)(elapsed / 100)) / 10.00 + " sec.");
                //System.out.println("PCFG Best Parse:");
                Tree tree2b = null;
                Tree tree2  = null;
                //System.out.println("Got full best parse...");
                if (op.doPCFG)
                {
                    tree2b = parser_1.GetBestParse();
                    tree2  = debinarizer.TransformTree(tree2b);
                }
                //System.out.println("Debinarized parse...");
                //tree2.pennPrint();
                //System.out.println("DepG Best Parse:");
                Tree tree3   = null;
                Tree tree3db = null;
                if (op.doDep)
                {
                    tree3 = dparser.GetBestParse();
                    // was: but wrong Tree tree3db = debinarizer.transformTree(tree2);
                    tree3db = debinarizer.TransformTree(tree3);
                    tree3.PennPrint(pw);
                }
                //tree.pennPrint();
                //((Tree)binaryTrainTrees.get(tNum)).pennPrint();
                //System.out.println("Combo Best Parse:");
                Tree tree4 = null;
                if (op.doPCFG && op.doDep)
                {
                    try
                    {
                        tree4 = bparser.GetBestParse();
                        if (tree4 == null)
                        {
                            tree4 = tree2b;
                        }
                    }
                    catch (ArgumentNullException)
                    {
                        log.Info("Blocked, using PCFG parse!");
                        tree4 = tree2b;
                    }
                }
                if (op.doPCFG && !bothPassed)
                {
                    tree4 = tree2b;
                }
                //tree4.pennPrint();
                if (op.doDep)
                {
                    depDE.Evaluate(tree3, binaryTree, pw);
                    depTE.Evaluate(tree3db, tree_2, pw);
                }
                ITreeTransformer tc      = op.tlpParams.Collinizer();
                ITreeTransformer tcEvalb = op.tlpParams.CollinizerEvalb();
                if (op.doPCFG)
                {
                    // System.out.println("XXXX Best PCFG was: ");
                    // tree2.pennPrint();
                    // System.out.println("XXXX Transformed best PCFG is: ");
                    // tc.transformTree(tree2).pennPrint();
                    //System.out.println("True Best Parse:");
                    //tree.pennPrint();
                    //tc.transformTree(tree).pennPrint();
                    pcfgPE.Evaluate(tc.TransformTree(tree2), tc.TransformTree(tree_2), pw);
                    pcfgCB.Evaluate(tc.TransformTree(tree2), tc.TransformTree(tree_2), pw);
                    Tree tree4b = null;
                    if (op.doDep)
                    {
                        comboDE.Evaluate((bothPassed ? tree4 : tree3), binaryTree, pw);
                        tree4b = tree4;
                        tree4  = debinarizer.TransformTree(tree4);
                        if (op.nodePrune)
                        {
                            NodePruner np = new NodePruner(parser_1, debinarizer);
                            tree4 = np.Prune(tree4);
                        }
                        //tree4.pennPrint();
                        comboPE.Evaluate(tc.TransformTree(tree4), tc.TransformTree(tree_2), pw);
                    }
                    //pcfgTE.evaluate(tree2, tree);
                    pcfgTE.Evaluate(tcEvalb.TransformTree(tree2), tcEvalb.TransformTree(tree_2), pw);
                    pcfgTEnoPunct.Evaluate(tc.TransformTree(tree2), tc.TransformTree(tree_2), pw);
                    if (op.doDep)
                    {
                        comboTE.Evaluate(tcEvalb.TransformTree(tree4), tcEvalb.TransformTree(tree_2), pw);
                        comboTEnoPunct.Evaluate(tc.TransformTree(tree4), tc.TransformTree(tree_2), pw);
                    }
                    System.Console.Out.WriteLine("PCFG only: " + parser_1.ScoreBinarizedTree(tree2b, 0));
                    //tc.transformTree(tree2).pennPrint();
                    tree2.PennPrint(pw);
                    if (op.doDep)
                    {
                        System.Console.Out.WriteLine("Combo: " + parser_1.ScoreBinarizedTree(tree4b, 0));
                        // tc.transformTree(tree4).pennPrint(pw);
                        tree4.PennPrint(pw);
                    }
                    System.Console.Out.WriteLine("Correct:" + parser_1.ScoreBinarizedTree(binaryTree, 0));

                    /*
                     * if (parser.scoreBinarizedTree(tree2b,true) < parser.scoreBinarizedTree(binaryTree,true)) {
                     * System.out.println("SCORE INVERSION");
                     * parser.validateBinarizedTree(binaryTree,0);
                     * }
                     */
                    tree_2.PennPrint(pw);
                }
                // end if doPCFG
                if (op.testOptions.evalb)
                {
                    if (op.doPCFG && op.doDep)
                    {
                        EvalbFormatWriter.WriteEVALBline(tcEvalb.TransformTree(tree_2), tcEvalb.TransformTree(tree4));
                    }
                    else
                    {
                        if (op.doPCFG)
                        {
                            EvalbFormatWriter.WriteEVALBline(tcEvalb.TransformTree(tree_2), tcEvalb.TransformTree(tree2));
                        }
                        else
                        {
                            if (op.doDep)
                            {
                                EvalbFormatWriter.WriteEVALBline(tcEvalb.TransformTree(tree_2), tcEvalb.TransformTree(tree3db));
                            }
                        }
                    }
                }
            }
            // end for each tree in test treebank
            if (op.testOptions.evalb)
            {
                EvalbFormatWriter.CloseEVALBfiles();
            }
            // op.testOptions.display();
            if (op.doPCFG)
            {
                pcfgPE.Display(false, pw);
                System.Console.Out.WriteLine("Grammar size: " + stateIndex.Size());
                pcfgCB.Display(false, pw);
                if (op.doDep)
                {
                    comboPE.Display(false, pw);
                }
                pcfgTE.Display(false, pw);
                pcfgTEnoPunct.Display(false, pw);
                if (op.doDep)
                {
                    comboTE.Display(false, pw);
                    comboTEnoPunct.Display(false, pw);
                }
            }
            if (op.doDep)
            {
                depTE.Display(false, pw);
                depDE.Display(false, pw);
            }
            if (op.doPCFG && op.doDep)
            {
                comboDE.Display(false, pw);
            }
        }
Пример #6
0
 public IterativeCKYPCFGParser(BinaryGrammar bg, UnaryGrammar ug, ILexicon lex, Options op, IIndex <string> stateIndex, IIndex <string> wordIndex, IIndex <string> tagIndex)
     : base(bg, ug, lex, op, stateIndex, wordIndex, tagIndex)
 {
 }
 internal NullGrammarProjection(BinaryGrammar bg, UnaryGrammar ug)
 {
     this.ug = ug;
     this.bg = bg;
 }
 internal N5BiLexPCFGParser(IScorer scorer, ExhaustivePCFGParser fscorer, ExhaustiveDependencyParser leach, BinaryGrammar bg, UnaryGrammar ug, IDependencyGrammar dg, ILexicon lex, Options op, IGrammarProjection proj, IIndex <string> stateIndex
                            , IIndex <string> wordIndex, IIndex <string> tagIndex)
     : base(scorer, fscorer, leach, bg, ug, dg, lex, op, proj, stateIndex, wordIndex, tagIndex)
 {
 }
 public BiLexPCFGParser(IScorer scorer, ExhaustivePCFGParser fscorer, ExhaustiveDependencyParser dparser, BinaryGrammar bg, UnaryGrammar ug, IDependencyGrammar dg, ILexicon lex, Options op, IIndex <string> stateIndex, IIndex <string> wordIndex,
                        IIndex <string> tagIndex)
     : this(scorer, fscorer, dparser, bg, ug, dg, lex, op, new NullGrammarProjection(bg, ug), stateIndex, wordIndex, tagIndex)
 {
 }
        internal LexicalizedParserQuery(LexicalizedParser parser)
        {
            this.op = parser.GetOp();
            BinaryGrammar      bg         = parser.bg;
            UnaryGrammar       ug         = parser.ug;
            ILexicon           lex        = parser.lex;
            IDependencyGrammar dg         = parser.dg;
            IIndex <string>    stateIndex = parser.stateIndex;
            IIndex <string>    wordIndex  = new DeltaIndex <string>(parser.wordIndex);
            IIndex <string>    tagIndex   = parser.tagIndex;

            this.debinarizer     = new Debinarizer(op.forceCNF);
            this.boundaryRemover = new BoundaryRemover();
            if (op.doPCFG)
            {
                if (op.testOptions.iterativeCKY)
                {
                    pparser = new IterativeCKYPCFGParser(bg, ug, lex, op, stateIndex, wordIndex, tagIndex);
                }
                else
                {
                    pparser = new ExhaustivePCFGParser(bg, ug, lex, op, stateIndex, wordIndex, tagIndex);
                }
            }
            else
            {
                pparser = null;
            }
            if (op.doDep)
            {
                dg.SetLexicon(lex);
                if (!op.testOptions.useFastFactored)
                {
                    dparser = new ExhaustiveDependencyParser(dg, lex, op, wordIndex, tagIndex);
                }
                else
                {
                    dparser = null;
                }
            }
            else
            {
                dparser = null;
            }
            if (op.doDep && op.doPCFG)
            {
                if (op.testOptions.useFastFactored)
                {
                    MLEDependencyGrammar mledg = (MLEDependencyGrammar)dg;
                    int numToFind = 1;
                    if (op.testOptions.printFactoredKGood > 0)
                    {
                        numToFind = op.testOptions.printFactoredKGood;
                    }
                    bparser = new FastFactoredParser(pparser, mledg, op, numToFind, wordIndex, tagIndex);
                }
                else
                {
                    IScorer scorer = new TwinScorer(pparser, dparser);
                    //Scorer scorer = parser;
                    if (op.testOptions.useN5)
                    {
                        bparser = new BiLexPCFGParser.N5BiLexPCFGParser(scorer, pparser, dparser, bg, ug, dg, lex, op, stateIndex, wordIndex, tagIndex);
                    }
                    else
                    {
                        bparser = new BiLexPCFGParser(scorer, pparser, dparser, bg, ug, dg, lex, op, stateIndex, wordIndex, tagIndex);
                    }
                }
            }
            else
            {
                bparser = null;
            }
            subcategoryStripper = op.tlpParams.SubcategoryStripper();
        }