/// <summary>
        /// An example command line for training a new parser:
        /// <br />
        /// nohup java -mx6g edu.stanford.nlp.parser.dvparser.DVParser -cachedTrees /scr/nlp/data/dvparser/wsj/cached.wsj.train.simple.ser.gz -train -testTreebank  /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj/22 2200-2219 -debugOutputFrequency 400 -nofilter -trainingThreads 5 -parser /u/nlp/data/lexparser/wsjPCFG.nocompact.simple.ser.gz -trainingIterations 40 -batchSize 25 -model /scr/nlp/data/dvparser/wsj/wsj.combine.v2.ser.gz -unkWord "*UNK*" -dvCombineCategories &gt; /scr/nlp/data/dvparser/wsj/wsj.combine.v2.out 2&gt;&amp;1 &amp;
        /// </summary>
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.TypeLoadException"/>
        public static void Main(string[] args)
        {
            if (args.Length == 0)
            {
                Help();
                System.Environment.Exit(2);
            }
            log.Info("Running DVParser with arguments:");
            foreach (string arg in args)
            {
                log.Info("  " + arg);
            }
            log.Info();
            string         parserPath           = null;
            string         trainTreebankPath    = null;
            IFileFilter    trainTreebankFilter  = null;
            string         cachedTrainTreesPath = null;
            bool           runGradientCheck     = false;
            bool           runTraining          = false;
            string         testTreebankPath     = null;
            IFileFilter    testTreebankFilter   = null;
            string         initialModelPath     = null;
            string         modelPath            = null;
            bool           filter            = true;
            string         resultsRecordPath = null;
            IList <string> unusedArgs        = new List <string>();
            // These parameters can be null or 0 if the model was not
            // serialized with the new parameters.  Setting the options at the
            // command line will override these defaults.
            // TODO: if/when we integrate back into the main branch and
            // rebuild models, we can get rid of this
            IList <string> argsWithDefaults = new List <string>(Arrays.AsList(new string[] { "-wordVectorFile", Options.LexOptions.DefaultWordVectorFile, "-dvKBest", int.ToString(TrainOptions.DefaultKBest), "-batchSize", int.ToString(TrainOptions.DefaultBatchSize
                                                                                                                                                                                                                                          ), "-trainingIterations", int.ToString(TrainOptions.DefaultTrainingIterations), "-qnIterationsPerBatch", int.ToString(TrainOptions.DefaultQnIterationsPerBatch), "-regCost", double.ToString(TrainOptions.DefaultRegcost), "-learningRate", double
                                                                                             .ToString(TrainOptions.DefaultLearningRate), "-deltaMargin", double.ToString(TrainOptions.DefaultDeltaMargin), "-unknownNumberVector", "-unknownDashedWordVectors", "-unknownCapsVector", "-unknownchinesepercentvector", "-unknownchinesenumbervector"
                                                                                             , "-unknownchineseyearvector", "-unkWord", "*UNK*", "-transformMatrixType", "DIAGONAL", "-scalingForInit", double.ToString(TrainOptions.DefaultScalingForInit), "-trainWordVectors" }));

            Sharpen.Collections.AddAll(argsWithDefaults, Arrays.AsList(args));
            args = Sharpen.Collections.ToArray(argsWithDefaults, new string[argsWithDefaults.Count]);
            for (int argIndex = 0; argIndex < args.Length;)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-parser"))
                {
                    parserPath = args[argIndex + 1];
                    argIndex  += 2;
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-testTreebank"))
                    {
                        Pair <string, IFileFilter> treebankDescription = ArgUtils.GetTreebankDescription(args, argIndex, "-testTreebank");
                        argIndex           = argIndex + ArgUtils.NumSubArgs(args, argIndex) + 1;
                        testTreebankPath   = treebankDescription.First();
                        testTreebankFilter = treebankDescription.Second();
                    }
                    else
                    {
                        if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-treebank"))
                        {
                            Pair <string, IFileFilter> treebankDescription = ArgUtils.GetTreebankDescription(args, argIndex, "-treebank");
                            argIndex            = argIndex + ArgUtils.NumSubArgs(args, argIndex) + 1;
                            trainTreebankPath   = treebankDescription.First();
                            trainTreebankFilter = treebankDescription.Second();
                        }
                        else
                        {
                            if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-cachedTrees"))
                            {
                                cachedTrainTreesPath = args[argIndex + 1];
                                argIndex            += 2;
                            }
                            else
                            {
                                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-runGradientCheck"))
                                {
                                    runGradientCheck = true;
                                    argIndex++;
                                }
                                else
                                {
                                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-train"))
                                    {
                                        runTraining = true;
                                        argIndex++;
                                    }
                                    else
                                    {
                                        if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-model"))
                                        {
                                            modelPath = args[argIndex + 1];
                                            argIndex += 2;
                                        }
                                        else
                                        {
                                            if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-nofilter"))
                                            {
                                                filter = false;
                                                argIndex++;
                                            }
                                            else
                                            {
                                                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-continueTraining"))
                                                {
                                                    runTraining      = true;
                                                    filter           = false;
                                                    initialModelPath = args[argIndex + 1];
                                                    argIndex        += 2;
                                                }
                                                else
                                                {
                                                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-resultsRecord"))
                                                    {
                                                        resultsRecordPath = args[argIndex + 1];
                                                        argIndex         += 2;
                                                    }
                                                    else
                                                    {
                                                        unusedArgs.Add(args[argIndex++]);
                                                    }
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
            if (parserPath == null && modelPath == null)
            {
                throw new ArgumentException("Must supply either a base parser model with -parser or a serialized DVParser with -model");
            }
            if (!runTraining && modelPath == null && !runGradientCheck)
            {
                throw new ArgumentException("Need to either train a new model, run the gradient check or specify a model to load with -model");
            }
            string[] newArgs = Sharpen.Collections.ToArray(unusedArgs, new string[unusedArgs.Count]);
            Edu.Stanford.Nlp.Parser.Dvparser.DVParser dvparser = null;
            LexicalizedParser lexparser = null;

            if (initialModelPath != null)
            {
                lexparser = ((LexicalizedParser)LexicalizedParser.LoadModel(initialModelPath, newArgs));
                DVModel model = GetModelFromLexicalizedParser(lexparser);
                dvparser = new Edu.Stanford.Nlp.Parser.Dvparser.DVParser(model, lexparser);
            }
            else
            {
                if (runTraining || runGradientCheck)
                {
                    lexparser = ((LexicalizedParser)LexicalizedParser.LoadModel(parserPath, newArgs));
                    dvparser  = new Edu.Stanford.Nlp.Parser.Dvparser.DVParser(lexparser);
                }
                else
                {
                    if (modelPath != null)
                    {
                        lexparser = ((LexicalizedParser)LexicalizedParser.LoadModel(modelPath, newArgs));
                        DVModel model = GetModelFromLexicalizedParser(lexparser);
                        dvparser = new Edu.Stanford.Nlp.Parser.Dvparser.DVParser(model, lexparser);
                    }
                }
            }
            IList <Tree> trainSentences = new List <Tree>();
            IdentityHashMap <Tree, byte[]> trainCompressedParses = Generics.NewIdentityHashMap();

            if (cachedTrainTreesPath != null)
            {
                foreach (string path in cachedTrainTreesPath.Split(","))
                {
                    IList <Pair <Tree, byte[]> > cache = IOUtils.ReadObjectFromFile(path);
                    foreach (Pair <Tree, byte[]> pair in cache)
                    {
                        trainSentences.Add(pair.First());
                        trainCompressedParses[pair.First()] = pair.Second();
                    }
                    log.Info("Read in " + cache.Count + " trees from " + path);
                }
            }
            if (trainTreebankPath != null)
            {
                // TODO: make the transformer a member of the model?
                ITreeTransformer transformer = BuildTrainTransformer(dvparser.GetOp());
                Treebank         treebank    = dvparser.GetOp().tlpParams.MemoryTreebank();
                treebank.LoadPath(trainTreebankPath, trainTreebankFilter);
                treebank = treebank.Transform(transformer);
                log.Info("Read in " + treebank.Count + " trees from " + trainTreebankPath);
                CacheParseHypotheses cacher = new CacheParseHypotheses(dvparser.parser);
                CacheParseHypotheses.CacheProcessor processor = new CacheParseHypotheses.CacheProcessor(cacher, lexparser, dvparser.op.trainOptions.dvKBest, transformer);
                foreach (Tree tree in treebank)
                {
                    trainSentences.Add(tree);
                    trainCompressedParses[tree] = processor.Process(tree).second;
                }
                //System.out.println(tree);
                log.Info("Finished parsing " + treebank.Count + " trees, getting " + dvparser.op.trainOptions.dvKBest + " hypotheses each");
            }
            if ((runTraining || runGradientCheck) && filter)
            {
                log.Info("Filtering rules for the given training set");
                dvparser.dvModel.SetRulesForTrainingSet(trainSentences, trainCompressedParses);
                log.Info("Done filtering rules; " + dvparser.dvModel.numBinaryMatrices + " binary matrices, " + dvparser.dvModel.numUnaryMatrices + " unary matrices, " + dvparser.dvModel.wordVectors.Count + " word vectors");
            }
            //dvparser.dvModel.printAllMatrices();
            Treebank testTreebank = null;

            if (testTreebankPath != null)
            {
                log.Info("Reading in trees from " + testTreebankPath);
                if (testTreebankFilter != null)
                {
                    log.Info("Filtering on " + testTreebankFilter);
                }
                testTreebank = dvparser.GetOp().tlpParams.MemoryTreebank();
                testTreebank.LoadPath(testTreebankPath, testTreebankFilter);
                log.Info("Read in " + testTreebank.Count + " trees for testing");
            }
            //    runGradientCheck= true;
            if (runGradientCheck)
            {
                log.Info("Running gradient check on " + trainSentences.Count + " trees");
                dvparser.RunGradientCheck(trainSentences, trainCompressedParses);
            }
            if (runTraining)
            {
                log.Info("Training the RNN parser");
                log.Info("Current train options: " + dvparser.GetOp().trainOptions);
                dvparser.Train(trainSentences, trainCompressedParses, testTreebank, modelPath, resultsRecordPath);
                if (modelPath != null)
                {
                    dvparser.SaveModel(modelPath);
                }
            }
            if (testTreebankPath != null)
            {
                EvaluateTreebank evaluator = new EvaluateTreebank(dvparser.AttachModelToLexicalizedParser());
                evaluator.TestOnTreebank(testTreebank);
            }
            log.Info("Successfully ran DVParser");
        }
        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            string         modelPath          = null;
            string         outputPath         = null;
            string         inputPath          = null;
            string         testTreebankPath   = null;
            IFileFilter    testTreebankFilter = null;
            IList <string> unusedArgs         = Generics.NewArrayList();

            for (int argIndex = 0; argIndex < args.Length;)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-model"))
                {
                    modelPath = args[argIndex + 1];
                    argIndex += 2;
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-output"))
                    {
                        outputPath = args[argIndex + 1];
                        argIndex  += 2;
                    }
                    else
                    {
                        if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-input"))
                        {
                            inputPath = args[argIndex + 1];
                            argIndex += 2;
                        }
                        else
                        {
                            if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-testTreebank"))
                            {
                                Pair <string, IFileFilter> treebankDescription = ArgUtils.GetTreebankDescription(args, argIndex, "-testTreebank");
                                argIndex           = argIndex + ArgUtils.NumSubArgs(args, argIndex) + 1;
                                testTreebankPath   = treebankDescription.First();
                                testTreebankFilter = treebankDescription.Second();
                            }
                            else
                            {
                                unusedArgs.Add(args[argIndex++]);
                            }
                        }
                    }
                }
            }
            string[]          newArgs = Sharpen.Collections.ToArray(unusedArgs, new string[unusedArgs.Count]);
            LexicalizedParser parser  = ((LexicalizedParser)LexicalizedParser.LoadModel(modelPath, newArgs));
            DVModel           model   = DVParser.GetModelFromLexicalizedParser(parser);
            File outputFile           = new File(outputPath);

            FileSystem.CheckNotExistsOrFail(outputFile);
            FileSystem.MkdirOrFail(outputFile);
            int count = 0;

            if (inputPath != null)
            {
                Reader input = new BufferedReader(new FileReader(inputPath));
                DocumentPreprocessor processor = new DocumentPreprocessor(input);
                foreach (IList <IHasWord> sentence in processor)
                {
                    count++;
                    // index from 1
                    IParserQuery pq = parser.ParserQuery();
                    if (!(pq is RerankingParserQuery))
                    {
                        throw new ArgumentException("Expected a RerankingParserQuery");
                    }
                    RerankingParserQuery rpq = (RerankingParserQuery)pq;
                    if (!rpq.Parse(sentence))
                    {
                        throw new Exception("Unparsable sentence: " + sentence);
                    }
                    IRerankerQuery reranker = rpq.RerankerQuery();
                    if (!(reranker is DVModelReranker.Query))
                    {
                        throw new ArgumentException("Expected a DVModelReranker");
                    }
                    DeepTree deepTree = ((DVModelReranker.Query)reranker).GetDeepTrees()[0];
                    IdentityHashMap <Tree, SimpleMatrix> vectors = deepTree.GetVectors();
                    foreach (KeyValuePair <Tree, SimpleMatrix> entry in vectors)
                    {
                        log.Info(entry.Key + "   " + entry.Value);
                    }
                    FileWriter     fout = new FileWriter(outputPath + File.separator + "sentence" + count + ".txt");
                    BufferedWriter bout = new BufferedWriter(fout);
                    bout.Write(SentenceUtils.ListToString(sentence));
                    bout.NewLine();
                    bout.Write(deepTree.GetTree().ToString());
                    bout.NewLine();
                    foreach (IHasWord word in sentence)
                    {
                        OutputMatrix(bout, model.GetWordVector(word.Word()));
                    }
                    Tree rootTree = FindRootTree(vectors);
                    OutputTreeMatrices(bout, rootTree, vectors);
                    bout.Flush();
                    fout.Close();
                }
            }
        }
Пример #3
0
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.TypeLoadException"/>
        public static void Main(string[] args)
        {
            string         dvmodelFile        = null;
            string         lexparserFile      = null;
            string         testTreebankPath   = null;
            IFileFilter    testTreebankFilter = null;
            IList <string> unusedArgs         = new List <string>();

            for (int argIndex = 0; argIndex < args.Length;)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-lexparser"))
                {
                    lexparserFile = args[argIndex + 1];
                    argIndex     += 2;
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-testTreebank"))
                    {
                        Pair <string, IFileFilter> treebankDescription = ArgUtils.GetTreebankDescription(args, argIndex, "-testTreebank");
                        argIndex           = argIndex + ArgUtils.NumSubArgs(args, argIndex) + 1;
                        testTreebankPath   = treebankDescription.First();
                        testTreebankFilter = treebankDescription.Second();
                    }
                    else
                    {
                        unusedArgs.Add(args[argIndex++]);
                    }
                }
            }
            log.Info("Loading lexparser from: " + lexparserFile);
            string[]          newArgs   = Sharpen.Collections.ToArray(unusedArgs, new string[unusedArgs.Count]);
            LexicalizedParser lexparser = ((LexicalizedParser)LexicalizedParser.LoadModel(lexparserFile, newArgs));

            log.Info("... done");
            Treebank testTreebank = null;

            if (testTreebankPath != null)
            {
                log.Info("Reading in trees from " + testTreebankPath);
                if (testTreebankFilter != null)
                {
                    log.Info("Filtering on " + testTreebankFilter);
                }
                testTreebank = lexparser.GetOp().tlpParams.MemoryTreebank();
                testTreebank.LoadPath(testTreebankPath, testTreebankFilter);
                log.Info("Read in " + testTreebank.Count + " trees for testing");
            }
            double[] labelResults = new double[weights.Length];
            double[] tagResults   = new double[weights.Length];
            for (int i = 0; i < weights.Length; ++i)
            {
                lexparser.GetOp().baseParserWeight = weights[i];
                EvaluateTreebank evaluator         = new EvaluateTreebank(lexparser);
                evaluator.TestOnTreebank(testTreebank);
                labelResults[i] = evaluator.GetLBScore();
                tagResults[i]   = evaluator.GetTagScore();
            }
            for (int i_1 = 0; i_1 < weights.Length; ++i_1)
            {
                log.Info("LexicalizedParser weight " + weights[i_1] + ": labeled " + labelResults[i_1] + " tag " + tagResults[i_1]);
            }
        }
Пример #4
0
        public static void Main(string[] args)
        {
            IList <string> remainingArgs = Generics.NewArrayList();
            IList <Pair <string, IFileFilter> > trainTreebankPath = null;
            Pair <string, IFileFilter>          testTreebankPath  = null;
            Pair <string, IFileFilter>          devTreebankPath   = null;
            string serializedPath   = null;
            string tlppClass        = null;
            string continueTraining = null;

            for (int argIndex = 0; argIndex < args.Length;)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-trainTreebank"))
                {
                    if (trainTreebankPath == null)
                    {
                        trainTreebankPath = Generics.NewArrayList();
                    }
                    trainTreebankPath.Add(ArgUtils.GetTreebankDescription(args, argIndex, "-trainTreebank"));
                    argIndex = argIndex + ArgUtils.NumSubArgs(args, argIndex) + 1;
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-testTreebank"))
                    {
                        testTreebankPath = ArgUtils.GetTreebankDescription(args, argIndex, "-testTreebank");
                        argIndex         = argIndex + ArgUtils.NumSubArgs(args, argIndex) + 1;
                    }
                    else
                    {
                        if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-devTreebank"))
                        {
                            devTreebankPath = ArgUtils.GetTreebankDescription(args, argIndex, "-devTreebank");
                            argIndex        = argIndex + ArgUtils.NumSubArgs(args, argIndex) + 1;
                        }
                        else
                        {
                            if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-serializedPath") || Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-model"))
                            {
                                serializedPath = args[argIndex + 1];
                                argIndex      += 2;
                            }
                            else
                            {
                                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-tlpp"))
                                {
                                    tlppClass = args[argIndex + 1];
                                    argIndex += 2;
                                }
                                else
                                {
                                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-continueTraining"))
                                    {
                                        continueTraining = args[argIndex + 1];
                                        argIndex        += 2;
                                    }
                                    else
                                    {
                                        remainingArgs.Add(args[argIndex]);
                                        ++argIndex;
                                    }
                                }
                            }
                        }
                    }
                }
            }
            string[] newArgs = new string[remainingArgs.Count];
            newArgs = Sharpen.Collections.ToArray(remainingArgs, newArgs);
            if (trainTreebankPath == null && serializedPath == null)
            {
                throw new ArgumentException("Must specify a treebank to train from with -trainTreebank or a parser to load with -serializedPath");
            }
            ShiftReduceParser parser = null;

            if (trainTreebankPath != null)
            {
                log.Info("Training ShiftReduceParser");
                log.Info("Initial arguments:");
                log.Info("   " + StringUtils.Join(args));
                if (continueTraining != null)
                {
                    parser = ((ShiftReduceParser)ShiftReduceParser.LoadModel(continueTraining, ArrayUtils.Concatenate(ForceTags, newArgs)));
                }
                else
                {
                    ShiftReduceOptions op = BuildTrainingOptions(tlppClass, newArgs);
                    parser = new ShiftReduceParser(op);
                }
                parser.Train(trainTreebankPath, devTreebankPath, serializedPath);
                parser.SaveModel(serializedPath);
            }
            if (serializedPath != null && parser == null)
            {
                parser = ((ShiftReduceParser)ShiftReduceParser.LoadModel(serializedPath, ArrayUtils.Concatenate(ForceTags, newArgs)));
            }
            //parser.outputStats();
            if (testTreebankPath != null)
            {
                log.Info("Loading test trees from " + testTreebankPath.First());
                Treebank testTreebank = parser.op.tlpParams.MemoryTreebank();
                testTreebank.LoadPath(testTreebankPath.First(), testTreebankPath.Second());
                log.Info("Loaded " + testTreebank.Count + " trees");
                EvaluateTreebank evaluator = new EvaluateTreebank(parser.op, null, parser);
                evaluator.TestOnTreebank(testTreebank);
            }
        }
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.TypeLoadException"/>
        public static void Main(string[] args)
        {
            string         modelPath          = null;
            IList <string> baseModelPaths     = null;
            string         testTreebankPath   = null;
            IFileFilter    testTreebankFilter = null;
            IList <string> unusedArgs         = new List <string>();

            for (int argIndex = 0; argIndex < args.Length;)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-model"))
                {
                    modelPath = args[argIndex + 1];
                    argIndex += 2;
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-testTreebank"))
                    {
                        Pair <string, IFileFilter> treebankDescription = ArgUtils.GetTreebankDescription(args, argIndex, "-testTreebank");
                        argIndex           = argIndex + ArgUtils.NumSubArgs(args, argIndex) + 1;
                        testTreebankPath   = treebankDescription.First();
                        testTreebankFilter = treebankDescription.Second();
                    }
                    else
                    {
                        if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-baseModels"))
                        {
                            argIndex++;
                            baseModelPaths = new List <string>();
                            while (argIndex < args.Length && args[argIndex][0] != '-')
                            {
                                baseModelPaths.Add(args[argIndex++]);
                            }
                            if (baseModelPaths.Count == 0)
                            {
                                throw new ArgumentException("Found an argument -baseModels with no actual models named");
                            }
                        }
                        else
                        {
                            unusedArgs.Add(args[argIndex++]);
                        }
                    }
                }
            }
            string[]          newArgs          = Sharpen.Collections.ToArray(unusedArgs, new string[unusedArgs.Count]);
            LexicalizedParser underlyingParser = null;
            Options           options          = null;
            LexicalizedParser combinedParser   = null;

            if (baseModelPaths != null)
            {
                IList <DVModel> dvparsers = new List <DVModel>();
                foreach (string baseModelPath in baseModelPaths)
                {
                    log.Info("Loading serialized DVParser from " + baseModelPath);
                    LexicalizedParser dvparser = ((LexicalizedParser)LexicalizedParser.LoadModel(baseModelPath));
                    IReranker         reranker = dvparser.reranker;
                    if (!(reranker is DVModelReranker))
                    {
                        throw new ArgumentException("Expected parsers with DVModel embedded");
                    }
                    dvparsers.Add(((DVModelReranker)reranker).GetModel());
                    if (underlyingParser == null)
                    {
                        underlyingParser = dvparser;
                        options          = underlyingParser.GetOp();
                        // TODO: other parser's options?
                        options.SetOptions(newArgs);
                    }
                    log.Info("... done");
                }
                combinedParser = LexicalizedParser.CopyLexicalizedParser(underlyingParser);
                CombinedDVModelReranker reranker_1 = new CombinedDVModelReranker(options, dvparsers);
                combinedParser.reranker = reranker_1;
                combinedParser.SaveParserToSerialized(modelPath);
            }
            else
            {
                throw new ArgumentException("Need to specify -model to load an already prepared CombinedParser");
            }
            Treebank testTreebank = null;

            if (testTreebankPath != null)
            {
                log.Info("Reading in trees from " + testTreebankPath);
                if (testTreebankFilter != null)
                {
                    log.Info("Filtering on " + testTreebankFilter);
                }
                testTreebank = combinedParser.GetOp().tlpParams.MemoryTreebank();
                testTreebank.LoadPath(testTreebankPath, testTreebankFilter);
                log.Info("Read in " + testTreebank.Count + " trees for testing");
                EvaluateTreebank evaluator = new EvaluateTreebank(combinedParser.GetOp(), null, combinedParser);
                evaluator.TestOnTreebank(testTreebank);
            }
        }
Пример #6
0
        /// <summary>
        /// An example of a command line is
        /// <br />
        /// java -mx1g edu.stanford.nlp.parser.dvparser.CacheParseHypotheses -model /scr/horatio/dvparser/wsjPCFG.nocompact.simple.ser.gz -output cached9.simple.ser.gz  -treebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 200-202
        /// <br />
        /// java -mx4g edu.stanford.nlp.parser.dvparser.CacheParseHypotheses -model ~/scr/dvparser/wsjPCFG.nocompact.simple.ser.gz -output cached.train.simple.ser.gz -treebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 200-2199 -numThreads 6
        /// <br />
        /// java -mx4g edu.stanford.nlp.parser.dvparser.CacheParseHypotheses -model ~/scr/dvparser/chinese/xinhuaPCFG.ser.gz -output cached.xinhua.train.ser.gz -treebank /afs/ir/data/linguistic-data/Chinese-Treebank/6/data/utf8/bracketed  026-270,301-499,600-999
        /// </summary>
        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            string parserModel = null;
            string output      = null;
            IList <Pair <string, IFileFilter> > treebanks = Generics.NewArrayList();
            int dvKBest    = 200;
            int numThreads = 1;

            for (int argIndex = 0; argIndex < args.Length;)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-dvKBest"))
                {
                    dvKBest   = System.Convert.ToInt32(args[argIndex + 1]);
                    argIndex += 2;
                    continue;
                }
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-parser") || args[argIndex].Equals("-model"))
                {
                    parserModel = args[argIndex + 1];
                    argIndex   += 2;
                    continue;
                }
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-output"))
                {
                    output    = args[argIndex + 1];
                    argIndex += 2;
                    continue;
                }
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-treebank"))
                {
                    Pair <string, IFileFilter> treebankDescription = ArgUtils.GetTreebankDescription(args, argIndex, "-treebank");
                    argIndex = argIndex + ArgUtils.NumSubArgs(args, argIndex) + 1;
                    treebanks.Add(treebankDescription);
                    continue;
                }
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-numThreads"))
                {
                    numThreads = System.Convert.ToInt32(args[argIndex + 1]);
                    argIndex  += 2;
                    continue;
                }
                throw new ArgumentException("Unknown argument " + args[argIndex]);
            }
            if (parserModel == null)
            {
                throw new ArgumentException("Need to supply a parser model with -model");
            }
            if (output == null)
            {
                throw new ArgumentException("Need to supply an output filename with -output");
            }
            if (treebanks.IsEmpty())
            {
                throw new ArgumentException("Need to supply a treebank with -treebank");
            }
            log.Info("Writing output to " + output);
            log.Info("Loading parser model " + parserModel);
            log.Info("Writing " + dvKBest + " hypothesis trees for each tree");
            LexicalizedParser    parser      = ((LexicalizedParser)LexicalizedParser.LoadModel(parserModel, "-dvKBest", int.ToString(dvKBest)));
            CacheParseHypotheses cacher      = new CacheParseHypotheses(parser);
            ITreeTransformer     transformer = DVParser.BuildTrainTransformer(parser.GetOp());
            IList <Tree>         sentences   = new List <Tree>();

            foreach (Pair <string, IFileFilter> description in treebanks)
            {
                log.Info("Reading trees from " + description.first);
                Treebank treebank = parser.GetOp().tlpParams.MemoryTreebank();
                treebank.LoadPath(description.first, description.second);
                treebank = treebank.Transform(transformer);
                Sharpen.Collections.AddAll(sentences, treebank);
            }
            log.Info("Processing " + sentences.Count + " trees");
            IList <Pair <Tree, byte[]> > cache = Generics.NewArrayList();

            transformer = new SynchronizedTreeTransformer(transformer);
            MulticoreWrapper <Tree, Pair <Tree, byte[]> > wrapper = new MulticoreWrapper <Tree, Pair <Tree, byte[]> >(numThreads, new CacheParseHypotheses.CacheProcessor(cacher, parser, dvKBest, transformer));

            foreach (Tree tree in sentences)
            {
                wrapper.Put(tree);
                while (wrapper.Peek())
                {
                    cache.Add(wrapper.Poll());
                    if (cache.Count % 10 == 0)
                    {
                        System.Console.Out.WriteLine("Processed " + cache.Count + " trees");
                    }
                }
            }
            wrapper.Join();
            while (wrapper.Peek())
            {
                cache.Add(wrapper.Poll());
                if (cache.Count % 10 == 0)
                {
                    System.Console.Out.WriteLine("Processed " + cache.Count + " trees");
                }
            }
            System.Console.Out.WriteLine("Finished processing " + cache.Count + " trees");
            IOUtils.WriteObjectToFile(cache, output);
        }
        /// <exception cref="System.Exception"/>
        public static void Main(string[] args)
        {
            string         modelPath          = null;
            string         outputPath         = null;
            string         testTreebankPath   = null;
            IFileFilter    testTreebankFilter = null;
            IList <string> unusedArgs         = new List <string>();

            for (int argIndex = 0; argIndex < args.Length;)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-model"))
                {
                    modelPath = args[argIndex + 1];
                    argIndex += 2;
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-testTreebank"))
                    {
                        Pair <string, IFileFilter> treebankDescription = ArgUtils.GetTreebankDescription(args, argIndex, "-testTreebank");
                        argIndex           = argIndex + ArgUtils.NumSubArgs(args, argIndex) + 1;
                        testTreebankPath   = treebankDescription.First();
                        testTreebankFilter = treebankDescription.Second();
                    }
                    else
                    {
                        if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-output"))
                        {
                            outputPath = args[argIndex + 1];
                            argIndex  += 2;
                        }
                        else
                        {
                            unusedArgs.Add(args[argIndex++]);
                        }
                    }
                }
            }
            if (modelPath == null)
            {
                throw new ArgumentException("Need to specify -model");
            }
            if (testTreebankPath == null)
            {
                throw new ArgumentException("Need to specify -testTreebank");
            }
            if (outputPath == null)
            {
                throw new ArgumentException("Need to specify -output");
            }
            string[]          newArgs      = Sharpen.Collections.ToArray(unusedArgs, new string[unusedArgs.Count]);
            LexicalizedParser lexparser    = ((LexicalizedParser)LexicalizedParser.LoadModel(modelPath, newArgs));
            Treebank          testTreebank = null;

            if (testTreebankPath != null)
            {
                log.Info("Reading in trees from " + testTreebankPath);
                if (testTreebankFilter != null)
                {
                    log.Info("Filtering on " + testTreebankFilter);
                }
                testTreebank = lexparser.GetOp().tlpParams.MemoryTreebank();
                testTreebank.LoadPath(testTreebankPath, testTreebankFilter);
                log.Info("Read in " + testTreebank.Count + " trees for testing");
            }
            FileWriter     @out = new FileWriter(outputPath);
            BufferedWriter bout = new BufferedWriter(@out);

            log.Info("Parsing " + testTreebank.Count + " trees");
            int count = 0;
            IList <FindNearestNeighbors.ParseRecord> records = Generics.NewArrayList();

            foreach (Tree goldTree in testTreebank)
            {
                IList <Word> tokens      = goldTree.YieldWords();
                IParserQuery parserQuery = lexparser.ParserQuery();
                if (!parserQuery.Parse(tokens))
                {
                    throw new AssertionError("Could not parse: " + tokens);
                }
                if (!(parserQuery is RerankingParserQuery))
                {
                    throw new ArgumentException("Expected a LexicalizedParser with a Reranker attached");
                }
                RerankingParserQuery rpq = (RerankingParserQuery)parserQuery;
                if (!(rpq.RerankerQuery() is DVModelReranker.Query))
                {
                    throw new ArgumentException("Expected a LexicalizedParser with a DVModel attached");
                }
                DeepTree     tree       = ((DVModelReranker.Query)rpq.RerankerQuery()).GetDeepTrees()[0];
                SimpleMatrix rootVector = null;
                foreach (KeyValuePair <Tree, SimpleMatrix> entry in tree.GetVectors())
                {
                    if (entry.Key.Label().Value().Equals("ROOT"))
                    {
                        rootVector = entry.Value;
                        break;
                    }
                }
                if (rootVector == null)
                {
                    throw new AssertionError("Could not find root nodevector");
                }
                @out.Write(tokens + "\n");
                @out.Write(tree.GetTree() + "\n");
                for (int i = 0; i < rootVector.GetNumElements(); ++i)
                {
                    @out.Write("  " + rootVector.Get(i));
                }
                @out.Write("\n\n\n");
                count++;
                if (count % 10 == 0)
                {
                    log.Info("  " + count);
                }
                records.Add(new FindNearestNeighbors.ParseRecord(tokens, goldTree, tree.GetTree(), rootVector, tree.GetVectors()));
            }
            log.Info("  done parsing");
            IList <Pair <Tree, SimpleMatrix> > subtrees = Generics.NewArrayList();

            foreach (FindNearestNeighbors.ParseRecord record in records)
            {
                foreach (KeyValuePair <Tree, SimpleMatrix> entry in record.nodeVectors)
                {
                    if (entry.Key.GetLeaves().Count <= maxLength)
                    {
                        subtrees.Add(Pair.MakePair(entry.Key, entry.Value));
                    }
                }
            }
            log.Info("There are " + subtrees.Count + " subtrees in the set of trees");
            PriorityQueue <ScoredObject <Pair <Tree, Tree> > > bestmatches = new PriorityQueue <ScoredObject <Pair <Tree, Tree> > >(101, ScoredComparator.DescendingComparator);

            for (int i_1 = 0; i_1 < subtrees.Count; ++i_1)
            {
                log.Info(subtrees[i_1].First().YieldWords());
                log.Info(subtrees[i_1].First());
                for (int j = 0; j < subtrees.Count; ++j)
                {
                    if (i_1 == j)
                    {
                        continue;
                    }
                    // TODO: look at basic category?
                    double normF = subtrees[i_1].Second().Minus(subtrees[j].Second()).NormF();
                    bestmatches.Add(new ScoredObject <Pair <Tree, Tree> >(Pair.MakePair(subtrees[i_1].First(), subtrees[j].First()), normF));
                    if (bestmatches.Count > 100)
                    {
                        bestmatches.Poll();
                    }
                }
                IList <ScoredObject <Pair <Tree, Tree> > > ordered = Generics.NewArrayList();
                while (bestmatches.Count > 0)
                {
                    ordered.Add(bestmatches.Poll());
                }
                Java.Util.Collections.Reverse(ordered);
                foreach (ScoredObject <Pair <Tree, Tree> > pair in ordered)
                {
                    log.Info(" MATCHED " + pair.Object().second.YieldWords() + " ... " + pair.Object().Second() + " with a score of " + pair.Score());
                }
                log.Info();
                log.Info();
                bestmatches.Clear();
            }

            /*
             * for (int i = 0; i < records.size(); ++i) {
             * if (i % 10 == 0) {
             * log.info("  " + i);
             * }
             * List<ScoredObject<ParseRecord>> scored = Generics.newArrayList();
             * for (int j = 0; j < records.size(); ++j) {
             * if (i == j) continue;
             *
             * double score = 0.0;
             * int matches = 0;
             * for (Map.Entry<Tree, SimpleMatrix> first : records.get(i).nodeVectors.entrySet()) {
             * for (Map.Entry<Tree, SimpleMatrix> second : records.get(j).nodeVectors.entrySet()) {
             * String firstBasic = dvparser.dvModel.basicCategory(first.getKey().label().value());
             * String secondBasic = dvparser.dvModel.basicCategory(second.getKey().label().value());
             * if (firstBasic.equals(secondBasic)) {
             ++matches;
             * double normF = first.getValue().minus(second.getValue()).normF();
             * score += normF * normF;
             * }
             * }
             * }
             * if (matches == 0) {
             * score = Double.POSITIVE_INFINITY;
             * } else {
             * score = score / matches;
             * }
             * //double score = records.get(i).vector.minus(records.get(j).vector).normF();
             * scored.add(new ScoredObject<ParseRecord>(records.get(j), score));
             * }
             * Collections.sort(scored, ScoredComparator.ASCENDING_COMPARATOR);
             *
             * out.write(records.get(i).sentence.toString() + "\n");
             * for (int j = 0; j < numNeighbors; ++j) {
             * out.write("   " + scored.get(j).score() + ": " + scored.get(j).object().sentence + "\n");
             * }
             * out.write("\n\n");
             * }
             * log.info();
             */
            bout.Flush();
            @out.Flush();
            @out.Close();
        }