public FilterConfusingRules(LexicalizedParser parser)
        {
            BinaryGrammar   binaryGrammar = parser.bg;
            UnaryGrammar    unaryGrammar  = parser.ug;
            Options         op            = parser.GetOp();
            IIndex <string> stateIndex    = parser.stateIndex;

            foreach (UnaryRule unaryRule in unaryGrammar)
            {
                // only make one matrix for each parent state, and only use the
                // basic category for that
                string childState = stateIndex.Get(unaryRule.child);
                string childBasic = op.Langpack().BasicCategory(childState);
                unaryRules.Add(childBasic);
            }
            foreach (BinaryRule binaryRule in binaryGrammar)
            {
                // only make one matrix for each parent state, and only use the
                // basic category for that
                string leftState  = stateIndex.Get(binaryRule.leftChild);
                string leftBasic  = op.Langpack().BasicCategory(leftState);
                string rightState = stateIndex.Get(binaryRule.rightChild);
                string rightBasic = op.Langpack().BasicCategory(rightState);
                binaryRules.Add(leftBasic, rightBasic);
            }
        }
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.TypeLoadException"/>
        public static void Main(string[] args)
        {
            string taggerFile = null;
            string inputFile  = null;
            string outputFile = null;
            double weight     = 1.0;

            for (int argIndex = 0; argIndex < args.Length;)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-tagger"))
                {
                    taggerFile = args[argIndex + 1];
                    argIndex  += 2;
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-input"))
                    {
                        inputFile = args[argIndex + 1];
                        argIndex += 2;
                    }
                    else
                    {
                        if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-output"))
                        {
                            outputFile = args[argIndex + 1];
                            argIndex  += 2;
                        }
                        else
                        {
                            if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-weight"))
                            {
                                weight    = double.ValueOf(args[argIndex + 1]);
                                argIndex += 2;
                            }
                            else
                            {
                                throw new ArgumentException("Unknown argument: " + args[argIndex]);
                            }
                        }
                    }
                }
            }
            LexicalizedParser parser = ((LexicalizedParser)LexicalizedParser.LoadModel(inputFile));
            MaxentTagger      tagger = new MaxentTagger(taggerFile);

            parser.reranker = new TaggerReranker(tagger, parser.GetOp());
            parser.SaveParserToSerialized(outputFile);
        }
Ejemplo n.º 3
0
        public virtual void RunTest(string[] args)
        {
            // get a parser from file
            LexicalizedParser pd = ((LexicalizedParser)LexicalizedParser.LoadModel(args[0]));

            op = pd.GetOp();
            // in case a serialized options was read in
            Treebank testTreebank = op.tlpParams.MemoryTreebank();
            int      testlow      = System.Convert.ToInt32(args[2]);
            int      testhigh     = System.Convert.ToInt32(args[3]);

            testTreebank.LoadPath(args[1], new NumberRangeFileFilter(testlow, testhigh, true));
            op.SetOptionsOrWarn(args, 4, args.Length);
            TestOnTreebank(pd, new EnglishTreebankParserParams(), testTreebank, args[1], pd.stateIndex);
        }
 public DVParser(LexicalizedParser parser)
 {
     this.parser = parser;
     this.op     = parser.GetOp();
     if (op.trainOptions.randomSeed == 0)
     {
         op.trainOptions.randomSeed = Runtime.NanoTime();
         log.Info("Random seed not set, using randomly chosen seed of " + op.trainOptions.randomSeed);
     }
     else
     {
         log.Info("Random seed set to " + op.trainOptions.randomSeed);
     }
     log.Info("Word vector file: " + op.lexOptions.wordVectorFile);
     log.Info("Size of word vectors: " + op.lexOptions.numHid);
     log.Info("Number of hypothesis trees to train against: " + op.trainOptions.dvKBest);
     log.Info("Number of trees in one batch: " + op.trainOptions.batchSize);
     log.Info("Number of iterations of trees: " + op.trainOptions.trainingIterations);
     log.Info("Number of qn iterations per batch: " + op.trainOptions.qnIterationsPerBatch);
     log.Info("Learning rate: " + op.trainOptions.learningRate);
     log.Info("Delta margin: " + op.trainOptions.deltaMargin);
     log.Info("regCost: " + op.trainOptions.regCost);
     log.Info("Using unknown word vector for numbers: " + op.trainOptions.unknownNumberVector);
     log.Info("Using unknown dashed word vector heuristics: " + op.trainOptions.unknownDashedWordVectors);
     log.Info("Using unknown word vector for capitalized words: " + op.trainOptions.unknownCapsVector);
     log.Info("Using unknown number vector for Chinese words: " + op.trainOptions.unknownChineseNumberVector);
     log.Info("Using unknown year vector for Chinese words: " + op.trainOptions.unknownChineseYearVector);
     log.Info("Using unknown percent vector for Chinese words: " + op.trainOptions.unknownChinesePercentVector);
     log.Info("Initial matrices scaled by: " + op.trainOptions.scalingForInit);
     log.Info("Training will use " + op.trainOptions.trainingThreads + " thread(s)");
     log.Info("Context words are " + ((op.trainOptions.useContextWords) ? "on" : "off"));
     log.Info("Model will " + ((op.trainOptions.dvSimplifiedModel) ? string.Empty : "not ") + "be simplified");
     this.dvModel = new DVModel(op, parser.stateIndex, parser.ug, parser.bg);
     if (dvModel.unaryTransform.Count != dvModel.unaryScore.Count)
     {
         throw new AssertionError("Unary transform and score size not the same");
     }
     if (dvModel.binaryTransform.Size() != dvModel.binaryScore.Size())
     {
         throw new AssertionError("Binary transform and score size not the same");
     }
 }
        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            IDictionary <string, int> flagsToNumArgs = Generics.NewHashMap();

            flagsToNumArgs["-parser"]        = int.Parse(3);
            flagsToNumArgs["-lex"]           = int.Parse(3);
            flagsToNumArgs["-test"]          = int.Parse(2);
            flagsToNumArgs["-out"]           = int.Parse(1);
            flagsToNumArgs["-lengthPenalty"] = int.Parse(1);
            flagsToNumArgs["-penaltyType"]   = int.Parse(1);
            flagsToNumArgs["-maxLength"]     = int.Parse(1);
            flagsToNumArgs["-stats"]         = int.Parse(2);
            IDictionary <string, string[]> argMap = StringUtils.ArgsToMap(args, flagsToNumArgs);
            bool        eval = argMap.Contains("-eval");
            PrintWriter pw   = null;

            if (argMap.Contains("-out"))
            {
                pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream((argMap["-out"])[0]), "GB18030"), true);
            }
            log.Info("ChineseCharacterBasedLexicon called with args:");
            ChineseTreebankParserParams ctpp = new ChineseTreebankParserParams();

            for (int i = 0; i < args.Length; i++)
            {
                ctpp.SetOptionFlag(args, i);
                log.Info(" " + args[i]);
            }
            log.Info();
            Options op = new Options(ctpp);

            if (argMap.Contains("-stats"))
            {
                string[]       statArgs         = (argMap["-stats"]);
                MemoryTreebank rawTrainTreebank = op.tlpParams.MemoryTreebank();
                IFileFilter    trainFilt        = new NumberRangesFileFilter(statArgs[1], false);
                rawTrainTreebank.LoadPath(new File(statArgs[0]), trainFilt);
                log.Info("Done reading trees.");
                MemoryTreebank trainTreebank;
                if (argMap.Contains("-annotate"))
                {
                    trainTreebank = new MemoryTreebank();
                    TreeAnnotator annotator = new TreeAnnotator(ctpp.HeadFinder(), ctpp, op);
                    foreach (Tree tree in rawTrainTreebank)
                    {
                        trainTreebank.Add(annotator.TransformTree(tree));
                    }
                    log.Info("Done annotating trees.");
                }
                else
                {
                    trainTreebank = rawTrainTreebank;
                }
                PrintStats(trainTreebank, pw);
                System.Environment.Exit(0);
            }
            int maxLength = 1000000;

            //    Test.verbose = true;
            if (argMap.Contains("-norm"))
            {
                op.testOptions.lengthNormalization = true;
            }
            if (argMap.Contains("-maxLength"))
            {
                maxLength = System.Convert.ToInt32((argMap["-maxLength"])[0]);
            }
            op.testOptions.maxLength = 120;
            bool combo = argMap.Contains("-combo");

            if (combo)
            {
                ctpp.useCharacterBasedLexicon = true;
                op.testOptions.maxSpanForTags = 10;
                op.doDep  = false;
                op.dcTags = false;
            }
            LexicalizedParser lp  = null;
            ILexicon          lex = null;

            if (argMap.Contains("-parser"))
            {
                string[] parserArgs = (argMap["-parser"]);
                if (parserArgs.Length > 1)
                {
                    IFileFilter trainFilt = new NumberRangesFileFilter(parserArgs[1], false);
                    lp = LexicalizedParser.TrainFromTreebank(parserArgs[0], trainFilt, op);
                    if (parserArgs.Length == 3)
                    {
                        string filename = parserArgs[2];
                        log.Info("Writing parser in serialized format to file " + filename + " ");
                        System.Console.Error.Flush();
                        ObjectOutputStream @out = IOUtils.WriteStreamFromString(filename);
                        @out.WriteObject(lp);
                        @out.Close();
                        log.Info("done.");
                    }
                }
                else
                {
                    string parserFile = parserArgs[0];
                    lp = LexicalizedParser.LoadModel(parserFile, op);
                }
                lex  = lp.GetLexicon();
                op   = lp.GetOp();
                ctpp = (ChineseTreebankParserParams)op.tlpParams;
            }
            if (argMap.Contains("-rad"))
            {
                ctpp.useUnknownCharacterModel = true;
            }
            if (argMap.Contains("-lengthPenalty"))
            {
                ctpp.lengthPenalty = double.Parse((argMap["-lengthPenalty"])[0]);
            }
            if (argMap.Contains("-penaltyType"))
            {
                ctpp.penaltyType = System.Convert.ToInt32((argMap["-penaltyType"])[0]);
            }
            if (argMap.Contains("-lex"))
            {
                string[] lexArgs = (argMap["-lex"]);
                if (lexArgs.Length > 1)
                {
                    IIndex <string> wordIndex = new HashIndex <string>();
                    IIndex <string> tagIndex  = new HashIndex <string>();
                    lex = ctpp.Lex(op, wordIndex, tagIndex);
                    MemoryTreebank rawTrainTreebank = op.tlpParams.MemoryTreebank();
                    IFileFilter    trainFilt        = new NumberRangesFileFilter(lexArgs[1], false);
                    rawTrainTreebank.LoadPath(new File(lexArgs[0]), trainFilt);
                    log.Info("Done reading trees.");
                    MemoryTreebank trainTreebank;
                    if (argMap.Contains("-annotate"))
                    {
                        trainTreebank = new MemoryTreebank();
                        TreeAnnotator annotator = new TreeAnnotator(ctpp.HeadFinder(), ctpp, op);
                        foreach (Tree tree in rawTrainTreebank)
                        {
                            tree = annotator.TransformTree(tree);
                            trainTreebank.Add(tree);
                        }
                        log.Info("Done annotating trees.");
                    }
                    else
                    {
                        trainTreebank = rawTrainTreebank;
                    }
                    lex.InitializeTraining(trainTreebank.Count);
                    lex.Train(trainTreebank);
                    lex.FinishTraining();
                    log.Info("Done training lexicon.");
                    if (lexArgs.Length == 3)
                    {
                        string filename = lexArgs.Length == 3 ? lexArgs[2] : "parsers/chineseCharLex.ser.gz";
                        log.Info("Writing lexicon in serialized format to file " + filename + " ");
                        System.Console.Error.Flush();
                        ObjectOutputStream @out = IOUtils.WriteStreamFromString(filename);
                        @out.WriteObject(lex);
                        @out.Close();
                        log.Info("done.");
                    }
                }
                else
                {
                    string lexFile = lexArgs.Length == 1 ? lexArgs[0] : "parsers/chineseCharLex.ser.gz";
                    log.Info("Reading Lexicon from file " + lexFile);
                    ObjectInputStream @in = IOUtils.ReadStreamFromString(lexFile);
                    try
                    {
                        lex = (ILexicon)@in.ReadObject();
                    }
                    catch (TypeLoadException)
                    {
                        throw new Exception("Bad serialized file: " + lexFile);
                    }
                    @in.Close();
                }
            }
            if (argMap.Contains("-test"))
            {
                bool segmentWords = ctpp.segment;
                bool parse        = lp != null;
                System.Diagnostics.Debug.Assert((parse || segmentWords));
                //      WordCatConstituent.collinizeWords = argMap.containsKey("-collinizeWords");
                //      WordCatConstituent.collinizeTags = argMap.containsKey("-collinizeTags");
                IWordSegmenter seg = null;
                if (segmentWords)
                {
                    seg = (IWordSegmenter)lex;
                }
                string[]       testArgs     = (argMap["-test"]);
                MemoryTreebank testTreebank = op.tlpParams.MemoryTreebank();
                IFileFilter    testFilt     = new NumberRangesFileFilter(testArgs[1], false);
                testTreebank.LoadPath(new File(testArgs[0]), testFilt);
                ITreeTransformer          subcategoryStripper = op.tlpParams.SubcategoryStripper();
                ITreeTransformer          collinizer          = ctpp.Collinizer();
                WordCatEquivalenceClasser eqclass             = new WordCatEquivalenceClasser();
                WordCatEqualityChecker    eqcheck             = new WordCatEqualityChecker();
                EquivalenceClassEval      basicEval           = new EquivalenceClassEval(eqclass, eqcheck, "basic");
                EquivalenceClassEval      collinsEval         = new EquivalenceClassEval(eqclass, eqcheck, "collinized");
                IList <string>            evalTypes           = new List <string>(3);
                bool goodPOS = false;
                if (segmentWords)
                {
                    evalTypes.Add(WordCatConstituent.wordType);
                    if (ctpp.segmentMarkov && !parse)
                    {
                        evalTypes.Add(WordCatConstituent.tagType);
                        goodPOS = true;
                    }
                }
                if (parse)
                {
                    evalTypes.Add(WordCatConstituent.tagType);
                    evalTypes.Add(WordCatConstituent.catType);
                    if (combo)
                    {
                        evalTypes.Add(WordCatConstituent.wordType);
                        goodPOS = true;
                    }
                }
                TreeToBracketProcessor proc = new TreeToBracketProcessor(evalTypes);
                log.Info("Testing...");
                foreach (Tree goldTop in testTreebank)
                {
                    Tree             gold         = goldTop.FirstChild();
                    IList <IHasWord> goldSentence = gold.YieldHasWord();
                    if (goldSentence.Count > maxLength)
                    {
                        log.Info("Skipping sentence; too long: " + goldSentence.Count);
                        continue;
                    }
                    else
                    {
                        log.Info("Processing sentence; length: " + goldSentence.Count);
                    }
                    IList <IHasWord> s;
                    if (segmentWords)
                    {
                        StringBuilder goldCharBuf = new StringBuilder();
                        foreach (IHasWord aGoldSentence in goldSentence)
                        {
                            StringLabel word = (StringLabel)aGoldSentence;
                            goldCharBuf.Append(word.Value());
                        }
                        string goldChars = goldCharBuf.ToString();
                        s = seg.Segment(goldChars);
                    }
                    else
                    {
                        s = goldSentence;
                    }
                    Tree tree;
                    if (parse)
                    {
                        tree = lp.ParseTree(s);
                        if (tree == null)
                        {
                            throw new Exception("PARSER RETURNED NULL!!!");
                        }
                    }
                    else
                    {
                        tree = Edu.Stanford.Nlp.Trees.Trees.ToFlatTree(s);
                        tree = subcategoryStripper.TransformTree(tree);
                    }
                    if (pw != null)
                    {
                        if (parse)
                        {
                            tree.PennPrint(pw);
                        }
                        else
                        {
                            IEnumerator sentIter = s.GetEnumerator();
                            for (; ;)
                            {
                                Word word = (Word)sentIter.Current;
                                pw.Print(word.Word());
                                if (sentIter.MoveNext())
                                {
                                    pw.Print(" ");
                                }
                                else
                                {
                                    break;
                                }
                            }
                        }
                        pw.Println();
                    }
                    if (eval)
                    {
                        ICollection ourBrackets;
                        ICollection goldBrackets;
                        ourBrackets  = proc.AllBrackets(tree);
                        goldBrackets = proc.AllBrackets(gold);
                        if (goodPOS)
                        {
                            Sharpen.Collections.AddAll(ourBrackets, TreeToBracketProcessor.CommonWordTagTypeBrackets(tree, gold));
                            Sharpen.Collections.AddAll(goldBrackets, TreeToBracketProcessor.CommonWordTagTypeBrackets(gold, tree));
                        }
                        basicEval.Eval(ourBrackets, goldBrackets);
                        System.Console.Out.WriteLine("\nScores:");
                        basicEval.DisplayLast();
                        Tree collinsTree = collinizer.TransformTree(tree);
                        Tree collinsGold = collinizer.TransformTree(gold);
                        ourBrackets  = proc.AllBrackets(collinsTree);
                        goldBrackets = proc.AllBrackets(collinsGold);
                        if (goodPOS)
                        {
                            Sharpen.Collections.AddAll(ourBrackets, TreeToBracketProcessor.CommonWordTagTypeBrackets(collinsTree, collinsGold));
                            Sharpen.Collections.AddAll(goldBrackets, TreeToBracketProcessor.CommonWordTagTypeBrackets(collinsGold, collinsTree));
                        }
                        collinsEval.Eval(ourBrackets, goldBrackets);
                        System.Console.Out.WriteLine("\nCollinized scores:");
                        collinsEval.DisplayLast();
                        System.Console.Out.WriteLine();
                    }
                }
                if (eval)
                {
                    basicEval.Display();
                    System.Console.Out.WriteLine();
                    collinsEval.Display();
                }
            }
        }
 public DVParser(DVModel model, LexicalizedParser parser)
 {
     this.parser  = parser;
     this.op      = parser.GetOp();
     this.dvModel = model;
 }
Ejemplo n.º 7
0
        /// <summary>This example shows a few more ways of providing input to a parser.</summary>
        /// <remarks>
        /// This example shows a few more ways of providing input to a parser.
        /// Usage: ParserDemo2 [grammar [textFile]]
        /// </remarks>
        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            string grammar = args.Length > 0 ? args[0] : "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";

            string[]                        options = new string[] { "-maxLength", "80", "-retainTmpSubcategories" };
            LexicalizedParser               lp      = ((LexicalizedParser)LexicalizedParser.LoadModel(grammar, options));
            ITreebankLanguagePack           tlp     = lp.GetOp().Langpack();
            IGrammaticalStructureFactory    gsf     = tlp.GrammaticalStructureFactory();
            IEnumerable <IList <IHasWord> > sentences;

            if (args.Length > 1)
            {
                DocumentPreprocessor      dp  = new DocumentPreprocessor(args[1]);
                IList <IList <IHasWord> > tmp = new List <IList <IHasWord> >();
                foreach (IList <IHasWord> sentence in dp)
                {
                    tmp.Add(sentence);
                }
                sentences = tmp;
            }
            else
            {
                // Showing tokenization and parsing in code a couple of different ways.
                string[]         sent     = new string[] { "This", "is", "an", "easy", "sentence", "." };
                IList <IHasWord> sentence = new List <IHasWord>();
                foreach (string word in sent)
                {
                    sentence.Add(new Word(word));
                }
                string sent2 = ("This is a slightly longer and more complex " + "sentence requiring tokenization.");
                // Use the default tokenizer for this TreebankLanguagePack
                ITokenizer <IHasWord> toke      = tlp.GetTokenizerFactory().GetTokenizer(new StringReader(sent2));
                IList <IHasWord>      sentence2 = toke.Tokenize();
                string[] sent3 = new string[] { "It", "can", "can", "it", "." };
                string[] tag3  = new string[] { "PRP", "MD", "VB", "PRP", "." };
                // Parser gets second "can" wrong without help
                IList <TaggedWord> sentence3 = new List <TaggedWord>();
                for (int i = 0; i < sent3.Length; i++)
                {
                    sentence3.Add(new TaggedWord(sent3[i], tag3[i]));
                }
                Tree parse = lp.Parse(sentence3);
                parse.PennPrint();
                IList <IList <IHasWord> > tmp = new List <IList <IHasWord> >();
                tmp.Add(sentence);
                tmp.Add(sentence2);
                tmp.Add(sentence3);
                sentences = tmp;
            }
            foreach (IList <IHasWord> sentence_1 in sentences)
            {
                Tree parse = lp.Parse(sentence_1);
                parse.PennPrint();
                System.Console.Out.WriteLine();
                GrammaticalStructure    gs  = gsf.NewGrammaticalStructure(parse);
                IList <TypedDependency> tdl = gs.TypedDependenciesCCprocessed();
                System.Console.Out.WriteLine(tdl);
                System.Console.Out.WriteLine();
                System.Console.Out.WriteLine("The words of the sentence:");
                foreach (ILabel lab in parse.Yield())
                {
                    if (lab is CoreLabel)
                    {
                        System.Console.Out.WriteLine(((CoreLabel)lab).ToString(CoreLabel.OutputFormat.ValueMap));
                    }
                    else
                    {
                        System.Console.Out.WriteLine(lab);
                    }
                }
                System.Console.Out.WriteLine();
                System.Console.Out.WriteLine(parse.TaggedYield());
                System.Console.Out.WriteLine();
            }
            // This method turns the String into a single sentence using the
            // default tokenizer for the TreebankLanguagePack.
            string sent3_1 = "This is one last test!";

            lp.Parse(sent3_1).PennPrint();
        }
Ejemplo n.º 8
0
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.TypeLoadException"/>
        public static void Main(string[] args)
        {
            string         dvmodelFile        = null;
            string         lexparserFile      = null;
            string         testTreebankPath   = null;
            IFileFilter    testTreebankFilter = null;
            IList <string> unusedArgs         = new List <string>();

            for (int argIndex = 0; argIndex < args.Length;)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-lexparser"))
                {
                    lexparserFile = args[argIndex + 1];
                    argIndex     += 2;
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-testTreebank"))
                    {
                        Pair <string, IFileFilter> treebankDescription = ArgUtils.GetTreebankDescription(args, argIndex, "-testTreebank");
                        argIndex           = argIndex + ArgUtils.NumSubArgs(args, argIndex) + 1;
                        testTreebankPath   = treebankDescription.First();
                        testTreebankFilter = treebankDescription.Second();
                    }
                    else
                    {
                        unusedArgs.Add(args[argIndex++]);
                    }
                }
            }
            log.Info("Loading lexparser from: " + lexparserFile);
            string[]          newArgs   = Sharpen.Collections.ToArray(unusedArgs, new string[unusedArgs.Count]);
            LexicalizedParser lexparser = ((LexicalizedParser)LexicalizedParser.LoadModel(lexparserFile, newArgs));

            log.Info("... done");
            Treebank testTreebank = null;

            if (testTreebankPath != null)
            {
                log.Info("Reading in trees from " + testTreebankPath);
                if (testTreebankFilter != null)
                {
                    log.Info("Filtering on " + testTreebankFilter);
                }
                testTreebank = lexparser.GetOp().tlpParams.MemoryTreebank();
                testTreebank.LoadPath(testTreebankPath, testTreebankFilter);
                log.Info("Read in " + testTreebank.Count + " trees for testing");
            }
            double[] labelResults = new double[weights.Length];
            double[] tagResults   = new double[weights.Length];
            for (int i = 0; i < weights.Length; ++i)
            {
                lexparser.GetOp().baseParserWeight = weights[i];
                EvaluateTreebank evaluator         = new EvaluateTreebank(lexparser);
                evaluator.TestOnTreebank(testTreebank);
                labelResults[i] = evaluator.GetLBScore();
                tagResults[i]   = evaluator.GetTagScore();
            }
            for (int i_1 = 0; i_1 < weights.Length; ++i_1)
            {
                log.Info("LexicalizedParser weight " + weights[i_1] + ": labeled " + labelResults[i_1] + " tag " + tagResults[i_1]);
            }
        }
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.TypeLoadException"/>
        public static void Main(string[] args)
        {
            string         modelPath          = null;
            IList <string> baseModelPaths     = null;
            string         testTreebankPath   = null;
            IFileFilter    testTreebankFilter = null;
            IList <string> unusedArgs         = new List <string>();

            for (int argIndex = 0; argIndex < args.Length;)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-model"))
                {
                    modelPath = args[argIndex + 1];
                    argIndex += 2;
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-testTreebank"))
                    {
                        Pair <string, IFileFilter> treebankDescription = ArgUtils.GetTreebankDescription(args, argIndex, "-testTreebank");
                        argIndex           = argIndex + ArgUtils.NumSubArgs(args, argIndex) + 1;
                        testTreebankPath   = treebankDescription.First();
                        testTreebankFilter = treebankDescription.Second();
                    }
                    else
                    {
                        if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-baseModels"))
                        {
                            argIndex++;
                            baseModelPaths = new List <string>();
                            while (argIndex < args.Length && args[argIndex][0] != '-')
                            {
                                baseModelPaths.Add(args[argIndex++]);
                            }
                            if (baseModelPaths.Count == 0)
                            {
                                throw new ArgumentException("Found an argument -baseModels with no actual models named");
                            }
                        }
                        else
                        {
                            unusedArgs.Add(args[argIndex++]);
                        }
                    }
                }
            }
            string[]          newArgs          = Sharpen.Collections.ToArray(unusedArgs, new string[unusedArgs.Count]);
            LexicalizedParser underlyingParser = null;
            Options           options          = null;
            LexicalizedParser combinedParser   = null;

            if (baseModelPaths != null)
            {
                IList <DVModel> dvparsers = new List <DVModel>();
                foreach (string baseModelPath in baseModelPaths)
                {
                    log.Info("Loading serialized DVParser from " + baseModelPath);
                    LexicalizedParser dvparser = ((LexicalizedParser)LexicalizedParser.LoadModel(baseModelPath));
                    IReranker         reranker = dvparser.reranker;
                    if (!(reranker is DVModelReranker))
                    {
                        throw new ArgumentException("Expected parsers with DVModel embedded");
                    }
                    dvparsers.Add(((DVModelReranker)reranker).GetModel());
                    if (underlyingParser == null)
                    {
                        underlyingParser = dvparser;
                        options          = underlyingParser.GetOp();
                        // TODO: other parser's options?
                        options.SetOptions(newArgs);
                    }
                    log.Info("... done");
                }
                combinedParser = LexicalizedParser.CopyLexicalizedParser(underlyingParser);
                CombinedDVModelReranker reranker_1 = new CombinedDVModelReranker(options, dvparsers);
                combinedParser.reranker = reranker_1;
                combinedParser.SaveParserToSerialized(modelPath);
            }
            else
            {
                throw new ArgumentException("Need to specify -model to load an already prepared CombinedParser");
            }
            Treebank testTreebank = null;

            if (testTreebankPath != null)
            {
                log.Info("Reading in trees from " + testTreebankPath);
                if (testTreebankFilter != null)
                {
                    log.Info("Filtering on " + testTreebankFilter);
                }
                testTreebank = combinedParser.GetOp().tlpParams.MemoryTreebank();
                testTreebank.LoadPath(testTreebankPath, testTreebankFilter);
                log.Info("Read in " + testTreebank.Count + " trees for testing");
                EvaluateTreebank evaluator = new EvaluateTreebank(combinedParser.GetOp(), null, combinedParser);
                evaluator.TestOnTreebank(testTreebank);
            }
        }
Ejemplo n.º 10
0
        /// <summary>
        /// An example of a command line is
        /// <br />
        /// java -mx1g edu.stanford.nlp.parser.dvparser.CacheParseHypotheses -model /scr/horatio/dvparser/wsjPCFG.nocompact.simple.ser.gz -output cached9.simple.ser.gz  -treebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 200-202
        /// <br />
        /// java -mx4g edu.stanford.nlp.parser.dvparser.CacheParseHypotheses -model ~/scr/dvparser/wsjPCFG.nocompact.simple.ser.gz -output cached.train.simple.ser.gz -treebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 200-2199 -numThreads 6
        /// <br />
        /// java -mx4g edu.stanford.nlp.parser.dvparser.CacheParseHypotheses -model ~/scr/dvparser/chinese/xinhuaPCFG.ser.gz -output cached.xinhua.train.ser.gz -treebank /afs/ir/data/linguistic-data/Chinese-Treebank/6/data/utf8/bracketed  026-270,301-499,600-999
        /// </summary>
        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            string parserModel = null;
            string output      = null;
            IList <Pair <string, IFileFilter> > treebanks = Generics.NewArrayList();
            int dvKBest    = 200;
            int numThreads = 1;

            for (int argIndex = 0; argIndex < args.Length;)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-dvKBest"))
                {
                    dvKBest   = System.Convert.ToInt32(args[argIndex + 1]);
                    argIndex += 2;
                    continue;
                }
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-parser") || args[argIndex].Equals("-model"))
                {
                    parserModel = args[argIndex + 1];
                    argIndex   += 2;
                    continue;
                }
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-output"))
                {
                    output    = args[argIndex + 1];
                    argIndex += 2;
                    continue;
                }
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-treebank"))
                {
                    Pair <string, IFileFilter> treebankDescription = ArgUtils.GetTreebankDescription(args, argIndex, "-treebank");
                    argIndex = argIndex + ArgUtils.NumSubArgs(args, argIndex) + 1;
                    treebanks.Add(treebankDescription);
                    continue;
                }
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-numThreads"))
                {
                    numThreads = System.Convert.ToInt32(args[argIndex + 1]);
                    argIndex  += 2;
                    continue;
                }
                throw new ArgumentException("Unknown argument " + args[argIndex]);
            }
            if (parserModel == null)
            {
                throw new ArgumentException("Need to supply a parser model with -model");
            }
            if (output == null)
            {
                throw new ArgumentException("Need to supply an output filename with -output");
            }
            if (treebanks.IsEmpty())
            {
                throw new ArgumentException("Need to supply a treebank with -treebank");
            }
            log.Info("Writing output to " + output);
            log.Info("Loading parser model " + parserModel);
            log.Info("Writing " + dvKBest + " hypothesis trees for each tree");
            LexicalizedParser    parser      = ((LexicalizedParser)LexicalizedParser.LoadModel(parserModel, "-dvKBest", int.ToString(dvKBest)));
            CacheParseHypotheses cacher      = new CacheParseHypotheses(parser);
            ITreeTransformer     transformer = DVParser.BuildTrainTransformer(parser.GetOp());
            IList <Tree>         sentences   = new List <Tree>();

            foreach (Pair <string, IFileFilter> description in treebanks)
            {
                log.Info("Reading trees from " + description.first);
                Treebank treebank = parser.GetOp().tlpParams.MemoryTreebank();
                treebank.LoadPath(description.first, description.second);
                treebank = treebank.Transform(transformer);
                Sharpen.Collections.AddAll(sentences, treebank);
            }
            log.Info("Processing " + sentences.Count + " trees");
            IList <Pair <Tree, byte[]> > cache = Generics.NewArrayList();

            transformer = new SynchronizedTreeTransformer(transformer);
            MulticoreWrapper <Tree, Pair <Tree, byte[]> > wrapper = new MulticoreWrapper <Tree, Pair <Tree, byte[]> >(numThreads, new CacheParseHypotheses.CacheProcessor(cacher, parser, dvKBest, transformer));

            foreach (Tree tree in sentences)
            {
                wrapper.Put(tree);
                while (wrapper.Peek())
                {
                    cache.Add(wrapper.Poll());
                    if (cache.Count % 10 == 0)
                    {
                        System.Console.Out.WriteLine("Processed " + cache.Count + " trees");
                    }
                }
            }
            wrapper.Join();
            while (wrapper.Peek())
            {
                cache.Add(wrapper.Poll());
                if (cache.Count % 10 == 0)
                {
                    System.Console.Out.WriteLine("Processed " + cache.Count + " trees");
                }
            }
            System.Console.Out.WriteLine("Finished processing " + cache.Count + " trees");
            IOUtils.WriteObjectToFile(cache, output);
        }
 public EvaluateTreebank(LexicalizedParser parser)
     : this(parser.GetOp(), parser.lex, parser)
 {
 }
Ejemplo n.º 12
0
        /// <summary>
        /// Command line arguments for this program:
        /// <br />
        /// -output: the model file to output
        /// -input: a list of model files to input
        /// </summary>
        public static void Main(string[] args)
        {
            string         outputModelFilename = null;
            IList <string> inputModelFilenames = Generics.NewArrayList();

            for (int argIndex = 0; argIndex < args.Length;)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-output"))
                {
                    outputModelFilename = args[argIndex + 1];
                    argIndex           += 2;
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-input"))
                    {
                        for (++argIndex; argIndex < args.Length && !args[argIndex].StartsWith("-"); ++argIndex)
                        {
                            Sharpen.Collections.AddAll(inputModelFilenames, Arrays.AsList(args[argIndex].Split(",")));
                        }
                    }
                    else
                    {
                        throw new Exception("Unknown argument " + args[argIndex]);
                    }
                }
            }
            if (outputModelFilename == null)
            {
                log.Info("Need to specify output model name with -output");
                System.Environment.Exit(2);
            }
            if (inputModelFilenames.Count == 0)
            {
                log.Info("Need to specify input model names with -input");
                System.Environment.Exit(2);
            }
            log.Info("Averaging " + inputModelFilenames);
            log.Info("Outputting result to " + outputModelFilename);
            LexicalizedParser lexparser = null;
            IList <DVModel>   models    = Generics.NewArrayList();

            foreach (string filename in inputModelFilenames)
            {
                LexicalizedParser parser = ((LexicalizedParser)LexicalizedParser.LoadModel(filename));
                if (lexparser == null)
                {
                    lexparser = parser;
                }
                models.Add(DVParser.GetModelFromLexicalizedParser(parser));
            }
            IList <TwoDimensionalMap <string, string, SimpleMatrix> > binaryTransformMaps = CollectionUtils.TransformAsList(models, null);
            IList <TwoDimensionalMap <string, string, SimpleMatrix> > binaryScoreMaps     = CollectionUtils.TransformAsList(models, null);
            IList <IDictionary <string, SimpleMatrix> >      unaryTransformMaps           = CollectionUtils.TransformAsList(models, null);
            IList <IDictionary <string, SimpleMatrix> >      unaryScoreMaps          = CollectionUtils.TransformAsList(models, null);
            IList <IDictionary <string, SimpleMatrix> >      wordMaps                = CollectionUtils.TransformAsList(models, null);
            TwoDimensionalMap <string, string, SimpleMatrix> binaryTransformAverages = AverageBinaryMatrices(binaryTransformMaps);
            TwoDimensionalMap <string, string, SimpleMatrix> binaryScoreAverages     = AverageBinaryMatrices(binaryScoreMaps);
            IDictionary <string, SimpleMatrix> unaryTransformAverages                = AverageUnaryMatrices(unaryTransformMaps);
            IDictionary <string, SimpleMatrix> unaryScoreAverages = AverageUnaryMatrices(unaryScoreMaps);
            IDictionary <string, SimpleMatrix> wordAverages       = AverageUnaryMatrices(wordMaps);
            DVModel  newModel  = new DVModel(binaryTransformAverages, unaryTransformAverages, binaryScoreAverages, unaryScoreAverages, wordAverages, lexparser.GetOp());
            DVParser newParser = new DVParser(newModel, lexparser);

            newParser.SaveModel(outputModelFilename);
        }
        internal LexicalizedParserQuery(LexicalizedParser parser)
        {
            this.op = parser.GetOp();
            BinaryGrammar      bg         = parser.bg;
            UnaryGrammar       ug         = parser.ug;
            ILexicon           lex        = parser.lex;
            IDependencyGrammar dg         = parser.dg;
            IIndex <string>    stateIndex = parser.stateIndex;
            IIndex <string>    wordIndex  = new DeltaIndex <string>(parser.wordIndex);
            IIndex <string>    tagIndex   = parser.tagIndex;

            this.debinarizer     = new Debinarizer(op.forceCNF);
            this.boundaryRemover = new BoundaryRemover();
            if (op.doPCFG)
            {
                if (op.testOptions.iterativeCKY)
                {
                    pparser = new IterativeCKYPCFGParser(bg, ug, lex, op, stateIndex, wordIndex, tagIndex);
                }
                else
                {
                    pparser = new ExhaustivePCFGParser(bg, ug, lex, op, stateIndex, wordIndex, tagIndex);
                }
            }
            else
            {
                pparser = null;
            }
            if (op.doDep)
            {
                dg.SetLexicon(lex);
                if (!op.testOptions.useFastFactored)
                {
                    dparser = new ExhaustiveDependencyParser(dg, lex, op, wordIndex, tagIndex);
                }
                else
                {
                    dparser = null;
                }
            }
            else
            {
                dparser = null;
            }
            if (op.doDep && op.doPCFG)
            {
                if (op.testOptions.useFastFactored)
                {
                    MLEDependencyGrammar mledg = (MLEDependencyGrammar)dg;
                    int numToFind = 1;
                    if (op.testOptions.printFactoredKGood > 0)
                    {
                        numToFind = op.testOptions.printFactoredKGood;
                    }
                    bparser = new FastFactoredParser(pparser, mledg, op, numToFind, wordIndex, tagIndex);
                }
                else
                {
                    IScorer scorer = new TwinScorer(pparser, dparser);
                    //Scorer scorer = parser;
                    if (op.testOptions.useN5)
                    {
                        bparser = new BiLexPCFGParser.N5BiLexPCFGParser(scorer, pparser, dparser, bg, ug, dg, lex, op, stateIndex, wordIndex, tagIndex);
                    }
                    else
                    {
                        bparser = new BiLexPCFGParser(scorer, pparser, dparser, bg, ug, dg, lex, op, stateIndex, wordIndex, tagIndex);
                    }
                }
            }
            else
            {
                bparser = null;
            }
            subcategoryStripper = op.tlpParams.SubcategoryStripper();
        }
        /// <exception cref="System.Exception"/>
        public static void Main(string[] args)
        {
            string         modelPath          = null;
            string         outputPath         = null;
            string         testTreebankPath   = null;
            IFileFilter    testTreebankFilter = null;
            IList <string> unusedArgs         = new List <string>();

            for (int argIndex = 0; argIndex < args.Length;)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-model"))
                {
                    modelPath = args[argIndex + 1];
                    argIndex += 2;
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-testTreebank"))
                    {
                        Pair <string, IFileFilter> treebankDescription = ArgUtils.GetTreebankDescription(args, argIndex, "-testTreebank");
                        argIndex           = argIndex + ArgUtils.NumSubArgs(args, argIndex) + 1;
                        testTreebankPath   = treebankDescription.First();
                        testTreebankFilter = treebankDescription.Second();
                    }
                    else
                    {
                        if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-output"))
                        {
                            outputPath = args[argIndex + 1];
                            argIndex  += 2;
                        }
                        else
                        {
                            unusedArgs.Add(args[argIndex++]);
                        }
                    }
                }
            }
            if (modelPath == null)
            {
                throw new ArgumentException("Need to specify -model");
            }
            if (testTreebankPath == null)
            {
                throw new ArgumentException("Need to specify -testTreebank");
            }
            if (outputPath == null)
            {
                throw new ArgumentException("Need to specify -output");
            }
            string[]          newArgs      = Sharpen.Collections.ToArray(unusedArgs, new string[unusedArgs.Count]);
            LexicalizedParser lexparser    = ((LexicalizedParser)LexicalizedParser.LoadModel(modelPath, newArgs));
            Treebank          testTreebank = null;

            if (testTreebankPath != null)
            {
                log.Info("Reading in trees from " + testTreebankPath);
                if (testTreebankFilter != null)
                {
                    log.Info("Filtering on " + testTreebankFilter);
                }
                testTreebank = lexparser.GetOp().tlpParams.MemoryTreebank();
                testTreebank.LoadPath(testTreebankPath, testTreebankFilter);
                log.Info("Read in " + testTreebank.Count + " trees for testing");
            }
            FileWriter     @out = new FileWriter(outputPath);
            BufferedWriter bout = new BufferedWriter(@out);

            log.Info("Parsing " + testTreebank.Count + " trees");
            int count = 0;
            IList <FindNearestNeighbors.ParseRecord> records = Generics.NewArrayList();

            foreach (Tree goldTree in testTreebank)
            {
                IList <Word> tokens      = goldTree.YieldWords();
                IParserQuery parserQuery = lexparser.ParserQuery();
                if (!parserQuery.Parse(tokens))
                {
                    throw new AssertionError("Could not parse: " + tokens);
                }
                if (!(parserQuery is RerankingParserQuery))
                {
                    throw new ArgumentException("Expected a LexicalizedParser with a Reranker attached");
                }
                RerankingParserQuery rpq = (RerankingParserQuery)parserQuery;
                if (!(rpq.RerankerQuery() is DVModelReranker.Query))
                {
                    throw new ArgumentException("Expected a LexicalizedParser with a DVModel attached");
                }
                DeepTree     tree       = ((DVModelReranker.Query)rpq.RerankerQuery()).GetDeepTrees()[0];
                SimpleMatrix rootVector = null;
                foreach (KeyValuePair <Tree, SimpleMatrix> entry in tree.GetVectors())
                {
                    if (entry.Key.Label().Value().Equals("ROOT"))
                    {
                        rootVector = entry.Value;
                        break;
                    }
                }
                if (rootVector == null)
                {
                    throw new AssertionError("Could not find root nodevector");
                }
                @out.Write(tokens + "\n");
                @out.Write(tree.GetTree() + "\n");
                for (int i = 0; i < rootVector.GetNumElements(); ++i)
                {
                    @out.Write("  " + rootVector.Get(i));
                }
                @out.Write("\n\n\n");
                count++;
                if (count % 10 == 0)
                {
                    log.Info("  " + count);
                }
                records.Add(new FindNearestNeighbors.ParseRecord(tokens, goldTree, tree.GetTree(), rootVector, tree.GetVectors()));
            }
            log.Info("  done parsing");
            IList <Pair <Tree, SimpleMatrix> > subtrees = Generics.NewArrayList();

            foreach (FindNearestNeighbors.ParseRecord record in records)
            {
                foreach (KeyValuePair <Tree, SimpleMatrix> entry in record.nodeVectors)
                {
                    if (entry.Key.GetLeaves().Count <= maxLength)
                    {
                        subtrees.Add(Pair.MakePair(entry.Key, entry.Value));
                    }
                }
            }
            log.Info("There are " + subtrees.Count + " subtrees in the set of trees");
            PriorityQueue <ScoredObject <Pair <Tree, Tree> > > bestmatches = new PriorityQueue <ScoredObject <Pair <Tree, Tree> > >(101, ScoredComparator.DescendingComparator);

            for (int i_1 = 0; i_1 < subtrees.Count; ++i_1)
            {
                log.Info(subtrees[i_1].First().YieldWords());
                log.Info(subtrees[i_1].First());
                for (int j = 0; j < subtrees.Count; ++j)
                {
                    if (i_1 == j)
                    {
                        continue;
                    }
                    // TODO: look at basic category?
                    double normF = subtrees[i_1].Second().Minus(subtrees[j].Second()).NormF();
                    bestmatches.Add(new ScoredObject <Pair <Tree, Tree> >(Pair.MakePair(subtrees[i_1].First(), subtrees[j].First()), normF));
                    if (bestmatches.Count > 100)
                    {
                        bestmatches.Poll();
                    }
                }
                IList <ScoredObject <Pair <Tree, Tree> > > ordered = Generics.NewArrayList();
                while (bestmatches.Count > 0)
                {
                    ordered.Add(bestmatches.Poll());
                }
                Java.Util.Collections.Reverse(ordered);
                foreach (ScoredObject <Pair <Tree, Tree> > pair in ordered)
                {
                    log.Info(" MATCHED " + pair.Object().second.YieldWords() + " ... " + pair.Object().Second() + " with a score of " + pair.Score());
                }
                log.Info();
                log.Info();
                bestmatches.Clear();
            }

            /*
             * for (int i = 0; i < records.size(); ++i) {
             * if (i % 10 == 0) {
             * log.info("  " + i);
             * }
             * List<ScoredObject<ParseRecord>> scored = Generics.newArrayList();
             * for (int j = 0; j < records.size(); ++j) {
             * if (i == j) continue;
             *
             * double score = 0.0;
             * int matches = 0;
             * for (Map.Entry<Tree, SimpleMatrix> first : records.get(i).nodeVectors.entrySet()) {
             * for (Map.Entry<Tree, SimpleMatrix> second : records.get(j).nodeVectors.entrySet()) {
             * String firstBasic = dvparser.dvModel.basicCategory(first.getKey().label().value());
             * String secondBasic = dvparser.dvModel.basicCategory(second.getKey().label().value());
             * if (firstBasic.equals(secondBasic)) {
             ++matches;
             * double normF = first.getValue().minus(second.getValue()).normF();
             * score += normF * normF;
             * }
             * }
             * }
             * if (matches == 0) {
             * score = Double.POSITIVE_INFINITY;
             * } else {
             * score = score / matches;
             * }
             * //double score = records.get(i).vector.minus(records.get(j).vector).normF();
             * scored.add(new ScoredObject<ParseRecord>(records.get(j), score));
             * }
             * Collections.sort(scored, ScoredComparator.ASCENDING_COMPARATOR);
             *
             * out.write(records.get(i).sentence.toString() + "\n");
             * for (int j = 0; j < numNeighbors; ++j) {
             * out.write("   " + scored.get(j).score() + ": " + scored.get(j).object().sentence + "\n");
             * }
             * out.write("\n\n");
             * }
             * log.info();
             */
            bout.Flush();
            @out.Flush();
            @out.Close();
        }