// todo: perhaps the output streams could be passed in
 /// <summary>
 /// Parse the files with names given in the String array args elements from
 /// index argIndex on.
 /// </summary>
 /// <remarks>
 /// Parse the files with names given in the String array args elements from
 /// index argIndex on.  Convenience method which builds and invokes a ParseFiles object.
 /// </remarks>
 public static void ParseFiles <_T0>(string[] args, int argIndex, bool tokenized, ITokenizerFactory <_T0> tokenizerFactory, string elementDelimiter, string sentenceDelimiter, IFunction <IList <IHasWord>, IList <IHasWord> > escaper, string tagDelimiter
                                     , Options op, TreePrint treePrint, LexicalizedParser pqFactory)
 where _T0 : IHasWord
     {
      Edu.Stanford.Nlp.Parser.Lexparser.ParseFiles pf = new Edu.Stanford.Nlp.Parser.Lexparser.ParseFiles(op, treePrint, pqFactory);
      pf.ParseFiles(args, argIndex, tokenized, tokenizerFactory, elementDelimiter, sentenceDelimiter, escaper, tagDelimiter);
     }
Пример #2
0
        /// <summary>
        /// demoAPI demonstrates other ways of calling the parser with
        /// already tokenized text, or in some cases, raw text that needs to
        /// be tokenized as a single sentence.
        /// </summary>
        /// <remarks>
        /// demoAPI demonstrates other ways of calling the parser with
        /// already tokenized text, or in some cases, raw text that needs to
        /// be tokenized as a single sentence.  Output is handled with a
        /// TreePrint object.  Note that the options used when creating the
        /// TreePrint can determine what results to print out.  Once again,
        /// one can capture the output by passing a PrintWriter to
        /// TreePrint.printTree. This code is for English.
        /// </remarks>
        public static void DemoAPI(LexicalizedParser lp)
        {
            // This option shows parsing a list of correctly tokenized words
            string[]          sent     = new string[] { "This", "is", "an", "easy", "sentence", "." };
            IList <CoreLabel> rawWords = SentenceUtils.ToCoreLabelList(sent);
            Tree parse = lp.Apply(rawWords);

            parse.PennPrint();
            System.Console.Out.WriteLine();
            // This option shows loading and using an explicit tokenizer
            string sent2 = "This is another sentence.";
            ITokenizerFactory <CoreLabel> tokenizerFactory = PTBTokenizer.Factory(new CoreLabelTokenFactory(), string.Empty);
            ITokenizer <CoreLabel>        tok       = tokenizerFactory.GetTokenizer(new StringReader(sent2));
            IList <CoreLabel>             rawWords2 = tok.Tokenize();

            parse = lp.Apply(rawWords2);
            ITreebankLanguagePack tlp = lp.TreebankLanguagePack();
            // PennTreebankLanguagePack for English
            IGrammaticalStructureFactory gsf = tlp.GrammaticalStructureFactory();
            GrammaticalStructure         gs  = gsf.NewGrammaticalStructure(parse);
            IList <TypedDependency>      tdl = gs.TypedDependenciesCCprocessed();

            System.Console.Out.WriteLine(tdl);
            System.Console.Out.WriteLine();
            // You can also use a TreePrint object to print trees and dependencies
            TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");

            tp.PrintTree(parse);
        }
Пример #3
0
        public static void ParseString(string sentence)
        {
            // Path to models extracted from `stanford-parser-3.6.0-models.jar`
            var modelsDirectory = @"../../../data/paket-files/stanford-corenlp-3.9.1-models/edu/stanford/nlp/models";
            var model           = @"/lexparser/englishPCFG.ser.gz";
            //var model = @"/parser/nndep/english_SD.gz";

            // Loading english PCFG parser from file
            var lp = LexicalizedParser.loadModel(modelsDirectory + model);

            // This sample shows parsing a list of correctly tokenized words
            //var rawWords = SentenceUtils.toCoreLabelList(sentence);
            //var tree = lp.apply(rawWords);
            //tree.pennPrint();

            // This option shows loading and using an explicit tokenizer
            var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
            var sent2Reader      = new StringReader(sentence);
            var rawWords2        = tokenizerFactory.getTokenizer(sent2Reader).tokenize();
            //sent2Reader.close();
            var tree2 = lp.apply(rawWords2);

            // Extract dependencies from lexical tree
            //var tlp = new PennTreebankLanguagePack();
            //var gsf = tlp.grammaticalStructureFactory();
            //var gs = gsf.newGrammaticalStructure(tree2);
            //var tdl = gs.typedDependenciesCCprocessed();
            //Console.WriteLine("\n{0}\n", tdl);

            // Extract collapsed dependencies from parsed tree
            var tp = new TreePrint("penn,typedDependenciesCollapsed");

            tp.printTree(tree2);
        }
        /// <summary>Parse a (speech) lattice with the PCFG parser.</summary>
        /// <param name="lr">a lattice to parse</param>
        /// <returns>Whether the lattice could be parsed by the grammar</returns>
        internal virtual bool Parse(HTKLatticeReader lr)
        {
            TreePrint   treePrint = GetTreePrint();
            PrintWriter pwOut     = op.tlpParams.Pw();

            parseSucceeded   = false;
            parseNoMemory    = false;
            parseUnparsable  = false;
            parseSkipped     = false;
            parseFallback    = false;
            whatFailed       = null;
            originalSentence = null;
            if (lr.GetNumStates() > op.testOptions.maxLength + 1)
            {
                // + 1 for boundary symbol
                parseSkipped = true;
                throw new NotSupportedException("Lattice too big: " + lr.GetNumStates());
            }
            if (op.doPCFG)
            {
                if (!pparser.Parse(lr))
                {
                    return(parseSucceeded);
                }
                if (op.testOptions.verbose)
                {
                    pwOut.Println("PParser output");
                    treePrint.PrintTree(GetBestPCFGParse(false), pwOut);
                }
            }
            parseSucceeded = true;
            return(true);
        }
        public void ProcessText(string inputText)
        {
            var jarRoot         = "C:\\stanford-parser-full-2016-10-31\\stanford-parser-3.7.0-models";//\\edu\\stanford\\nlp\\models";//"nlp.stanford.edu\\stanford-parser-full-2017-06-09\\models";
            var modelsDirectory = jarRoot + "\\edu\\stanford\\nlp\\models";

            // Loading english PCFG parser from file
            var lp = LexicalizedParser.loadModel(modelsDirectory + "\\lexparser\\englishPCFG.ser.gz");

            // This option shows loading and using an explicit tokenizer
            var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
            var sentReader       = new StringReader(inputText);
            var rawWords         = tokenizerFactory.getTokenizer(sentReader).tokenize();

            sentReader.close();
            var tree = lp.apply(rawWords);

            //Extract dependencies from lexical tree
            var tlp = new PennTreebankLanguagePack();
            var gsf = tlp.grammaticalStructureFactory();
            var gs  = gsf.newGrammaticalStructure(tree);
            var tdl = gs.typedDependenciesCCprocessed();

            Console.WriteLine("\n{0}\n", tdl);

            // Extract collapsed dependencies from parsed tree
            var tp = new TreePrint("penn,typedDependenciesCollapsed");

            tp.printTree(tree);
        }
Пример #6
0
        public ParsedStatementFactory.ParseResult ParseStatement(string input)
        {
            var sent2Reader = new StringReader(input);
            var rawWords2   = _tokenizerFactory.getTokenizer(sent2Reader).tokenize();
            var parse       = _lp.apply(rawWords2);

            var gs  = _structureFactory.newGrammaticalStructure(parse);
            var tdl = gs.typedDependenciesCCprocessed();
            //System.Console.WriteLine("newGrammaticalStructure:\n{0}\n", gs);
            //System.Console.WriteLine("typedDependenciesCCprocessed:\n{0}\n", tdl);
            //var tp = new TreePrint("penn,typedDependenciesCollapsed");
            //tp.printTree(parse);
            //return new ParsedStatement(parse);

            var xmlTreePrint = new TreePrint("xmlTree, dependencies", "xml, collapsedDependencies", _tlp);
            var stream       = new ByteArrayOutputStream();

            xmlTreePrint.printTree(parse, new PrintWriter(stream));

            string xmlOutput = stream.toString() + "</s>";

            //System.Console.WriteLine("xml:\n{0}\n", xmlOutput);

            return(ParsedStatementFactory.CreateParsedStatement(xmlOutput));
            //System.Console.WriteLine("TreePrint: \n{0}\n", parse);
        }
Пример #7
0
        public virtual bool Run(File trainTreebankFile, File testTreebankFile, InputStream inputStream)
        {
            op           = new Options();
            op.tlpParams = new ArabicTreebankParserParams();
            op.SetOptions("-arabicFactored");
            op.testOptions.maxLength = maxSentLen;
            op.testOptions.MaxItems  = 5000000;
            //500000 is the default for Arabic, but we have substantially more edges now
            op.testOptions.outputFormatOptions = "removeTopBracket,includePunctuationDependencies";
            // WSG: Just set this to some high value so that extractBestParse()
            // actually calls the lattice reader (e.g., this says that we can't have a word longer than
            // 80 characters...seems sensible for Arabic
            op.testOptions.maxSpanForTags = 80;
            treePrint           = op.testOptions.TreePrint(op.tlpParams);
            debinarizer         = new Debinarizer(op.forceCNF, new CategoryWordTagFactory());
            subcategoryStripper = op.tlpParams.SubcategoryStripper();
            Timing.StartTime();
            Treebank trainTreebank = op.tlpParams.DiskTreebank();

            trainTreebank.LoadPath(trainTreebankFile);
            lp = GetParserDataFromTreebank(trainTreebank);
            MakeParsers();
            if (Verbose)
            {
                op.Display();
                string lexNumRules = (pparser != null) ? int.ToString(lp.lex.NumRules()) : string.Empty;
                log.Info("Grammar\tStates\tTags\tWords\tUnaryR\tBinaryR\tTaggings");
                log.Info("Grammar\t" + lp.stateIndex.Size() + '\t' + lp.tagIndex.Size() + '\t' + lp.wordIndex.Size() + '\t' + (pparser != null ? lp.ug.NumRules() : string.Empty) + '\t' + (pparser != null ? lp.bg.NumRules() : string.Empty) + '\t' + lexNumRules
                         );
                log.Info("ParserPack is " + op.tlpParams.GetType().FullName);
                log.Info("Lexicon is " + lp.lex.GetType().FullName);
            }
            return(Parse(inputStream));
        }
Пример #8
0
        public void ParseEasySentence()
        {
            // This option shows parsing a list of correctly tokenized words
            var sent     = new[] { "This", "is", "an", "easy", "sentence", "." };
            var rawWords = SentenceUtils.toCoreLabelList(sent);
            var parse    = _lp.apply(rawWords);

            Assert.NotNull(parse);
            parse.pennPrint();

            // This option shows loading and using an explicit tokenizer
            var sent2            = "This is another sentence.";
            var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");

            using var sent2Reader = new StringReader(sent2);
            var rawWords2 = tokenizerFactory.getTokenizer(sent2Reader).tokenize();

            parse = _lp.apply(rawWords2);
            Assert.NotNull(parse);

            var tlp = new PennTreebankLanguagePack();
            var gsf = tlp.grammaticalStructureFactory();
            var gs  = gsf.newGrammaticalStructure(parse);
            var tdl = gs.typedDependenciesCCprocessed();

            TestContext.Out.WriteLine($"\n{tdl}\n");

            var tp = new TreePrint("penn,typedDependenciesCollapsed");

            Assert.NotNull(tp);
            tp.printTree(parse);
        }
Пример #9
0
        public virtual void TestConll2007()
        {
            Tree test = Tree.ValueOf("((S (NP (PRP It)) (VP (VBZ is) (RB not) (ADJP (JJ normal)) (SBAR (IN for) (S (NP (NNS dogs)) (VP (TO to) (VP (VB be) (VP (VBG vomiting)))))))))");

            string[]     words     = new string[] { "It", "is", "not", "normal", "for", "dogs", "to", "be", "vomiting" };
            string[]     tags      = new string[] { "PRP", "VBZ", "RB", "JJ", "IN", "NNS", "TO", "VB", "VBG" };
            TreePrint    treePrint = new TreePrint("conll2007");
            StringWriter writer    = new StringWriter();
            PrintWriter  wrapped   = new PrintWriter(writer);

            treePrint.PrintTree(test, wrapped);
            wrapped.Close();
            string @out = writer.ToString();

            string[] lines = @out.Trim().Split("\n");
            for (int i = 0; i < lines.Length; ++i)
            {
                string[] pieces  = lines[i].Trim().Split("\\s+");
                int      lineNum = int.Parse(pieces[0]);
                NUnit.Framework.Assert.AreEqual((i + 1), lineNum);
                NUnit.Framework.Assert.AreEqual(words[i], pieces[1]);
                NUnit.Framework.Assert.AreEqual(tags[i], pieces[3]);
                NUnit.Framework.Assert.AreEqual(tags[i], pieces[4]);
            }
        }
Пример #10
0
        public static void DemoAPI(LexicalizedParser lp)
        {
            // This option shows parsing a list of correctly tokenized words
            var sent = new[] { "This", "is", "an", "easy", "sentence", "." };

            java.util.List rawWords = Sentence.toCoreLabelList(sent);
            Tree           parse    = lp.apply(rawWords);

            parse.pennPrint();

            // This option shows loading and using an explicit tokenizer
            const string     Sent2            = "This is another sentence.";
            TokenizerFactory tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
            var sent2Reader = new StringReader(Sent2);

            java.util.List rawWords2 = tokenizerFactory.getTokenizer(sent2Reader).tokenize();
            parse = lp.apply(rawWords2);

            var tlp = new PennTreebankLanguagePack();
            GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
            GrammaticalStructure        gs  = gsf.newGrammaticalStructure(parse);

            java.util.List tdl = gs.typedDependenciesCCprocessed();
            Console.WriteLine("\n{0}\n", tdl);

            var tp = new TreePrint("penn,typedDependenciesCollapsed");

            tp.printTree(parse);
        }
Пример #11
0
        //use Stanford.NLP.Net to parse the sentence
        static Tree Parse(string sent)
        {
            // Loading english PCFG parser from file
            var lp = LexicalizedParser.loadModel(modelsDirectory + "\\lexparser\\englishPCFG.ser.gz");

            var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
            var sentReader       = new java.io.StringReader(sent);
            var rawWords         = tokenizerFactory.getTokenizer(sentReader).tokenize();

            sentReader.close();
            var tree = lp.apply(rawWords);

            // Extract dependencies from lexical tree
            var tlp = new PennTreebankLanguagePack();
            var gsf = tlp.grammaticalStructureFactory();
            var gs  = gsf.newGrammaticalStructure(tree);
            var tdl = gs.typedDependenciesCCprocessed();

            // Extract collapsed dependencies from parsed tree
            //var tp = new TreePrint("penn,typedDependenciesCollapsed");
            var tp = new TreePrint("penn");

            tp.printTree(tree);

            return(tree);
        }
Пример #12
0
        static void Main()
        {
            // Path to models extracted from `stanford-parser-3.6.0-models.jar`
            var jarRoot = @"..\..\..\..\paket-files\nlp.stanford.edu\stanford-parser-full-2016-10-31\models\";
            var modelsDirectory = jarRoot + @"\edu\stanford\nlp\models";

            // Loading english PCFG parser from file
            var lp = LexicalizedParser.loadModel(modelsDirectory + @"\lexparser\englishPCFG.ser.gz");

            // This sample shows parsing a list of correctly tokenized words
            var sent = new[] { "This", "is", "an", "easy", "sentence", "." };
            var rawWords = SentenceUtils.toCoreLabelList(sent);
            var tree = lp.apply(rawWords);
            tree.pennPrint();

            // This option shows loading and using an explicit tokenizer
            var sent2 = "This is another sentence.";
            var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
            var sent2Reader = new StringReader(sent2);
            var rawWords2 = tokenizerFactory.getTokenizer(sent2Reader).tokenize();
            sent2Reader.close();
            var tree2 = lp.apply(rawWords2);

            // Extract dependencies from lexical tree
            var tlp = new PennTreebankLanguagePack();
            var gsf = tlp.grammaticalStructureFactory();
            var gs = gsf.newGrammaticalStructure(tree2);
            var tdl = gs.typedDependenciesCCprocessed();
            Console.WriteLine("\n{0}\n", tdl);

            // Extract collapsed dependencies from parsed tree
            var tp = new TreePrint("penn,typedDependenciesCollapsed");
            tp.printTree(tree2);
        }
        /// <summary>Generates the XML content for a constituent tree</summary>
        private static void AddConstituentTreeInfo(Element treeInfo, Tree tree, TreePrint constituentTreePrinter)
        {
            StringWriter treeStrWriter = new StringWriter();

            constituentTreePrinter.PrintTree(tree, new PrintWriter(treeStrWriter, true));
            string temp = treeStrWriter.ToString();

            //log.info(temp);
            treeInfo.AppendChild(temp);
        }
Пример #14
0
 private static void DisplayTree(Tree t, TreePrint tp, PrintWriter pw)
 {
     if (t == null)
     {
         pw.Println("null");
     }
     else
     {
         tp.PrintTree(t, pw);
     }
 }
Пример #15
0
        public ParsedStatement ParseSentence(string input)
        {
            var sent2Reader = new StringReader(input);
            var rawWords2   = _tokenizerFactory.getTokenizer(sent2Reader).tokenize();
            var parse       = _lp.apply(rawWords2);

            var gs  = _structureFactory.newGrammaticalStructure(parse);
            var tdl = gs.typedDependenciesCCprocessed();

            System.Console.WriteLine("\n{0}\n", tdl);

            var tp = new TreePrint("penn,typedDependenciesCollapsed");

            tp.printTree(parse);
            return(new ParsedStatement(parse));
            //System.Console.WriteLine("TreePrint: \n{0}\n", parse);
        }
     public ParseFiles(Options op, TreePrint treePrint, LexicalizedParser pqFactory)
 {
     this.op        = op;
     this.pqFactory = pqFactory;
     this.treePrint = treePrint;
     this.tlp       = op.tlpParams.TreebankLanguagePack();
     this.pwOut     = op.tlpParams.Pw();
     this.pwErr     = op.tlpParams.Pw(System.Console.Error);
     if (op.testOptions.verbose)
     {
         pwErr.Println("Sentence final words are: " + Arrays.AsList(tlp.SentenceFinalPunctuationWords()));
         pwErr.Println("File encoding is: " + op.tlpParams.GetInputEncoding());
     }
     // evaluation setup
     this.runningAverages = bool.ParseBoolean(op.testOptions.evals.GetProperty("runningAverages"));
     this.summary         = bool.ParseBoolean(op.testOptions.evals.GetProperty("summary"));
     if (bool.ParseBoolean(op.testOptions.evals.GetProperty("pcfgLL")))
     {
         this.pcfgLL = new AbstractEval.ScoreEval("pcfgLL", runningAverages);
     }
     else
     {
         this.pcfgLL = null;
     }
     if (bool.ParseBoolean(op.testOptions.evals.GetProperty("depLL")))
     {
         this.depLL = new AbstractEval.ScoreEval("depLL", runningAverages);
     }
     else
     {
         this.depLL = null;
     }
     if (bool.ParseBoolean(op.testOptions.evals.GetProperty("factLL")))
     {
         this.factLL = new AbstractEval.ScoreEval("factLL", runningAverages);
     }
     else
     {
         this.factLL = null;
     }
 }
Пример #17
0
 static int Main(string[] args)
 {
     if (args.Length < 1 || !File.Exists(args[0]))
     {
         Console.WriteLine("FATAL: First command line parameter must be an existing file");
         return(-1);
     }
     try
     {
         FileLoader loader = new FileLoader(json_tree.json_tree.encodingClass, json_tree.json_tree.unicodeDetection, args[0]);
         Debug.Assert(!loader.IsBinaryFile());
         string src;
         if (loader.LoadFile(out src))
         {
             json_tree.json_tree jsonParser = new json_tree.json_tree(src, Console.Out);
             bool bMatches = jsonParser.json_text();
             if (bMatches)
             {
                 Console.WriteLine("SUCCESS: Json Parser matched input file '{0}'", args[0]);
                 TreePrint tprint = new TreePrint(Console.Out, src, 60, new NodePrinter(jsonParser).GetNodeName, false);
                 tprint.PrintTree(jsonParser.GetRoot(), 0, 0);
             }
             else
             {
                 Console.WriteLine("FAILURE: Json Parser did not match input file '{0]'", args[0]);
             }
             return(0);
         }
         else
         {
             Console.WriteLine("FATAL: File '{0}' could not be loaded", args[0]);
             return(-1);
         }
     }
     catch (Exception e)
     {
         Console.WriteLine("FATAL: Program terminated by exception '{0}'", e.Message);
         return(-1);
     }
 }
Пример #18
0
        static void Main()
        {
            // Path to models extracted from `stanford-parser-3.6.0-models.jar`
            var jarRoot         = @"..\..\..\..\paket-files\nlp.stanford.edu\stanford-parser-full-2015-12-09\models\";
            var modelsDirectory = jarRoot + @"\edu\stanford\nlp\models";

            // Loading english PCFG parser from file
            var lp = LexicalizedParser.loadModel(modelsDirectory + @"\lexparser\englishPCFG.ser.gz");

            // This sample shows parsing a list of correctly tokenized words
            var sent     = new[] { "This", "is", "an", "easy", "sentence", "." };
            var rawWords = Sentence.toCoreLabelList(sent);
            var tree     = lp.apply(rawWords);

            tree.pennPrint();

            // This option shows loading and using an explicit tokenizer
            var sent2            = "This is another sentence.";
            var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
            var sent2Reader      = new StringReader(sent2);
            var rawWords2        = tokenizerFactory.getTokenizer(sent2Reader).tokenize();

            sent2Reader.close();
            var tree2 = lp.apply(rawWords2);

            // Extract dependencies from lexical tree
            var tlp = new PennTreebankLanguagePack();
            var gsf = tlp.grammaticalStructureFactory();
            var gs  = gsf.newGrammaticalStructure(tree2);
            var tdl = gs.typedDependenciesCCprocessed();

            Console.WriteLine("\n{0}\n", tdl);

            // Extract collapsed dependencies from parsed tree
            var tp = new TreePrint("penn,typedDependenciesCollapsed");

            tp.printTree(tree2);
        }
Пример #19
0
        //use Stanford.NLP.Net to parse the sentence
        Tree Parse(string sent)
        {
            var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
            var sentReader       = new java.io.StringReader(sent);
            var rawWords         = tokenizerFactory.getTokenizer(sentReader).tokenize();

            sentReader.close();
            var tree = lp.apply(rawWords);

            // Extract dependencies from lexical tree
            var tlp = new PennTreebankLanguagePack();
            var gsf = tlp.grammaticalStructureFactory();
            var gs  = gsf.newGrammaticalStructure(tree);
            var tdl = gs.typedDependenciesCCprocessed();

            // Extract collapsed dependencies from parsed tree
            //var tp = new TreePrint("penn,typedDependenciesCollapsed");
            var tp = new TreePrint("penn");

            tp.printTree(tree);

            return(tree);
        }
Пример #20
0
        public static void DemoAPI(LexicalizedParser lp)
        {
            // This option shows parsing a list of correctly tokenized words
            var sent = new[] { "This", "is", "an", "easy", "sentence", "." };
            var rawWords = Sentence.toCoreLabelList(sent);
            var parse = lp.apply(rawWords);
            parse.pennPrint();

            // This option shows loading and using an explicit tokenizer
            const string Sent2 = "This is another sentence.";
            var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
            var sent2Reader = new StringReader(Sent2);
            var rawWords2 = tokenizerFactory.getTokenizer(sent2Reader).tokenize();
            parse = lp.apply(rawWords2);

            var tlp = new PennTreebankLanguagePack();
            var gsf = tlp.grammaticalStructureFactory();
            var gs = gsf.newGrammaticalStructure(parse);
            var tdl = gs.typedDependenciesCCprocessed();
            System.Console.WriteLine("\n{0}\n", tdl);

            var tp = new TreePrint("penn,typedDependenciesCollapsed");
            tp.printTree(parse);
        }
Пример #21
0
    public string Tags(string input)
    {
        // Path to models extracted from `stanford-parser-3.6.0-models.jar`
        var jarRoot         = @"";
        var modelsDirectory = jarRoot;

        var lp = LexicalizedParser.loadModel(modelsDirectory + @"\lexparser\englishPCFG.ser.gz");


        // This option shows loading and using an explicit tokenizer
        var sent2            = input;
        var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
        var sent2Reader      = new java.io.StringReader(sent2);
        var rawWords2        = tokenizerFactory.getTokenizer(sent2Reader).tokenize();

        sent2Reader.close();
        var tree2 = lp.apply(rawWords2);

        // Extract dependencies from lexical tree
        var tlp = new PennTreebankLanguagePack();
        var gsf = tlp.grammaticalStructureFactory();
        var gs  = gsf.newGrammaticalStructure(tree2);
        var tdl = gs.typedDependenciesCCprocessed();


        // Extract collapsed dependencies from parsed tree
        var tp = new TreePrint("penn,typedDependenciesCollapsed");

        UnityEngine.Debug.Log(tdl);
        //tp.printTree(tree2);

        for (int i = 0; i < tdl.size(); i++)
        {
            TypedDependency node = (TypedDependency)tdl.get(i);

            string relation = node.reln().getShortName();

            if (relation.Contains("nsubj"))
            {
                IndexedWord act = node.gov();
                //node.dep().getword()
                action = act.value();

                UnityEngine.Debug.Log("This is the action " + action);

                IndexedWord subject = node.dep();
                subj = subject.value();

                UnityEngine.Debug.Log("This is the subject " + subj);
            }

            if (relation.Contains("dobj"))
            {
                IndexedWord act = node.gov();
                //node.dep().getword()
                action = act.value();
                UnityEngine.Debug.Log("This is the action " + action);

                IndexedWord tar = node.dep();
                target = tar.value();
                UnityEngine.Debug.Log("This is the target " + target);
            }

            if (relation.Contains("nmod"))
            {
                IndexedWord tar_two = node.dep();
                second_target = tar_two.value();
                UnityEngine.Debug.Log("This is the target second " + second_target);
            }
        }

        return(tdl.ToString());
    }
Пример #22
0
        // = false;
        // not an instantiable class
        /// <summary>Usage: java edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon [-s] -treeFile file-with-trees [-po matching-pattern operation] operation-file-1 operation-file-2 ...</summary>
        /// <remarks>
        /// Usage: java edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon [-s] -treeFile file-with-trees [-po matching-pattern operation] operation-file-1 operation-file-2 ... operation-file-n
        /// <h4>Arguments:</h4>
        /// Each argument should be the name of a transformation file that contains a list of pattern
        /// and transformation operation list pairs.  That is, it is a sequence of pairs of a
        /// <see cref="Edu.Stanford.Nlp.Trees.Tregex.TregexPattern"/>
        /// pattern on one or more lines, then a
        /// blank line (empty or whitespace), then a list of transformation operations one per line
        /// (as specified by <b>Legal operation syntax</b> below) to apply when the pattern is matched,
        /// and then another blank line (empty or whitespace).
        /// Note the need for blank lines: The code crashes if they are not present as separators
        /// (although the blank line at the end of the file can be omitted).
        /// The script file can include comment lines, either whole comment lines or
        /// trailing comments introduced by %, which extend to the end of line.  A needed percent
        /// mark can be escaped by a preceding backslash.
        /// <p>
        /// For example, if you want to excise an SBARQ node whenever it is the parent of an SQ node,
        /// and relabel the SQ node to S, your transformation file would look like this:
        /// <blockquote>
        /// <code>
        /// SBARQ=n1 &lt; SQ=n2<br />
        /// <br />
        /// excise n1 n1<br />
        /// relabel n2 S
        /// </code>
        /// </blockquote>
        /// <h4>Options:</h4>
        /// <ul>
        /// <li>
        /// <c>-treeFile &lt;filename&gt;</c>
        /// specify the name of the file that has the trees you want to transform.
        /// <li>
        /// <c>-po &lt;matchPattern&gt; &lt;operation&gt;</c>
        /// Apply a single operation to every tree using the specified match pattern and the specified operation.  Use this option
        /// when you want to quickly try the effect of one pattern/surgery combination, and are too lazy to write a transformation file.
        /// <li>
        /// <c>-s</c>
        /// Print each output tree on one line (default is pretty-printing).
        /// <li>
        /// <c>-m</c>
        /// For every tree that had a matching pattern, print "before" (prepended as "Operated on:") and "after" (prepended as "Result:").  Unoperated on trees just pass through the transducer as usual.
        /// <li>
        /// <c>-encoding X</c>
        /// Uses character set X for input and output of trees.
        /// <li>
        /// <c>-macros &lt;filename&gt;</c>
        /// A file of macros to use on the tregex pattern.  Macros should be one per line, with original and replacement separated by tabs.
        /// <li>
        /// <c>-hf &lt;headFinder-class-name&gt;</c>
        /// use the specified
        /// <see cref="Edu.Stanford.Nlp.Trees.IHeadFinder"/>
        /// class to determine headship relations.
        /// <li>
        /// <c>-hfArg &lt;string&gt;</c>
        /// pass a string argument in to the
        /// <see cref="Edu.Stanford.Nlp.Trees.IHeadFinder"/>
        /// class's constructor.
        /// <c>-hfArg</c>
        /// can be used multiple times to pass in multiple arguments.
        /// <li>
        /// <c>-trf &lt;TreeReaderFactory-class-name&gt;</c>
        /// use the specified
        /// <see cref="Edu.Stanford.Nlp.Trees.ITreeReaderFactory"/>
        /// class to read trees from files.
        /// </ul>
        /// <h4>Legal operation syntax:</h4>
        /// <ul>
        /// <li>
        /// <c>delete &lt;name&gt;</c>
        /// deletes the node and everything below it.
        /// <li>
        /// <c>prune &lt;name&gt;</c>
        /// Like delete, but if, after the pruning, the parent has no children anymore, the parent is pruned too.  Pruning continues to affect all ancestors until one is found with remaining children.  This may result in a null tree.
        /// <li>
        /// <c>excise &lt;name1&gt; &lt;name2&gt;</c>
        /// The name1 node should either dominate or be the same as the name2 node.  This excises out everything from
        /// name1 to name2.  All the children of name2 go into the parent of name1, where name1 was.
        /// <li>
        /// <c>relabel &lt;name&gt; &lt;new-label&gt;</c>
        /// Relabels the node to have the new label. <br />
        /// There are three possible forms: <br />
        /// <c>relabel nodeX VP</c>
        /// - for changing a node label to an
        /// alphanumeric string <br />
        /// <c>relabel nodeX /''/</c>
        /// - for relabeling a node to
        /// something that isn't a valid identifier without quoting <br />
        /// <c>relabel nodeX /^VB(.*)$/verb\\/$1/</c>
        /// - for regular
        /// expression based relabeling. In this case, all matches of the
        /// regular expression against the node label are replaced with the
        /// replacement String.  This has the semantics of Java/Perl's
        /// replaceAll: you may use capturing groups and put them in
        /// replacements with $n. For example, if the pattern is /foo/bar/
        /// and the node matched is "foo", the replaceAll semantics result in
        /// "barbar".  If the pattern is /^foo(.*)$/bar$1/ and node matched is
        /// "foofoo", relabel will result in "barfoo".  <br />
        /// When using the regex replacement method, you can also use the
        /// sequences ={node} and %{var} in the replacement string to use
        /// captured nodes or variable strings in the replacement string.
        /// For example, if the Tregex pattern was "duck=bar" and the relabel
        /// is /foo/={bar}/, "foofoo" will be replaced with "duckduck". <br />
        /// To concatenate two nodes named in the tregex pattern, for
        /// example, you can use the pattern /^.*$/={foo}={bar}/.  Note that
        /// the ^.*$ is necessary to make sure the regex pattern only matches
        /// and replaces once on the entire node name. <br />
        /// To get an "=" or a "%" in the replacement, using \ escaping.
        /// Also, as in the example you can escape a slash in the middle of
        /// the second and third forms with \\/ and \\\\. <br />
        /// <li>
        /// <c>insert &lt;name&gt; &lt;position&gt;</c>
        /// or
        /// <c>insert &lt;tree&gt; &lt;position&gt;</c>
        /// inserts the named node or tree into the position specified.
        /// <li>
        /// <c>move &lt;name&gt; &lt;position&gt;</c>
        /// moves the named node into the specified position.
        /// <p>Right now the  only ways to specify position are:
        /// <p>
        /// <c>$+ &lt;name&gt;</c>
        /// the left sister of the named node<br />
        /// <c>$- &lt;name&gt;</c>
        /// the right sister of the named node<br />
        /// <c>&gt;i &lt;name&gt;</c>
        /// the i_th daughter of the named node<br />
        /// <c>&gt;-i &lt;name&gt;</c>
        /// the i_th daughter, counting from the right, of the named node.
        /// <li>
        /// <c>replace &lt;name1&gt; &lt;name2&gt;</c>
        /// deletes name1 and inserts a copy of name2 in its place.
        /// <li>
        /// <c>replace &lt;name&gt; &lt;tree&gt; &lt;tree2&gt;...</c>
        /// deletes name and inserts the new tree(s) in its place.  If
        /// more than one replacement tree is given, each of the new
        /// subtrees will be added in order where the old tree was.
        /// Multiple subtrees at the root is an illegal operation and
        /// will throw an exception.
        /// <li>
        /// <c>createSubtree &lt;auxiliary-tree-or-label&gt; &lt;name1&gt; [&lt;name2&gt;]</c>
        /// Create a subtree out of all the nodes from
        /// <c>&lt;name1&gt;</c>
        /// through
        /// <c>&lt;name2&gt;</c>
        /// . The subtree is moved to the foot of the given
        /// auxiliary tree, and the tree is inserted where the nodes of
        /// the subtree used to reside. If a simple label is provided as
        /// the first argument, the subtree is given a single parent with
        /// a name corresponding to the label.  To limit the operation to
        /// just one node, elide
        /// <c>&lt;name2&gt;</c>
        /// .
        /// <li>
        /// <c>adjoin &lt;auxiliary_tree&gt; &lt;name&gt;</c>
        /// Adjoins the specified auxiliary tree into the named node.
        /// The daughters of the target node will become the daughters of the foot of the auxiliary tree.
        /// <li>
        /// <c>adjoinH &lt;auxiliary_tree&gt; &lt;name&gt;</c>
        /// Similar to adjoin, but preserves the target node
        /// and makes it the root of
        /// <c>&lt;tree&gt;</c>
        /// . (It is still accessible as
        /// <c>name</c>
        /// .  The root of the
        /// auxiliary tree is ignored.)
        /// <li>
        /// <c>adjoinF &lt;auxiliary_tree&gt; &lt;name&gt;</c>
        /// Similar to adjoin,
        /// but preserves the target node and makes it the foot of
        /// <c>&lt;tree&gt;</c>
        /// .
        /// (It is still accessible as
        /// <c>name</c>
        /// , and retains its status as parent of its children.
        /// The root of the auxiliary tree is ignored.)
        /// <li> <dt>
        /// <c>coindex &lt;name1&gt; &lt;name2&gt; ... &lt;nameM&gt;</c>
        /// Puts a (Penn Treebank style)
        /// coindexation suffix of the form "-N" on each of nodes name_1 through name_m.  The value of N will be
        /// automatically generated in reference to the existing coindexations in the tree, so that there is never
        /// an accidental clash of indices across things that are not meant to be coindexed.
        /// </ul>
        /// <p>
        /// In the context of
        /// <c>adjoin</c>
        /// ,
        /// <c>adjoinH</c>
        /// ,
        /// <c>adjoinF</c>
        /// , and
        /// <c>createSubtree</c>
        /// , an auxiliary
        /// tree is a tree in Penn Treebank format with
        /// <c>@</c>
        /// on
        /// exactly one of the leaves denoting the foot of the tree.
        /// The operations which use the foot use the labeled node.
        /// For example:
        /// </p>
        /// <blockquote>
        /// Tsurgeon:
        /// <c>adjoin (FOO (BAR@)) foo</c>
        /// <br />
        /// Tregex:
        /// <c>B=foo</c>
        /// <br />
        /// Input:
        /// <c>(A (B 1 2))</c>
        /// Output:
        /// <c>(A (FOO (BAR 1 2)))</c>
        /// </blockquote>
        /// <p>
        /// Tsurgeon applies the same operation to the same tree for as long
        /// as the given tregex operation matches.  This means that infinite
        /// loops are very easy to cause.  One common situation where this comes up
        /// is with an insert operation will repeats infinitely many times
        /// unless you add an expression to the tregex that matches against
        /// the inserted pattern.  For example, this pattern will infinite loop:
        /// </p>
        /// <blockquote>
        /// <code>
        /// TregexPattern tregex = TregexPattern.compile("S=node &lt;&lt; NP"); <br />
        /// TsurgeonPattern tsurgeon = Tsurgeon.parseOperation("insert (NP foo) &gt;-1 node");
        /// </code>
        /// </blockquote>
        /// <p>
        /// This pattern, though, will terminate:
        /// </p>
        /// <blockquote>
        /// <code>
        /// TregexPattern tregex = TregexPattern.compile("S=node &lt;&lt; NP !&lt;&lt; foo"); <br />
        /// TsurgeonPattern tsurgeon = Tsurgeon.parseOperation("insert (NP foo) &gt;-1 node");
        /// </code>
        /// </blockquote>
        /// <p>
        /// Tsurgeon has (very) limited support for conditional statements.
        /// If a pattern is prefaced with
        /// <c>if exists &lt;name&gt;</c>
        /// ,
        /// the rest of the pattern will only execute if
        /// the named node was found in the corresponding TregexMatcher.
        /// </p>
        /// </remarks>
        /// <param name="args">
        /// a list of names of files each of which contains a single tregex matching pattern plus a list, one per line,
        /// of transformation operations to apply to the matched pattern.
        /// </param>
        /// <exception cref="System.Exception">If an I/O or pattern syntax error</exception>
        public static void Main(string[] args)
        {
            string headFinderClassName = null;
            string headFinderOption    = "-hf";

            string[] headFinderArgs      = null;
            string   headFinderArgOption = "-hfArg";
            string   encoding            = "UTF-8";
            string   encodingOption      = "-encoding";

            if (args.Length == 0)
            {
                log.Info("Usage: java edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon [-s] -treeFile <file-with-trees> [-po <matching-pattern> <operation>] <operation-file-1> <operation-file-2> ... <operation-file-n>");
                System.Environment.Exit(0);
            }
            string treePrintFormats;
            string singleLineOption = "-s";
            string verboseOption    = "-v";
            string matchedOption    = "-m";
            // if set, then print original form of trees that are matched & thus operated on
            string patternOperationOption = "-po";
            string treeFileOption         = "-treeFile";
            string trfOption     = "-trf";
            string macroOption   = "-macros";
            string macroFilename = string.Empty;
            IDictionary <string, int> flagMap = Generics.NewHashMap();

            flagMap[patternOperationOption] = 2;
            flagMap[treeFileOption]         = 1;
            flagMap[trfOption]        = 1;
            flagMap[singleLineOption] = 0;
            flagMap[encodingOption]   = 1;
            flagMap[headFinderOption] = 1;
            flagMap[macroOption]      = 1;
            IDictionary <string, string[]> argsMap = StringUtils.ArgsToMap(args, flagMap);

            args = argsMap[null];
            if (argsMap.Contains(headFinderOption))
            {
                headFinderClassName = argsMap[headFinderOption][0];
            }
            if (argsMap.Contains(headFinderArgOption))
            {
                headFinderArgs = argsMap[headFinderArgOption];
            }
            if (argsMap.Contains(verboseOption))
            {
                verbose = true;
            }
            if (argsMap.Contains(singleLineOption))
            {
                treePrintFormats = "oneline,";
            }
            else
            {
                treePrintFormats = "penn,";
            }
            if (argsMap.Contains(encodingOption))
            {
                encoding = argsMap[encodingOption][0];
            }
            if (argsMap.Contains(macroOption))
            {
                macroFilename = argsMap[macroOption][0];
            }
            TreePrint          tp    = new TreePrint(treePrintFormats, new PennTreebankLanguagePack());
            PrintWriter        pwOut = new PrintWriter(new OutputStreamWriter(System.Console.Out, encoding), true);
            ITreeReaderFactory trf;

            if (argsMap.Contains(trfOption))
            {
                string trfClass = argsMap[trfOption][0];
                trf = ReflectionLoading.LoadByReflection(trfClass);
            }
            else
            {
                trf = new TregexPattern.TRegexTreeReaderFactory();
            }
            Treebank trees = new DiskTreebank(trf, encoding);

            if (argsMap.Contains(treeFileOption))
            {
                trees.LoadPath(argsMap[treeFileOption][0]);
            }
            if (trees.IsEmpty())
            {
                log.Info("Warning: No trees specified to operate on.  Use -treeFile path option.");
            }
            TregexPatternCompiler compiler;

            if (headFinderClassName == null)
            {
                compiler = new TregexPatternCompiler();
            }
            else
            {
                IHeadFinder hf;
                if (headFinderArgs == null)
                {
                    hf = ReflectionLoading.LoadByReflection(headFinderClassName);
                }
                else
                {
                    hf = ReflectionLoading.LoadByReflection(headFinderClassName, (object[])headFinderArgs);
                }
                compiler = new TregexPatternCompiler(hf);
            }
            Macros.AddAllMacros(compiler, macroFilename, encoding);
            IList <Pair <TregexPattern, TsurgeonPattern> > ops = new List <Pair <TregexPattern, TsurgeonPattern> >();

            if (argsMap.Contains(patternOperationOption))
            {
                TregexPattern   matchPattern = compiler.Compile(argsMap[patternOperationOption][0]);
                TsurgeonPattern p            = ParseOperation(argsMap[patternOperationOption][1]);
                ops.Add(new Pair <TregexPattern, TsurgeonPattern>(matchPattern, p));
            }
            else
            {
                foreach (string arg in args)
                {
                    IList <Pair <TregexPattern, TsurgeonPattern> > pairs = GetOperationsFromFile(arg, encoding, compiler);
                    foreach (Pair <TregexPattern, TsurgeonPattern> pair in pairs)
                    {
                        if (verbose)
                        {
                            log.Info(pair.Second());
                        }
                        ops.Add(pair);
                    }
                }
            }
            foreach (Tree t in trees)
            {
                Tree original = t.DeepCopy();
                Tree result   = ProcessPatternsOnTree(ops, t);
                if (argsMap.Contains(matchedOption) && matchedOnTree)
                {
                    pwOut.Println("Operated on: ");
                    DisplayTree(original, tp, pwOut);
                    pwOut.Println("Result: ");
                }
                DisplayTree(result, tp, pwOut);
            }
        }
Пример #23
0
        /// <summary>
        /// This method lets you train and test a segmenter relative to a
        /// Treebank.
        /// </summary>
        /// <remarks>
        /// This method lets you train and test a segmenter relative to a
        /// Treebank.
        /// <p>
        /// <i>Implementation note:</i> This method is largely cloned from
        /// LexicalizedParser's main method.  Should we try to have it be able
        /// to train segmenters to stop things going out of sync?
        /// </remarks>
        public static void Main(string[] args)
        {
            bool     train = false;
            bool     saveToSerializedFile      = false;
            bool     saveToTextFile            = false;
            string   serializedInputFileOrUrl  = null;
            string   textInputFileOrUrl        = null;
            string   serializedOutputFileOrUrl = null;
            string   textOutputFileOrUrl       = null;
            string   treebankPath = null;
            Treebank testTreebank = null;
            // Treebank tuneTreebank = null;
            string      testPath    = null;
            IFileFilter testFilter  = null;
            IFileFilter trainFilter = null;
            string      encoding    = null;
            // variables needed to process the files to be parsed
            ITokenizerFactory <Word> tokenizerFactory = null;
            //    DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor();
            bool tokenized = false;
            // whether or not the input file has already been tokenized
            IFunction <IList <IHasWord>, IList <IHasWord> > escaper = new ChineseEscaper();
            // int tagDelimiter = -1;
            // String sentenceDelimiter = "\n";
            // boolean fromXML = false;
            int argIndex = 0;

            if (args.Length < 1)
            {
                log.Info("usage: java edu.stanford.nlp.parser.lexparser." + "LexicalizedParser parserFileOrUrl filename*");
                return;
            }
            Options op = new Options();

            op.tlpParams = new ChineseTreebankParserParams();
            // while loop through option arguments
            while (argIndex < args.Length && args[argIndex][0] == '-')
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-train"))
                {
                    train = true;
                    saveToSerializedFile = true;
                    int numSubArgs = NumSubArgs(args, argIndex);
                    argIndex++;
                    if (numSubArgs > 1)
                    {
                        treebankPath = args[argIndex];
                        argIndex++;
                    }
                    else
                    {
                        throw new Exception("Error: -train option must have treebankPath as first argument.");
                    }
                    if (numSubArgs == 2)
                    {
                        trainFilter = new NumberRangesFileFilter(args[argIndex++], true);
                    }
                    else
                    {
                        if (numSubArgs >= 3)
                        {
                            try
                            {
                                int low  = System.Convert.ToInt32(args[argIndex]);
                                int high = System.Convert.ToInt32(args[argIndex + 1]);
                                trainFilter = new NumberRangeFileFilter(low, high, true);
                                argIndex   += 2;
                            }
                            catch (NumberFormatException)
                            {
                                // maybe it's a ranges expression?
                                trainFilter = new NumberRangesFileFilter(args[argIndex], true);
                                argIndex++;
                            }
                        }
                    }
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-encoding"))
                    {
                        // sets encoding for TreebankLangParserParams
                        encoding = args[argIndex + 1];
                        op.tlpParams.SetInputEncoding(encoding);
                        op.tlpParams.SetOutputEncoding(encoding);
                        argIndex += 2;
                    }
                    else
                    {
                        if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-loadFromSerializedFile"))
                        {
                            // load the parser from a binary serialized file
                            // the next argument must be the path to the parser file
                            serializedInputFileOrUrl = args[argIndex + 1];
                            argIndex += 2;
                        }
                        else
                        {
                            // doesn't make sense to load from TextFile -pichuan
                            //      } else if (args[argIndex].equalsIgnoreCase("-loadFromTextFile")) {
                            //        // load the parser from declarative text file
                            //        // the next argument must be the path to the parser file
                            //        textInputFileOrUrl = args[argIndex + 1];
                            //        argIndex += 2;
                            if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-saveToSerializedFile"))
                            {
                                saveToSerializedFile      = true;
                                serializedOutputFileOrUrl = args[argIndex + 1];
                                argIndex += 2;
                            }
                            else
                            {
                                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-saveToTextFile"))
                                {
                                    // save the parser to declarative text file
                                    saveToTextFile      = true;
                                    textOutputFileOrUrl = args[argIndex + 1];
                                    argIndex           += 2;
                                }
                                else
                                {
                                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-treebank"))
                                    {
                                        // the next argument is the treebank path and range for testing
                                        int numSubArgs = NumSubArgs(args, argIndex);
                                        argIndex++;
                                        if (numSubArgs == 1)
                                        {
                                            testFilter = new NumberRangesFileFilter(args[argIndex++], true);
                                        }
                                        else
                                        {
                                            if (numSubArgs > 1)
                                            {
                                                testPath = args[argIndex++];
                                                if (numSubArgs == 2)
                                                {
                                                    testFilter = new NumberRangesFileFilter(args[argIndex++], true);
                                                }
                                                else
                                                {
                                                    if (numSubArgs >= 3)
                                                    {
                                                        try
                                                        {
                                                            int low  = System.Convert.ToInt32(args[argIndex]);
                                                            int high = System.Convert.ToInt32(args[argIndex + 1]);
                                                            testFilter = new NumberRangeFileFilter(low, high, true);
                                                            argIndex  += 2;
                                                        }
                                                        catch (NumberFormatException)
                                                        {
                                                            // maybe it's a ranges expression?
                                                            testFilter = new NumberRangesFileFilter(args[argIndex++], true);
                                                        }
                                                    }
                                                }
                                            }
                                        }
                                    }
                                    else
                                    {
                                        int j = op.tlpParams.SetOptionFlag(args, argIndex);
                                        if (j == argIndex)
                                        {
                                            log.Info("Unknown option ignored: " + args[argIndex]);
                                            j++;
                                        }
                                        argIndex = j;
                                    }
                                }
                            }
                        }
                    }
                }
            }
            // end while loop through arguments
            ITreebankLangParserParams tlpParams = op.tlpParams;

            // all other arguments are order dependent and
            // are processed in order below
            Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter cs = null;
            if (!train && op.testOptions.verbose)
            {
                System.Console.Out.WriteLine("Currently " + new DateTime());
                PrintArgs(args, System.Console.Out);
            }
            if (train)
            {
                PrintArgs(args, System.Console.Out);
                // so we train a parser using the treebank
                if (treebankPath == null)
                {
                    // the next arg must be the treebank path, since it wasn't give earlier
                    treebankPath = args[argIndex];
                    argIndex++;
                    if (args.Length > argIndex + 1)
                    {
                        try
                        {
                            // the next two args might be the range
                            int low  = System.Convert.ToInt32(args[argIndex]);
                            int high = System.Convert.ToInt32(args[argIndex + 1]);
                            trainFilter = new NumberRangeFileFilter(low, high, true);
                            argIndex   += 2;
                        }
                        catch (NumberFormatException)
                        {
                            // maybe it's a ranges expression?
                            trainFilter = new NumberRangesFileFilter(args[argIndex], true);
                            argIndex++;
                        }
                    }
                }
                Treebank        trainTreebank = MakeTreebank(treebankPath, op, trainFilter);
                IIndex <string> wordIndex     = new HashIndex <string>();
                IIndex <string> tagIndex      = new HashIndex <string>();
                cs = new Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter(trainTreebank, op, wordIndex, tagIndex);
            }
            else
            {
                if (textInputFileOrUrl != null)
                {
                }
                else
                {
                    // so we load the segmenter from a text grammar file
                    // XXXXX fix later -pichuan
                    //cs = new LexicalizedParser(textInputFileOrUrl, true, op);
                    // so we load a serialized segmenter
                    if (serializedInputFileOrUrl == null)
                    {
                        // the next argument must be the path to the serialized parser
                        serializedInputFileOrUrl = args[argIndex];
                        argIndex++;
                    }
                    try
                    {
                        cs = new Edu.Stanford.Nlp.Parser.Lexparser.ChineseLexiconAndWordSegmenter(serializedInputFileOrUrl, op);
                    }
                    catch (ArgumentException)
                    {
                        log.Info("Error loading segmenter, exiting...");
                        System.Environment.Exit(0);
                    }
                }
            }
            // the following has to go after reading parser to make sure
            // op and tlpParams are the same for train and test
            TreePrint treePrint = op.testOptions.TreePrint(tlpParams);

            if (testFilter != null)
            {
                if (testPath == null)
                {
                    if (treebankPath == null)
                    {
                        throw new Exception("No test treebank path specified...");
                    }
                    else
                    {
                        log.Info("No test treebank path specified.  Using train path: \"" + treebankPath + "\"");
                        testPath = treebankPath;
                    }
                }
                testTreebank = tlpParams.TestMemoryTreebank();
                testTreebank.LoadPath(testPath, testFilter);
            }
            op.trainOptions.sisterSplitters = Generics.NewHashSet(Arrays.AsList(tlpParams.SisterSplitters()));
            // at this point we should be sure that op.tlpParams is
            // set appropriately (from command line, or from grammar file),
            // and will never change again.  We also set the tlpParams of the
            // LexicalizedParser instance to be the same object.  This is
            // redundancy that we probably should take out eventually.
            //
            // -- Roger
            if (op.testOptions.verbose)
            {
                log.Info("Lexicon is " + cs.GetType().FullName);
            }
            PrintWriter pwOut = tlpParams.Pw();
            PrintWriter pwErr = tlpParams.Pw(System.Console.Error);

            // Now what do we do with the parser we've made
            if (saveToTextFile)
            {
                // save the parser to textGrammar format
                if (textOutputFileOrUrl != null)
                {
                    SaveSegmenterDataToText(cs, textOutputFileOrUrl);
                }
                else
                {
                    log.Info("Usage: must specify a text segmenter data output path");
                }
            }
            if (saveToSerializedFile)
            {
                if (serializedOutputFileOrUrl == null && argIndex < args.Length)
                {
                    // the next argument must be the path to serialize to
                    serializedOutputFileOrUrl = args[argIndex];
                    argIndex++;
                }
                if (serializedOutputFileOrUrl != null)
                {
                    SaveSegmenterDataToSerialized(cs, serializedOutputFileOrUrl);
                }
                else
                {
                    if (textOutputFileOrUrl == null && testTreebank == null)
                    {
                        // no saving/parsing request has been specified
                        log.Info("usage: " + "java edu.stanford.nlp.parser.lexparser.ChineseLexiconAndWordSegmenter" + "-train trainFilesPath [start stop] serializedParserFilename");
                    }
                }
            }
            /* --------------------- Testing part!!!! ----------------------- */
            if (op.testOptions.verbose)
            {
            }
            //      printOptions(false, op);
            if (testTreebank != null || (argIndex < args.Length && Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-treebank")))
            {
                // test parser on treebank
                if (testTreebank == null)
                {
                    // the next argument is the treebank path and range for testing
                    testTreebank = tlpParams.TestMemoryTreebank();
                    if (args.Length < argIndex + 4)
                    {
                        testTreebank.LoadPath(args[argIndex + 1]);
                    }
                    else
                    {
                        int testlow  = System.Convert.ToInt32(args[argIndex + 2]);
                        int testhigh = System.Convert.ToInt32(args[argIndex + 3]);
                        testTreebank.LoadPath(args[argIndex + 1], new NumberRangeFileFilter(testlow, testhigh, true));
                    }
                }
            }
        }
Пример #24
0
        /// <summary>
        /// Called by determineHead and may be overridden in subclasses
        /// if special treatment is necessary for particular categories.
        /// </summary>
        /// <param name="t">The tre to determine the head daughter of</param>
        /// <param name="parent">The parent of t (or may be null)</param>
        /// <returns>The head daughter of t</returns>
        protected internal virtual Tree DetermineNonTrivialHead(Tree t, Tree parent)
        {
            Tree   theHead   = null;
            string motherCat = tlp.BasicCategory(t.Label().Value());

            if (motherCat.StartsWith("@"))
            {
                motherCat = Sharpen.Runtime.Substring(motherCat, 1);
            }
            if (Debug)
            {
                log.Info("Looking for head of " + t.Label() + "; value is |" + t.Label().Value() + "|, " + " baseCat is |" + motherCat + '|');
            }
            // We know we have nonterminals underneath
            // (a bit of a Penn Treebank assumption, but).
            // Look at label.
            // a total special case....
            // first look for POS tag at end
            // this appears to be redundant in the Collins case since the rule already would do that
            //    Tree lastDtr = t.lastChild();
            //    if (tlp.basicCategory(lastDtr.label().value()).equals("POS")) {
            //      theHead = lastDtr;
            //    } else {
            string[][] how  = nonTerminalInfo[motherCat];
            Tree[]     kids = t.Children();
            if (how == null)
            {
                if (Debug)
                {
                    log.Info("Warning: No rule found for " + motherCat + " (first char: " + motherCat[0] + ')');
                    log.Info("Known nonterms are: " + nonTerminalInfo.Keys);
                }
                if (defaultRule != null)
                {
                    if (Debug)
                    {
                        log.Info("  Using defaultRule");
                    }
                    return(TraverseLocate(kids, defaultRule, true));
                }
                else
                {
                    // TreePrint because TreeGraphNode only prints the node number,
                    // doesn't print the tree structure
                    TreePrint    printer = new TreePrint("penn");
                    StringWriter buffer  = new StringWriter();
                    printer.PrintTree(t, new PrintWriter(buffer));
                    // TODO: we could get really fancy and define our own
                    // exception class to represent this
                    throw new ArgumentException("No head rule defined for " + motherCat + " using " + this.GetType() + " in " + buffer.ToString());
                }
            }
            for (int i = 0; i < how.Length; i++)
            {
                bool lastResort = (i == how.Length - 1);
                theHead = TraverseLocate(kids, how[i], lastResort);
                if (theHead != null)
                {
                    break;
                }
            }
            if (Debug)
            {
                log.Info("  Chose " + theHead.Label());
            }
            return(theHead);
        }
Пример #25
0
        static void Main(string[] args)
        {
            var tf = new edu.stanford.nlp.trees.LabeledScoredTreeFactory(new CustomStringLabelFactory());

            var str   = "(x2 / score :null_edge(x1 / null_tag) :null_edge(x3 / null_tag)	:time(xap0 / before	:quant(x5 / temporal - quantity	:unit(y / year) :null_edge(x4 / null_tag))))";
            var input = new java.io.StringReader(str);

            var treeReader = new edu.stanford.nlp.trees.PennTreeReader(input, tf, new CustomTreeNormalizer(), new CustomTokenizerAdapter(input));

            var t = treeReader.readTree();


            TreePrint p = new TreePrint("penn");

            p.printTree(t);



            //READ RST INFORMATION
            RSTTree tree = new RSTTree("lincon");

            tree.Load(Path.Combine(Root, "rst.xml"));
            tree.EvaluateODonell();

            var sum = tree.Summarize();

            //READ AMR INFORMATION FOR EACH EDU AND ASSOCIATTE THE ODONELL SCORE
            IGraph g      = new Graph();
            var    parser = new VDS.RDF.Parsing.RdfXmlParser();

            //   NTriplesParser ntparser = new NTriplesParser();
            parser.Load(g, Path.Combine(Root, "output.xml"));
            var document = new AMRDocument();

            document.Load(g);

            foreach (var item in document.EDUSentences)
            {
                item.ApplyRSTWeight(sum.Where(c => c.edu == item.Id).Select(c => c.Weight).First());
            }

            //var rstdocument = new RSTDocumentRepository();
            //rstdocument.DeleteAllNodes();
            //rstdocument.Save(tree);

            AMRNEORepository repo = new AMRNEORepository();

            repo.DeleteAllNodes();
            repo.SaveDocument(document);



            //var ids = Helper.ReadIds(g);
            //foreach (var item in ids)
            //{

            //    item.sentence = Helper.GetSentence(g, item);
            //    item.AddNodes(g);


            //    if (item.id == 22)
            //    {
            //        Console.WriteLine(item.urlid);
            //        Console.WriteLine(item.sentence);
            //        Console.WriteLine(item.Root.uriid);
            //        Console.WriteLine(item.Root.Term.uriid);
            //        Console.WriteLine(item.Root.Term.type);
            //    }

            //}

            //SparqlQueryParser qparser = new SparqlQueryParser();
            ////Then we can parse a SPARQL string into a query

            //StringBuilder querystr = new StringBuilder();
            //querystr.AppendLine("PREFIX amr-core: <http://amr.isi.edu/rdf/core-amr#>");
            //querystr.AppendLine("PREFIX amr-data: <http://amr.isi.edu/amr_data#>");
            //querystr.AppendLine("PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>");
            //querystr.AppendLine("PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>");
            //querystr.AppendLine("PREFIX amr-terms: <http://amr.isi.edu/rdf/amr-terms#>");
            ////querystr.AppendLine("SELECT  ?p WHERE { ?s rdf:type ?p }");
            ////querystr.Append("SELECT ?s ?sentence ?id ?root ?rtype ?amrtype");
            //querystr.Append("SELECT ?root ?rtype  ?amrtypelbl ");
            //querystr.Append("WHERE {");
            //querystr.Append("?s amr-core:has-sentence ?sentence.");
            //querystr.Append("?s amr-core:has-id ?id.");
            //querystr.Append("?s amr-core:root ?root. ");
            //querystr.Append("?root rdf:type ?rtype. ");
            //querystr.Append("?rtype rdf:type ?amrtype. ");
            //querystr.Append("?amrtype rdfs:label ?amrtypelbl. ");
            //querystr.Append("}");

            //SparqlQuery q = qparser.ParseFromString(querystr.ToString());

            ////http://amr.isi.edu/rdf/core-amr#has-id
            //var rset = (SparqlResultSet)g.ExecuteQuery(q);

            //var SB = new StringBuilder();
            //if (rset.Result && rset.Results.Count > 0)
            //{
            //    foreach (var result in rset.Results)
            //    {
            //        foreach (var r in result)
            //        {
            //            Console.WriteLine(r.Key + " " + r.Value);
            //        }

            //        //Do what you want with each result
            //    }
            //}
            //File.WriteAllText("dic.txt", SB.ToString());
            //http://amr.isi.edu/amr_data/22#root01

            //foreach (var item in g.Triples)
            //{


            //    Console.WriteLine(item.Subject);

            //}



            //foreach (var node in g.Nodes)
            //{
            //    Console.WriteLine(node.ToString());
            //}

            //g.SaveToFile("output.rdf");
        }
        /// <summary>Parse a sentence represented as a List of tokens.</summary>
        /// <remarks>
        /// Parse a sentence represented as a List of tokens.
        /// The text must already have been tokenized and
        /// normalized into tokens that are appropriate to the treebank
        /// which was used to train the parser.  The tokens can be of
        /// multiple types, and the list items need not be homogeneous as to type
        /// (in particular, only some words might be given tags):
        /// <ul>
        /// <li>If a token implements HasWord, then the word to be parsed is
        /// given by its word() value.</li>
        /// <li>If a token implements HasTag and the tag() value is not
        /// null or the empty String, then the parser is strongly advised to assign
        /// a part of speech tag that <i>begins</i> with this String.</li>
        /// </ul>
        /// </remarks>
        /// <param name="sentence">The sentence to parse</param>
        /// <returns>true Iff the sentence was accepted by the grammar</returns>
        /// <exception cref="System.NotSupportedException">
        /// If the Sentence is too long or
        /// of zero length or the parse
        /// otherwise fails for resource reasons
        /// </exception>
        private bool ParseInternal <_T0>(IList <_T0> sentence)
            where _T0 : IHasWord
        {
            parseSucceeded   = false;
            parseNoMemory    = false;
            parseUnparsable  = false;
            parseSkipped     = false;
            parseFallback    = false;
            whatFailed       = null;
            addedPunct       = false;
            originalSentence = sentence;
            int length = sentence.Count;

            if (length == 0)
            {
                parseSkipped = true;
                throw new NotSupportedException("Can't parse a zero-length sentence!");
            }
            IList <IHasWord> sentenceB;

            if (op.wordFunction != null)
            {
                sentenceB = Generics.NewArrayList();
                foreach (IHasWord word in originalSentence)
                {
                    if (word is ILabel)
                    {
                        ILabel label    = (ILabel)word;
                        ILabel newLabel = label.LabelFactory().NewLabel(label);
                        if (newLabel is IHasWord)
                        {
                            sentenceB.Add((IHasWord)newLabel);
                        }
                        else
                        {
                            throw new AssertionError("This should have been a HasWord");
                        }
                    }
                    else
                    {
                        if (word is IHasTag)
                        {
                            TaggedWord tw = new TaggedWord(word.Word(), ((IHasTag)word).Tag());
                            sentenceB.Add(tw);
                        }
                        else
                        {
                            sentenceB.Add(new Word(word.Word()));
                        }
                    }
                }
                foreach (IHasWord word_1 in sentenceB)
                {
                    word_1.SetWord(op.wordFunction.Apply(word_1.Word()));
                }
            }
            else
            {
                sentenceB = new List <IHasWord>(sentence);
            }
            if (op.testOptions.addMissingFinalPunctuation)
            {
                addedPunct = AddSentenceFinalPunctIfNeeded(sentenceB, length);
            }
            if (length > op.testOptions.maxLength)
            {
                parseSkipped = true;
                throw new NotSupportedException("Sentence too long: length " + length);
            }
            TreePrint   treePrint = GetTreePrint();
            PrintWriter pwOut     = op.tlpParams.Pw();

            //Insert the boundary symbol
            if (sentence[0] is CoreLabel)
            {
                CoreLabel boundary = new CoreLabel();
                boundary.SetWord(LexiconConstants.Boundary);
                boundary.SetValue(LexiconConstants.Boundary);
                boundary.SetTag(LexiconConstants.BoundaryTag);
                boundary.SetIndex(sentence.Count + 1);
                //1-based indexing used in the parser
                sentenceB.Add(boundary);
            }
            else
            {
                sentenceB.Add(new TaggedWord(LexiconConstants.Boundary, LexiconConstants.BoundaryTag));
            }
            if (Thread.Interrupted())
            {
                throw new RuntimeInterruptedException();
            }
            if (op.doPCFG)
            {
                if (!pparser.Parse(sentenceB))
                {
                    return(parseSucceeded);
                }
                if (op.testOptions.verbose)
                {
                    pwOut.Println("PParser output");
                    // getBestPCFGParse(false).pennPrint(pwOut); // with scores on nodes
                    treePrint.PrintTree(GetBestPCFGParse(false), pwOut);
                }
            }
            // without scores on nodes
            if (Thread.Interrupted())
            {
                throw new RuntimeInterruptedException();
            }
            if (op.doDep && !op.testOptions.useFastFactored)
            {
                if (!dparser.Parse(sentenceB))
                {
                    return(parseSucceeded);
                }
                // cdm nov 2006: should move these printing bits to the main printing section,
                // so don't calculate the best parse twice!
                if (op.testOptions.verbose)
                {
                    pwOut.Println("DParser output");
                    treePrint.PrintTree(dparser.GetBestParse(), pwOut);
                }
            }
            if (Thread.Interrupted())
            {
                throw new RuntimeInterruptedException();
            }
            if (op.doPCFG && op.doDep)
            {
                if (!bparser.Parse(sentenceB))
                {
                    return(parseSucceeded);
                }
                else
                {
                    parseSucceeded = true;
                }
            }
            return(true);
        }
        /// <summary>Test the parser on a treebank.</summary>
        /// <remarks>
        /// Test the parser on a treebank. Parses will be written to stdout, and
        /// various other information will be written to stderr and stdout,
        /// particularly if <code>op.testOptions.verbose</code> is true.
        /// </remarks>
        /// <param name="testTreebank">The treebank to parse</param>
        /// <returns>
        /// The labeled precision/recall F<sub>1</sub> (EVALB measure)
        /// of the parser on the treebank.
        /// </returns>
        public virtual double TestOnTreebank(Treebank testTreebank)
        {
            log.Info("Testing on treebank");
            Timing    treebankTotalTimer        = new Timing();
            TreePrint treePrint                 = op.testOptions.TreePrint(op.tlpParams);
            ITreebankLangParserParams tlpParams = op.tlpParams;
            ITreebankLanguagePack     tlp       = op.Langpack();
            PrintWriter pwOut;
            PrintWriter pwErr;

            if (op.testOptions.quietEvaluation)
            {
                NullOutputStream quiet = new NullOutputStream();
                pwOut = tlpParams.Pw(quiet);
                pwErr = tlpParams.Pw(quiet);
            }
            else
            {
                pwOut = tlpParams.Pw();
                pwErr = tlpParams.Pw(System.Console.Error);
            }
            if (op.testOptions.verbose)
            {
                pwErr.Print("Testing ");
                pwErr.Println(testTreebank.TextualSummary(tlp));
            }
            if (op.testOptions.evalb)
            {
                EvalbFormatWriter.InitEVALBfiles(tlpParams);
            }
            PrintWriter pwFileOut = null;

            if (op.testOptions.writeOutputFiles)
            {
                string fname = op.testOptions.outputFilesPrefix + "." + op.testOptions.outputFilesExtension;
                try
                {
                    pwFileOut = op.tlpParams.Pw(new FileOutputStream(fname));
                }
                catch (IOException ioe)
                {
                    Sharpen.Runtime.PrintStackTrace(ioe);
                }
            }
            PrintWriter pwStats = null;

            if (op.testOptions.outputkBestEquivocation != null)
            {
                try
                {
                    pwStats = op.tlpParams.Pw(new FileOutputStream(op.testOptions.outputkBestEquivocation));
                }
                catch (IOException ioe)
                {
                    Sharpen.Runtime.PrintStackTrace(ioe);
                }
            }
            if (op.testOptions.testingThreads != 1)
            {
                MulticoreWrapper <IList <IHasWord>, IParserQuery> wrapper = new MulticoreWrapper <IList <IHasWord>, IParserQuery>(op.testOptions.testingThreads, new ParsingThreadsafeProcessor(pqFactory, pwErr));
                LinkedList <Tree> goldTrees = new LinkedList <Tree>();
                foreach (Tree goldTree in testTreebank)
                {
                    IList <IHasWord> sentence = GetInputSentence(goldTree);
                    goldTrees.Add(goldTree);
                    pwErr.Println("Parsing [len. " + sentence.Count + "]: " + SentenceUtils.ListToString(sentence));
                    wrapper.Put(sentence);
                    while (wrapper.Peek())
                    {
                        IParserQuery pq = wrapper.Poll();
                        goldTree = goldTrees.Poll();
                        ProcessResults(pq, goldTree, pwErr, pwOut, pwFileOut, pwStats, treePrint);
                    }
                }
                // for tree iterator
                wrapper.Join();
                while (wrapper.Peek())
                {
                    IParserQuery pq         = wrapper.Poll();
                    Tree         goldTree_1 = goldTrees.Poll();
                    ProcessResults(pq, goldTree_1, pwErr, pwOut, pwFileOut, pwStats, treePrint);
                }
            }
            else
            {
                IParserQuery pq = pqFactory.ParserQuery();
                foreach (Tree goldTree in testTreebank)
                {
                    IList <CoreLabel> sentence = GetInputSentence(goldTree);
                    pwErr.Println("Parsing [len. " + sentence.Count + "]: " + SentenceUtils.ListToString(sentence));
                    pq.ParseAndReport(sentence, pwErr);
                    ProcessResults(pq, goldTree, pwErr, pwOut, pwFileOut, pwStats, treePrint);
                }
            }
            // for tree iterator
            //Done parsing...print the results of the evaluations
            treebankTotalTimer.Done("Testing on treebank");
            if (op.testOptions.quietEvaluation)
            {
                pwErr = tlpParams.Pw(System.Console.Error);
            }
            if (saidMemMessage)
            {
                ParserUtils.PrintOutOfMemory(pwErr);
            }
            if (op.testOptions.evalb)
            {
                EvalbFormatWriter.CloseEVALBfiles();
            }
            if (numSkippedEvals != 0)
            {
                pwErr.Printf("Unable to evaluate %d parser hypotheses due to yield mismatch\n", numSkippedEvals);
            }
            // only created here so we know what parser types are supported...
            IParserQuery pq_1 = pqFactory.ParserQuery();

            if (summary)
            {
                if (pcfgLB != null)
                {
                    pcfgLB.Display(false, pwErr);
                }
                if (pcfgChildSpecific != null)
                {
                    pcfgChildSpecific.Display(false, pwErr);
                }
                if (pcfgLA != null)
                {
                    pcfgLA.Display(false, pwErr);
                }
                if (pcfgCB != null)
                {
                    pcfgCB.Display(false, pwErr);
                }
                if (pcfgDA != null)
                {
                    pcfgDA.Display(false, pwErr);
                }
                if (pcfgTA != null)
                {
                    pcfgTA.Display(false, pwErr);
                }
                if (pcfgLL != null && pq_1.GetPCFGParser() != null)
                {
                    pcfgLL.Display(false, pwErr);
                }
                if (depDA != null)
                {
                    depDA.Display(false, pwErr);
                }
                if (depTA != null)
                {
                    depTA.Display(false, pwErr);
                }
                if (depLL != null && pq_1.GetDependencyParser() != null)
                {
                    depLL.Display(false, pwErr);
                }
                if (factLB != null)
                {
                    factLB.Display(false, pwErr);
                }
                if (factChildSpecific != null)
                {
                    factChildSpecific.Display(false, pwErr);
                }
                if (factLA != null)
                {
                    factLA.Display(false, pwErr);
                }
                if (factCB != null)
                {
                    factCB.Display(false, pwErr);
                }
                if (factDA != null)
                {
                    factDA.Display(false, pwErr);
                }
                if (factTA != null)
                {
                    factTA.Display(false, pwErr);
                }
                if (factLL != null && pq_1.GetFactoredParser() != null)
                {
                    factLL.Display(false, pwErr);
                }
                if (pcfgCatE != null)
                {
                    pcfgCatE.Display(false, pwErr);
                }
                foreach (IEval eval in evals)
                {
                    eval.Display(false, pwErr);
                }
                foreach (BestOfTopKEval eval_1 in topKEvals)
                {
                    eval_1.Display(false, pwErr);
                }
            }
            // these ones only have a display mode, so display if turned on!!
            if (pcfgRUO != null)
            {
                pcfgRUO.Display(true, pwErr);
            }
            if (pcfgCUO != null)
            {
                pcfgCUO.Display(true, pwErr);
            }
            if (tsv)
            {
                NumberFormat nf = new DecimalFormat("0.00");
                pwErr.Println("factF1\tfactDA\tfactEx\tpcfgF1\tdepDA\tfactTA\tnum");
                if (factLB != null)
                {
                    pwErr.Print(nf.Format(factLB.GetEvalbF1Percent()));
                }
                pwErr.Print("\t");
                if (pq_1.GetDependencyParser() != null && factDA != null)
                {
                    pwErr.Print(nf.Format(factDA.GetEvalbF1Percent()));
                }
                pwErr.Print("\t");
                if (factLB != null)
                {
                    pwErr.Print(nf.Format(factLB.GetExactPercent()));
                }
                pwErr.Print("\t");
                if (pcfgLB != null)
                {
                    pwErr.Print(nf.Format(pcfgLB.GetEvalbF1Percent()));
                }
                pwErr.Print("\t");
                if (pq_1.GetDependencyParser() != null && depDA != null)
                {
                    pwErr.Print(nf.Format(depDA.GetEvalbF1Percent()));
                }
                pwErr.Print("\t");
                if (pq_1.GetPCFGParser() != null && factTA != null)
                {
                    pwErr.Print(nf.Format(factTA.GetEvalbF1Percent()));
                }
                pwErr.Print("\t");
                if (factLB != null)
                {
                    pwErr.Print(factLB.GetNum());
                }
                pwErr.Println();
            }
            double f1 = 0.0;

            if (factLB != null)
            {
                f1 = factLB.GetEvalbF1();
            }
            //Close files (if necessary)
            if (pwFileOut != null)
            {
                pwFileOut.Close();
            }
            if (pwStats != null)
            {
                pwStats.Close();
            }
            if (parserQueryEvals != null)
            {
                foreach (IParserQueryEval parserQueryEval in parserQueryEvals)
                {
                    parserQueryEval.Display(false, pwErr);
                }
            }
            return(f1);
        }
        public virtual void ProcessResults(IParserQuery pq, Tree goldTree, PrintWriter pwErr, PrintWriter pwOut, PrintWriter pwFileOut, PrintWriter pwStats, TreePrint treePrint)
        {
            if (pq.SaidMemMessage())
            {
                saidMemMessage = true;
            }
            Tree             tree;
            IList <IHasWord> sentence = pq.OriginalSentence();

            try
            {
                tree = pq.GetBestParse();
            }
            catch (NoSuchParseException)
            {
                tree = null;
            }
            IList <ScoredObject <Tree> > kbestPCFGTrees = null;

            if (tree != null && kbestPCFG > 0)
            {
                kbestPCFGTrees = pq.GetKBestPCFGParses(kbestPCFG);
            }
            //combo parse goes to pwOut (System.out)
            if (op.testOptions.verbose)
            {
                pwOut.Println("ComboParser best");
                Tree ot = tree;
                if (ot != null && !op.tlpParams.TreebankLanguagePack().IsStartSymbol(ot.Value()))
                {
                    ot = ot.TreeFactory().NewTreeNode(op.tlpParams.TreebankLanguagePack().StartSymbol(), Java.Util.Collections.SingletonList(ot));
                }
                treePrint.PrintTree(ot, pwOut);
            }
            else
            {
                treePrint.PrintTree(tree, pwOut);
            }
            // **OUTPUT**
            // print various n-best like outputs (including 1-best)
            // print various statistics
            if (tree != null)
            {
                if (op.testOptions.printAllBestParses)
                {
                    IList <ScoredObject <Tree> > parses = pq.GetBestPCFGParses();
                    int sz = parses.Count;
                    if (sz > 1)
                    {
                        pwOut.Println("There were " + sz + " best PCFG parses with score " + parses[0].Score() + '.');
                        Tree transGoldTree = collinizer.TransformTree(goldTree);
                        int  iii           = 0;
                        foreach (ScoredObject <Tree> sot in parses)
                        {
                            iii++;
                            Tree tb  = sot.Object();
                            Tree tbd = debinarizer.TransformTree(tb);
                            tbd = subcategoryStripper.TransformTree(tbd);
                            pq.RestoreOriginalWords(tbd);
                            pwOut.Println("PCFG Parse #" + iii + " with score " + tbd.Score());
                            tbd.PennPrint(pwOut);
                            Tree tbtr = collinizer.TransformTree(tbd);
                            // pwOut.println("Tree size = " + tbtr.size() + "; depth = " + tbtr.depth());
                            kGoodLB.Evaluate(tbtr, transGoldTree, pwErr);
                        }
                    }
                }
                else
                {
                    // Huang and Chiang (2006) Algorithm 3 output from the PCFG parser
                    if (op.testOptions.printPCFGkBest > 0 && op.testOptions.outputkBestEquivocation == null)
                    {
                        IList <ScoredObject <Tree> > trees = kbestPCFGTrees.SubList(0, op.testOptions.printPCFGkBest);
                        Tree transGoldTree = collinizer.TransformTree(goldTree);
                        int  i             = 0;
                        foreach (ScoredObject <Tree> tp in trees)
                        {
                            i++;
                            pwOut.Println("PCFG Parse #" + i + " with score " + tp.Score());
                            Tree tbd = tp.Object();
                            tbd.PennPrint(pwOut);
                            Tree tbtr = collinizer.TransformTree(tbd);
                            kGoodLB.Evaluate(tbtr, transGoldTree, pwErr);
                        }
                    }
                    else
                    {
                        // Chart parser (factored) n-best list
                        if (op.testOptions.printFactoredKGood > 0 && pq.HasFactoredParse())
                        {
                            // DZ: debug n best trees
                            IList <ScoredObject <Tree> > trees = pq.GetKGoodFactoredParses(op.testOptions.printFactoredKGood);
                            Tree transGoldTree = collinizer.TransformTree(goldTree);
                            int  ii            = 0;
                            foreach (ScoredObject <Tree> tp in trees)
                            {
                                ii++;
                                pwOut.Println("Factored Parse #" + ii + " with score " + tp.Score());
                                Tree tbd = tp.Object();
                                tbd.PennPrint(pwOut);
                                Tree tbtr = collinizer.TransformTree(tbd);
                                kGoodLB.Evaluate(tbtr, transGoldTree, pwOut);
                            }
                        }
                        else
                        {
                            //1-best output
                            if (pwFileOut != null)
                            {
                                pwFileOut.Println(tree.ToString());
                            }
                        }
                    }
                }
                //Print the derivational entropy
                if (op.testOptions.outputkBestEquivocation != null && op.testOptions.printPCFGkBest > 0)
                {
                    IList <ScoredObject <Tree> > trees = kbestPCFGTrees.SubList(0, op.testOptions.printPCFGkBest);
                    double[] logScores = new double[trees.Count];
                    int      treeId    = 0;
                    foreach (ScoredObject <Tree> kBestTree in trees)
                    {
                        logScores[treeId++] = kBestTree.Score();
                    }
                    //Re-normalize
                    double entropy = 0.0;
                    double denom   = ArrayMath.LogSum(logScores);
                    foreach (double logScore in logScores)
                    {
                        double logPr = logScore - denom;
                        entropy += System.Math.Exp(logPr) * (logPr / System.Math.Log(2));
                    }
                    entropy *= -1;
                    //Convert to bits
                    pwStats.Printf("%f\t%d\t%d\n", entropy, trees.Count, sentence.Count);
                }
            }
            // **EVALUATION**
            // Perform various evaluations specified by the user
            if (tree != null)
            {
                //Strip subcategories and remove punctuation for evaluation
                tree = subcategoryStripper.TransformTree(tree);
                Tree treeFact = collinizer.TransformTree(tree);
                //Setup the gold tree
                if (op.testOptions.verbose)
                {
                    pwOut.Println("Correct parse");
                    treePrint.PrintTree(goldTree, pwOut);
                }
                Tree transGoldTree = collinizer.TransformTree(goldTree);
                if (transGoldTree != null)
                {
                    transGoldTree = subcategoryStripper.TransformTree(transGoldTree);
                }
                //Can't do evaluation in these two cases
                if (transGoldTree == null)
                {
                    pwErr.Println("Couldn't transform gold tree for evaluation, skipping eval. Gold tree was:");
                    goldTree.PennPrint(pwErr);
                    numSkippedEvals++;
                    return;
                }
                else
                {
                    if (treeFact == null)
                    {
                        pwErr.Println("Couldn't transform hypothesis tree for evaluation, skipping eval. Tree was:");
                        tree.PennPrint(pwErr);
                        numSkippedEvals++;
                        return;
                    }
                    else
                    {
                        if (treeFact.Yield().Count != transGoldTree.Yield().Count)
                        {
                            IList <ILabel> fYield = treeFact.Yield();
                            IList <ILabel> gYield = transGoldTree.Yield();
                            pwErr.Println("WARNING: Evaluation could not be performed due to gold/parsed yield mismatch.");
                            pwErr.Printf("  sizes: gold: %d (transf) %d (orig); parsed: %d (transf) %d (orig).%n", gYield.Count, goldTree.Yield().Count, fYield.Count, tree.Yield().Count);
                            pwErr.Println("  gold: " + SentenceUtils.ListToString(gYield, true));
                            pwErr.Println("  pars: " + SentenceUtils.ListToString(fYield, true));
                            numSkippedEvals++;
                            return;
                        }
                    }
                }
                if (topKEvals.Count > 0)
                {
                    IList <Tree> transGuesses = new List <Tree>();
                    int          kbest        = System.Math.Min(op.testOptions.evalPCFGkBest, kbestPCFGTrees.Count);
                    foreach (ScoredObject <Tree> guess in kbestPCFGTrees.SubList(0, kbest))
                    {
                        transGuesses.Add(collinizer.TransformTree(guess.Object()));
                    }
                    foreach (BestOfTopKEval eval in topKEvals)
                    {
                        eval.Evaluate(transGuesses, transGoldTree, pwErr);
                    }
                }
                //PCFG eval
                Tree treePCFG = pq.GetBestPCFGParse();
                if (treePCFG != null)
                {
                    Tree treePCFGeval = collinizer.TransformTree(treePCFG);
                    if (pcfgLB != null)
                    {
                        pcfgLB.Evaluate(treePCFGeval, transGoldTree, pwErr);
                    }
                    if (pcfgChildSpecific != null)
                    {
                        pcfgChildSpecific.Evaluate(treePCFGeval, transGoldTree, pwErr);
                    }
                    if (pcfgLA != null)
                    {
                        pcfgLA.Evaluate(treePCFGeval, transGoldTree, pwErr);
                    }
                    if (pcfgCB != null)
                    {
                        pcfgCB.Evaluate(treePCFGeval, transGoldTree, pwErr);
                    }
                    if (pcfgDA != null)
                    {
                        // Re-index the leaves after Collinization, stripping traces, etc.
                        treePCFGeval.IndexLeaves(true);
                        transGoldTree.IndexLeaves(true);
                        pcfgDA.Evaluate(treePCFGeval, transGoldTree, pwErr);
                    }
                    if (pcfgTA != null)
                    {
                        pcfgTA.Evaluate(treePCFGeval, transGoldTree, pwErr);
                    }
                    if (pcfgLL != null && pq.GetPCFGParser() != null)
                    {
                        pcfgLL.RecordScore(pq.GetPCFGParser(), pwErr);
                    }
                    if (pcfgRUO != null)
                    {
                        pcfgRUO.Evaluate(treePCFGeval, transGoldTree, pwErr);
                    }
                    if (pcfgCUO != null)
                    {
                        pcfgCUO.Evaluate(treePCFGeval, transGoldTree, pwErr);
                    }
                    if (pcfgCatE != null)
                    {
                        pcfgCatE.Evaluate(treePCFGeval, transGoldTree, pwErr);
                    }
                }
                //Dependency eval
                // todo: is treeDep really useful here, or should we really use depDAEval tree (debinarized) throughout? We use it for parse, and it sure seems like we could use it for tag eval, but maybe not factDA?
                Tree treeDep = pq.GetBestDependencyParse(false);
                if (treeDep != null)
                {
                    Tree goldTreeB    = binarizerOnly.TransformTree(goldTree);
                    Tree goldTreeEval = goldTree.DeepCopy();
                    goldTreeEval.IndexLeaves(true);
                    goldTreeEval.PercolateHeads(op.Langpack().HeadFinder());
                    Tree depDAEval = pq.GetBestDependencyParse(true);
                    depDAEval.IndexLeaves(true);
                    depDAEval.PercolateHeadIndices();
                    if (depDA != null)
                    {
                        depDA.Evaluate(depDAEval, goldTreeEval, pwErr);
                    }
                    if (depTA != null)
                    {
                        Tree undoneTree = debinarizer.TransformTree(treeDep);
                        undoneTree = subcategoryStripper.TransformTree(undoneTree);
                        pq.RestoreOriginalWords(undoneTree);
                        // pwErr.println("subcategoryStripped tree: " + undoneTree.toStructureDebugString());
                        depTA.Evaluate(undoneTree, goldTree, pwErr);
                    }
                    if (depLL != null && pq.GetDependencyParser() != null)
                    {
                        depLL.RecordScore(pq.GetDependencyParser(), pwErr);
                    }
                    Tree factTreeB;
                    if (pq.HasFactoredParse())
                    {
                        factTreeB = pq.GetBestFactoredParse();
                    }
                    else
                    {
                        factTreeB = treeDep;
                    }
                    if (factDA != null)
                    {
                        factDA.Evaluate(factTreeB, goldTreeB, pwErr);
                    }
                }
                //Factored parser (1best) eval
                if (factLB != null)
                {
                    factLB.Evaluate(treeFact, transGoldTree, pwErr);
                }
                if (factChildSpecific != null)
                {
                    factChildSpecific.Evaluate(treeFact, transGoldTree, pwErr);
                }
                if (factLA != null)
                {
                    factLA.Evaluate(treeFact, transGoldTree, pwErr);
                }
                if (factTA != null)
                {
                    factTA.Evaluate(tree, boundaryRemover.TransformTree(goldTree), pwErr);
                }
                if (factLL != null && pq.GetFactoredParser() != null)
                {
                    factLL.RecordScore(pq.GetFactoredParser(), pwErr);
                }
                if (factCB != null)
                {
                    factCB.Evaluate(treeFact, transGoldTree, pwErr);
                }
                foreach (IEval eval_1 in evals)
                {
                    eval_1.Evaluate(treeFact, transGoldTree, pwErr);
                }
                if (parserQueryEvals != null)
                {
                    foreach (IParserQueryEval eval in parserQueryEvals)
                    {
                        eval_1.Evaluate(pq, transGoldTree, pwErr);
                    }
                }
                if (op.testOptions.evalb)
                {
                    // empty out scores just in case
                    NanScores(tree);
                    EvalbFormatWriter.WriteEVALBline(treeFact, transGoldTree);
                }
            }
            pwErr.Println();
        }
Пример #29
0
        public void SentenceParser(string sent2)
        {
            var modelsDirectory = jarRoot + @"edu\stanford\nlp\models";

            // Loading english PCFG parser from file
            var lp = LexicalizedParser.loadModel(modelsDirectory + @"\lexparser\englishPCFG.ser.gz");

            // This option shows loading and using an explicit tokenizer
            sent2.ToLower();
            var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
            var sent2Reader      = new java.io.StringReader(sent2);
            var rawWords2        = tokenizerFactory.getTokenizer(sent2Reader).tokenize();

            sent2Reader.close();
            var tree2 = lp.apply(rawWords2);

            // Extract dependencies from lexical tree

            var tlp = new PennTreebankLanguagePack();
            var gsf = tlp.grammaticalStructureFactory();
            var gs  = gsf.newGrammaticalStructure(tree2);
            var tdl = gs.typedDependenciesCCprocessed();
            //Console.WriteLine("\n{0}\n", tdl);


            // Extract collapsed dependencies from parsed tree

            var tp = new TreePrint("penn,typedDependenciesCollapsed");

            tp.printTree(tree2);



            ArrayList dep = gs.typedDependenciesCollapsed() as ArrayList;

            foreach (TypedDependency td in dep)
            {
                for (int i = 0; i < keyword.Length; i++)
                {
                    if (td.dep().originalText().Equals(keyword[i]))
                    {
                        keyFlag = true;
                        key     = keyword[i];
                        break;
                    }
                }
                if (keyFlag)
                {
                    break;
                }
            }

            keyFlag = false;


            switch (key)
            {
            case "circle":

                Circle circle = new Circle();
                shape     = circle.GetProps();
                propsUsed = Associator(shape, dep);

                break;

            case "rectangle":

                Rectangle rect = new Rectangle();
                shape     = rect.GetProps();
                propsUsed = Associator(shape, dep);

                break;

            case "triangle":

                Triangle tri = new Triangle();
                shape     = tri.GetProps();
                propsUsed = Associator(shape, dep);

                break;

            case "square":

                Square square = new Square();
                shape     = square.GetProps();
                propsUsed = Associator(shape, dep);

                break;

            default:

                break;
            } //End of Switch

            dependency = tdl.ToString();
        } //End of SentenceParser