Example #1
0
        //use Stanford.NLP.Net to parse the sentence
        static Tree Parse(string sent)
        {
            // Loading english PCFG parser from file
            var lp = LexicalizedParser.loadModel(modelsDirectory + "\\lexparser\\englishPCFG.ser.gz");

            var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
            var sentReader       = new java.io.StringReader(sent);
            var rawWords         = tokenizerFactory.getTokenizer(sentReader).tokenize();

            sentReader.close();
            var tree = lp.apply(rawWords);

            // Extract dependencies from lexical tree
            var tlp = new PennTreebankLanguagePack();
            var gsf = tlp.grammaticalStructureFactory();
            var gs  = gsf.newGrammaticalStructure(tree);
            var tdl = gs.typedDependenciesCCprocessed();

            // Extract collapsed dependencies from parsed tree
            //var tp = new TreePrint("penn,typedDependenciesCollapsed");
            var tp = new TreePrint("penn");

            tp.printTree(tree);

            return(tree);
        }
Example #2
0
        static void Main()
        {
            // Path to models extracted from `stanford-parser-3.6.0-models.jar`
            var jarRoot = @"..\..\..\..\paket-files\nlp.stanford.edu\stanford-parser-full-2016-10-31\models\";
            var modelsDirectory = jarRoot + @"\edu\stanford\nlp\models";

            // Loading english PCFG parser from file
            var lp = LexicalizedParser.loadModel(modelsDirectory + @"\lexparser\englishPCFG.ser.gz");

            // This sample shows parsing a list of correctly tokenized words
            var sent = new[] { "This", "is", "an", "easy", "sentence", "." };
            var rawWords = SentenceUtils.toCoreLabelList(sent);
            var tree = lp.apply(rawWords);
            tree.pennPrint();

            // This option shows loading and using an explicit tokenizer
            var sent2 = "This is another sentence.";
            var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
            var sent2Reader = new StringReader(sent2);
            var rawWords2 = tokenizerFactory.getTokenizer(sent2Reader).tokenize();
            sent2Reader.close();
            var tree2 = lp.apply(rawWords2);

            // Extract dependencies from lexical tree
            var tlp = new PennTreebankLanguagePack();
            var gsf = tlp.grammaticalStructureFactory();
            var gs = gsf.newGrammaticalStructure(tree2);
            var tdl = gs.typedDependenciesCCprocessed();
            Console.WriteLine("\n{0}\n", tdl);

            // Extract collapsed dependencies from parsed tree
            var tp = new TreePrint("penn,typedDependenciesCollapsed");
            tp.printTree(tree2);
        }
        /// <summary>
        /// Returns a new
        /// <c>SemanticGraph</c>
        /// constructed from a given
        /// <see cref="Edu.Stanford.Nlp.Trees.Tree"/>
        /// with given options.
        /// This factory method is intended to replace a profusion of highly similar
        /// factory methods, such as
        /// <c>typedDependencies()</c>
        /// ,
        /// <c>typedDependenciesCollapsed()</c>
        /// ,
        /// <c>allTypedDependencies()</c>
        /// ,
        /// <c>allTypedDependenciesCollapsed()</c>
        /// , etc.
        /// For a fuller explanation of the meaning of the boolean arguments, see
        /// <see cref="Edu.Stanford.Nlp.Trees.GrammaticalStructure"/>
        /// .
        /// </summary>
        /// <param name="tree">A tree representing a phrase structure parse</param>
        /// <param name="includeExtras">
        /// Whether to include extra dependencies, which may
        /// result in a non-tree
        /// </param>
        /// <param name="filter">A filter to exclude certain dependencies; ignored if null</param>
        /// <param name="originalDependencies">
        /// generate original Stanford dependencies instead of new
        /// Universal Dependencies
        /// </param>
        /// <returns>A SemanticGraph</returns>
        public static SemanticGraph MakeFromTree(Tree tree, SemanticGraphFactory.Mode mode, GrammaticalStructure.Extras includeExtras, IPredicate <TypedDependency> filter, bool originalDependencies, bool includePunctuationDependencies)
        {
            GrammaticalStructure gs;

            if (originalDependencies)
            {
                IPredicate <string> wordFilt;
                if (includePunctuationDependencies)
                {
                    wordFilt = Filters.AcceptFilter();
                }
                else
                {
                    wordFilt = new PennTreebankLanguagePack().PunctuationWordRejectFilter();
                }
                gs = new EnglishGrammaticalStructure(tree, wordFilt, new SemanticHeadFinder(true));
            }
            else
            {
                IPredicate <string> tagFilt;
                if (includePunctuationDependencies)
                {
                    tagFilt = Filters.AcceptFilter();
                }
                else
                {
                    tagFilt = new PennTreebankLanguagePack().PunctuationTagRejectFilter();
                }
                gs = new UniversalEnglishGrammaticalStructure(tree, tagFilt, new UniversalSemanticHeadFinder(true));
            }
            return(MakeFromTree(gs, mode, includeExtras, filter));
        }
        public void ParseEasySentence()
        {
            // This option shows parsing a list of correctly tokenized words
            var sent     = new[] { "This", "is", "an", "easy", "sentence", "." };
            var rawWords = SentenceUtils.toCoreLabelList(sent);
            var parse    = _lp.apply(rawWords);

            Assert.NotNull(parse);
            parse.pennPrint();

            // This option shows loading and using an explicit tokenizer
            var sent2            = "This is another sentence.";
            var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");

            using var sent2Reader = new StringReader(sent2);
            var rawWords2 = tokenizerFactory.getTokenizer(sent2Reader).tokenize();

            parse = _lp.apply(rawWords2);
            Assert.NotNull(parse);

            var tlp = new PennTreebankLanguagePack();
            var gsf = tlp.grammaticalStructureFactory();
            var gs  = gsf.newGrammaticalStructure(parse);
            var tdl = gs.typedDependenciesCCprocessed();

            TestContext.Out.WriteLine($"\n{tdl}\n");

            var tp = new TreePrint("penn,typedDependenciesCollapsed");

            Assert.NotNull(tp);
            tp.printTree(parse);
        }
Example #5
0
        public static void DemoAPI(LexicalizedParser lp)
        {
            // This option shows parsing a list of correctly tokenized words
            var sent = new[] { "This", "is", "an", "easy", "sentence", "." };

            java.util.List rawWords = Sentence.toCoreLabelList(sent);
            Tree           parse    = lp.apply(rawWords);

            parse.pennPrint();

            // This option shows loading and using an explicit tokenizer
            const string     Sent2            = "This is another sentence.";
            TokenizerFactory tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
            var sent2Reader = new StringReader(Sent2);

            java.util.List rawWords2 = tokenizerFactory.getTokenizer(sent2Reader).tokenize();
            parse = lp.apply(rawWords2);

            var tlp = new PennTreebankLanguagePack();
            GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
            GrammaticalStructure        gs  = gsf.newGrammaticalStructure(parse);

            java.util.List tdl = gs.typedDependenciesCCprocessed();
            Console.WriteLine("\n{0}\n", tdl);

            var tp = new TreePrint("penn,typedDependenciesCollapsed");

            tp.printTree(parse);
        }
        public string parse(string sentence)
        {
            string parsedout = "";
            // This option shows loading and using an explicit tokenizer
            var sent2 = sentence;
            var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
            var sent2Reader = new StringReader(sent2);
            var rawWords2 = tokenizerFactory.getTokenizer(sent2Reader).tokenize();
            var parse = _sdpModel.apply(rawWords2);

            var tlp = new PennTreebankLanguagePack();
            var gsf = tlp.grammaticalStructureFactory();
            var gs = gsf.newGrammaticalStructure(parse);
            var tdl = gs.typedDependenciesCCprocessed();

            System.Console.WriteLine();
            for (var it = tdl.iterator(); it.hasNext(); )
            {
                parsedout = parsedout + "\n" +it.next();
            }
                //System.Console.WriteLine("{0}", it.next());
            //System.Console.WriteLine();

            //var tp = new TreePrint("penn,typedDependenciesCollapsed");
           
            return parsedout;
            
        }
        public void ProcessText(string inputText)
        {
            var jarRoot         = "C:\\stanford-parser-full-2016-10-31\\stanford-parser-3.7.0-models";//\\edu\\stanford\\nlp\\models";//"nlp.stanford.edu\\stanford-parser-full-2017-06-09\\models";
            var modelsDirectory = jarRoot + "\\edu\\stanford\\nlp\\models";

            // Loading english PCFG parser from file
            var lp = LexicalizedParser.loadModel(modelsDirectory + "\\lexparser\\englishPCFG.ser.gz");

            // This option shows loading and using an explicit tokenizer
            var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
            var sentReader       = new StringReader(inputText);
            var rawWords         = tokenizerFactory.getTokenizer(sentReader).tokenize();

            sentReader.close();
            var tree = lp.apply(rawWords);

            //Extract dependencies from lexical tree
            var tlp = new PennTreebankLanguagePack();
            var gsf = tlp.grammaticalStructureFactory();
            var gs  = gsf.newGrammaticalStructure(tree);
            var tdl = gs.typedDependenciesCCprocessed();

            Console.WriteLine("\n{0}\n", tdl);

            // Extract collapsed dependencies from parsed tree
            var tp = new TreePrint("penn,typedDependenciesCollapsed");

            tp.printTree(tree);
        }
Example #8
0
        /// <summary>
        /// Checks the a list of sentences for correct grammar. Returns a new list of the sentences with correct grammar.
        /// </summary>
        /// <param name="sentences">A list of strings that will have their grammar checked.</param>
        /// <returns>A new list of the sentences with correct grammar.</returns>
        public static HashSet <string> CheckGrammer(HashSet <string> sentences)
        {
            HashSet <string> answers = new HashSet <string>();


            Console.WriteLine(sentences.Count + " potential sentences\n");
            if (sentences.Count == 0)
            {
                return(null);
            }

            Console.WriteLine("Loading Stanford NLP...");
            // Loading english PCFG parser from file
            var lp = LexicalizedParser.loadModel("..\\..\\..\\packages\\englishPCFG.ser.gz");

            Console.WriteLine("Stanford Parser Loaded!\n");

            //Test the grammar of each potential sentence that has all english words
            foreach (var curSentence in sentences)
            {
                //Convert the sentence to a tree that Stanford NLP library generates from its parsing
                var tempSentence = curSentence;
                var sentReader   = new StringReader(tempSentence);
                var tree         = lp.apply(PTBTokenizer.factory(new CoreLabelTokenFactory(), "").getTokenizer(sentReader).tokenize());
                sentReader.close();

                //Determine if the word is a sentence
                string strTree    = tree.ToString();
                bool   isSentence = false;

                if (strTree.Contains("(S "))
                {
                    tempSentence = curSentence + ".";
                    isSentence   = true;
                }
                else if (strTree.Contains("(SINV ") || strTree.Contains("(SBARQ ") || strTree.Contains("(SQ "))
                {
                    tempSentence = curSentence + "?";
                    isSentence   = true;
                }

                if (isSentence)
                {
                    var    tlp    = new PennTreebankLanguagePack();
                    string strRel = tlp.grammaticalStructureFactory().newGrammaticalStructure(tree).typedDependenciesCCprocessed().ToString();

                    if (strRel.Contains("nsubj("))
                    {
                        answers.Add(tempSentence);
                    }
                }
            }
            return(answers);
        }
        public virtual void TestBasicCategory()
        {
            ITreebankLanguagePack tlp = new PennTreebankLanguagePack();

            NUnit.Framework.Assert.AreEqual("NP", tlp.BasicCategory("NP-SBJ-R"));
            NUnit.Framework.Assert.AreEqual("-", tlp.BasicCategory("-"));
            NUnit.Framework.Assert.AreEqual("-LRB-", tlp.BasicCategory("-LRB-"));
            NUnit.Framework.Assert.AreEqual("-", tlp.BasicCategory("--PU"));
            NUnit.Framework.Assert.AreEqual("-", tlp.BasicCategory("--PU-U"));
            NUnit.Framework.Assert.AreEqual("-LRB-", tlp.BasicCategory("-LRB--PU"));
            NUnit.Framework.Assert.AreEqual("-LRB-", tlp.BasicCategory("-LRB--PU-U"));
        }
Example #10
0
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            ITreebankLanguagePack tlp = new PennTreebankLanguagePack();

            System.Console.Out.WriteLine("Start symbol: " + tlp.StartSymbol());
            string start = tlp.StartSymbol();

            System.Console.Out.WriteLine("Should be true: " + (tlp.IsStartSymbol(start)));
            string[] strs = new string[] { "-", "-LLB-", "NP-2", "NP=3", "NP-LGS", "NP-TMP=3" };
            foreach (string str in strs)
            {
                System.Console.Out.WriteLine("String: " + str + " basic: " + tlp.BasicCategory(str) + " basicAndFunc: " + tlp.CategoryAndFunction(str));
            }
        }
Example #11
0
        public NlpService()
        {
            string parserFileOrUrl = "englishPCFG.ser.gz";

            _lp = LexicalizedParser.loadModel(parserFileOrUrl);
            if (_lp == null)
            {
                throw new InvalidOperationException("couldn't load " + parserFileOrUrl);
            }
            _tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");

            _tlp = new PennTreebankLanguagePack();
            _structureFactory = _tlp.grammaticalStructureFactory();
        }
        public static List <DependencyRelationship> ParseDepencyRelationshipsInSentence(string sentence)
        {
            var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
            var sent2Reader      = new StringReader(sentence);
            var rawWords2        = tokenizerFactory.getTokenizer(sent2Reader).tokenize();

            sent2Reader.close();
            var tree2 = LoadedLexicalizedParserModel.Instance.apply(rawWords2);

            // Extract dependencies from lexical tree
            var tlp = new PennTreebankLanguagePack();
            var gsf = tlp.grammaticalStructureFactory();
            var gs  = gsf.newGrammaticalStructure(tree2);
            var tdl = gs.typedDependencies();

            return(ParseJavaDependecyRelationships(tdl));
        }
        public static IEnumerable <TypedDependency> ComputeDependencies(Parse parse)
        {
            // Extract dependencies from lexical tree
            var tlp  = new PennTreebankLanguagePack();
            var gsf  = tlp.GrammaticalStructureFactory();
            var tree = new ParseTree(parse);

            try
            {
                var gs = gsf.NewGrammaticalStructure(tree);
                return(gs.TypedDependencies());
            }
            catch (Exception)
            {
                Console.WriteLine("Exception when computing deps for {0}", parse);
                return(new List <TypedDependency>());
            }
        }
        public static void DemoDP(LexicalizedParser lp, string fileName)
        {
            // This option shows loading and sentence-segment and tokenizing
            // a file using DocumentPreprocessor
            var tlp = new PennTreebankLanguagePack();
            var gsf = tlp.grammaticalStructureFactory();
            // You could also create a tokenizer here (as below) and pass it
            // to DocumentPreprocessor
            foreach (List sentence in new DocumentPreprocessor(fileName))
            {
                var parse = lp.apply(sentence);
                parse.pennPrint();

                var gs = gsf.newGrammaticalStructure(parse);
                var tdl = gs.typedDependenciesCCprocessed(true);
                System.Console.WriteLine("\n{0}\n", tdl);
            }
        }
Example #15
0
        public static void DemoDP(LexicalizedParser lp, string fileName)
        {
            // This option shows loading and sentence-segment and tokenizing
            // a file using DocumentPreprocessor
            var tlp = new PennTreebankLanguagePack();
            GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();

            // You could also create a tokenizer here (as below) and pass it
            // to DocumentPreprocessor
            foreach (List sentence in new DocumentPreprocessor(fileName))
            {
                Tree parse = lp.apply(sentence);
                parse.pennPrint();

                GrammaticalStructure gs  = gsf.newGrammaticalStructure(parse);
                java.util.List       tdl = gs.typedDependenciesCCprocessed(true);
                Console.WriteLine("\n{0}\n", tdl);
            }
        }
        public static void Start(string model, string fileName)
        {
            var grammar = (!String.IsNullOrEmpty(model)) ? model : Program.ParserModel;
            var options = new[] { "-maxLength", "80", "-retainTmpSubcategories" };
            var lp = LexicalizedParser.loadModel(grammar, options);
            var tlp = new PennTreebankLanguagePack();
            var gsf = tlp.grammaticalStructureFactory();

            var sentences = new List<ArrayList>();
            if (!string.IsNullOrEmpty(fileName))
            {
                sentences.AddRange(new DocumentPreprocessor(fileName).Cast<ArrayList>());
            }
            else
            {
                var sent = new[] { "This", "is", "an", "easy", "sentence", "." };
                var arrList = new ArrayList();
                foreach (var s in sent)
                {
                    arrList.Add(new Word(s));
                }
                sentences.Add(arrList);

                const string Sent2 = "This is a slightly longer and more complex sentence requiring tokenization.";
                var toke = tlp.getTokenizerFactory().getTokenizer(new StringReader(Sent2));
                sentences.Add((ArrayList)toke.tokenize());
            }

            foreach (var sentence in sentences)
            {
                var parse = lp.apply(sentence);
                parse.pennPrint();
                System.Console.WriteLine("\n{0}\n", (parse.taggedYield()));

                var gs = gsf.newGrammaticalStructure(parse);
                var tdl = gs.typedDependenciesCCprocessed(true);
                System.Console.WriteLine("{0}\n", tdl);
            }
       
        }
        public static void Start(string model, string fileName)
        {
            var grammar = (!String.IsNullOrEmpty(model)) ? model : Program.ParserModel;
            var options = new[] { "-maxLength", "80", "-retainTmpSubcategories" };
            var lp      = LexicalizedParser.loadModel(grammar, options);
            var tlp     = new PennTreebankLanguagePack();
            var gsf     = tlp.grammaticalStructureFactory();

            var sentences = new List <ArrayList>();

            if (!string.IsNullOrEmpty(fileName))
            {
                sentences.AddRange(new DocumentPreprocessor(fileName).Cast <ArrayList>());
            }
            else
            {
                var sent    = new[] { "This", "is", "an", "easy", "sentence", "." };
                var arrList = new ArrayList();
                foreach (var s in sent)
                {
                    arrList.Add(new Word(s));
                }
                sentences.Add(arrList);

                const string Sent2 = "This is a slightly longer and more complex sentence requiring tokenization.";
                var          toke  = tlp.getTokenizerFactory().getTokenizer(new StringReader(Sent2));
                sentences.Add((ArrayList)toke.tokenize());
            }

            foreach (var sentence in sentences)
            {
                var parse = lp.apply(sentence);
                parse.pennPrint();
                System.Console.WriteLine("\n{0}\n", (parse.taggedYield()));

                var gs  = gsf.newGrammaticalStructure(parse);
                var tdl = gs.typedDependenciesCCprocessed(true);
                System.Console.WriteLine("{0}\n", tdl);
            }
        }
        public void LoadSentencesFromFile()
        {
            // This option shows loading and sentence-segment and tokenizing
            // a file using DocumentPreprocessor
            var tlp = new PennTreebankLanguagePack();
            var gsf = tlp.grammaticalStructureFactory();

            // You could also create a tokenizer here (as below) and pass it
            // to DocumentPreprocessor
            var preprocessor = new DocumentPreprocessor(Files.DataFile("SampleText.txt"));

            foreach (var sentence in preprocessor.ToSeq().Cast <List>())
            {
                var parse = _lp.apply(sentence);
                Assert.NotNull(parse);
                parse.pennPrint();

                var gs  = gsf.newGrammaticalStructure(parse);
                var tdl = gs.typedDependenciesCCprocessed(true);
                TestContext.Out.WriteLine($"\n{tdl}\n");
            }
        }
Example #19
0
        static void Main()
        {
            // Path to models extracted from `stanford-parser-3.6.0-models.jar`
            var jarRoot         = @"..\..\..\..\paket-files\nlp.stanford.edu\stanford-parser-full-2015-12-09\models\";
            var modelsDirectory = jarRoot + @"\edu\stanford\nlp\models";

            // Loading english PCFG parser from file
            var lp = LexicalizedParser.loadModel(modelsDirectory + @"\lexparser\englishPCFG.ser.gz");

            // This sample shows parsing a list of correctly tokenized words
            var sent     = new[] { "This", "is", "an", "easy", "sentence", "." };
            var rawWords = Sentence.toCoreLabelList(sent);
            var tree     = lp.apply(rawWords);

            tree.pennPrint();

            // This option shows loading and using an explicit tokenizer
            var sent2            = "This is another sentence.";
            var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
            var sent2Reader      = new StringReader(sent2);
            var rawWords2        = tokenizerFactory.getTokenizer(sent2Reader).tokenize();

            sent2Reader.close();
            var tree2 = lp.apply(rawWords2);

            // Extract dependencies from lexical tree
            var tlp = new PennTreebankLanguagePack();
            var gsf = tlp.grammaticalStructureFactory();
            var gs  = gsf.newGrammaticalStructure(tree2);
            var tdl = gs.typedDependenciesCCprocessed();

            Console.WriteLine("\n{0}\n", tdl);

            // Extract collapsed dependencies from parsed tree
            var tp = new TreePrint("penn,typedDependenciesCollapsed");

            tp.printTree(tree2);
        }
Example #20
0
        //use Stanford.NLP.Net to parse the sentence
        Tree Parse(string sent)
        {
            var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
            var sentReader       = new java.io.StringReader(sent);
            var rawWords         = tokenizerFactory.getTokenizer(sentReader).tokenize();

            sentReader.close();
            var tree = lp.apply(rawWords);

            // Extract dependencies from lexical tree
            var tlp = new PennTreebankLanguagePack();
            var gsf = tlp.grammaticalStructureFactory();
            var gs  = gsf.newGrammaticalStructure(tree);
            var tdl = gs.typedDependenciesCCprocessed();

            // Extract collapsed dependencies from parsed tree
            //var tp = new TreePrint("penn,typedDependenciesCollapsed");
            var tp = new TreePrint("penn");

            tp.printTree(tree);

            return(tree);
        }
        public static void DemoAPI(LexicalizedParser lp)
        {
            // This option shows parsing a list of correctly tokenized words
            var sent = new[] { "This", "is", "an", "easy", "sentence", "." };
            var rawWords = Sentence.toCoreLabelList(sent);
            var parse = lp.apply(rawWords);
            parse.pennPrint();

            // This option shows loading and using an explicit tokenizer
            const string Sent2 = "This is another sentence.";
            var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
            var sent2Reader = new StringReader(Sent2);
            var rawWords2 = tokenizerFactory.getTokenizer(sent2Reader).tokenize();
            parse = lp.apply(rawWords2);

            var tlp = new PennTreebankLanguagePack();
            var gsf = tlp.grammaticalStructureFactory();
            var gs = gsf.newGrammaticalStructure(parse);
            var tdl = gs.typedDependenciesCCprocessed();
            System.Console.WriteLine("\n{0}\n", tdl);

            var tp = new TreePrint("penn,typedDependenciesCollapsed");
            tp.printTree(parse);
        }
Example #22
0
        /// <summary>Lets you test out the TreeBinarizer on the command line.</summary>
        /// <remarks>
        /// Lets you test out the TreeBinarizer on the command line.
        /// This main method doesn't yet handle as many flags as one would like.
        /// But it does have:
        /// <ul>
        /// <li> -tlp TreebankLanguagePack
        /// <li>-tlpp TreebankLangParserParams
        /// <li>-insideFactor
        /// <li>-markovOrder
        /// </ul>
        /// </remarks>
        /// <param name="args">
        /// Command line arguments: flags as above, as above followed by
        /// treebankPath
        /// </param>
        public static void Main(string[] args)
        {
            ITreebankLangParserParams tlpp = null;
            // TreebankLangParserParams tlpp = new EnglishTreebankParserParams();
            // TreeReaderFactory trf = new LabeledScoredTreeReaderFactory();
            // Looks like it must build CategoryWordTagFactory!!
            ITreeReaderFactory    trf     = null;
            string                fileExt = "mrg";
            IHeadFinder           hf      = new ModCollinsHeadFinder();
            ITreebankLanguagePack tlp     = new PennTreebankLanguagePack();
            bool   insideFactor           = false;
            bool   mf               = false;
            int    mo               = 1;
            bool   uwl              = false;
            bool   uat              = false;
            double sst              = 20.0;
            bool   mfs              = false;
            bool   simpleLabels     = false;
            bool   noRebinarization = false;
            int    i = 0;

            while (i < args.Length && args[i].StartsWith("-"))
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-tlp") && i + 1 < args.Length)
                {
                    try
                    {
                        tlp = (ITreebankLanguagePack)System.Activator.CreateInstance(Sharpen.Runtime.GetType(args[i + 1]));
                    }
                    catch (Exception e)
                    {
                        log.Info("Couldn't instantiate: " + args[i + 1]);
                        throw new Exception(e);
                    }
                    i++;
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-tlpp") && i + 1 < args.Length)
                    {
                        try
                        {
                            tlpp = (ITreebankLangParserParams)System.Activator.CreateInstance(Sharpen.Runtime.GetType(args[i + 1]));
                        }
                        catch (Exception e)
                        {
                            log.Info("Couldn't instantiate: " + args[i + 1]);
                            throw new Exception(e);
                        }
                        i++;
                    }
                    else
                    {
                        if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-insideFactor"))
                        {
                            insideFactor = true;
                        }
                        else
                        {
                            if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-markovOrder") && i + 1 < args.Length)
                            {
                                i++;
                                mo = System.Convert.ToInt32(args[i]);
                            }
                            else
                            {
                                if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-simpleLabels"))
                                {
                                    simpleLabels = true;
                                }
                                else
                                {
                                    if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-noRebinarization"))
                                    {
                                        noRebinarization = true;
                                    }
                                    else
                                    {
                                        log.Info("Unknown option:" + args[i]);
                                    }
                                }
                            }
                        }
                    }
                }
                i++;
            }
            if (i >= args.Length)
            {
                log.Info("usage: java TreeBinarizer [-tlpp class|-markovOrder int|...] treebankPath");
                System.Environment.Exit(0);
            }
            Treebank treebank;

            if (tlpp != null)
            {
                treebank = tlpp.MemoryTreebank();
                tlp      = tlpp.TreebankLanguagePack();
                fileExt  = tlp.TreebankFileExtension();
                hf       = tlpp.HeadFinder();
            }
            else
            {
                treebank = new DiskTreebank(trf);
            }
            treebank.LoadPath(args[i], fileExt, true);
            ITreeTransformer tt = new Edu.Stanford.Nlp.Parser.Lexparser.TreeBinarizer(hf, tlp, insideFactor, mf, mo, uwl, uat, sst, mfs, simpleLabels, noRebinarization);

            foreach (Tree t in treebank)
            {
                Tree newT = tt.TransformTree(t);
                System.Console.Out.WriteLine("Original tree:");
                t.PennPrint();
                System.Console.Out.WriteLine("Binarized tree:");
                newT.PennPrint();
                System.Console.Out.WriteLine();
            }
        }
Example #23
0
        public void SentenceParser(string sent2)
        {
            var modelsDirectory = jarRoot + @"edu\stanford\nlp\models";

            // Loading english PCFG parser from file
            var lp = LexicalizedParser.loadModel(modelsDirectory + @"\lexparser\englishPCFG.ser.gz");

            // This option shows loading and using an explicit tokenizer
            sent2.ToLower();
            var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
            var sent2Reader      = new java.io.StringReader(sent2);
            var rawWords2        = tokenizerFactory.getTokenizer(sent2Reader).tokenize();

            sent2Reader.close();
            var tree2 = lp.apply(rawWords2);

            // Extract dependencies from lexical tree

            var tlp = new PennTreebankLanguagePack();
            var gsf = tlp.grammaticalStructureFactory();
            var gs  = gsf.newGrammaticalStructure(tree2);
            var tdl = gs.typedDependenciesCCprocessed();
            //Console.WriteLine("\n{0}\n", tdl);


            // Extract collapsed dependencies from parsed tree

            var tp = new TreePrint("penn,typedDependenciesCollapsed");

            tp.printTree(tree2);



            ArrayList dep = gs.typedDependenciesCollapsed() as ArrayList;

            foreach (TypedDependency td in dep)
            {
                for (int i = 0; i < keyword.Length; i++)
                {
                    if (td.dep().originalText().Equals(keyword[i]))
                    {
                        keyFlag = true;
                        key     = keyword[i];
                        break;
                    }
                }
                if (keyFlag)
                {
                    break;
                }
            }

            keyFlag = false;


            switch (key)
            {
            case "circle":

                Circle circle = new Circle();
                shape     = circle.GetProps();
                propsUsed = Associator(shape, dep);

                break;

            case "rectangle":

                Rectangle rect = new Rectangle();
                shape     = rect.GetProps();
                propsUsed = Associator(shape, dep);

                break;

            case "triangle":

                Triangle tri = new Triangle();
                shape     = tri.GetProps();
                propsUsed = Associator(shape, dep);

                break;

            case "square":

                Square square = new Square();
                shape     = square.GetProps();
                propsUsed = Associator(shape, dep);

                break;

            default:

                break;
            } //End of Switch

            dependency = tdl.ToString();
        } //End of SentenceParser
Example #24
0
        private static void Main(string[] args)
        {
            /*FileStream ostrm;
             * StreamWriter writer;
             * TextWriter oldOut = Console.Out;
             * try
             * {
             *  ostrm = new FileStream("C:\\Users\\Alexandre\\Desktop\\vs_output_2.txt", FileMode.OpenOrCreate, FileAccess.Write);
             *  writer = new StreamWriter(ostrm);
             * }
             * catch (Exception e)
             * {
             *  Console.WriteLine("Cannot open Redirect.txt for writing");
             *  Console.WriteLine(e.Message);
             *  return;
             * }
             * Console.SetOut(writer);*/


            /*// read file
             * var tokenizerTrainingFilePath = currentDirectory + "Input/tokenizer.train";
             * var outputFilePath = currentDirectory + "Output/EnglishTok.nbin";
             * MaximumEntropyTokenizer.Train(tokenizerTrainingFilePath, outputFilePath);*/

            // test detokenization

            /*var tokens = new List<string>() {"do", "n't", "commit"};
             * var detokenizer = new DictionaryDetokenizer();
             * var result = detokenizer.Detokenize(tokens.ToArray());
             * Console.WriteLine(result);*/

            /*// train model file
             * var inputFilePath = currentDirectory + "Input/sentences.train";
             * var outputFilePath = currentDirectory + "Output/" + Path.GetFileNameWithoutExtension(inputFilePath) + ".nbin";
             * var iterations = 100;
             * var cut = 5;
             * var endOfSentenceScanner = new CharactersSpecificEndOfSentenceScanner();
             * Console.WriteLine("Training model...");
             * var model = MaximumEntropySentenceDetector.TrainModel(inputFilePath, iterations, cut, endOfSentenceScanner);
             * Console.WriteLine("Writing output file '{0}'...", outputFilePath);
             * new BinaryGisModelWriter().Persist(model, outputFilePath);
             * Console.WriteLine("Output file written.");*/

            /*// tokenize tests
             * var modelPath = currentDirectory + "../Resources/Models/";
             * var tokenizer = new EnglishMaximumEntropyTokenizer(modelPath + "EnglishTok.nbin");
             *
             * var input = "It was built of a bright brick throughout; its skyline was fantastic, and even its ground plan was wild.";
             * var tokens = tokenizer.Tokenize(input);
             * Console.WriteLine(string.Join(" | ", tokens));*/


            // detect tokenization issues

            /*var pathToFile = currentDirectory + "Input/tokenizerIssues.txt";
             * var modelPath = currentDirectory + "../Resources/Models/";
             * var tokenizer = new EnglishMaximumEntropyTokenizer(modelPath + "EnglishTok.nbin");
             * var allLines = File.ReadAllLines(pathToFile);
             * foreach (var line in allLines)
             * {
             *  var tokens = tokenizer.Tokenize(line);
             *  Console.WriteLine(string.Join(" | ", tokens));
             * }*/

            // parsing
            var sentence  = "This is a generic bank response, which indicates simply that they are not willing to accept the transaction.";
            var tokenizer = new EnglishMaximumEntropyTokenizer(currentDirectory + "../Resources/Models/EnglishTok.nbin");
            var tokens    = tokenizer.Tokenize(sentence);
            var modelPath = currentDirectory + "../Resources/Models/";
            var parser    = new OpenNLP.Tools.Parser.EnglishTreebankParser(modelPath, true, false);
            var parse     = parser.DoParse(tokens);
            // Extract dependencies from lexical tree
            var tlp  = new PennTreebankLanguagePack();
            var gsf  = tlp.GrammaticalStructureFactory();
            var tree = new ParseTree(parse);

            Console.WriteLine(tree);
            var gs           = gsf.NewGrammaticalStructure(tree);
            var dependencies = gs.TypedDependencies();

            foreach (var dep in dependencies)
            {
                Console.WriteLine(dep);
            }

            Console.WriteLine("===========");
            Console.WriteLine("OK");
            Console.ReadKey();
        }
Example #25
0
        /// <summary>
        /// Gets depeendencies from sentence.
        /// </summary>
        /// <param name="annotation"></param>
        /// <returns></returns>
        private NotenizerDependencies GetDepencencies(Annotation annotation)
        {
            Tree tree;
            NotenizerDependency dep;
            GrammaticalStructure gramStruct;
            NotenizerDependencies dependencies;
            NotenizerDependency nsubjComplement;
            TreebankLanguagePack treeBankLangPack;
            java.util.Collection typedDependencies;
            GrammaticalStructureFactory gramStructFact;

            tree = annotation.get(typeof(TreeCoreAnnotations.TreeAnnotation)) as Tree;
            treeBankLangPack = new PennTreebankLanguagePack();
            gramStructFact = treeBankLangPack.grammaticalStructureFactory();
            gramStruct = gramStructFact.newGrammaticalStructure(tree);
            typedDependencies = gramStruct.typedDependenciesCollapsed();
            dependencies = new NotenizerDependencies();

            foreach (TypedDependency typedDependencyLoop in (typedDependencies as java.util.ArrayList))
            {
                dep = new NotenizerDependency(typedDependencyLoop);
                dependencies.Add(dep);

                if (dep.Relation.IsNominalSubject())
                {
                    nsubjComplement = new NotenizerDependency(typedDependencyLoop);
                    nsubjComplement.TokenType = dep.TokenType == TokenType.Dependent ? TokenType.Governor : TokenType.Dependent;
                    dependencies.Add(nsubjComplement);
                }
            }

            return dependencies;
        }
Example #26
0
        // Add WordNet search paths to this as the 'object' parameter?
        /// <summary>
        /// Oxymoron: A terse paradox; the yoking of two contradictory terms.
        /// </summary>
        /// <param name="a"></param>
        /// <param name="windowSize"></param>
        public static void FindOxymoron(Analyzer a, int? windowSize, object greedy)
        {
            int ws = windowSize ?? 1; // Not used. The window size is one sentence.
              bool greedySearch = (bool?)greedy ?? false;

              GetDependencyIndexDelegate GetDependencyIndex = delegate(TreeGraphNode t)
              {
            return Convert.ToInt32(Regex.Match(t.toString(), "^.*?-(\\d+)\\'*$").Result("$1")) - 1;
              };

              Action<Miscellaneous.TreeNode<Analyzer.WordNetRelation>, object> WordNetRelationVisitor =
            (Miscellaneous.TreeNode<Analyzer.WordNetRelation> n, object o) =>
              {
            if (n.IsRoot())
              return;

            var oxymoronData = (OxymoronData)o;

            if (oxymoronData.Overlap.Value != 0)
              return;

            var w1 = oxymoronData.W1;
            var derivedFormsW2 = oxymoronData.GetDerivedFormsW2();

            bool checkedAntonyms = false;
            var currentNode = n;
            while (!currentNode.Parent.IsRoot()) {
              currentNode = currentNode.Parent;
              if (currentNode.Value.Relation == WordNetEngine.SynSetRelation.Antonym) {
            checkedAntonyms = true;
            break;
              }
            }

            var p = n.Parent;

            var candidates = new List<string> { w1 };
            if (!p.IsRoot())
              candidates = p.Value.Words;

            var relation = n.Value.Relation;

            switch(relation) {
              case WordNetEngine.SynSetRelation.SimilarTo:
            n.Value.Words = Token.FindSynonyms(candidates);
            break;

              case WordNetEngine.SynSetRelation.Antonym:
            n.Value.Words = Token.FindAntonyms(candidates);
            if (!checkedAntonyms)
              checkedAntonyms = true;
            break;

              case WordNetEngine.SynSetRelation.DerivationallyRelated:
            n.Value.Words = Token.FindDerivationalForms(candidates, Analyzer.SimilarityPrefixes, Analyzer.MostCommonSimilaritySuffixes, useAllForms: greedySearch ? true : false);
            if (checkedAntonyms) {
              var negations = new List<string>(Analyzer.NegationPrefixes.Select(x => (string)(x.Clone()) + w1));

              n.Value.Words.AddRange(Token.FindDerivationalForms(negations, null, null, useAllForms: greedySearch ? true : false));
            }
            break;
            }

            if (!checkedAntonyms)
              n.Value.Words.AddRange(candidates);

            n.Value.Words = n.Value.Words.Distinct().ToList(); // Remove duplicates.

            if (oxymoronData.Debug) {
              Console.WriteLine("===================================================");
              Console.WriteLine("Relation: " + relation.ToString());
              //Console.WriteLine("Parent relation: " + p.Value.Relation.ToString());
              Console.WriteLine("Child count: " + n.Children.Count());
              Console.WriteLine("Node candidates:");
              if (n.IsRoot() || n.Value.Words.Count == 0) Console.WriteLine("  None");
              else {
            foreach (var w in n.Value.Words)
              Console.WriteLine("  " + w.ToString());
              }
              if (n.IsLeaf()) Console.WriteLine("LEAF NODE");
              Console.WriteLine("===================================================");
            }

            if (checkedAntonyms)
              oxymoronData.Overlap.Value = n.Value.Words.Intersect(derivedFormsW2).Count();
              };

              Action<Miscellaneous.TreeNode<Analyzer.WordNetRelation>, object> WordNetRelationNullVisitor =
            (Miscellaneous.TreeNode<Analyzer.WordNetRelation> n, object o) =>
              {
            //Console.WriteLine(n.Value.Relation.ToString());
            n.Value.Words = null;
              };

              string dependencySymbols = @"^(amod|advmod|acomp|dobj|nsubj|prep)$";

              var allSubsequences = new List<List<Subsequence>>();

              TreebankLanguagePack tlp = new PennTreebankLanguagePack();
              GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();

              for (int i = 0; i < a.Document.Sentences.Count; ++i) {
            var sentence = a.Document.Sentences[i];
            var subsequenceTokens = new List<SubsequenceToken>();
            foreach (var token in sentence.Tokens)
              subsequenceTokens.Add(new SubsequenceToken(token, sentence));
            var phrases = sentence.Phrases;
            if (phrases.Count > 0) {
              var subsequence = new Subsequence(subsequenceTokens, sentence, phrases[0].Subsequences[0].ContainingSubsequence, i);

              var tree = sentence.Tree;
              GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
              java.util.Collection tdc = gs.typedDependenciesCollapsed();

              var candidates = new List<Subsequence>();
              for (java.util.Iterator j = tdc.iterator(); j.hasNext(); ) {
            var td = (TypedDependency)j.next();
            var relation = td.reln().getShortName();
            if (Regex.IsMatch(relation, dependencySymbols)) {
              var governorIndex = GetDependencyIndex(td.gov());
              var dependentIndex = GetDependencyIndex(td.dep());

              var index = Math.Min(governorIndex, dependentIndex);
              var count = Math.Abs(dependentIndex - governorIndex) + 1;
              var ss = relation == "prep" ? subsequence.GetRange(index, count) : subsequence.Where((n, k) => k == governorIndex | k == dependentIndex).ToList();

              // Remove any leftover punctuation from the candidate subsequences.
              ss.RemoveAll(n => Regex.IsMatch(n.Tag, Analyzer.PunctuationPatterns));

              candidates.Add(new Subsequence(ss, sentence, subsequence.ContainingSubsequence, i));
            }
              }

              // Determine whether the candidate pairs are oxymorons.
              for (int k = 0; k < candidates.Count; ++k) {
            var list = new List<Subsequence>();

            Token[] pair = { candidates[k][0], candidates[k][candidates[k].Count - 1] };

            // Clear (i.e. null) all the word lists in the WordNet search-path tree.
            a.WordNetSearchPath.Traverse(WordNetRelationNullVisitor);

            var overlap = new OxymoronData.IntClass(0);
            a.WordNetSearchPath.Traverse(WordNetRelationVisitor, new OxymoronData(pair, overlap, greedy: greedySearch, debug: false));
            if (overlap.Value == 0) {
              a.WordNetSearchPath.Traverse(WordNetRelationNullVisitor);
              a.WordNetSearchPath.Traverse(WordNetRelationVisitor, new OxymoronData(pair.Reverse().ToArray(), overlap, greedy: greedySearch, debug: false));
            }

            if (overlap.Value != 0) {
              list.Add(candidates[k]);
              allSubsequences.Add(list);
            }
              }
            }
              }

              // Remove duplicate instances and merge those contained in others.
              var figures = MergeFigures(allSubsequences, RhetoricalFigures.Oxymoron, multiWindow: true);

              a.Figures.AddRange(figures);
        }
Example #27
0
    public string Tags(string input)
    {
        // Path to models extracted from `stanford-parser-3.6.0-models.jar`
        var jarRoot         = @"";
        var modelsDirectory = jarRoot;

        var lp = LexicalizedParser.loadModel(modelsDirectory + @"\lexparser\englishPCFG.ser.gz");


        // This option shows loading and using an explicit tokenizer
        var sent2            = input;
        var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
        var sent2Reader      = new java.io.StringReader(sent2);
        var rawWords2        = tokenizerFactory.getTokenizer(sent2Reader).tokenize();

        sent2Reader.close();
        var tree2 = lp.apply(rawWords2);

        // Extract dependencies from lexical tree
        var tlp = new PennTreebankLanguagePack();
        var gsf = tlp.grammaticalStructureFactory();
        var gs  = gsf.newGrammaticalStructure(tree2);
        var tdl = gs.typedDependenciesCCprocessed();


        // Extract collapsed dependencies from parsed tree
        var tp = new TreePrint("penn,typedDependenciesCollapsed");

        UnityEngine.Debug.Log(tdl);
        //tp.printTree(tree2);

        for (int i = 0; i < tdl.size(); i++)
        {
            TypedDependency node = (TypedDependency)tdl.get(i);

            string relation = node.reln().getShortName();

            if (relation.Contains("nsubj"))
            {
                IndexedWord act = node.gov();
                //node.dep().getword()
                action = act.value();

                UnityEngine.Debug.Log("This is the action " + action);

                IndexedWord subject = node.dep();
                subj = subject.value();

                UnityEngine.Debug.Log("This is the subject " + subj);
            }

            if (relation.Contains("dobj"))
            {
                IndexedWord act = node.gov();
                //node.dep().getword()
                action = act.value();
                UnityEngine.Debug.Log("This is the action " + action);

                IndexedWord tar = node.dep();
                target = tar.value();
                UnityEngine.Debug.Log("This is the target " + target);
            }

            if (relation.Contains("nmod"))
            {
                IndexedWord tar_two = node.dep();
                second_target = tar_two.value();
                UnityEngine.Debug.Log("This is the target second " + second_target);
            }
        }

        return(tdl.ToString());
    }