/// <exception cref="System.Exception"/>
        public static void Main(string[] args)
        {
            if (args.Length != 2)
            {
                log.Info("usage: java TaggerDemo2 modelFile fileToTag");
                return;
            }
            MaxentTagger tagger = new MaxentTagger(args[0]);
            ITokenizerFactory <CoreLabel> ptbTokenizerFactory = PTBTokenizer.Factory(new CoreLabelTokenFactory(), "untokenizable=noneKeep");
            BufferedReader       r  = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8"));
            PrintWriter          pw = new PrintWriter(new OutputStreamWriter(System.Console.Out, "utf-8"));
            DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);

            documentPreprocessor.SetTokenizerFactory(ptbTokenizerFactory);
            foreach (IList <IHasWord> sentence in documentPreprocessor)
            {
                IList <TaggedWord> tSentence = tagger.TagSentence(sentence);
                pw.Println(SentenceUtils.ListToString(tSentence, false));
            }
            // print the adjectives in one more sentence. This shows how to get at words and tags in a tagged sentence.
            IList <IHasWord>   sent       = SentenceUtils.ToWordList("The", "slimy", "slug", "crawled", "over", "the", "long", ",", "green", "grass", ".");
            IList <TaggedWord> taggedSent = tagger.TagSentence(sent);

            foreach (TaggedWord tw in taggedSent)
            {
                if (tw.Tag().StartsWith("JJ"))
                {
                    pw.Println(tw.Word());
                }
            }
            pw.Close();
        }
Beispiel #2
0
        public static void ParseString(string sentence)
        {
            // Path to models extracted from `stanford-parser-3.6.0-models.jar`
            var modelsDirectory = @"../../../data/paket-files/stanford-corenlp-3.9.1-models/edu/stanford/nlp/models";
            var model           = @"/lexparser/englishPCFG.ser.gz";
            //var model = @"/parser/nndep/english_SD.gz";

            // Loading english PCFG parser from file
            var lp = LexicalizedParser.loadModel(modelsDirectory + model);

            // This sample shows parsing a list of correctly tokenized words
            //var rawWords = SentenceUtils.toCoreLabelList(sentence);
            //var tree = lp.apply(rawWords);
            //tree.pennPrint();

            // This option shows loading and using an explicit tokenizer
            var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
            var sent2Reader      = new StringReader(sentence);
            var rawWords2        = tokenizerFactory.getTokenizer(sent2Reader).tokenize();
            //sent2Reader.close();
            var tree2 = lp.apply(rawWords2);

            // Extract dependencies from lexical tree
            //var tlp = new PennTreebankLanguagePack();
            //var gsf = tlp.grammaticalStructureFactory();
            //var gs = gsf.newGrammaticalStructure(tree2);
            //var tdl = gs.typedDependenciesCCprocessed();
            //Console.WriteLine("\n{0}\n", tdl);

            // Extract collapsed dependencies from parsed tree
            var tp = new TreePrint("penn,typedDependenciesCollapsed");

            tp.printTree(tree2);
        }
        //use Stanford.NLP.Net to parse the sentence
        static Tree Parse(string sent)
        {
            // Loading english PCFG parser from file
            var lp = LexicalizedParser.loadModel(modelsDirectory + "\\lexparser\\englishPCFG.ser.gz");

            var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
            var sentReader       = new java.io.StringReader(sent);
            var rawWords         = tokenizerFactory.getTokenizer(sentReader).tokenize();

            sentReader.close();
            var tree = lp.apply(rawWords);

            // Extract dependencies from lexical tree
            var tlp = new PennTreebankLanguagePack();
            var gsf = tlp.grammaticalStructureFactory();
            var gs  = gsf.newGrammaticalStructure(tree);
            var tdl = gs.typedDependenciesCCprocessed();

            // Extract collapsed dependencies from parsed tree
            //var tp = new TreePrint("penn,typedDependenciesCollapsed");
            var tp = new TreePrint("penn");

            tp.printTree(tree);

            return(tree);
        }
        public void ProcessText(string inputText)
        {
            var jarRoot         = "C:\\stanford-parser-full-2016-10-31\\stanford-parser-3.7.0-models";//\\edu\\stanford\\nlp\\models";//"nlp.stanford.edu\\stanford-parser-full-2017-06-09\\models";
            var modelsDirectory = jarRoot + "\\edu\\stanford\\nlp\\models";

            // Loading english PCFG parser from file
            var lp = LexicalizedParser.loadModel(modelsDirectory + "\\lexparser\\englishPCFG.ser.gz");

            // This option shows loading and using an explicit tokenizer
            var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
            var sentReader       = new StringReader(inputText);
            var rawWords         = tokenizerFactory.getTokenizer(sentReader).tokenize();

            sentReader.close();
            var tree = lp.apply(rawWords);

            //Extract dependencies from lexical tree
            var tlp = new PennTreebankLanguagePack();
            var gsf = tlp.grammaticalStructureFactory();
            var gs  = gsf.newGrammaticalStructure(tree);
            var tdl = gs.typedDependenciesCCprocessed();

            Console.WriteLine("\n{0}\n", tdl);

            // Extract collapsed dependencies from parsed tree
            var tp = new TreePrint("penn,typedDependenciesCollapsed");

            tp.printTree(tree);
        }
Beispiel #5
0
        /// <summary>
        /// Parse the document searching for sentences where the entity found.
        /// Returns a csv line with the file, the entity the sentence and the sintax analisis of the sentences
        /// </summary>
        /// <param name="text">Document text</param>
        /// <param name="entity">Entity.</param>
        /// <param name="origFile">Original file.</param>
        public static List <string[]> Parse(string text, string entity, string origFile, string language)
        {
            var results = new List <string[]>();
            //Load spanish models.
            var modelsDirectory    = StanfordEnv.PARSER_MODELS;
            var lexparserDirectory = modelsDirectory + StanfordEnv.GetParserLanguageFiles(language);
            var lp = LexicalizedParser.loadModel(lexparserDirectory);

            string[]      splittedText = SplitText(text);
            List <string> entityLines  = GetEntitiesLines(splittedText, entity);

            foreach (var line in entityLines)
            {
                //Parser sentence.
                var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
                var sent2Reader      = new java.io.StringReader(line);
                var rawWords2        = tokenizerFactory.getTokenizer(sent2Reader).tokenize();
                sent2Reader.close();
                var tree2 = lp.apply(rawWords2);

                results.Add(new string[] { origFile, entity, line, tree2.ToString() });
            }

            return(results);
        }
        public void ParseEasySentence()
        {
            // This option shows parsing a list of correctly tokenized words
            var sent     = new[] { "This", "is", "an", "easy", "sentence", "." };
            var rawWords = SentenceUtils.toCoreLabelList(sent);
            var parse    = _lp.apply(rawWords);

            Assert.NotNull(parse);
            parse.pennPrint();

            // This option shows loading and using an explicit tokenizer
            var sent2            = "This is another sentence.";
            var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");

            using var sent2Reader = new StringReader(sent2);
            var rawWords2 = tokenizerFactory.getTokenizer(sent2Reader).tokenize();

            parse = _lp.apply(rawWords2);
            Assert.NotNull(parse);

            var tlp = new PennTreebankLanguagePack();
            var gsf = tlp.grammaticalStructureFactory();
            var gs  = gsf.newGrammaticalStructure(parse);
            var tdl = gs.typedDependenciesCCprocessed();

            TestContext.Out.WriteLine($"\n{tdl}\n");

            var tp = new TreePrint("penn,typedDependenciesCollapsed");

            Assert.NotNull(tp);
            tp.printTree(parse);
        }
Beispiel #7
0
        public static void DemoAPI(LexicalizedParser lp)
        {
            // This option shows parsing a list of correctly tokenized words
            var sent = new[] { "This", "is", "an", "easy", "sentence", "." };

            java.util.List rawWords = Sentence.toCoreLabelList(sent);
            Tree           parse    = lp.apply(rawWords);

            parse.pennPrint();

            // This option shows loading and using an explicit tokenizer
            const string     Sent2            = "This is another sentence.";
            TokenizerFactory tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
            var sent2Reader = new StringReader(Sent2);

            java.util.List rawWords2 = tokenizerFactory.getTokenizer(sent2Reader).tokenize();
            parse = lp.apply(rawWords2);

            var tlp = new PennTreebankLanguagePack();
            GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
            GrammaticalStructure        gs  = gsf.newGrammaticalStructure(parse);

            java.util.List tdl = gs.typedDependenciesCCprocessed();
            Console.WriteLine("\n{0}\n", tdl);

            var tp = new TreePrint("penn,typedDependenciesCollapsed");

            tp.printTree(parse);
        }
Beispiel #8
0
        /// <summary>
        /// demoAPI demonstrates other ways of calling the parser with
        /// already tokenized text, or in some cases, raw text that needs to
        /// be tokenized as a single sentence.
        /// </summary>
        /// <remarks>
        /// demoAPI demonstrates other ways of calling the parser with
        /// already tokenized text, or in some cases, raw text that needs to
        /// be tokenized as a single sentence.  Output is handled with a
        /// TreePrint object.  Note that the options used when creating the
        /// TreePrint can determine what results to print out.  Once again,
        /// one can capture the output by passing a PrintWriter to
        /// TreePrint.printTree. This code is for English.
        /// </remarks>
        public static void DemoAPI(LexicalizedParser lp)
        {
            // This option shows parsing a list of correctly tokenized words
            string[]          sent     = new string[] { "This", "is", "an", "easy", "sentence", "." };
            IList <CoreLabel> rawWords = SentenceUtils.ToCoreLabelList(sent);
            Tree parse = lp.Apply(rawWords);

            parse.PennPrint();
            System.Console.Out.WriteLine();
            // This option shows loading and using an explicit tokenizer
            string sent2 = "This is another sentence.";
            ITokenizerFactory <CoreLabel> tokenizerFactory = PTBTokenizer.Factory(new CoreLabelTokenFactory(), string.Empty);
            ITokenizer <CoreLabel>        tok       = tokenizerFactory.GetTokenizer(new StringReader(sent2));
            IList <CoreLabel>             rawWords2 = tok.Tokenize();

            parse = lp.Apply(rawWords2);
            ITreebankLanguagePack tlp = lp.TreebankLanguagePack();
            // PennTreebankLanguagePack for English
            IGrammaticalStructureFactory gsf = tlp.GrammaticalStructureFactory();
            GrammaticalStructure         gs  = gsf.NewGrammaticalStructure(parse);
            IList <TypedDependency>      tdl = gs.TypedDependenciesCCprocessed();

            System.Console.Out.WriteLine(tdl);
            System.Console.Out.WriteLine();
            // You can also use a TreePrint object to print trees and dependencies
            TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");

            tp.PrintTree(parse);
        }
        /// <exception cref="System.Exception"/>
        public MUCMentionExtractor(Dictionaries dict, Properties props, Semantics semantics)
            : base(dict, semantics)
        {
            string fileName = props.GetProperty(Constants.MucProp);

            fileContents      = IOUtils.SlurpFile(fileName);
            currentOffset     = 0;
            tokenizerFactory  = PTBTokenizer.Factory(new CoreLabelTokenFactory(false), string.Empty);
            stanfordProcessor = LoadStanfordProcessor(props);
        }
Beispiel #10
0
        public static void TokenizeText(string text)
        {
            var          temp = new StringReader(text);
            PTBTokenizer ptbt = new PTBTokenizer(temp, new CoreLabelTokenFactory(), "");

            while (ptbt.hasNext())
            {
                CoreLabel label = (CoreLabel)ptbt.next();
                Console.WriteLine(String.Format("{0}\t| BEGIN_OFFSET: {1}\t| END_OFFSET: {2}", label.value(), label.beginPosition(), label.endPosition()));
            }
        }
Beispiel #11
0
        /// <summary>
        /// Checks the a list of sentences for correct grammar. Returns a new list of the sentences with correct grammar.
        /// </summary>
        /// <param name="sentences">A list of strings that will have their grammar checked.</param>
        /// <returns>A new list of the sentences with correct grammar.</returns>
        public static HashSet <string> CheckGrammer(HashSet <string> sentences)
        {
            HashSet <string> answers = new HashSet <string>();


            Console.WriteLine(sentences.Count + " potential sentences\n");
            if (sentences.Count == 0)
            {
                return(null);
            }

            Console.WriteLine("Loading Stanford NLP...");
            // Loading english PCFG parser from file
            var lp = LexicalizedParser.loadModel("..\\..\\..\\packages\\englishPCFG.ser.gz");

            Console.WriteLine("Stanford Parser Loaded!\n");

            //Test the grammar of each potential sentence that has all english words
            foreach (var curSentence in sentences)
            {
                //Convert the sentence to a tree that Stanford NLP library generates from its parsing
                var tempSentence = curSentence;
                var sentReader   = new StringReader(tempSentence);
                var tree         = lp.apply(PTBTokenizer.factory(new CoreLabelTokenFactory(), "").getTokenizer(sentReader).tokenize());
                sentReader.close();

                //Determine if the word is a sentence
                string strTree    = tree.ToString();
                bool   isSentence = false;

                if (strTree.Contains("(S "))
                {
                    tempSentence = curSentence + ".";
                    isSentence   = true;
                }
                else if (strTree.Contains("(SINV ") || strTree.Contains("(SBARQ ") || strTree.Contains("(SQ "))
                {
                    tempSentence = curSentence + "?";
                    isSentence   = true;
                }

                if (isSentence)
                {
                    var    tlp    = new PennTreebankLanguagePack();
                    string strRel = tlp.grammaticalStructureFactory().newGrammaticalStructure(tree).typedDependenciesCCprocessed().ToString();

                    if (strRel.Contains("nsubj("))
                    {
                        answers.Add(tempSentence);
                    }
                }
            }
            return(answers);
        }
Beispiel #12
0
        public NlpService()
        {
            string parserFileOrUrl = "englishPCFG.ser.gz";

            _lp = LexicalizedParser.loadModel(parserFileOrUrl);
            if (_lp == null)
            {
                throw new InvalidOperationException("couldn't load " + parserFileOrUrl);
            }
            _tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");

            _tlp = new PennTreebankLanguagePack();
            _structureFactory = _tlp.grammaticalStructureFactory();
        }
Beispiel #13
0
        public static void Execute(string fileName)
        {
            var tagger = new MaxentTagger(TaggerDemo.Model);
            var ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "untokenizable=noneKeep");
            var r = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), "utf-8"));
            var documentPreprocessor = new DocumentPreprocessor(r);

            documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory);
            foreach (List sentence in documentPreprocessor)
            {
                var tSentence = tagger.tagSentence(sentence);
                System.Console.WriteLine(Sentence.listToString(tSentence, false));
            }
        }
        public string[] Tokenize(string input)
        {
            try
            {
                PTBTokenizer ptb = new PTBTokenizer(new StringReader(input), new WordTokenFactory(), "normalizeParentheses=false,normalizeOtherBrackets=false,asciiQuotes=true,unicodeQuotes=true,untokenizable=noneDelete");
                object[] tokens = ptb.tokenize().toArray();

                return tokens.Select(t => t.ToString()).ToArray();
            }
            catch (Exception e)
            {
               return null;

            }
        }
Beispiel #15
0
        static void tokenize()
        {
            tokenizedArticle = new List<Token>();
            int ctr = 0;

            PTBTokenizer ptbt = new PTBTokenizer(new StringReader(article.Body), new CoreLabelTokenFactory(), "");

            while (ptbt.hasNext())
            {
                ctr++;
                CoreLabel label = ((CoreLabel)ptbt.next());
                tokenizedArticle.Add(new Token(label.toString(), ctr));
                
                System.Console.WriteLine(label);
            }
        }
Beispiel #16
0
        public static List <string> Tokenize(string s)
        {
            var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");

            tokenizerFactory.setOptions("untokenizable=noneDelete");
            var reader = new jio.StringReader(s);
            var words  = tokenizerFactory.getTokenizer(reader).tokenize();
            var tokens = new List <string>();

            for (int i = 0; i < words.size(); i++)
            {
                ling.CoreLabel word = (ling.CoreLabel)words.get(i);
                string         w    = word.toString().ToUpper();
                tokens.Add(w);
            }
            return(tokens);
        }
        public static List <DependencyRelationship> ParseDepencyRelationshipsInSentence(string sentence)
        {
            var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
            var sent2Reader      = new StringReader(sentence);
            var rawWords2        = tokenizerFactory.getTokenizer(sent2Reader).tokenize();

            sent2Reader.close();
            var tree2 = LoadedLexicalizedParserModel.Instance.apply(rawWords2);

            // Extract dependencies from lexical tree
            var tlp = new PennTreebankLanguagePack();
            var gsf = tlp.grammaticalStructureFactory();
            var gs  = gsf.newGrammaticalStructure(tree2);
            var tdl = gs.typedDependencies();

            return(ParseJavaDependecyRelationships(tdl));
        }
Beispiel #18
0
        public static List <string> ExtractNounsFromSemantics(string sentence)
        {
            string assemblyPath    = Assembly.GetExecutingAssembly().GetName().CodeBase;
            string projectPath     = Directory.GetParent(new Uri(Path.GetDirectoryName(Path.GetDirectoryName(Path.GetDirectoryName(assemblyPath)))).LocalPath).FullName;
            string modelsDirectory = Path.GetFullPath(projectPath + @"\Parser\CoreNLP-3.9.1-Models\edu\stanford\nlp\models");

            // Loading english PCFG parser from file
            LexicalizedParser lp = LexicalizedParser.loadModel(modelsDirectory + @"\lexparser\englishPCFG.ser.gz");

            // This shows loading and using an explicit tokenizer
            var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
            var sent2Reader      = new java.io.StringReader(sentence);
            var rawWords         = tokenizerFactory.getTokenizer(sent2Reader).tokenize();

            sent2Reader.close();
            var tree = lp.apply(rawWords);

            return(tree.toArray().Cast <LabeledScoredTreeNode>().Where(n => n.isLeaf() && nounLabels.Contains(n.parent(tree).label().value())).Select(n => n.label().ToString()).ToList());
        }
Beispiel #19
0
        static void Main(string[] args)
        {
            using (TextReader reader = System.IO.File.OpenText("C:\\Data\\msr_paraphrase_train.txt"))
            {
                TextWriter writer1 = System.IO.File.CreateText("C:\\Data\\msr_paraphrase_train_s1.token");
                TextWriter writer2 = System.IO.File.CreateText("C:\\Data\\msr_paraphrase_train_s2.token");

                string[] inputdata = reader.ReadToEnd().Split('\n');

                foreach (string line in inputdata)
                {
                    string[] sp = line.Split('\t');

                    //writer.Write(sp[0] + '\t');

                    var            tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
                    var            sent2Reader1     = new java.io.StringReader(sp[3]);
                    java.util.List rawWords1        = tokenizerFactory.getTokenizer(sent2Reader1).tokenize();
                    sent2Reader1.close();
                    var            sent2Reader2 = new java.io.StringReader(sp[4]);
                    java.util.List rawWords2    = tokenizerFactory.getTokenizer(sent2Reader2).tokenize();
                    sent2Reader2.close();

                    for (int i = 0; i < rawWords1.size(); ++i)
                    {
                        writer1.Write(rawWords1.get(i) + " ");
                    }
                    writer1.Write('\n');

                    for (int i = 0; i < rawWords2.size(); ++i)
                    {
                        writer2.Write(rawWords2.get(i) + " ");
                    }
                    writer2.Write('\n');
                }
                writer1.Close();
                writer2.Close();
            }
            System.Console.ReadKey();
        }
Beispiel #20
0
        static void Main()
        {
            // Path to models extracted from `stanford-parser-3.6.0-models.jar`
            var jarRoot         = @"..\..\..\..\paket-files\nlp.stanford.edu\stanford-parser-full-2015-12-09\models\";
            var modelsDirectory = jarRoot + @"\edu\stanford\nlp\models";

            // Loading english PCFG parser from file
            var lp = LexicalizedParser.loadModel(modelsDirectory + @"\lexparser\englishPCFG.ser.gz");

            // This sample shows parsing a list of correctly tokenized words
            var sent     = new[] { "This", "is", "an", "easy", "sentence", "." };
            var rawWords = Sentence.toCoreLabelList(sent);
            var tree     = lp.apply(rawWords);

            tree.pennPrint();

            // This option shows loading and using an explicit tokenizer
            var sent2            = "This is another sentence.";
            var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
            var sent2Reader      = new StringReader(sent2);
            var rawWords2        = tokenizerFactory.getTokenizer(sent2Reader).tokenize();

            sent2Reader.close();
            var tree2 = lp.apply(rawWords2);

            // Extract dependencies from lexical tree
            var tlp = new PennTreebankLanguagePack();
            var gsf = tlp.grammaticalStructureFactory();
            var gs  = gsf.newGrammaticalStructure(tree2);
            var tdl = gs.typedDependenciesCCprocessed();

            Console.WriteLine("\n{0}\n", tdl);

            // Extract collapsed dependencies from parsed tree
            var tp = new TreePrint("penn,typedDependenciesCollapsed");

            tp.printTree(tree2);
        }
Beispiel #21
0
        //use Stanford.NLP.Net to parse the sentence
        Tree Parse(string sent)
        {
            var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
            var sentReader       = new java.io.StringReader(sent);
            var rawWords         = tokenizerFactory.getTokenizer(sentReader).tokenize();

            sentReader.close();
            var tree = lp.apply(rawWords);

            // Extract dependencies from lexical tree
            var tlp = new PennTreebankLanguagePack();
            var gsf = tlp.grammaticalStructureFactory();
            var gs  = gsf.newGrammaticalStructure(tree);
            var tdl = gs.typedDependenciesCCprocessed();

            // Extract collapsed dependencies from parsed tree
            //var tp = new TreePrint("penn,typedDependenciesCollapsed");
            var tp = new TreePrint("penn");

            tp.printTree(tree);

            return(tree);
        }
            public virtual IList <CoreLabel> Apply(string doc)
            {
                if (doc == null)
                {
                    return(null);
                }
                string section                  = string.Empty;
                string entity                   = "O";
                string entityClass              = string.Empty;
                int    pNum                     = 0;
                int    sNum                     = 0;
                int    wNum                     = 0;
                PTBTokenizer <CoreLabel> ptb    = PTBTokenizer.NewPTBTokenizer(new BufferedReader(new StringReader(doc)), false, true);
                IList <CoreLabel>        words  = ptb.Tokenize();
                IList <CoreLabel>        result = new List <CoreLabel>();
                CoreLabel prev                  = null;
                string    prevString            = string.Empty;

                foreach (CoreLabel word in words)
                {
                    Matcher matcher = sgml.Matcher(word.Word());
                    if (matcher.Matches())
                    {
                        string tag = matcher.Group(1);
                        if (Sharpen.Runtime.EqualsIgnoreCase(word.Word(), "<p>"))
                        {
                            pNum++;
                            sNum = 0;
                            wNum = 0;
                            if (prev != null)
                            {
                                string s = prev.Get(typeof(CoreAnnotations.AfterAnnotation));
                                s += word.OriginalText() + word.After();
                                prev.Set(typeof(CoreAnnotations.AfterAnnotation), s);
                            }
                            prevString += word.Before() + word.OriginalText();
                        }
                        else
                        {
                            if (Sharpen.Runtime.EqualsIgnoreCase(word.Word(), "<s>"))
                            {
                                sNum++;
                                wNum = 0;
                                if (prev != null)
                                {
                                    string s = prev.Get(typeof(CoreAnnotations.AfterAnnotation));
                                    s += word.OriginalText() + word.After();
                                    prev.Set(typeof(CoreAnnotations.AfterAnnotation), s);
                                }
                                prevString += word.Before() + word.OriginalText();
                            }
                            else
                            {
                                matcher = beginEntity.Matcher(word.Word());
                                if (matcher.Matches())
                                {
                                    entityClass = matcher.Group(1);
                                    entity      = matcher.Group(2);
                                    if (prev != null)
                                    {
                                        string s = prev.Get(typeof(CoreAnnotations.AfterAnnotation));
                                        s += word.After();
                                        prev.Set(typeof(CoreAnnotations.AfterAnnotation), s);
                                    }
                                    prevString += word.Before();
                                }
                                else
                                {
                                    matcher = endEntity.Matcher(word.Word());
                                    if (matcher.Matches())
                                    {
                                        entityClass = string.Empty;
                                        entity      = "O";
                                        if (prev != null)
                                        {
                                            string s = prev.Get(typeof(CoreAnnotations.AfterAnnotation));
                                            s += word.After();
                                            prev.Set(typeof(CoreAnnotations.AfterAnnotation), s);
                                        }
                                        prevString += word.Before();
                                    }
                                    else
                                    {
                                        if (Sharpen.Runtime.EqualsIgnoreCase(word.Word(), "<doc>"))
                                        {
                                            prevString += word.Before() + word.OriginalText();
                                        }
                                        else
                                        {
                                            if (Sharpen.Runtime.EqualsIgnoreCase(word.Word(), "</doc>"))
                                            {
                                                string s = prev.Get(typeof(CoreAnnotations.AfterAnnotation));
                                                s += word.OriginalText();
                                                prev.Set(typeof(CoreAnnotations.AfterAnnotation), s);
                                            }
                                            else
                                            {
                                                section = tag.ToUpper();
                                                if (prev != null)
                                                {
                                                    string s = prev.Get(typeof(CoreAnnotations.AfterAnnotation));
                                                    s += word.OriginalText() + word.After();
                                                    prev.Set(typeof(CoreAnnotations.AfterAnnotation), s);
                                                }
                                                prevString += word.Before() + word.OriginalText();
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                    else
                    {
                        CoreLabel wi = new CoreLabel();
                        wi.SetWord(word.Word());
                        wi.Set(typeof(CoreAnnotations.OriginalTextAnnotation), word.OriginalText());
                        wi.Set(typeof(CoreAnnotations.BeforeAnnotation), prevString + word.Before());
                        wi.Set(typeof(CoreAnnotations.AfterAnnotation), word.After());
                        wi.Set(typeof(CoreAnnotations.WordPositionAnnotation), string.Empty + wNum);
                        wi.Set(typeof(CoreAnnotations.SentencePositionAnnotation), string.Empty + sNum);
                        wi.Set(typeof(CoreAnnotations.ParaPositionAnnotation), string.Empty + pNum);
                        wi.Set(typeof(CoreAnnotations.SectionAnnotation), section);
                        wi.Set(typeof(CoreAnnotations.AnswerAnnotation), entity);
                        wi.Set(typeof(CoreAnnotations.EntityClassAnnotation), entityClass);
                        wNum++;
                        prevString = string.Empty;
                        result.Add(wi);
                        prev = wi;
                    }
                }
                //log.info(doc);
                //log.info(edu.stanford.nlp.util.StringUtils.join(result, "\n"));
                //System.exit(0);
                return(result);
            }
Beispiel #23
0
    public string Tags(string input)
    {
        // Path to models extracted from `stanford-parser-3.6.0-models.jar`
        var jarRoot         = @"";
        var modelsDirectory = jarRoot;

        var lp = LexicalizedParser.loadModel(modelsDirectory + @"\lexparser\englishPCFG.ser.gz");


        // This option shows loading and using an explicit tokenizer
        var sent2            = input;
        var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
        var sent2Reader      = new java.io.StringReader(sent2);
        var rawWords2        = tokenizerFactory.getTokenizer(sent2Reader).tokenize();

        sent2Reader.close();
        var tree2 = lp.apply(rawWords2);

        // Extract dependencies from lexical tree
        var tlp = new PennTreebankLanguagePack();
        var gsf = tlp.grammaticalStructureFactory();
        var gs  = gsf.newGrammaticalStructure(tree2);
        var tdl = gs.typedDependenciesCCprocessed();


        // Extract collapsed dependencies from parsed tree
        var tp = new TreePrint("penn,typedDependenciesCollapsed");

        UnityEngine.Debug.Log(tdl);
        //tp.printTree(tree2);

        for (int i = 0; i < tdl.size(); i++)
        {
            TypedDependency node = (TypedDependency)tdl.get(i);

            string relation = node.reln().getShortName();

            if (relation.Contains("nsubj"))
            {
                IndexedWord act = node.gov();
                //node.dep().getword()
                action = act.value();

                UnityEngine.Debug.Log("This is the action " + action);

                IndexedWord subject = node.dep();
                subj = subject.value();

                UnityEngine.Debug.Log("This is the subject " + subj);
            }

            if (relation.Contains("dobj"))
            {
                IndexedWord act = node.gov();
                //node.dep().getword()
                action = act.value();
                UnityEngine.Debug.Log("This is the action " + action);

                IndexedWord tar = node.dep();
                target = tar.value();
                UnityEngine.Debug.Log("This is the target " + target);
            }

            if (relation.Contains("nmod"))
            {
                IndexedWord tar_two = node.dep();
                second_target = tar_two.value();
                UnityEngine.Debug.Log("This is the target second " + second_target);
            }
        }

        return(tdl.ToString());
    }
Beispiel #24
0
        /// <summary>
        /// initFactory returns the right type of TokenizerFactory based on the options in the properties file
        /// and the type.
        /// </summary>
        /// <remarks>
        /// initFactory returns the right type of TokenizerFactory based on the options in the properties file
        /// and the type. When adding a new Tokenizer, modify TokenizerType.getTokenizerType() to retrieve
        /// your tokenizer from the properties file, and then add a class is the switch structure here to
        /// instantiate the new Tokenizer type.
        /// </remarks>
        /// <param name="type">the TokenizerType</param>
        /// <param name="props">the properties file</param>
        /// <param name="extraOptions">extra things that should be passed into the tokenizer constructor</param>
        /// <exception cref="System.ArgumentException"/>
        private static ITokenizerFactory <CoreLabel> InitFactory(TokenizerAnnotator.TokenizerType type, Properties props, string extraOptions)
        {
            ITokenizerFactory <CoreLabel> factory;
            string options = props.GetProperty("tokenize.options", null);

            // set it to the equivalent of both extraOptions and options
            // TODO: maybe we should always have getDefaultOptions() and
            // expect the user to turn off default options.  That would
            // require all options to have negated options, but
            // currently there are some which don't have that
            if (options == null)
            {
                options = type.GetDefaultOptions();
            }
            if (extraOptions != null)
            {
                if (extraOptions.EndsWith(","))
                {
                    options = extraOptions + options;
                }
                else
                {
                    options = extraOptions + ',' + options;
                }
            }
            switch (type)
            {
            case TokenizerAnnotator.TokenizerType.Arabic:
            case TokenizerAnnotator.TokenizerType.Chinese:
            {
                factory = null;
                break;
            }

            case TokenizerAnnotator.TokenizerType.Spanish:
            {
                factory = SpanishTokenizer.Factory(new CoreLabelTokenFactory(), options);
                break;
            }

            case TokenizerAnnotator.TokenizerType.French:
            {
                factory = FrenchTokenizer.Factory(new CoreLabelTokenFactory(), options);
                break;
            }

            case TokenizerAnnotator.TokenizerType.Whitespace:
            {
                bool eolIsSignificant = bool.ValueOf(props.GetProperty(EolProperty, "false"));
                eolIsSignificant = eolIsSignificant || bool.ValueOf(props.GetProperty(StanfordCoreNLP.NewlineSplitterProperty, "false"));
                factory          = new WhitespaceTokenizer.WhitespaceTokenizerFactory <CoreLabel>(new CoreLabelTokenFactory(), eolIsSignificant);
                break;
            }

            case TokenizerAnnotator.TokenizerType.English:
            case TokenizerAnnotator.TokenizerType.German:
            {
                factory = PTBTokenizer.Factory(new CoreLabelTokenFactory(), options);
                break;
            }

            case TokenizerAnnotator.TokenizerType.Unspecified:
            {
                log.Info("No tokenizer type provided. Defaulting to PTBTokenizer.");
                factory = PTBTokenizer.Factory(new CoreLabelTokenFactory(), options);
                break;
            }

            default:
            {
                throw new ArgumentException("No valid tokenizer type provided.\n" + "Use -tokenize.language, -tokenize.class, or -tokenize.whitespace \n" + "to specify a tokenizer.");
            }
            }
            return(factory);
        }
 /// <summary>
 /// Returns a factory for
 /// <see cref="Edu.Stanford.Nlp.Process.PTBTokenizer{T}"/>
 /// .
 /// </summary>
 /// <returns>A tokenizer</returns>
 public override ITokenizerFactory <IHasWord> GetTokenizerFactory()
 {
     return(PTBTokenizer.CoreLabelFactory());
 }
 /// <summary>
 /// Constructs a new (empty) BasicDocument using a
 /// <see cref="Edu.Stanford.Nlp.Process.PTBTokenizer{T}"/>
 /// .
 /// Call one of the <tt>init</tt> * methods to populate the document
 /// from a desired source.
 /// </summary>
 public BasicDocument()
     : this(PTBTokenizer.Factory())
 {
 }
        // main method only
        public static void Main(string[] args)
        {
            Treebank   tb           = new MemoryTreebank();
            Properties props        = StringUtils.ArgsToProperties(args);
            string     treeFileName = props.GetProperty("treeFile");
            string     sentFileName = props.GetProperty("sentFile");
            string     testGraph    = props.GetProperty("testGraph");

            if (testGraph == null)
            {
                testGraph = "false";
            }
            string load = props.GetProperty("load");
            string save = props.GetProperty("save");

            if (load != null)
            {
                log.Info("Load not implemented!");
                return;
            }
            if (sentFileName == null && treeFileName == null)
            {
                log.Info("Usage: java SemanticGraph [-sentFile file|-treeFile file] [-testGraph]");
                Tree t = Tree.ValueOf("(ROOT (S (NP (NP (DT An) (NN attempt)) (PP (IN on) (NP (NP (NNP Andres) (NNP Pastrana) (POS 's)) (NN life)))) (VP (VBD was) (VP (VBN carried) (PP (IN out) (S (VP (VBG using) (NP (DT a) (JJ powerful) (NN bomb))))))) (. .)))"
                                      );
                tb.Add(t);
            }
            else
            {
                if (treeFileName != null)
                {
                    tb.LoadPath(treeFileName);
                }
                else
                {
                    string[]          options = new string[] { "-retainNPTmpSubcategories" };
                    LexicalizedParser lp      = ((LexicalizedParser)LexicalizedParser.LoadModel("/u/nlp/data/lexparser/englishPCFG.ser.gz", options));
                    BufferedReader    reader  = null;
                    try
                    {
                        reader = IOUtils.ReaderFromString(sentFileName);
                    }
                    catch (IOException e)
                    {
                        throw new RuntimeIOException("Cannot find or open " + sentFileName, e);
                    }
                    try
                    {
                        System.Console.Out.WriteLine("Processing sentence file " + sentFileName);
                        for (string line; (line = reader.ReadLine()) != null;)
                        {
                            System.Console.Out.WriteLine("Processing sentence: " + line);
                            PTBTokenizer <Word> ptb   = PTBTokenizer.NewPTBTokenizer(new StringReader(line));
                            IList <Word>        words = ptb.Tokenize();
                            Tree parseTree            = lp.ParseTree(words);
                            tb.Add(parseTree);
                        }
                        reader.Close();
                    }
                    catch (Exception e)
                    {
                        throw new Exception("Exception reading key file " + sentFileName, e);
                    }
                }
            }
            foreach (Tree t_1 in tb)
            {
                SemanticGraph sg = SemanticGraphFactory.GenerateUncollapsedDependencies(t_1);
                System.Console.Out.WriteLine(sg.ToString());
                System.Console.Out.WriteLine(sg.ToCompactString());
                if (testGraph.Equals("true"))
                {
                    SemanticGraph g1 = SemanticGraphFactory.GenerateCollapsedDependencies(t_1);
                    System.Console.Out.WriteLine("TEST SEMANTIC GRAPH - graph ----------------------------");
                    System.Console.Out.WriteLine(g1.ToString());
                    System.Console.Out.WriteLine("readable ----------------------------");
                    System.Console.Out.WriteLine(g1.ToString(SemanticGraph.OutputFormat.Readable));
                    System.Console.Out.WriteLine("List of dependencies ----------------------------");
                    System.Console.Out.WriteLine(g1.ToList());
                    System.Console.Out.WriteLine("xml ----------------------------");
                    System.Console.Out.WriteLine(g1.ToString(SemanticGraph.OutputFormat.Xml));
                    System.Console.Out.WriteLine("dot ----------------------------");
                    System.Console.Out.WriteLine(g1.ToDotFormat());
                    System.Console.Out.WriteLine("dot (simple) ----------------------------");
                    System.Console.Out.WriteLine(g1.ToDotFormat("Simple", CoreLabel.OutputFormat.Value));
                }
            }
            // System.out.println(" graph ----------------------------");
            // System.out.println(t.allTypedDependenciesCCProcessed(false));
            if (save != null)
            {
                log.Info("Save not implemented!");
            }
        }
Beispiel #28
0
        public void SentenceParser(string sent2)
        {
            var modelsDirectory = jarRoot + @"edu\stanford\nlp\models";

            // Loading english PCFG parser from file
            var lp = LexicalizedParser.loadModel(modelsDirectory + @"\lexparser\englishPCFG.ser.gz");

            // This option shows loading and using an explicit tokenizer
            sent2.ToLower();
            var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
            var sent2Reader      = new java.io.StringReader(sent2);
            var rawWords2        = tokenizerFactory.getTokenizer(sent2Reader).tokenize();

            sent2Reader.close();
            var tree2 = lp.apply(rawWords2);

            // Extract dependencies from lexical tree

            var tlp = new PennTreebankLanguagePack();
            var gsf = tlp.grammaticalStructureFactory();
            var gs  = gsf.newGrammaticalStructure(tree2);
            var tdl = gs.typedDependenciesCCprocessed();
            //Console.WriteLine("\n{0}\n", tdl);


            // Extract collapsed dependencies from parsed tree

            var tp = new TreePrint("penn,typedDependenciesCollapsed");

            tp.printTree(tree2);



            ArrayList dep = gs.typedDependenciesCollapsed() as ArrayList;

            foreach (TypedDependency td in dep)
            {
                for (int i = 0; i < keyword.Length; i++)
                {
                    if (td.dep().originalText().Equals(keyword[i]))
                    {
                        keyFlag = true;
                        key     = keyword[i];
                        break;
                    }
                }
                if (keyFlag)
                {
                    break;
                }
            }

            keyFlag = false;


            switch (key)
            {
            case "circle":

                Circle circle = new Circle();
                shape     = circle.GetProps();
                propsUsed = Associator(shape, dep);

                break;

            case "rectangle":

                Rectangle rect = new Rectangle();
                shape     = rect.GetProps();
                propsUsed = Associator(shape, dep);

                break;

            case "triangle":

                Triangle tri = new Triangle();
                shape     = tri.GetProps();
                propsUsed = Associator(shape, dep);

                break;

            case "square":

                Square square = new Square();
                shape     = square.GetProps();
                propsUsed = Associator(shape, dep);

                break;

            default:

                break;
            } //End of Switch

            dependency = tdl.ToString();
        } //End of SentenceParser
Beispiel #29
0
        public WordsToSentencesAnnotator(Properties properties)
        {
            bool nlSplitting = bool.ValueOf(properties.GetProperty(StanfordCoreNLP.NewlineSplitterProperty, "false"));

            if (nlSplitting)
            {
                bool whitespaceTokenization = bool.ValueOf(properties.GetProperty("tokenize.whitespace", "false"));
                if (whitespaceTokenization)
                {
                    if (Runtime.LineSeparator().Equals("\n"))
                    {
                        // this constructor will keep empty lines as empty sentences
                        WordToSentenceProcessor <CoreLabel> wts1 = new WordToSentenceProcessor <CoreLabel>(ArrayUtils.AsImmutableSet(new string[] { "\n" }));
                        this.countLineNumbers = true;
                        this.wts = wts1;
                    }
                    else
                    {
                        // throw "\n" in just in case files use that instead of
                        // the system separator
                        // this constructor will keep empty lines as empty sentences
                        WordToSentenceProcessor <CoreLabel> wts1 = new WordToSentenceProcessor <CoreLabel>(ArrayUtils.AsImmutableSet(new string[] { Runtime.LineSeparator(), "\n" }));
                        this.countLineNumbers = true;
                        this.wts = wts1;
                    }
                }
                else
                {
                    // this constructor will keep empty lines as empty sentences
                    WordToSentenceProcessor <CoreLabel> wts1 = new WordToSentenceProcessor <CoreLabel>(ArrayUtils.AsImmutableSet(new string[] { PTBTokenizer.GetNewlineToken() }));
                    this.countLineNumbers = true;
                    this.wts = wts1;
                }
            }
            else
            {
                string isOneSentence = properties.GetProperty("ssplit.isOneSentence");
                if (bool.Parse(isOneSentence))
                {
                    // this method treats null as false
                    // Treat as one sentence: You get a no-op sentence splitter that always returns all tokens as one sentence.
                    WordToSentenceProcessor <CoreLabel> wts1 = new WordToSentenceProcessor <CoreLabel>(true);
                    this.countLineNumbers = false;
                    this.wts = wts1;
                }
                else
                {
                    // multi token sentence boundaries
                    string boundaryMultiTokenRegex = properties.GetProperty("ssplit.boundaryMultiTokenRegex");
                    // Discard these tokens without marking them as sentence boundaries
                    string tokenPatternsToDiscardProp          = properties.GetProperty("ssplit.tokenPatternsToDiscard");
                    ICollection <string> tokenRegexesToDiscard = null;
                    if (tokenPatternsToDiscardProp != null)
                    {
                        string[] toks = tokenPatternsToDiscardProp.Split(",");
                        tokenRegexesToDiscard = Generics.NewHashSet(Arrays.AsList(toks));
                    }
                    // regular boundaries
                    string boundaryTokenRegex     = properties.GetProperty("ssplit.boundaryTokenRegex");
                    string boundaryFollowersRegex = properties.GetProperty("ssplit.boundaryFollowersRegex");
                    // newline boundaries which are discarded.
                    ICollection <string> boundariesToDiscard = null;
                    string bounds = properties.GetProperty("ssplit.boundariesToDiscard");
                    if (bounds != null)
                    {
                        string[] toks = bounds.Split(",");
                        boundariesToDiscard = Generics.NewHashSet(Arrays.AsList(toks));
                    }
                    ICollection <string> htmlElementsToDiscard = null;
                    // HTML boundaries which are discarded
                    bounds = properties.GetProperty("ssplit.htmlBoundariesToDiscard");
                    if (bounds != null)
                    {
                        string[] elements = bounds.Split(",");
                        htmlElementsToDiscard = Generics.NewHashSet(Arrays.AsList(elements));
                    }
                    string nlsb = properties.GetProperty(StanfordCoreNLP.NewlineIsSentenceBreakProperty, StanfordCoreNLP.DefaultNewlineIsSentenceBreak);
                    this.countLineNumbers = false;
                    this.wts = new WordToSentenceProcessor <CoreLabel>(boundaryTokenRegex, boundaryFollowersRegex, boundariesToDiscard, htmlElementsToDiscard, WordToSentenceProcessor.StringToNewlineIsSentenceBreak(nlsb), (boundaryMultiTokenRegex != null) ? TokenSequencePattern
                                                                       .Compile(boundaryMultiTokenRegex) : null, tokenRegexesToDiscard);
                }
            }
            Verbose = bool.ValueOf(properties.GetProperty("ssplit.verbose", "false"));
        }