예제 #1
0
        private void TestOnTreebank(LexicalizedParser pd, ITreebankLangParserParams tlpParams, Treebank testTreebank, string treebankRoot, IIndex <string> stateIndex)
        {
            Timing.StartTime();
            ITreeTransformer annotator = new TreeAnnotator(tlpParams.HeadFinder(), tlpParams, op);

            // CDM: Aug 2004: With new implementation of treebank split categories,
            // I've hardwired this to load English ones.  Otherwise need training data.
            // op.trainOptions.splitters = new HashSet(Arrays.asList(op.tlpParams.splitters()));
            op.trainOptions.splitters       = ParentAnnotationStats.GetEnglishSplitCategories(treebankRoot);
            op.trainOptions.sisterSplitters = Generics.NewHashSet(Arrays.AsList(op.tlpParams.SisterSplitters()));
            foreach (Tree goldTree in testTreebank)
            {
                goldTree = annotator.TransformTree(goldTree);
                //      System.out.println();
                //      System.out.println("Checking tree: " + goldTree);
                foreach (Tree localTree in goldTree)
                {
                    // now try to use the grammar to score this local tree
                    if (localTree.IsLeaf() || localTree.IsPreTerminal() || localTree.Children().Length < 2)
                    {
                        continue;
                    }
                    System.Console.Out.WriteLine(LocalTreeToRule(localTree));
                    double score = ComputeLocalTreeScore(localTree, stateIndex, pd);
                    if (score == double.NegativeInfinity)
                    {
                    }
                    //          System.out.println(localTreeToRule(localTree));
                    System.Console.Out.WriteLine("score: " + score);
                }
            }
        }
예제 #2
0
        //use Stanford.NLP.Net to parse the sentence
        static Tree Parse(string sent)
        {
            // Loading english PCFG parser from file
            var lp = LexicalizedParser.loadModel(modelsDirectory + "\\lexparser\\englishPCFG.ser.gz");

            var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
            var sentReader       = new java.io.StringReader(sent);
            var rawWords         = tokenizerFactory.getTokenizer(sentReader).tokenize();

            sentReader.close();
            var tree = lp.apply(rawWords);

            // Extract dependencies from lexical tree
            var tlp = new PennTreebankLanguagePack();
            var gsf = tlp.grammaticalStructureFactory();
            var gs  = gsf.newGrammaticalStructure(tree);
            var tdl = gs.typedDependenciesCCprocessed();

            // Extract collapsed dependencies from parsed tree
            //var tp = new TreePrint("penn,typedDependenciesCollapsed");
            var tp = new TreePrint("penn");

            tp.printTree(tree);

            return(tree);
        }
 // todo: perhaps the output streams could be passed in
 /// <summary>
 /// Parse the files with names given in the String array args elements from
 /// index argIndex on.
 /// </summary>
 /// <remarks>
 /// Parse the files with names given in the String array args elements from
 /// index argIndex on.  Convenience method which builds and invokes a ParseFiles object.
 /// </remarks>
 public static void ParseFiles <_T0>(string[] args, int argIndex, bool tokenized, ITokenizerFactory <_T0> tokenizerFactory, string elementDelimiter, string sentenceDelimiter, IFunction <IList <IHasWord>, IList <IHasWord> > escaper, string tagDelimiter
                                     , Options op, TreePrint treePrint, LexicalizedParser pqFactory)
 where _T0 : IHasWord
     {
      Edu.Stanford.Nlp.Parser.Lexparser.ParseFiles pf = new Edu.Stanford.Nlp.Parser.Lexparser.ParseFiles(op, treePrint, pqFactory);
      pf.ParseFiles(args, argIndex, tokenized, tokenizerFactory, elementDelimiter, sentenceDelimiter, escaper, tagDelimiter);
     }
        public FilterConfusingRules(LexicalizedParser parser)
        {
            BinaryGrammar   binaryGrammar = parser.bg;
            UnaryGrammar    unaryGrammar  = parser.ug;
            Options         op            = parser.GetOp();
            IIndex <string> stateIndex    = parser.stateIndex;

            foreach (UnaryRule unaryRule in unaryGrammar)
            {
                // only make one matrix for each parent state, and only use the
                // basic category for that
                string childState = stateIndex.Get(unaryRule.child);
                string childBasic = op.Langpack().BasicCategory(childState);
                unaryRules.Add(childBasic);
            }
            foreach (BinaryRule binaryRule in binaryGrammar)
            {
                // only make one matrix for each parent state, and only use the
                // basic category for that
                string leftState  = stateIndex.Get(binaryRule.leftChild);
                string leftBasic  = op.Langpack().BasicCategory(leftState);
                string rightState = stateIndex.Get(binaryRule.rightChild);
                string rightBasic = op.Langpack().BasicCategory(rightState);
                binaryRules.Add(leftBasic, rightBasic);
            }
        }
예제 #5
0
        public static void ParseString(string sentence)
        {
            // Path to models extracted from `stanford-parser-3.6.0-models.jar`
            var modelsDirectory = @"../../../data/paket-files/stanford-corenlp-3.9.1-models/edu/stanford/nlp/models";
            var model           = @"/lexparser/englishPCFG.ser.gz";
            //var model = @"/parser/nndep/english_SD.gz";

            // Loading english PCFG parser from file
            var lp = LexicalizedParser.loadModel(modelsDirectory + model);

            // This sample shows parsing a list of correctly tokenized words
            //var rawWords = SentenceUtils.toCoreLabelList(sentence);
            //var tree = lp.apply(rawWords);
            //tree.pennPrint();

            // This option shows loading and using an explicit tokenizer
            var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
            var sent2Reader      = new StringReader(sentence);
            var rawWords2        = tokenizerFactory.getTokenizer(sent2Reader).tokenize();
            //sent2Reader.close();
            var tree2 = lp.apply(rawWords2);

            // Extract dependencies from lexical tree
            //var tlp = new PennTreebankLanguagePack();
            //var gsf = tlp.grammaticalStructureFactory();
            //var gs = gsf.newGrammaticalStructure(tree2);
            //var tdl = gs.typedDependenciesCCprocessed();
            //Console.WriteLine("\n{0}\n", tdl);

            // Extract collapsed dependencies from parsed tree
            var tp = new TreePrint("penn,typedDependenciesCollapsed");

            tp.printTree(tree2);
        }
예제 #6
0
        public static void DemoAPI(LexicalizedParser lp)
        {
            // This option shows parsing a list of correctly tokenized words
            var sent = new[] { "This", "is", "an", "easy", "sentence", "." };

            java.util.List rawWords = Sentence.toCoreLabelList(sent);
            Tree           parse    = lp.apply(rawWords);

            parse.pennPrint();

            // This option shows loading and using an explicit tokenizer
            const string     Sent2            = "This is another sentence.";
            TokenizerFactory tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
            var sent2Reader = new StringReader(Sent2);

            java.util.List rawWords2 = tokenizerFactory.getTokenizer(sent2Reader).tokenize();
            parse = lp.apply(rawWords2);

            var tlp = new PennTreebankLanguagePack();
            GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
            GrammaticalStructure        gs  = gsf.newGrammaticalStructure(parse);

            java.util.List tdl = gs.typedDependenciesCCprocessed();
            Console.WriteLine("\n{0}\n", tdl);

            var tp = new TreePrint("penn,typedDependenciesCollapsed");

            tp.printTree(parse);
        }
        public static IList <Tree> GetTopParsesForOneTree(LexicalizedParser parser, int dvKBest, Tree tree, ITreeTransformer transformer)
        {
            IParserQuery pq       = parser.ParserQuery();
            IList <Word> sentence = tree.YieldWords();

            // Since the trees are binarized and otherwise manipulated, we
            // need to chop off the last word in order to remove the end of
            // sentence symbol
            if (sentence.Count <= 1)
            {
                return(null);
            }
            sentence = sentence.SubList(0, sentence.Count - 1);
            if (!pq.Parse(sentence))
            {
                log.Info("Failed to use the given parser to reparse sentence \"" + sentence + "\"");
                return(null);
            }
            IList <Tree> parses = new List <Tree>();
            IList <ScoredObject <Tree> > bestKParses = pq.GetKBestPCFGParses(dvKBest);

            foreach (ScoredObject <Tree> so in bestKParses)
            {
                Tree result = so.Object();
                if (transformer != null)
                {
                    result = transformer.TransformTree(result);
                }
                parses.Add(result);
            }
            return(parses);
        }
예제 #8
0
        /// <summary>
        /// demoAPI demonstrates other ways of calling the parser with
        /// already tokenized text, or in some cases, raw text that needs to
        /// be tokenized as a single sentence.
        /// </summary>
        /// <remarks>
        /// demoAPI demonstrates other ways of calling the parser with
        /// already tokenized text, or in some cases, raw text that needs to
        /// be tokenized as a single sentence.  Output is handled with a
        /// TreePrint object.  Note that the options used when creating the
        /// TreePrint can determine what results to print out.  Once again,
        /// one can capture the output by passing a PrintWriter to
        /// TreePrint.printTree. This code is for English.
        /// </remarks>
        public static void DemoAPI(LexicalizedParser lp)
        {
            // This option shows parsing a list of correctly tokenized words
            string[]          sent     = new string[] { "This", "is", "an", "easy", "sentence", "." };
            IList <CoreLabel> rawWords = SentenceUtils.ToCoreLabelList(sent);
            Tree parse = lp.Apply(rawWords);

            parse.PennPrint();
            System.Console.Out.WriteLine();
            // This option shows loading and using an explicit tokenizer
            string sent2 = "This is another sentence.";
            ITokenizerFactory <CoreLabel> tokenizerFactory = PTBTokenizer.Factory(new CoreLabelTokenFactory(), string.Empty);
            ITokenizer <CoreLabel>        tok       = tokenizerFactory.GetTokenizer(new StringReader(sent2));
            IList <CoreLabel>             rawWords2 = tok.Tokenize();

            parse = lp.Apply(rawWords2);
            ITreebankLanguagePack tlp = lp.TreebankLanguagePack();
            // PennTreebankLanguagePack for English
            IGrammaticalStructureFactory gsf = tlp.GrammaticalStructureFactory();
            GrammaticalStructure         gs  = gsf.NewGrammaticalStructure(parse);
            IList <TypedDependency>      tdl = gs.TypedDependenciesCCprocessed();

            System.Console.Out.WriteLine(tdl);
            System.Console.Out.WriteLine();
            // You can also use a TreePrint object to print trees and dependencies
            TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");

            tp.PrintTree(parse);
        }
 public StanfordParsingService()
 {
     var path = Path.GetTempPath();
     if (!System.IO.File.Exists(path + "englishPCFG.ser.gz"))
         System.IO.File.WriteAllBytes(path + "englishPCFG.ser.gz", Smartifyer.Resources.englishPCFG_ser);
     lp = LexicalizedParser.loadModel(path + "englishPCFG.ser.gz");
 }
        public void ProcessText(string inputText)
        {
            var jarRoot         = "C:\\stanford-parser-full-2016-10-31\\stanford-parser-3.7.0-models";//\\edu\\stanford\\nlp\\models";//"nlp.stanford.edu\\stanford-parser-full-2017-06-09\\models";
            var modelsDirectory = jarRoot + "\\edu\\stanford\\nlp\\models";

            // Loading english PCFG parser from file
            var lp = LexicalizedParser.loadModel(modelsDirectory + "\\lexparser\\englishPCFG.ser.gz");

            // This option shows loading and using an explicit tokenizer
            var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
            var sentReader       = new StringReader(inputText);
            var rawWords         = tokenizerFactory.getTokenizer(sentReader).tokenize();

            sentReader.close();
            var tree = lp.apply(rawWords);

            //Extract dependencies from lexical tree
            var tlp = new PennTreebankLanguagePack();
            var gsf = tlp.grammaticalStructureFactory();
            var gs  = gsf.newGrammaticalStructure(tree);
            var tdl = gs.typedDependenciesCCprocessed();

            Console.WriteLine("\n{0}\n", tdl);

            // Extract collapsed dependencies from parsed tree
            var tp = new TreePrint("penn,typedDependenciesCollapsed");

            tp.printTree(tree);
        }
예제 #11
0
        public static void Main(string[] args)
        {
            string         input     = null;
            string         output    = null;
            IList <string> extraArgs = Generics.NewArrayList();

            for (int argIndex = 0; argIndex < args.Length;)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-input"))
                {
                    input     = args[argIndex + 1];
                    argIndex += 2;
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-output"))
                    {
                        output    = args[argIndex + 1];
                        argIndex += 2;
                    }
                    else
                    {
                        extraArgs.Add(args[argIndex++]);
                    }
                }
            }
            LexicalizedParser parser = LexicalizedParser.LoadModel(input, extraArgs);

            parser.SaveParserToSerialized(output);
        }
 public Query(DVModelReranker _enclosing)
 {
     this._enclosing  = _enclosing;
     this.transformer = LexicalizedParser.BuildTrainTransformer(this._enclosing.op);
     this.scorer      = new DVParserCostAndGradient(null, null, this._enclosing.model, this._enclosing.op);
     this.deepTrees   = Generics.NewArrayList();
 }
예제 #13
0
 public CacheProcessor(CacheParseHypotheses cacher, LexicalizedParser parser, int dvKBest, ITreeTransformer transformer)
 {
     this.cacher      = cacher;
     this.parser      = parser;
     this.dvKBest     = dvKBest;
     this.transformer = transformer;
 }
예제 #14
0
파일: Parser.cs 프로젝트: fossabot/atpr
        /// <summary>
        /// Parse the document searching for sentences where the entity found.
        /// Returns a csv line with the file, the entity the sentence and the sintax analisis of the sentences
        /// </summary>
        /// <param name="text">Document text</param>
        /// <param name="entity">Entity.</param>
        /// <param name="origFile">Original file.</param>
        public static List <string[]> Parse(string text, string entity, string origFile, string language)
        {
            var results = new List <string[]>();
            //Load spanish models.
            var modelsDirectory    = StanfordEnv.PARSER_MODELS;
            var lexparserDirectory = modelsDirectory + StanfordEnv.GetParserLanguageFiles(language);
            var lp = LexicalizedParser.loadModel(lexparserDirectory);

            string[]      splittedText = SplitText(text);
            List <string> entityLines  = GetEntitiesLines(splittedText, entity);

            foreach (var line in entityLines)
            {
                //Parser sentence.
                var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
                var sent2Reader      = new java.io.StringReader(line);
                var rawWords2        = tokenizerFactory.getTokenizer(sent2Reader).tokenize();
                sent2Reader.close();
                var tree2 = lp.apply(rawWords2);

                results.Add(new string[] { origFile, entity, line, tree2.ToString() });
            }

            return(results);
        }
예제 #15
0
        /// <summary>
        /// demoDP demonstrates turning a file into tokens and then parse
        /// trees.
        /// </summary>
        /// <remarks>
        /// demoDP demonstrates turning a file into tokens and then parse
        /// trees.  Note that the trees are printed by calling pennPrint on
        /// the Tree object.  It is also possible to pass a PrintWriter to
        /// pennPrint if you want to capture the output.
        /// This code will work with any supported language.
        /// </remarks>
        public static void DemoDP(LexicalizedParser lp, string filename)
        {
            // This option shows loading, sentence-segmenting and tokenizing
            // a file using DocumentPreprocessor.
            ITreebankLanguagePack tlp = lp.TreebankLanguagePack();
            // a PennTreebankLanguagePack for English
            IGrammaticalStructureFactory gsf = null;

            if (tlp.SupportsGrammaticalStructures())
            {
                gsf = tlp.GrammaticalStructureFactory();
            }
            // You could also create a tokenizer here (as below) and pass it
            // to DocumentPreprocessor
            foreach (IList <IHasWord> sentence in new DocumentPreprocessor(filename))
            {
                Tree parse = lp.Apply(sentence);
                parse.PennPrint();
                System.Console.Out.WriteLine();
                if (gsf != null)
                {
                    GrammaticalStructure gs  = gsf.NewGrammaticalStructure(parse);
                    ICollection          tdl = gs.TypedDependenciesCCprocessed();
                    System.Console.Out.WriteLine(tdl);
                    System.Console.Out.WriteLine();
                }
            }
        }
예제 #16
0
        public virtual bool Run(File trainTreebankFile, File testTreebankFile, InputStream inputStream)
        {
            op           = new Options();
            op.tlpParams = new ArabicTreebankParserParams();
            op.SetOptions("-arabicFactored");
            op.testOptions.maxLength = maxSentLen;
            op.testOptions.MaxItems  = 5000000;
            //500000 is the default for Arabic, but we have substantially more edges now
            op.testOptions.outputFormatOptions = "removeTopBracket,includePunctuationDependencies";
            // WSG: Just set this to some high value so that extractBestParse()
            // actually calls the lattice reader (e.g., this says that we can't have a word longer than
            // 80 characters...seems sensible for Arabic
            op.testOptions.maxSpanForTags = 80;
            treePrint           = op.testOptions.TreePrint(op.tlpParams);
            debinarizer         = new Debinarizer(op.forceCNF, new CategoryWordTagFactory());
            subcategoryStripper = op.tlpParams.SubcategoryStripper();
            Timing.StartTime();
            Treebank trainTreebank = op.tlpParams.DiskTreebank();

            trainTreebank.LoadPath(trainTreebankFile);
            lp = GetParserDataFromTreebank(trainTreebank);
            MakeParsers();
            if (Verbose)
            {
                op.Display();
                string lexNumRules = (pparser != null) ? int.ToString(lp.lex.NumRules()) : string.Empty;
                log.Info("Grammar\tStates\tTags\tWords\tUnaryR\tBinaryR\tTaggings");
                log.Info("Grammar\t" + lp.stateIndex.Size() + '\t' + lp.tagIndex.Size() + '\t' + lp.wordIndex.Size() + '\t' + (pparser != null ? lp.ug.NumRules() : string.Empty) + '\t' + (pparser != null ? lp.bg.NumRules() : string.Empty) + '\t' + lexNumRules
                         );
                log.Info("ParserPack is " + op.tlpParams.GetType().FullName);
                log.Info("Lexicon is " + lp.lex.GetType().FullName);
            }
            return(Parse(inputStream));
        }
        public virtual LexicalizedParser AttachModelToLexicalizedParser()
        {
            LexicalizedParser newParser = LexicalizedParser.CopyLexicalizedParser(parser);
            DVModelReranker   reranker  = new DVModelReranker(dvModel);

            newParser.reranker = reranker;
            return(newParser);
        }
        public virtual void SaveModel(string filename)
        {
            log.Info("Saving serialized model to " + filename);
            LexicalizedParser newParser = AttachModelToLexicalizedParser();

            newParser.SaveParserToSerialized(filename);
            log.Info("... done");
        }
예제 #19
0
        public NpletParser()
        {
            //---THIS IS UGLY FIX FOR STANDFORD PARSER---
            CultureInfo ci = new CultureInfo("en-US");
            Thread.CurrentThread.CurrentCulture = ci;
            Thread.CurrentThread.CurrentUICulture = ci;

            _parser = LexicalizedParser.loadModel("../../../StanfordModels/englishPCFG.ser.gz");
            //-------------------------------------------
        }
 public Query(CombinedDVModelReranker _enclosing)
 {
     this._enclosing  = _enclosing;
     this.transformer = LexicalizedParser.BuildTrainTransformer(this._enclosing.op);
     this.scorers     = Generics.NewArrayList();
     foreach (DVModel model in this._enclosing.models)
     {
         this.scorers.Add(new DVParserCostAndGradient(null, null, model, this._enclosing.op));
     }
 }
        public static DVModel GetModelFromLexicalizedParser(LexicalizedParser parser)
        {
            if (!(parser.reranker is DVModelReranker))
            {
                throw new ArgumentException("This parser does not contain a DVModel reranker");
            }
            DVModelReranker reranker = (DVModelReranker)parser.reranker;

            return(reranker.GetModel());
        }
예제 #22
0
        /// <summary>
        /// Checks the a list of sentences for correct grammar. Returns a new list of the sentences with correct grammar.
        /// </summary>
        /// <param name="sentences">A list of strings that will have their grammar checked.</param>
        /// <returns>A new list of the sentences with correct grammar.</returns>
        public static HashSet <string> CheckGrammer(HashSet <string> sentences)
        {
            HashSet <string> answers = new HashSet <string>();


            Console.WriteLine(sentences.Count + " potential sentences\n");
            if (sentences.Count == 0)
            {
                return(null);
            }

            Console.WriteLine("Loading Stanford NLP...");
            // Loading english PCFG parser from file
            var lp = LexicalizedParser.loadModel("..\\..\\..\\packages\\englishPCFG.ser.gz");

            Console.WriteLine("Stanford Parser Loaded!\n");

            //Test the grammar of each potential sentence that has all english words
            foreach (var curSentence in sentences)
            {
                //Convert the sentence to a tree that Stanford NLP library generates from its parsing
                var tempSentence = curSentence;
                var sentReader   = new StringReader(tempSentence);
                var tree         = lp.apply(PTBTokenizer.factory(new CoreLabelTokenFactory(), "").getTokenizer(sentReader).tokenize());
                sentReader.close();

                //Determine if the word is a sentence
                string strTree    = tree.ToString();
                bool   isSentence = false;

                if (strTree.Contains("(S "))
                {
                    tempSentence = curSentence + ".";
                    isSentence   = true;
                }
                else if (strTree.Contains("(SINV ") || strTree.Contains("(SBARQ ") || strTree.Contains("(SQ "))
                {
                    tempSentence = curSentence + "?";
                    isSentence   = true;
                }

                if (isSentence)
                {
                    var    tlp    = new PennTreebankLanguagePack();
                    string strRel = tlp.grammaticalStructureFactory().newGrammaticalStructure(tree).typedDependenciesCCprocessed().ToString();

                    if (strRel.Contains("nsubj("))
                    {
                        answers.Add(tempSentence);
                    }
                }
            }
            return(answers);
        }
        public static void Main(string[] args)
        {
            string parserFile = null;

            for (int argIndex = 0; argIndex < args.Length;)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-model"))
                {
                    parserFile = args[argIndex + 1];
                    argIndex  += 2;
                }
                else
                {
                    string error = "Unknown argument " + args[argIndex];
                    log.Info(error);
                    throw new Exception(error);
                }
            }
            if (parserFile == null)
            {
                log.Info("Must specify a model file with -model");
                System.Environment.Exit(2);
            }
            LexicalizedParser    parser = ((LexicalizedParser)LexicalizedParser.LoadModel(parserFile));
            ICollection <string> tags   = Generics.NewTreeSet();

            foreach (string tag in parser.tagIndex)
            {
                tags.Add(parser.TreebankLanguagePack().BasicCategory(tag));
            }
            System.Console.Out.WriteLine("Basic tags: " + tags.Count);
            foreach (string tag_1 in tags)
            {
                System.Console.Out.Write("  " + tag_1);
            }
            System.Console.Out.WriteLine();
            System.Console.Out.WriteLine("All tags size: " + parser.tagIndex.Size());
            ICollection <string> states = Generics.NewTreeSet();

            foreach (string state in parser.stateIndex)
            {
                states.Add(parser.TreebankLanguagePack().BasicCategory(state));
            }
            System.Console.Out.WriteLine("Basic states: " + states.Count);
            foreach (string tag_2 in states)
            {
                System.Console.Out.Write("  " + tag_2);
            }
            System.Console.Out.WriteLine();
            System.Console.Out.WriteLine("All states size: " + parser.stateIndex.Size());
            System.Console.Out.WriteLine("Unary grammar size: " + parser.ug.NumRules());
            System.Console.Out.WriteLine("Binary grammar size: " + parser.bg.NumRules());
        }
예제 #24
0
        public static void Start(string fileName)
        {
            LexicalizedParser lp = LexicalizedParser.loadModel(/*Program.ParserModel*/);

            if (!String.IsNullOrEmpty(fileName))
            {
                DemoDP(lp, fileName);
            }
            else
            {
                DemoAPI(lp);
            }
        }
예제 #25
0
파일: NLPService.cs 프로젝트: henkin/Nala
        public NlpService()
        {
            string parserFileOrUrl = "englishPCFG.ser.gz";

            _lp = LexicalizedParser.loadModel(parserFileOrUrl);
            if (_lp == null)
            {
                throw new InvalidOperationException("couldn't load " + parserFileOrUrl);
            }
            _tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");

            _tlp = new PennTreebankLanguagePack();
            _structureFactory = _tlp.grammaticalStructureFactory();
        }
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.TypeLoadException"/>
        public static void Main(string[] args)
        {
            string taggerFile = null;
            string inputFile  = null;
            string outputFile = null;
            double weight     = 1.0;

            for (int argIndex = 0; argIndex < args.Length;)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-tagger"))
                {
                    taggerFile = args[argIndex + 1];
                    argIndex  += 2;
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-input"))
                    {
                        inputFile = args[argIndex + 1];
                        argIndex += 2;
                    }
                    else
                    {
                        if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-output"))
                        {
                            outputFile = args[argIndex + 1];
                            argIndex  += 2;
                        }
                        else
                        {
                            if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-weight"))
                            {
                                weight    = double.ValueOf(args[argIndex + 1]);
                                argIndex += 2;
                            }
                            else
                            {
                                throw new ArgumentException("Unknown argument: " + args[argIndex]);
                            }
                        }
                    }
                }
            }
            LexicalizedParser parser = ((LexicalizedParser)LexicalizedParser.LoadModel(inputFile));
            MaxentTagger      tagger = new MaxentTagger(taggerFile);

            parser.reranker = new TaggerReranker(tagger, parser.GetOp());
            parser.SaveParserToSerialized(outputFile);
        }
예제 #27
0
        public virtual void RunTest(string[] args)
        {
            // get a parser from file
            LexicalizedParser pd = ((LexicalizedParser)LexicalizedParser.LoadModel(args[0]));

            op = pd.GetOp();
            // in case a serialized options was read in
            Treebank testTreebank = op.tlpParams.MemoryTreebank();
            int      testlow      = System.Convert.ToInt32(args[2]);
            int      testhigh     = System.Convert.ToInt32(args[3]);

            testTreebank.LoadPath(args[1], new NumberRangeFileFilter(testlow, testhigh, true));
            op.SetOptionsOrWarn(args, 4, args.Length);
            TestOnTreebank(pd, new EnglishTreebankParserParams(), testTreebank, args[1], pd.stateIndex);
        }
        public void LexicalizedParserTest()
        {
            // GZIPed model in the file
            var model = Files.Parser.Models("lexparser/englishPCFG.ser.gz");

            using var fs  = new FileStream(model, FileMode.Open);
            using var isw = new ikvm.io.InputStreamWrapper(fs);

            using var ois =
                      model.EndsWith(".gz")
                    ? new ObjectInputStream(new GZIPInputStream(isw))
                    : new ObjectInputStream(isw);

            var lp = LexicalizedParser.loadModel(ois);

            Assert.NotNull(lp);
        }
예제 #29
0
        public static void DemoDP(LexicalizedParser lp, string fileName)
        {
            // This option shows loading and sentence-segment and tokenizing
            // a file using DocumentPreprocessor
            var tlp = new PennTreebankLanguagePack();
            var gsf = tlp.grammaticalStructureFactory();
            // You could also create a tokenizer here (as below) and pass it
            // to DocumentPreprocessor
            foreach (List sentence in new DocumentPreprocessor(fileName))
            {
                var parse = lp.apply(sentence);
                parse.pennPrint();

                var gs = gsf.newGrammaticalStructure(parse);
                var tdl = gs.typedDependenciesCCprocessed(true);
                System.Console.WriteLine("\n{0}\n", tdl);
            }
        }
예제 #30
0
        public static void DemoDP(LexicalizedParser lp, string fileName)
        {
            // This option shows loading and sentence-segment and tokenizing
            // a file using DocumentPreprocessor
            var tlp = new PennTreebankLanguagePack();
            GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();

            // You could also create a tokenizer here (as below) and pass it
            // to DocumentPreprocessor
            foreach (List sentence in new DocumentPreprocessor(fileName))
            {
                Tree parse = lp.apply(sentence);
                parse.pennPrint();

                GrammaticalStructure gs  = gsf.newGrammaticalStructure(parse);
                java.util.List       tdl = gs.typedDependenciesCCprocessed(true);
                Console.WriteLine("\n{0}\n", tdl);
            }
        }
        internal static IdentityHashMap <Tree, IList <Tree> > GetTopParses(LexicalizedParser parser, Options op, ICollection <Tree> trees, ITreeTransformer transformer, bool outputUpdates)
        {
            IdentityHashMap <Tree, IList <Tree> > topParses = new IdentityHashMap <Tree, IList <Tree> >();

            foreach (Tree tree in trees)
            {
                IList <Tree> parses = GetTopParsesForOneTree(parser, op.trainOptions.dvKBest, tree, transformer);
                topParses[tree] = parses;
                if (outputUpdates && topParses.Count % 10 == 0)
                {
                    log.Info("Processed " + topParses.Count + " trees");
                }
            }
            if (outputUpdates)
            {
                log.Info("Finished processing " + topParses.Count + " trees");
            }
            return(topParses);
        }
예제 #32
0
        public static List <string> ExtractNounsFromSemantics(string sentence)
        {
            string assemblyPath    = Assembly.GetExecutingAssembly().GetName().CodeBase;
            string projectPath     = Directory.GetParent(new Uri(Path.GetDirectoryName(Path.GetDirectoryName(Path.GetDirectoryName(assemblyPath)))).LocalPath).FullName;
            string modelsDirectory = Path.GetFullPath(projectPath + @"\Parser\CoreNLP-3.9.1-Models\edu\stanford\nlp\models");

            // Loading english PCFG parser from file
            LexicalizedParser lp = LexicalizedParser.loadModel(modelsDirectory + @"\lexparser\englishPCFG.ser.gz");

            // This shows loading and using an explicit tokenizer
            var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
            var sent2Reader      = new java.io.StringReader(sentence);
            var rawWords         = tokenizerFactory.getTokenizer(sent2Reader).tokenize();

            sent2Reader.close();
            var tree = lp.apply(rawWords);

            return(tree.toArray().Cast <LabeledScoredTreeNode>().Where(n => n.isLeaf() && nounLabels.Contains(n.parent(tree).label().value())).Select(n => n.label().ToString()).ToList());
        }
 public DVParser(LexicalizedParser parser)
 {
     this.parser = parser;
     this.op     = parser.GetOp();
     if (op.trainOptions.randomSeed == 0)
     {
         op.trainOptions.randomSeed = Runtime.NanoTime();
         log.Info("Random seed not set, using randomly chosen seed of " + op.trainOptions.randomSeed);
     }
     else
     {
         log.Info("Random seed set to " + op.trainOptions.randomSeed);
     }
     log.Info("Word vector file: " + op.lexOptions.wordVectorFile);
     log.Info("Size of word vectors: " + op.lexOptions.numHid);
     log.Info("Number of hypothesis trees to train against: " + op.trainOptions.dvKBest);
     log.Info("Number of trees in one batch: " + op.trainOptions.batchSize);
     log.Info("Number of iterations of trees: " + op.trainOptions.trainingIterations);
     log.Info("Number of qn iterations per batch: " + op.trainOptions.qnIterationsPerBatch);
     log.Info("Learning rate: " + op.trainOptions.learningRate);
     log.Info("Delta margin: " + op.trainOptions.deltaMargin);
     log.Info("regCost: " + op.trainOptions.regCost);
     log.Info("Using unknown word vector for numbers: " + op.trainOptions.unknownNumberVector);
     log.Info("Using unknown dashed word vector heuristics: " + op.trainOptions.unknownDashedWordVectors);
     log.Info("Using unknown word vector for capitalized words: " + op.trainOptions.unknownCapsVector);
     log.Info("Using unknown number vector for Chinese words: " + op.trainOptions.unknownChineseNumberVector);
     log.Info("Using unknown year vector for Chinese words: " + op.trainOptions.unknownChineseYearVector);
     log.Info("Using unknown percent vector for Chinese words: " + op.trainOptions.unknownChinesePercentVector);
     log.Info("Initial matrices scaled by: " + op.trainOptions.scalingForInit);
     log.Info("Training will use " + op.trainOptions.trainingThreads + " thread(s)");
     log.Info("Context words are " + ((op.trainOptions.useContextWords) ? "on" : "off"));
     log.Info("Model will " + ((op.trainOptions.dvSimplifiedModel) ? string.Empty : "not ") + "be simplified");
     this.dvModel = new DVModel(op, parser.stateIndex, parser.ug, parser.bg);
     if (dvModel.unaryTransform.Count != dvModel.unaryScore.Count)
     {
         throw new AssertionError("Unary transform and score size not the same");
     }
     if (dvModel.binaryTransform.Size() != dvModel.binaryScore.Size())
     {
         throw new AssertionError("Binary transform and score size not the same");
     }
 }
        public bool LoadModel(string modelFile)
        {

            try
            {
                _sdpModel = LexicalizedParser.loadModel(modelFile);
                _isSDPModelLoaded = true;
                return true;
            }
            catch
            {
                System.Console.WriteLine("Uable to load the Model englishPCFG.ser.gz... ");
                _isSDPModelLoaded = false;
                return false;
            }


           



        }
예제 #35
0
        public static void DemoAPI(LexicalizedParser lp)
        {
            // This option shows parsing a list of correctly tokenized words
            var sent = new[] { "This", "is", "an", "easy", "sentence", "." };
            var rawWords = Sentence.toCoreLabelList(sent);
            var parse = lp.apply(rawWords);
            parse.pennPrint();

            // This option shows loading and using an explicit tokenizer
            const string Sent2 = "This is another sentence.";
            var tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
            var sent2Reader = new StringReader(Sent2);
            var rawWords2 = tokenizerFactory.getTokenizer(sent2Reader).tokenize();
            parse = lp.apply(rawWords2);

            var tlp = new PennTreebankLanguagePack();
            var gsf = tlp.grammaticalStructureFactory();
            var gs = gsf.newGrammaticalStructure(parse);
            var tdl = gs.typedDependenciesCCprocessed();
            System.Console.WriteLine("\n{0}\n", tdl);

            var tp = new TreePrint("penn,typedDependenciesCollapsed");
            tp.printTree(parse);
        }