Пример #1
0
        /// <summary>
        /// demoDP demonstrates turning a file into tokens and then parse
        /// trees.
        /// </summary>
        /// <remarks>
        /// demoDP demonstrates turning a file into tokens and then parse
        /// trees.  Note that the trees are printed by calling pennPrint on
        /// the Tree object.  It is also possible to pass a PrintWriter to
        /// pennPrint if you want to capture the output.
        /// This code will work with any supported language.
        /// </remarks>
        public static void DemoDP(LexicalizedParser lp, string filename)
        {
            // This option shows loading, sentence-segmenting and tokenizing
            // a file using DocumentPreprocessor.
            ITreebankLanguagePack tlp = lp.TreebankLanguagePack();
            // a PennTreebankLanguagePack for English
            IGrammaticalStructureFactory gsf = null;

            if (tlp.SupportsGrammaticalStructures())
            {
                gsf = tlp.GrammaticalStructureFactory();
            }
            // You could also create a tokenizer here (as below) and pass it
            // to DocumentPreprocessor
            foreach (IList <IHasWord> sentence in new DocumentPreprocessor(filename))
            {
                Tree parse = lp.Apply(sentence);
                parse.PennPrint();
                System.Console.Out.WriteLine();
                if (gsf != null)
                {
                    GrammaticalStructure gs  = gsf.NewGrammaticalStructure(parse);
                    ICollection          tdl = gs.TypedDependenciesCCprocessed();
                    System.Console.Out.WriteLine(tdl);
                    System.Console.Out.WriteLine();
                }
            }
        }
Пример #2
0
        /// <summary>
        /// demoAPI demonstrates other ways of calling the parser with
        /// already tokenized text, or in some cases, raw text that needs to
        /// be tokenized as a single sentence.
        /// </summary>
        /// <remarks>
        /// demoAPI demonstrates other ways of calling the parser with
        /// already tokenized text, or in some cases, raw text that needs to
        /// be tokenized as a single sentence.  Output is handled with a
        /// TreePrint object.  Note that the options used when creating the
        /// TreePrint can determine what results to print out.  Once again,
        /// one can capture the output by passing a PrintWriter to
        /// TreePrint.printTree. This code is for English.
        /// </remarks>
        public static void DemoAPI(LexicalizedParser lp)
        {
            // This option shows parsing a list of correctly tokenized words
            string[]          sent     = new string[] { "This", "is", "an", "easy", "sentence", "." };
            IList <CoreLabel> rawWords = SentenceUtils.ToCoreLabelList(sent);
            Tree parse = lp.Apply(rawWords);

            parse.PennPrint();
            System.Console.Out.WriteLine();
            // This option shows loading and using an explicit tokenizer
            string sent2 = "This is another sentence.";
            ITokenizerFactory <CoreLabel> tokenizerFactory = PTBTokenizer.Factory(new CoreLabelTokenFactory(), string.Empty);
            ITokenizer <CoreLabel>        tok       = tokenizerFactory.GetTokenizer(new StringReader(sent2));
            IList <CoreLabel>             rawWords2 = tok.Tokenize();

            parse = lp.Apply(rawWords2);
            ITreebankLanguagePack tlp = lp.TreebankLanguagePack();
            // PennTreebankLanguagePack for English
            IGrammaticalStructureFactory gsf = tlp.GrammaticalStructureFactory();
            GrammaticalStructure         gs  = gsf.NewGrammaticalStructure(parse);
            IList <TypedDependency>      tdl = gs.TypedDependenciesCCprocessed();

            System.Console.Out.WriteLine(tdl);
            System.Console.Out.WriteLine();
            // You can also use a TreePrint object to print trees and dependencies
            TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");

            tp.PrintTree(parse);
        }
Пример #3
0
        public ParserAnnotator(string annotatorName, Properties props)
        {
            string model = props.GetProperty(annotatorName + ".model", LexicalizedParser.DefaultParserLoc);

            if (model == null)
            {
                throw new ArgumentException("No model specified for Parser annotator " + annotatorName);
            }
            this.Verbose = PropertiesUtils.GetBool(props, annotatorName + ".debug", false);
            string[] flags = ConvertFlagsToArray(props.GetProperty(annotatorName + ".flags"));
            this.parser            = LoadModel(model, Verbose, flags);
            this.maxSentenceLength = PropertiesUtils.GetInt(props, annotatorName + ".maxlen", -1);
            string treeMapClass = props.GetProperty(annotatorName + ".treemap");

            if (treeMapClass == null)
            {
                this.treeMap = null;
            }
            else
            {
                this.treeMap = ReflectionLoading.LoadByReflection(treeMapClass, props);
            }
            this.maxParseTime = PropertiesUtils.GetLong(props, annotatorName + ".maxtime", -1);
            this.kBest        = PropertiesUtils.GetInt(props, annotatorName + ".kbest", 1);
            this.keepPunct    = PropertiesUtils.GetBool(props, annotatorName + ".keepPunct", true);
            string buildGraphsProperty = annotatorName + ".buildgraphs";

            if (!this.parser.GetTLPParams().SupportsBasicDependencies())
            {
                if (PropertiesUtils.GetBool(props, buildGraphsProperty))
                {
                    log.Info("WARNING: " + buildGraphsProperty + " set to true, but " + this.parser.GetTLPParams().GetType() + " does not support dependencies");
                }
                this.BuildGraphs = false;
            }
            else
            {
                this.BuildGraphs = PropertiesUtils.GetBool(props, buildGraphsProperty, true);
            }
            if (this.BuildGraphs)
            {
                bool generateOriginalDependencies = PropertiesUtils.GetBool(props, annotatorName + ".originalDependencies", false);
                parser.GetTLPParams().SetGenerateOriginalDependencies(generateOriginalDependencies);
                ITreebankLanguagePack tlp         = parser.GetTLPParams().TreebankLanguagePack();
                IPredicate <string>   punctFilter = this.keepPunct ? Filters.AcceptFilter() : tlp.PunctuationWordRejectFilter();
                this.gsf = tlp.GrammaticalStructureFactory(punctFilter, parser.GetTLPParams().TypedDependencyHeadFinder());
            }
            else
            {
                this.gsf = null;
            }
            this.nThreads = PropertiesUtils.GetInt(props, annotatorName + ".nthreads", PropertiesUtils.GetInt(props, "nthreads", 1));
            bool usesBinary = StanfordCoreNLP.UsesBinaryTrees(props);

            this.saveBinaryTrees   = PropertiesUtils.GetBool(props, annotatorName + ".binaryTrees", usesBinary);
            this.noSquash          = PropertiesUtils.GetBool(props, annotatorName + ".nosquash", false);
            this.extraDependencies = MetaClass.Cast(props.GetProperty(annotatorName + ".extradependencies", "NONE"), typeof(GrammaticalStructure.Extras));
        }
Пример #4
0
 public ParserAnnotator(ParserGrammar parser, bool verbose, int maxSent, Func <Tree, Tree> treeMap)
 {
     this.Verbose           = verbose;
     this.BuildGraphs       = parser.GetTLPParams().SupportsBasicDependencies();
     this.parser            = parser;
     this.maxSentenceLength = maxSent;
     this.treeMap           = treeMap;
     this.maxParseTime      = 0;
     this.kBest             = 1;
     this.keepPunct         = true;
     if (this.BuildGraphs)
     {
         ITreebankLanguagePack tlp = parser.GetTLPParams().TreebankLanguagePack();
         this.gsf = tlp.GrammaticalStructureFactory(tlp.PunctuationWordRejectFilter(), parser.GetTLPParams().TypedDependencyHeadFinder());
     }
     else
     {
         this.gsf = null;
     }
     this.nThreads          = 1;
     this.saveBinaryTrees   = false;
     this.noSquash          = false;
     this.extraDependencies = GrammaticalStructure.Extras.None;
 }
Пример #5
0
        /// <summary>This example shows a few more ways of providing input to a parser.</summary>
        /// <remarks>
        /// This example shows a few more ways of providing input to a parser.
        /// Usage: ParserDemo2 [grammar [textFile]]
        /// </remarks>
        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            string grammar = args.Length > 0 ? args[0] : "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";

            string[]                        options = new string[] { "-maxLength", "80", "-retainTmpSubcategories" };
            LexicalizedParser               lp      = ((LexicalizedParser)LexicalizedParser.LoadModel(grammar, options));
            ITreebankLanguagePack           tlp     = lp.GetOp().Langpack();
            IGrammaticalStructureFactory    gsf     = tlp.GrammaticalStructureFactory();
            IEnumerable <IList <IHasWord> > sentences;

            if (args.Length > 1)
            {
                DocumentPreprocessor      dp  = new DocumentPreprocessor(args[1]);
                IList <IList <IHasWord> > tmp = new List <IList <IHasWord> >();
                foreach (IList <IHasWord> sentence in dp)
                {
                    tmp.Add(sentence);
                }
                sentences = tmp;
            }
            else
            {
                // Showing tokenization and parsing in code a couple of different ways.
                string[]         sent     = new string[] { "This", "is", "an", "easy", "sentence", "." };
                IList <IHasWord> sentence = new List <IHasWord>();
                foreach (string word in sent)
                {
                    sentence.Add(new Word(word));
                }
                string sent2 = ("This is a slightly longer and more complex " + "sentence requiring tokenization.");
                // Use the default tokenizer for this TreebankLanguagePack
                ITokenizer <IHasWord> toke      = tlp.GetTokenizerFactory().GetTokenizer(new StringReader(sent2));
                IList <IHasWord>      sentence2 = toke.Tokenize();
                string[] sent3 = new string[] { "It", "can", "can", "it", "." };
                string[] tag3  = new string[] { "PRP", "MD", "VB", "PRP", "." };
                // Parser gets second "can" wrong without help
                IList <TaggedWord> sentence3 = new List <TaggedWord>();
                for (int i = 0; i < sent3.Length; i++)
                {
                    sentence3.Add(new TaggedWord(sent3[i], tag3[i]));
                }
                Tree parse = lp.Parse(sentence3);
                parse.PennPrint();
                IList <IList <IHasWord> > tmp = new List <IList <IHasWord> >();
                tmp.Add(sentence);
                tmp.Add(sentence2);
                tmp.Add(sentence3);
                sentences = tmp;
            }
            foreach (IList <IHasWord> sentence_1 in sentences)
            {
                Tree parse = lp.Parse(sentence_1);
                parse.PennPrint();
                System.Console.Out.WriteLine();
                GrammaticalStructure    gs  = gsf.NewGrammaticalStructure(parse);
                IList <TypedDependency> tdl = gs.TypedDependenciesCCprocessed();
                System.Console.Out.WriteLine(tdl);
                System.Console.Out.WriteLine();
                System.Console.Out.WriteLine("The words of the sentence:");
                foreach (ILabel lab in parse.Yield())
                {
                    if (lab is CoreLabel)
                    {
                        System.Console.Out.WriteLine(((CoreLabel)lab).ToString(CoreLabel.OutputFormat.ValueMap));
                    }
                    else
                    {
                        System.Console.Out.WriteLine(lab);
                    }
                }
                System.Console.Out.WriteLine();
                System.Console.Out.WriteLine(parse.TaggedYield());
                System.Console.Out.WriteLine();
            }
            // This method turns the String into a single sentence using the
            // default tokenizer for the TreebankLanguagePack.
            string sent3_1 = "This is one last test!";

            lp.Parse(sent3_1).PennPrint();
        }