/// <summary> /// demoDP demonstrates turning a file into tokens and then parse /// trees. /// </summary> /// <remarks> /// demoDP demonstrates turning a file into tokens and then parse /// trees. Note that the trees are printed by calling pennPrint on /// the Tree object. It is also possible to pass a PrintWriter to /// pennPrint if you want to capture the output. /// This code will work with any supported language. /// </remarks> public static void DemoDP(LexicalizedParser lp, string filename) { // This option shows loading, sentence-segmenting and tokenizing // a file using DocumentPreprocessor. ITreebankLanguagePack tlp = lp.TreebankLanguagePack(); // a PennTreebankLanguagePack for English IGrammaticalStructureFactory gsf = null; if (tlp.SupportsGrammaticalStructures()) { gsf = tlp.GrammaticalStructureFactory(); } // You could also create a tokenizer here (as below) and pass it // to DocumentPreprocessor foreach (IList <IHasWord> sentence in new DocumentPreprocessor(filename)) { Tree parse = lp.Apply(sentence); parse.PennPrint(); System.Console.Out.WriteLine(); if (gsf != null) { GrammaticalStructure gs = gsf.NewGrammaticalStructure(parse); ICollection tdl = gs.TypedDependenciesCCprocessed(); System.Console.Out.WriteLine(tdl); System.Console.Out.WriteLine(); } } }
/// <summary> /// demoAPI demonstrates other ways of calling the parser with /// already tokenized text, or in some cases, raw text that needs to /// be tokenized as a single sentence. /// </summary> /// <remarks> /// demoAPI demonstrates other ways of calling the parser with /// already tokenized text, or in some cases, raw text that needs to /// be tokenized as a single sentence. Output is handled with a /// TreePrint object. Note that the options used when creating the /// TreePrint can determine what results to print out. Once again, /// one can capture the output by passing a PrintWriter to /// TreePrint.printTree. This code is for English. /// </remarks> public static void DemoAPI(LexicalizedParser lp) { // This option shows parsing a list of correctly tokenized words string[] sent = new string[] { "This", "is", "an", "easy", "sentence", "." }; IList <CoreLabel> rawWords = SentenceUtils.ToCoreLabelList(sent); Tree parse = lp.Apply(rawWords); parse.PennPrint(); System.Console.Out.WriteLine(); // This option shows loading and using an explicit tokenizer string sent2 = "This is another sentence."; ITokenizerFactory <CoreLabel> tokenizerFactory = PTBTokenizer.Factory(new CoreLabelTokenFactory(), string.Empty); ITokenizer <CoreLabel> tok = tokenizerFactory.GetTokenizer(new StringReader(sent2)); IList <CoreLabel> rawWords2 = tok.Tokenize(); parse = lp.Apply(rawWords2); ITreebankLanguagePack tlp = lp.TreebankLanguagePack(); // PennTreebankLanguagePack for English IGrammaticalStructureFactory gsf = tlp.GrammaticalStructureFactory(); GrammaticalStructure gs = gsf.NewGrammaticalStructure(parse); IList <TypedDependency> tdl = gs.TypedDependenciesCCprocessed(); System.Console.Out.WriteLine(tdl); System.Console.Out.WriteLine(); // You can also use a TreePrint object to print trees and dependencies TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed"); tp.PrintTree(parse); }
public ParserAnnotator(string annotatorName, Properties props) { string model = props.GetProperty(annotatorName + ".model", LexicalizedParser.DefaultParserLoc); if (model == null) { throw new ArgumentException("No model specified for Parser annotator " + annotatorName); } this.Verbose = PropertiesUtils.GetBool(props, annotatorName + ".debug", false); string[] flags = ConvertFlagsToArray(props.GetProperty(annotatorName + ".flags")); this.parser = LoadModel(model, Verbose, flags); this.maxSentenceLength = PropertiesUtils.GetInt(props, annotatorName + ".maxlen", -1); string treeMapClass = props.GetProperty(annotatorName + ".treemap"); if (treeMapClass == null) { this.treeMap = null; } else { this.treeMap = ReflectionLoading.LoadByReflection(treeMapClass, props); } this.maxParseTime = PropertiesUtils.GetLong(props, annotatorName + ".maxtime", -1); this.kBest = PropertiesUtils.GetInt(props, annotatorName + ".kbest", 1); this.keepPunct = PropertiesUtils.GetBool(props, annotatorName + ".keepPunct", true); string buildGraphsProperty = annotatorName + ".buildgraphs"; if (!this.parser.GetTLPParams().SupportsBasicDependencies()) { if (PropertiesUtils.GetBool(props, buildGraphsProperty)) { log.Info("WARNING: " + buildGraphsProperty + " set to true, but " + this.parser.GetTLPParams().GetType() + " does not support dependencies"); } this.BuildGraphs = false; } else { this.BuildGraphs = PropertiesUtils.GetBool(props, buildGraphsProperty, true); } if (this.BuildGraphs) { bool generateOriginalDependencies = PropertiesUtils.GetBool(props, annotatorName + ".originalDependencies", false); parser.GetTLPParams().SetGenerateOriginalDependencies(generateOriginalDependencies); ITreebankLanguagePack tlp = parser.GetTLPParams().TreebankLanguagePack(); IPredicate <string> punctFilter = this.keepPunct ? Filters.AcceptFilter() : tlp.PunctuationWordRejectFilter(); this.gsf = tlp.GrammaticalStructureFactory(punctFilter, parser.GetTLPParams().TypedDependencyHeadFinder()); } else { this.gsf = null; } this.nThreads = PropertiesUtils.GetInt(props, annotatorName + ".nthreads", PropertiesUtils.GetInt(props, "nthreads", 1)); bool usesBinary = StanfordCoreNLP.UsesBinaryTrees(props); this.saveBinaryTrees = PropertiesUtils.GetBool(props, annotatorName + ".binaryTrees", usesBinary); this.noSquash = PropertiesUtils.GetBool(props, annotatorName + ".nosquash", false); this.extraDependencies = MetaClass.Cast(props.GetProperty(annotatorName + ".extradependencies", "NONE"), typeof(GrammaticalStructure.Extras)); }
public ParserAnnotator(ParserGrammar parser, bool verbose, int maxSent, Func <Tree, Tree> treeMap) { this.Verbose = verbose; this.BuildGraphs = parser.GetTLPParams().SupportsBasicDependencies(); this.parser = parser; this.maxSentenceLength = maxSent; this.treeMap = treeMap; this.maxParseTime = 0; this.kBest = 1; this.keepPunct = true; if (this.BuildGraphs) { ITreebankLanguagePack tlp = parser.GetTLPParams().TreebankLanguagePack(); this.gsf = tlp.GrammaticalStructureFactory(tlp.PunctuationWordRejectFilter(), parser.GetTLPParams().TypedDependencyHeadFinder()); } else { this.gsf = null; } this.nThreads = 1; this.saveBinaryTrees = false; this.noSquash = false; this.extraDependencies = GrammaticalStructure.Extras.None; }
/// <summary>This example shows a few more ways of providing input to a parser.</summary> /// <remarks> /// This example shows a few more ways of providing input to a parser. /// Usage: ParserDemo2 [grammar [textFile]] /// </remarks> /// <exception cref="System.IO.IOException"/> public static void Main(string[] args) { string grammar = args.Length > 0 ? args[0] : "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"; string[] options = new string[] { "-maxLength", "80", "-retainTmpSubcategories" }; LexicalizedParser lp = ((LexicalizedParser)LexicalizedParser.LoadModel(grammar, options)); ITreebankLanguagePack tlp = lp.GetOp().Langpack(); IGrammaticalStructureFactory gsf = tlp.GrammaticalStructureFactory(); IEnumerable <IList <IHasWord> > sentences; if (args.Length > 1) { DocumentPreprocessor dp = new DocumentPreprocessor(args[1]); IList <IList <IHasWord> > tmp = new List <IList <IHasWord> >(); foreach (IList <IHasWord> sentence in dp) { tmp.Add(sentence); } sentences = tmp; } else { // Showing tokenization and parsing in code a couple of different ways. string[] sent = new string[] { "This", "is", "an", "easy", "sentence", "." }; IList <IHasWord> sentence = new List <IHasWord>(); foreach (string word in sent) { sentence.Add(new Word(word)); } string sent2 = ("This is a slightly longer and more complex " + "sentence requiring tokenization."); // Use the default tokenizer for this TreebankLanguagePack ITokenizer <IHasWord> toke = tlp.GetTokenizerFactory().GetTokenizer(new StringReader(sent2)); IList <IHasWord> sentence2 = toke.Tokenize(); string[] sent3 = new string[] { "It", "can", "can", "it", "." }; string[] tag3 = new string[] { "PRP", "MD", "VB", "PRP", "." }; // Parser gets second "can" wrong without help IList <TaggedWord> sentence3 = new List <TaggedWord>(); for (int i = 0; i < sent3.Length; i++) { sentence3.Add(new TaggedWord(sent3[i], tag3[i])); } Tree parse = lp.Parse(sentence3); parse.PennPrint(); IList <IList <IHasWord> > tmp = new List <IList <IHasWord> >(); tmp.Add(sentence); tmp.Add(sentence2); tmp.Add(sentence3); sentences = tmp; } foreach (IList <IHasWord> sentence_1 in sentences) { Tree parse = lp.Parse(sentence_1); parse.PennPrint(); System.Console.Out.WriteLine(); GrammaticalStructure gs = gsf.NewGrammaticalStructure(parse); IList <TypedDependency> tdl = gs.TypedDependenciesCCprocessed(); System.Console.Out.WriteLine(tdl); System.Console.Out.WriteLine(); System.Console.Out.WriteLine("The words of the sentence:"); foreach (ILabel lab in parse.Yield()) { if (lab is CoreLabel) { System.Console.Out.WriteLine(((CoreLabel)lab).ToString(CoreLabel.OutputFormat.ValueMap)); } else { System.Console.Out.WriteLine(lab); } } System.Console.Out.WriteLine(); System.Console.Out.WriteLine(parse.TaggedYield()); System.Console.Out.WriteLine(); } // This method turns the String into a single sentence using the // default tokenizer for the TreebankLanguagePack. string sent3_1 = "This is one last test!"; lp.Parse(sent3_1).PennPrint(); }