/// <summary>This example shows a few more ways of providing input to a parser.</summary> /// <remarks> /// This example shows a few more ways of providing input to a parser. /// Usage: ParserDemo2 [grammar [textFile]] /// </remarks> /// <exception cref="System.IO.IOException"/> public static void Main(string[] args) { string grammar = args.Length > 0 ? args[0] : "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"; string[] options = new string[] { "-maxLength", "80", "-retainTmpSubcategories" }; LexicalizedParser lp = ((LexicalizedParser)LexicalizedParser.LoadModel(grammar, options)); ITreebankLanguagePack tlp = lp.GetOp().Langpack(); IGrammaticalStructureFactory gsf = tlp.GrammaticalStructureFactory(); IEnumerable <IList <IHasWord> > sentences; if (args.Length > 1) { DocumentPreprocessor dp = new DocumentPreprocessor(args[1]); IList <IList <IHasWord> > tmp = new List <IList <IHasWord> >(); foreach (IList <IHasWord> sentence in dp) { tmp.Add(sentence); } sentences = tmp; } else { // Showing tokenization and parsing in code a couple of different ways. string[] sent = new string[] { "This", "is", "an", "easy", "sentence", "." }; IList <IHasWord> sentence = new List <IHasWord>(); foreach (string word in sent) { sentence.Add(new Word(word)); } string sent2 = ("This is a slightly longer and more complex " + "sentence requiring tokenization."); // Use the default tokenizer for this TreebankLanguagePack ITokenizer <IHasWord> toke = tlp.GetTokenizerFactory().GetTokenizer(new StringReader(sent2)); IList <IHasWord> sentence2 = toke.Tokenize(); string[] sent3 = new string[] { "It", "can", "can", "it", "." }; string[] tag3 = new string[] { "PRP", "MD", "VB", "PRP", "." }; // Parser gets second "can" wrong without help IList <TaggedWord> sentence3 = new List <TaggedWord>(); for (int i = 0; i < sent3.Length; i++) { sentence3.Add(new TaggedWord(sent3[i], tag3[i])); } Tree parse = lp.Parse(sentence3); parse.PennPrint(); IList <IList <IHasWord> > tmp = new List <IList <IHasWord> >(); tmp.Add(sentence); tmp.Add(sentence2); tmp.Add(sentence3); sentences = tmp; } foreach (IList <IHasWord> sentence_1 in sentences) { Tree parse = lp.Parse(sentence_1); parse.PennPrint(); System.Console.Out.WriteLine(); GrammaticalStructure gs = gsf.NewGrammaticalStructure(parse); IList <TypedDependency> tdl = gs.TypedDependenciesCCprocessed(); System.Console.Out.WriteLine(tdl); System.Console.Out.WriteLine(); System.Console.Out.WriteLine("The words of the sentence:"); foreach (ILabel lab in parse.Yield()) { if (lab is CoreLabel) { System.Console.Out.WriteLine(((CoreLabel)lab).ToString(CoreLabel.OutputFormat.ValueMap)); } else { System.Console.Out.WriteLine(lab); } } System.Console.Out.WriteLine(); System.Console.Out.WriteLine(parse.TaggedYield()); System.Console.Out.WriteLine(); } // This method turns the String into a single sentence using the // default tokenizer for the TreebankLanguagePack. string sent3_1 = "This is one last test!"; lp.Parse(sent3_1).PennPrint(); }
/// <summary>Finds the nearest delimiter starting from index start.</summary> /// <remarks> /// Finds the nearest delimiter starting from index start. If <tt>seekDir</tt> /// is SEEK_FORWARD, finds the nearest delimiter after start. Else, if it is /// SEEK_BACK, finds the nearest delimiter before start. /// </remarks> private int NearestDelimiter(string text, int start, int seekDir) { if (seekDir != SeekBack && seekDir != SeekForward) { throw new ArgumentException("Unknown seek direction " + seekDir); } StringReader reader = new StringReader(text); DocumentPreprocessor processor = new DocumentPreprocessor(reader); ITokenizerFactory <IHasWord> tf = tlp.GetTokenizerFactory(); processor.SetTokenizerFactory(tf); IList <int> boundaries = new List <int>(); foreach (IList <IHasWord> sentence in processor) { if (sentence.Count == 0) { continue; } if (!(sentence[0] is IHasOffset)) { throw new InvalidCastException("Expected HasOffsets from the " + "DocumentPreprocessor"); } if (boundaries.Count == 0) { boundaries.Add(0); } else { IHasOffset first = (IHasOffset)sentence[0]; boundaries.Add(first.BeginPosition()); } } boundaries.Add(text.Length); for (int i = 0; i < boundaries.Count - 1; ++i) { if (boundaries[i] <= start && start < boundaries[i + 1]) { if (seekDir == SeekBack) { return(boundaries[i] - 1); } else { if (seekDir == SeekForward) { return(boundaries[i + 1] - 1); } } } } // The cursor position at the end is actually one past the text length. // We might as well highlight the last interval in that case. if (boundaries.Count >= 2 && start >= text.Length) { if (seekDir == SeekBack) { return(boundaries[boundaries.Count - 2] - 1); } else { if (seekDir == SeekForward) { return(boundaries[boundaries.Count - 1] - 1); } } } return(-1); }
public virtual void ParseFiles <_T0>(string[] args, int argIndex, bool tokenized, ITokenizerFactory <_T0> tokenizerFactory, string elementDelimiter, string sentenceDelimiter, IFunction <IList <IHasWord>, IList <IHasWord> > escaper, string tagDelimiter ) where _T0 : IHasWord { DocumentPreprocessor.DocType docType = (elementDelimiter == null) ? DocumentPreprocessor.DocType.Plain : DocumentPreprocessor.DocType.Xml; if (op.testOptions.verbose) { if (tokenizerFactory != null) { pwErr.Println("parseFiles: Tokenizer factory is: " + tokenizerFactory); } } Timing timer = new Timing(); // timer.start(); // constructor already starts it. //Loop over the files for (int i = argIndex; i < args.Length; i++) { string filename = args[i]; DocumentPreprocessor documentPreprocessor; if (filename.Equals("-")) { try { documentPreprocessor = new DocumentPreprocessor(IOUtils.ReaderFromStdin(op.tlpParams.GetInputEncoding()), docType); } catch (IOException e) { throw new RuntimeIOException(e); } } else { documentPreprocessor = new DocumentPreprocessor(filename, docType, op.tlpParams.GetInputEncoding()); } //Unused values are null per the main() method invocation below //null is the default for these properties documentPreprocessor.SetSentenceFinalPuncWords(tlp.SentenceFinalPunctuationWords()); documentPreprocessor.SetEscaper(escaper); documentPreprocessor.SetSentenceDelimiter(sentenceDelimiter); documentPreprocessor.SetTagDelimiter(tagDelimiter); documentPreprocessor.SetElementDelimiter(elementDelimiter); if (tokenizerFactory == null) { documentPreprocessor.SetTokenizerFactory((tokenized) ? null : tlp.GetTokenizerFactory()); } else { documentPreprocessor.SetTokenizerFactory(tokenizerFactory); } //Setup the output PrintWriter pwo = pwOut; if (op.testOptions.writeOutputFiles) { string normalizedName = filename; try { new URL(normalizedName); // this will exception if not a URL normalizedName = normalizedName.ReplaceAll("/", "_"); } catch (MalformedURLException) { } //It isn't a URL, so silently ignore string ext = (op.testOptions.outputFilesExtension == null) ? "stp" : op.testOptions.outputFilesExtension; string fname = normalizedName + '.' + ext; if (op.testOptions.outputFilesDirectory != null && !op.testOptions.outputFilesDirectory.IsEmpty()) { string fseparator = Runtime.GetProperty("file.separator"); if (fseparator == null || fseparator.IsEmpty()) { fseparator = "/"; } File fnameFile = new File(fname); fname = op.testOptions.outputFilesDirectory + fseparator + fnameFile.GetName(); } try { pwo = op.tlpParams.Pw(new FileOutputStream(fname)); } catch (IOException ioe) { throw new RuntimeIOException(ioe); } } treePrint.PrintHeader(pwo, op.tlpParams.GetOutputEncoding()); pwErr.Println("Parsing file: " + filename); int num = 0; int numProcessed = 0; if (op.testOptions.testingThreads != 1) { MulticoreWrapper <IList <IHasWord>, IParserQuery> wrapper = new MulticoreWrapper <IList <IHasWord>, IParserQuery>(op.testOptions.testingThreads, new ParsingThreadsafeProcessor(pqFactory, pwErr)); foreach (IList <IHasWord> sentence in documentPreprocessor) { num++; numSents++; int len = sentence.Count; numWords += len; pwErr.Println("Parsing [sent. " + num + " len. " + len + "]: " + SentenceUtils.ListToString(sentence, true)); wrapper.Put(sentence); while (wrapper.Peek()) { IParserQuery pq = wrapper.Poll(); ProcessResults(pq, numProcessed++, pwo); } } wrapper.Join(); while (wrapper.Peek()) { IParserQuery pq = wrapper.Poll(); ProcessResults(pq, numProcessed++, pwo); } } else { IParserQuery pq = pqFactory.ParserQuery(); foreach (IList <IHasWord> sentence in documentPreprocessor) { num++; numSents++; int len = sentence.Count; numWords += len; pwErr.Println("Parsing [sent. " + num + " len. " + len + "]: " + SentenceUtils.ListToString(sentence, true)); pq.ParseAndReport(sentence, pwErr); ProcessResults(pq, numProcessed++, pwo); } } treePrint.PrintFooter(pwo); if (op.testOptions.writeOutputFiles) { pwo.Close(); } pwErr.Println("Parsed file: " + filename + " [" + num + " sentences]."); } long millis = timer.Stop(); if (summary) { if (pcfgLL != null) { pcfgLL.Display(false, pwErr); } if (depLL != null) { depLL.Display(false, pwErr); } if (factLL != null) { factLL.Display(false, pwErr); } } if (saidMemMessage) { ParserUtils.PrintOutOfMemory(pwErr); } double wordspersec = numWords / (((double)millis) / 1000); double sentspersec = numSents / (((double)millis) / 1000); NumberFormat nf = new DecimalFormat("0.00"); // easier way! pwErr.Println("Parsed " + numWords + " words in " + numSents + " sentences (" + nf.Format(wordspersec) + " wds/sec; " + nf.Format(sentspersec) + " sents/sec)."); if (numFallback > 0) { pwErr.Println(" " + numFallback + " sentences were parsed by fallback to PCFG."); } if (numUnparsable > 0 || numNoMemory > 0 || numSkipped > 0) { pwErr.Println(" " + (numUnparsable + numNoMemory + numSkipped) + " sentences were not parsed:"); if (numUnparsable > 0) { pwErr.Println(" " + numUnparsable + " were not parsable with non-zero probability."); } if (numNoMemory > 0) { pwErr.Println(" " + numNoMemory + " were skipped because of insufficient memory."); } if (numSkipped > 0) { pwErr.Println(" " + numSkipped + " were skipped as length 0 or greater than " + op.testOptions.maxLength); } } }