/// <summary>Adds a sentence final punctuation mark to sentences that lack one.</summary> /// <remarks> /// Adds a sentence final punctuation mark to sentences that lack one. /// This method adds a period (the first sentence final punctuation word /// in a parser language pack) to sentences that don't have one within /// the last 3 words (to allow for close parentheses, etc.). It checks /// tags for punctuation, if available, otherwise words. /// </remarks> /// <param name="sentence">The sentence to check</param> /// <param name="length">The length of the sentence (just to avoid recomputation)</param> private bool AddSentenceFinalPunctIfNeeded(IList <IHasWord> sentence, int length) { int start = length - 3; if (start < 0) { start = 0; } ITreebankLanguagePack tlp = op.tlpParams.TreebankLanguagePack(); for (int i = length - 1; i >= start; i--) { IHasWord item = sentence[i]; // An object (e.g., CoreLabel) can implement HasTag but not actually store // a tag so we need to check that there is something there for this case. // If there is, use only it, since word tokens can be ambiguous. string tag = null; if (item is IHasTag) { tag = ((IHasTag)item).Tag(); } if (tag != null && !tag.IsEmpty()) { if (tlp.IsSentenceFinalPunctuationTag(tag)) { return(false); } } else { string str = item.Word(); if (tlp.IsPunctuationWord(str)) { return(false); } } } // none found so add one. if (op.testOptions.verbose) { log.Info("Adding missing final punctuation to sentence."); } string[] sfpWords = tlp.SentenceFinalPunctuationWords(); if (sfpWords.Length > 0) { sentence.Add(new Word(sfpWords[0])); } return(true); }
public ParseFiles(Options op, TreePrint treePrint, LexicalizedParser pqFactory) { this.op = op; this.pqFactory = pqFactory; this.treePrint = treePrint; this.tlp = op.tlpParams.TreebankLanguagePack(); this.pwOut = op.tlpParams.Pw(); this.pwErr = op.tlpParams.Pw(System.Console.Error); if (op.testOptions.verbose) { pwErr.Println("Sentence final words are: " + Arrays.AsList(tlp.SentenceFinalPunctuationWords())); pwErr.Println("File encoding is: " + op.tlpParams.GetInputEncoding()); } // evaluation setup this.runningAverages = bool.ParseBoolean(op.testOptions.evals.GetProperty("runningAverages")); this.summary = bool.ParseBoolean(op.testOptions.evals.GetProperty("summary")); if (bool.ParseBoolean(op.testOptions.evals.GetProperty("pcfgLL"))) { this.pcfgLL = new AbstractEval.ScoreEval("pcfgLL", runningAverages); } else { this.pcfgLL = null; } if (bool.ParseBoolean(op.testOptions.evals.GetProperty("depLL"))) { this.depLL = new AbstractEval.ScoreEval("depLL", runningAverages); } else { this.depLL = null; } if (bool.ParseBoolean(op.testOptions.evals.GetProperty("factLL"))) { this.factLL = new AbstractEval.ScoreEval("factLL", runningAverages); } else { this.factLL = null; } }
public virtual void ParseFiles <_T0>(string[] args, int argIndex, bool tokenized, ITokenizerFactory <_T0> tokenizerFactory, string elementDelimiter, string sentenceDelimiter, IFunction <IList <IHasWord>, IList <IHasWord> > escaper, string tagDelimiter ) where _T0 : IHasWord { DocumentPreprocessor.DocType docType = (elementDelimiter == null) ? DocumentPreprocessor.DocType.Plain : DocumentPreprocessor.DocType.Xml; if (op.testOptions.verbose) { if (tokenizerFactory != null) { pwErr.Println("parseFiles: Tokenizer factory is: " + tokenizerFactory); } } Timing timer = new Timing(); // timer.start(); // constructor already starts it. //Loop over the files for (int i = argIndex; i < args.Length; i++) { string filename = args[i]; DocumentPreprocessor documentPreprocessor; if (filename.Equals("-")) { try { documentPreprocessor = new DocumentPreprocessor(IOUtils.ReaderFromStdin(op.tlpParams.GetInputEncoding()), docType); } catch (IOException e) { throw new RuntimeIOException(e); } } else { documentPreprocessor = new DocumentPreprocessor(filename, docType, op.tlpParams.GetInputEncoding()); } //Unused values are null per the main() method invocation below //null is the default for these properties documentPreprocessor.SetSentenceFinalPuncWords(tlp.SentenceFinalPunctuationWords()); documentPreprocessor.SetEscaper(escaper); documentPreprocessor.SetSentenceDelimiter(sentenceDelimiter); documentPreprocessor.SetTagDelimiter(tagDelimiter); documentPreprocessor.SetElementDelimiter(elementDelimiter); if (tokenizerFactory == null) { documentPreprocessor.SetTokenizerFactory((tokenized) ? null : tlp.GetTokenizerFactory()); } else { documentPreprocessor.SetTokenizerFactory(tokenizerFactory); } //Setup the output PrintWriter pwo = pwOut; if (op.testOptions.writeOutputFiles) { string normalizedName = filename; try { new URL(normalizedName); // this will exception if not a URL normalizedName = normalizedName.ReplaceAll("/", "_"); } catch (MalformedURLException) { } //It isn't a URL, so silently ignore string ext = (op.testOptions.outputFilesExtension == null) ? "stp" : op.testOptions.outputFilesExtension; string fname = normalizedName + '.' + ext; if (op.testOptions.outputFilesDirectory != null && !op.testOptions.outputFilesDirectory.IsEmpty()) { string fseparator = Runtime.GetProperty("file.separator"); if (fseparator == null || fseparator.IsEmpty()) { fseparator = "/"; } File fnameFile = new File(fname); fname = op.testOptions.outputFilesDirectory + fseparator + fnameFile.GetName(); } try { pwo = op.tlpParams.Pw(new FileOutputStream(fname)); } catch (IOException ioe) { throw new RuntimeIOException(ioe); } } treePrint.PrintHeader(pwo, op.tlpParams.GetOutputEncoding()); pwErr.Println("Parsing file: " + filename); int num = 0; int numProcessed = 0; if (op.testOptions.testingThreads != 1) { MulticoreWrapper <IList <IHasWord>, IParserQuery> wrapper = new MulticoreWrapper <IList <IHasWord>, IParserQuery>(op.testOptions.testingThreads, new ParsingThreadsafeProcessor(pqFactory, pwErr)); foreach (IList <IHasWord> sentence in documentPreprocessor) { num++; numSents++; int len = sentence.Count; numWords += len; pwErr.Println("Parsing [sent. " + num + " len. " + len + "]: " + SentenceUtils.ListToString(sentence, true)); wrapper.Put(sentence); while (wrapper.Peek()) { IParserQuery pq = wrapper.Poll(); ProcessResults(pq, numProcessed++, pwo); } } wrapper.Join(); while (wrapper.Peek()) { IParserQuery pq = wrapper.Poll(); ProcessResults(pq, numProcessed++, pwo); } } else { IParserQuery pq = pqFactory.ParserQuery(); foreach (IList <IHasWord> sentence in documentPreprocessor) { num++; numSents++; int len = sentence.Count; numWords += len; pwErr.Println("Parsing [sent. " + num + " len. " + len + "]: " + SentenceUtils.ListToString(sentence, true)); pq.ParseAndReport(sentence, pwErr); ProcessResults(pq, numProcessed++, pwo); } } treePrint.PrintFooter(pwo); if (op.testOptions.writeOutputFiles) { pwo.Close(); } pwErr.Println("Parsed file: " + filename + " [" + num + " sentences]."); } long millis = timer.Stop(); if (summary) { if (pcfgLL != null) { pcfgLL.Display(false, pwErr); } if (depLL != null) { depLL.Display(false, pwErr); } if (factLL != null) { factLL.Display(false, pwErr); } } if (saidMemMessage) { ParserUtils.PrintOutOfMemory(pwErr); } double wordspersec = numWords / (((double)millis) / 1000); double sentspersec = numSents / (((double)millis) / 1000); NumberFormat nf = new DecimalFormat("0.00"); // easier way! pwErr.Println("Parsed " + numWords + " words in " + numSents + " sentences (" + nf.Format(wordspersec) + " wds/sec; " + nf.Format(sentspersec) + " sents/sec)."); if (numFallback > 0) { pwErr.Println(" " + numFallback + " sentences were parsed by fallback to PCFG."); } if (numUnparsable > 0 || numNoMemory > 0 || numSkipped > 0) { pwErr.Println(" " + (numUnparsable + numNoMemory + numSkipped) + " sentences were not parsed:"); if (numUnparsable > 0) { pwErr.Println(" " + numUnparsable + " were not parsable with non-zero probability."); } if (numNoMemory > 0) { pwErr.Println(" " + numNoMemory + " were skipped because of insufficient memory."); } if (numSkipped > 0) { pwErr.Println(" " + numSkipped + " were skipped as length 0 or greater than " + op.testOptions.maxLength); } } }