public Analyzer(string path, DocumentPreprocessor.DocType docType = null, string ignore = "", string punctuation = null, AnalyzerOptions options = AnalyzerOptions.None) : this() { _path = path; _docType = docType; _ignore = ignore; _options = options; _punctuation = punctuation ?? PunctuationPatterns; Open(); }
/// <summary> /// Constructs a preprocessor from a file at a path, which can be either /// a filesystem location, a classpath entry, or a URL. /// </summary> /// <param name="docPath">The path</param> /// <param name="encoding">The character encoding used by Readers</param> public DocumentPreprocessor(string docPath, DocumentPreprocessor.DocType t, string encoding) { if (docPath == null) { throw new ArgumentException("Cannot open null document path!"); } docType = t; try { inputReader = IOUtils.ReaderFromString(docPath, encoding); } catch (IOException ioe) { throw new RuntimeIOException(string.Format("%s: Could not open path %s", this.GetType().FullName, docPath), ioe); } }
public DocumentPreprocessor(Reader input, DocumentPreprocessor.DocType t) { // todo [cdm 2017]: This class is used in all our parsers, but we should probably work to move over to WordToSetenceProcessor, which has been used in CoreNLP and has been developed more. // todo: Should probably change this to be regex, but I've added some multi-character punctuation in the meantime // inputReader is used in a fairly yucky way at the moment to communicate // from a XMLIterator across to a PlainTextIterator. Maybe redo by making // the inner classes static and explicitly passing things around. //Configurable options // = null; // = null; // = null; //From PTB conventions // = false; if (input == null) { throw new ArgumentException("Cannot read from null object!"); } docType = t; inputReader = input; }
/// <summary>A simple, deterministic sentence-splitter.</summary> /// <remarks> /// A simple, deterministic sentence-splitter. This method only supports the English /// tokenizer, so for other languages you should run the tokenizer first and then /// run this sentence splitter with the "-whitespaceTokenization" option. /// </remarks> /// <param name="args">Command-line arguments</param> /// <exception cref="System.IO.IOException"/> public static void Main(string[] args) { Properties options = StringUtils.ArgsToProperties(args, ArgOptionDefs()); if (options.Contains("help")) { log.Info(Usage()); return; } // Command-line flags string encoding = options.GetProperty("encoding", "utf-8"); bool printSentenceLengths = PropertiesUtils.GetBool(options, "printSentenceLengths", false); string xmlElementDelimiter = options.GetProperty("xml", null); DocumentPreprocessor.DocType docType = xmlElementDelimiter == null ? DocumentPreprocessor.DocType.Plain : DocumentPreprocessor.DocType.Xml; string sentenceDelimiter = options.Contains("noTokenization") ? Runtime.GetProperty("line.separator") : null; string sDelim = options.GetProperty("sentenceDelimiter"); if (sDelim != null) { if (Sharpen.Runtime.EqualsIgnoreCase(sDelim, "newline")) { sentenceDelimiter = "\n"; } else { sentenceDelimiter = sDelim; } } string tagDelimiter = options.GetProperty("tag", null); string[] sentenceDelims = null; // Setup the TokenizerFactory int numFactoryFlags = 0; bool suppressEscaping = options.Contains("suppressEscaping"); if (suppressEscaping) { numFactoryFlags += 1; } bool customTokenizer = options.Contains("tokenizerOptions"); if (customTokenizer) { numFactoryFlags += 1; } bool printOriginalText = options.Contains("printOriginalText"); if (printOriginalText) { numFactoryFlags += 1; } bool whitespaceTokenization = options.Contains("whitespaceTokenization"); if (whitespaceTokenization) { numFactoryFlags += 1; } if (numFactoryFlags > 1) { log.Info("Only one tokenizer flag allowed at a time: "); log.Info(" -suppressEscaping, -tokenizerOptions, -printOriginalText, -whitespaceTokenization"); return; } ITokenizerFactory <IHasWord> tf = null; if (suppressEscaping) { tf = PTBTokenizer.Factory(new CoreLabelTokenFactory(), "ptb3Escaping=false"); } else { if (customTokenizer) { tf = PTBTokenizer.Factory(new CoreLabelTokenFactory(), options.GetProperty("tokenizerOptions")); } else { if (printOriginalText) { tf = PTBTokenizer.Factory(new CoreLabelTokenFactory(), "invertible=true"); } else { if (whitespaceTokenization) { IList <string> whitespaceDelims = new List <string>(Arrays.AsList(DocumentPreprocessor.DefaultSentenceDelims)); whitespaceDelims.Add(WhitespaceLexer.Newline); sentenceDelims = Sharpen.Collections.ToArray(whitespaceDelims, new string[whitespaceDelims.Count]); } else { tf = PTBTokenizer.Factory(new CoreLabelTokenFactory(), string.Empty); } } } } string fileList = options.GetProperty(string.Empty, null); string[] files = fileList == null ? new string[1] : fileList.Split("\\s+"); int numSents = 0; PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.Console.Out, encoding), true); foreach (string file in files) { DocumentPreprocessor docPreprocessor; if (file == null || file.IsEmpty()) { docPreprocessor = new DocumentPreprocessor(new InputStreamReader(Runtime.@in, encoding)); } else { docPreprocessor = new DocumentPreprocessor(file, docType, encoding); } if (docType == DocumentPreprocessor.DocType.Xml) { docPreprocessor.SetElementDelimiter(xmlElementDelimiter); } docPreprocessor.SetTokenizerFactory(tf); if (sentenceDelimiter != null) { docPreprocessor.SetSentenceDelimiter(sentenceDelimiter); } if (tagDelimiter != null) { docPreprocessor.SetTagDelimiter(tagDelimiter); } if (sentenceDelims != null) { docPreprocessor.SetSentenceFinalPuncWords(sentenceDelims); } foreach (IList <IHasWord> sentence in docPreprocessor) { numSents++; if (printSentenceLengths) { System.Console.Error.Printf("Length: %d%n", sentence.Count); } bool printSpace = false; foreach (IHasWord word in sentence) { if (printOriginalText) { CoreLabel cl = (CoreLabel)word; if (!printSpace) { pw.Print(cl.Get(typeof(CoreAnnotations.BeforeAnnotation))); printSpace = true; } pw.Print(cl.Get(typeof(CoreAnnotations.OriginalTextAnnotation))); pw.Print(cl.Get(typeof(CoreAnnotations.AfterAnnotation))); } else { if (printSpace) { pw.Print(" "); } printSpace = true; pw.Print(word.Word()); } } pw.Println(); } } pw.Close(); System.Console.Error.Printf("Read in %d sentences.%n", numSents); }
public DocumentPreprocessor(string docPath, DocumentPreprocessor.DocType t) : this(docPath, t, "UTF-8") { }
public virtual void ParseFiles <_T0>(string[] args, int argIndex, bool tokenized, ITokenizerFactory <_T0> tokenizerFactory, string elementDelimiter, string sentenceDelimiter, IFunction <IList <IHasWord>, IList <IHasWord> > escaper, string tagDelimiter ) where _T0 : IHasWord { DocumentPreprocessor.DocType docType = (elementDelimiter == null) ? DocumentPreprocessor.DocType.Plain : DocumentPreprocessor.DocType.Xml; if (op.testOptions.verbose) { if (tokenizerFactory != null) { pwErr.Println("parseFiles: Tokenizer factory is: " + tokenizerFactory); } } Timing timer = new Timing(); // timer.start(); // constructor already starts it. //Loop over the files for (int i = argIndex; i < args.Length; i++) { string filename = args[i]; DocumentPreprocessor documentPreprocessor; if (filename.Equals("-")) { try { documentPreprocessor = new DocumentPreprocessor(IOUtils.ReaderFromStdin(op.tlpParams.GetInputEncoding()), docType); } catch (IOException e) { throw new RuntimeIOException(e); } } else { documentPreprocessor = new DocumentPreprocessor(filename, docType, op.tlpParams.GetInputEncoding()); } //Unused values are null per the main() method invocation below //null is the default for these properties documentPreprocessor.SetSentenceFinalPuncWords(tlp.SentenceFinalPunctuationWords()); documentPreprocessor.SetEscaper(escaper); documentPreprocessor.SetSentenceDelimiter(sentenceDelimiter); documentPreprocessor.SetTagDelimiter(tagDelimiter); documentPreprocessor.SetElementDelimiter(elementDelimiter); if (tokenizerFactory == null) { documentPreprocessor.SetTokenizerFactory((tokenized) ? null : tlp.GetTokenizerFactory()); } else { documentPreprocessor.SetTokenizerFactory(tokenizerFactory); } //Setup the output PrintWriter pwo = pwOut; if (op.testOptions.writeOutputFiles) { string normalizedName = filename; try { new URL(normalizedName); // this will exception if not a URL normalizedName = normalizedName.ReplaceAll("/", "_"); } catch (MalformedURLException) { } //It isn't a URL, so silently ignore string ext = (op.testOptions.outputFilesExtension == null) ? "stp" : op.testOptions.outputFilesExtension; string fname = normalizedName + '.' + ext; if (op.testOptions.outputFilesDirectory != null && !op.testOptions.outputFilesDirectory.IsEmpty()) { string fseparator = Runtime.GetProperty("file.separator"); if (fseparator == null || fseparator.IsEmpty()) { fseparator = "/"; } File fnameFile = new File(fname); fname = op.testOptions.outputFilesDirectory + fseparator + fnameFile.GetName(); } try { pwo = op.tlpParams.Pw(new FileOutputStream(fname)); } catch (IOException ioe) { throw new RuntimeIOException(ioe); } } treePrint.PrintHeader(pwo, op.tlpParams.GetOutputEncoding()); pwErr.Println("Parsing file: " + filename); int num = 0; int numProcessed = 0; if (op.testOptions.testingThreads != 1) { MulticoreWrapper <IList <IHasWord>, IParserQuery> wrapper = new MulticoreWrapper <IList <IHasWord>, IParserQuery>(op.testOptions.testingThreads, new ParsingThreadsafeProcessor(pqFactory, pwErr)); foreach (IList <IHasWord> sentence in documentPreprocessor) { num++; numSents++; int len = sentence.Count; numWords += len; pwErr.Println("Parsing [sent. " + num + " len. " + len + "]: " + SentenceUtils.ListToString(sentence, true)); wrapper.Put(sentence); while (wrapper.Peek()) { IParserQuery pq = wrapper.Poll(); ProcessResults(pq, numProcessed++, pwo); } } wrapper.Join(); while (wrapper.Peek()) { IParserQuery pq = wrapper.Poll(); ProcessResults(pq, numProcessed++, pwo); } } else { IParserQuery pq = pqFactory.ParserQuery(); foreach (IList <IHasWord> sentence in documentPreprocessor) { num++; numSents++; int len = sentence.Count; numWords += len; pwErr.Println("Parsing [sent. " + num + " len. " + len + "]: " + SentenceUtils.ListToString(sentence, true)); pq.ParseAndReport(sentence, pwErr); ProcessResults(pq, numProcessed++, pwo); } } treePrint.PrintFooter(pwo); if (op.testOptions.writeOutputFiles) { pwo.Close(); } pwErr.Println("Parsed file: " + filename + " [" + num + " sentences]."); } long millis = timer.Stop(); if (summary) { if (pcfgLL != null) { pcfgLL.Display(false, pwErr); } if (depLL != null) { depLL.Display(false, pwErr); } if (factLL != null) { factLL.Display(false, pwErr); } } if (saidMemMessage) { ParserUtils.PrintOutOfMemory(pwErr); } double wordspersec = numWords / (((double)millis) / 1000); double sentspersec = numSents / (((double)millis) / 1000); NumberFormat nf = new DecimalFormat("0.00"); // easier way! pwErr.Println("Parsed " + numWords + " words in " + numSents + " sentences (" + nf.Format(wordspersec) + " wds/sec; " + nf.Format(sentspersec) + " sents/sec)."); if (numFallback > 0) { pwErr.Println(" " + numFallback + " sentences were parsed by fallback to PCFG."); } if (numUnparsable > 0 || numNoMemory > 0 || numSkipped > 0) { pwErr.Println(" " + (numUnparsable + numNoMemory + numSkipped) + " sentences were not parsed:"); if (numUnparsable > 0) { pwErr.Println(" " + numUnparsable + " were not parsable with non-zero probability."); } if (numNoMemory > 0) { pwErr.Println(" " + numNoMemory + " were skipped because of insufficient memory."); } if (numSkipped > 0) { pwErr.Println(" " + numSkipped + " were skipped as length 0 or greater than " + op.testOptions.maxLength); } } }