Exemple #1
0
        public Analyzer(string path, DocumentPreprocessor.DocType docType = null, string ignore = "", string punctuation = null, AnalyzerOptions options = AnalyzerOptions.None)
            : this()
        {
            _path    = path;
            _docType = docType;
            _ignore  = ignore;
            _options = options;

            _punctuation = punctuation ?? PunctuationPatterns;

            Open();
        }
Exemple #2
0
 /// <summary>
 /// Constructs a preprocessor from a file at a path, which can be either
 /// a filesystem location, a classpath entry, or a URL.
 /// </summary>
 /// <param name="docPath">The path</param>
 /// <param name="encoding">The character encoding used by Readers</param>
 public DocumentPreprocessor(string docPath, DocumentPreprocessor.DocType t, string encoding)
 {
     if (docPath == null)
     {
         throw new ArgumentException("Cannot open null document path!");
     }
     docType = t;
     try
     {
         inputReader = IOUtils.ReaderFromString(docPath, encoding);
     }
     catch (IOException ioe)
     {
         throw new RuntimeIOException(string.Format("%s: Could not open path %s", this.GetType().FullName, docPath), ioe);
     }
 }
Exemple #3
0
 public DocumentPreprocessor(Reader input, DocumentPreprocessor.DocType t)
 {
     // todo [cdm 2017]: This class is used in all our parsers, but we should probably work to move over to WordToSetenceProcessor, which has been used in CoreNLP and has been developed more.
     // todo: Should probably change this to be regex, but I've added some multi-character punctuation in the meantime
     // inputReader is used in a fairly yucky way at the moment to communicate
     // from a XMLIterator across to a PlainTextIterator.  Maybe redo by making
     // the inner classes static and explicitly passing things around.
     //Configurable options
     // = null;
     // = null;
     // = null;
     //From PTB conventions
     // = false;
     if (input == null)
     {
         throw new ArgumentException("Cannot read from null object!");
     }
     docType     = t;
     inputReader = input;
 }
Exemple #4
0
        /// <summary>A simple, deterministic sentence-splitter.</summary>
        /// <remarks>
        /// A simple, deterministic sentence-splitter. This method only supports the English
        /// tokenizer, so for other languages you should run the tokenizer first and then
        /// run this sentence splitter with the "-whitespaceTokenization" option.
        /// </remarks>
        /// <param name="args">Command-line arguments</param>
        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            Properties options = StringUtils.ArgsToProperties(args, ArgOptionDefs());

            if (options.Contains("help"))
            {
                log.Info(Usage());
                return;
            }
            // Command-line flags
            string encoding             = options.GetProperty("encoding", "utf-8");
            bool   printSentenceLengths = PropertiesUtils.GetBool(options, "printSentenceLengths", false);
            string xmlElementDelimiter  = options.GetProperty("xml", null);

            DocumentPreprocessor.DocType docType = xmlElementDelimiter == null ? DocumentPreprocessor.DocType.Plain : DocumentPreprocessor.DocType.Xml;
            string sentenceDelimiter             = options.Contains("noTokenization") ? Runtime.GetProperty("line.separator") : null;
            string sDelim = options.GetProperty("sentenceDelimiter");

            if (sDelim != null)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(sDelim, "newline"))
                {
                    sentenceDelimiter = "\n";
                }
                else
                {
                    sentenceDelimiter = sDelim;
                }
            }
            string tagDelimiter = options.GetProperty("tag", null);

            string[] sentenceDelims = null;
            // Setup the TokenizerFactory
            int  numFactoryFlags  = 0;
            bool suppressEscaping = options.Contains("suppressEscaping");

            if (suppressEscaping)
            {
                numFactoryFlags += 1;
            }
            bool customTokenizer = options.Contains("tokenizerOptions");

            if (customTokenizer)
            {
                numFactoryFlags += 1;
            }
            bool printOriginalText = options.Contains("printOriginalText");

            if (printOriginalText)
            {
                numFactoryFlags += 1;
            }
            bool whitespaceTokenization = options.Contains("whitespaceTokenization");

            if (whitespaceTokenization)
            {
                numFactoryFlags += 1;
            }
            if (numFactoryFlags > 1)
            {
                log.Info("Only one tokenizer flag allowed at a time: ");
                log.Info("  -suppressEscaping, -tokenizerOptions, -printOriginalText, -whitespaceTokenization");
                return;
            }
            ITokenizerFactory <IHasWord> tf = null;

            if (suppressEscaping)
            {
                tf = PTBTokenizer.Factory(new CoreLabelTokenFactory(), "ptb3Escaping=false");
            }
            else
            {
                if (customTokenizer)
                {
                    tf = PTBTokenizer.Factory(new CoreLabelTokenFactory(), options.GetProperty("tokenizerOptions"));
                }
                else
                {
                    if (printOriginalText)
                    {
                        tf = PTBTokenizer.Factory(new CoreLabelTokenFactory(), "invertible=true");
                    }
                    else
                    {
                        if (whitespaceTokenization)
                        {
                            IList <string> whitespaceDelims = new List <string>(Arrays.AsList(DocumentPreprocessor.DefaultSentenceDelims));
                            whitespaceDelims.Add(WhitespaceLexer.Newline);
                            sentenceDelims = Sharpen.Collections.ToArray(whitespaceDelims, new string[whitespaceDelims.Count]);
                        }
                        else
                        {
                            tf = PTBTokenizer.Factory(new CoreLabelTokenFactory(), string.Empty);
                        }
                    }
                }
            }
            string fileList = options.GetProperty(string.Empty, null);

            string[]    files    = fileList == null ? new string[1] : fileList.Split("\\s+");
            int         numSents = 0;
            PrintWriter pw       = new PrintWriter(new OutputStreamWriter(System.Console.Out, encoding), true);

            foreach (string file in files)
            {
                DocumentPreprocessor docPreprocessor;
                if (file == null || file.IsEmpty())
                {
                    docPreprocessor = new DocumentPreprocessor(new InputStreamReader(Runtime.@in, encoding));
                }
                else
                {
                    docPreprocessor = new DocumentPreprocessor(file, docType, encoding);
                }
                if (docType == DocumentPreprocessor.DocType.Xml)
                {
                    docPreprocessor.SetElementDelimiter(xmlElementDelimiter);
                }
                docPreprocessor.SetTokenizerFactory(tf);
                if (sentenceDelimiter != null)
                {
                    docPreprocessor.SetSentenceDelimiter(sentenceDelimiter);
                }
                if (tagDelimiter != null)
                {
                    docPreprocessor.SetTagDelimiter(tagDelimiter);
                }
                if (sentenceDelims != null)
                {
                    docPreprocessor.SetSentenceFinalPuncWords(sentenceDelims);
                }
                foreach (IList <IHasWord> sentence in docPreprocessor)
                {
                    numSents++;
                    if (printSentenceLengths)
                    {
                        System.Console.Error.Printf("Length: %d%n", sentence.Count);
                    }
                    bool printSpace = false;
                    foreach (IHasWord word in sentence)
                    {
                        if (printOriginalText)
                        {
                            CoreLabel cl = (CoreLabel)word;
                            if (!printSpace)
                            {
                                pw.Print(cl.Get(typeof(CoreAnnotations.BeforeAnnotation)));
                                printSpace = true;
                            }
                            pw.Print(cl.Get(typeof(CoreAnnotations.OriginalTextAnnotation)));
                            pw.Print(cl.Get(typeof(CoreAnnotations.AfterAnnotation)));
                        }
                        else
                        {
                            if (printSpace)
                            {
                                pw.Print(" ");
                            }
                            printSpace = true;
                            pw.Print(word.Word());
                        }
                    }
                    pw.Println();
                }
            }
            pw.Close();
            System.Console.Error.Printf("Read in %d sentences.%n", numSents);
        }
Exemple #5
0
 public DocumentPreprocessor(string docPath, DocumentPreprocessor.DocType t)
     : this(docPath, t, "UTF-8")
 {
 }
 public virtual void ParseFiles <_T0>(string[] args, int argIndex, bool tokenized, ITokenizerFactory <_T0> tokenizerFactory, string elementDelimiter, string sentenceDelimiter, IFunction <IList <IHasWord>, IList <IHasWord> > escaper, string tagDelimiter
                                      )
 where _T0 : IHasWord
     {
      DocumentPreprocessor.DocType docType = (elementDelimiter == null) ? DocumentPreprocessor.DocType.Plain : DocumentPreprocessor.DocType.Xml;
      if (op.testOptions.verbose)
     {
         if (tokenizerFactory != null)
         {
             pwErr.Println("parseFiles: Tokenizer factory is: " + tokenizerFactory);
         }
     }
      Timing timer = new Timing();
      // timer.start(); // constructor already starts it.
      //Loop over the files
      for (int i = argIndex; i < args.Length; i++)
     {
         string filename = args[i];
         DocumentPreprocessor documentPreprocessor;
         if (filename.Equals("-"))
         {
             try
             {
                 documentPreprocessor = new DocumentPreprocessor(IOUtils.ReaderFromStdin(op.tlpParams.GetInputEncoding()), docType);
             }
             catch (IOException e)
             {
                 throw new RuntimeIOException(e);
             }
         }
         else
         {
             documentPreprocessor = new DocumentPreprocessor(filename, docType, op.tlpParams.GetInputEncoding());
         }
         //Unused values are null per the main() method invocation below
         //null is the default for these properties
         documentPreprocessor.SetSentenceFinalPuncWords(tlp.SentenceFinalPunctuationWords());
         documentPreprocessor.SetEscaper(escaper);
         documentPreprocessor.SetSentenceDelimiter(sentenceDelimiter);
         documentPreprocessor.SetTagDelimiter(tagDelimiter);
         documentPreprocessor.SetElementDelimiter(elementDelimiter);
         if (tokenizerFactory == null)
         {
             documentPreprocessor.SetTokenizerFactory((tokenized) ? null : tlp.GetTokenizerFactory());
         }
         else
         {
             documentPreprocessor.SetTokenizerFactory(tokenizerFactory);
         }
         //Setup the output
         PrintWriter pwo = pwOut;
         if (op.testOptions.writeOutputFiles)
         {
             string normalizedName = filename;
             try
             {
                 new URL(normalizedName);
                 // this will exception if not a URL
                 normalizedName = normalizedName.ReplaceAll("/", "_");
             }
             catch (MalformedURLException)
             {
             }
             //It isn't a URL, so silently ignore
             string ext   = (op.testOptions.outputFilesExtension == null) ? "stp" : op.testOptions.outputFilesExtension;
             string fname = normalizedName + '.' + ext;
             if (op.testOptions.outputFilesDirectory != null && !op.testOptions.outputFilesDirectory.IsEmpty())
             {
                 string fseparator = Runtime.GetProperty("file.separator");
                 if (fseparator == null || fseparator.IsEmpty())
                 {
                     fseparator = "/";
                 }
                 File fnameFile = new File(fname);
                 fname          = op.testOptions.outputFilesDirectory + fseparator + fnameFile.GetName();
             }
             try
             {
                 pwo = op.tlpParams.Pw(new FileOutputStream(fname));
             }
             catch (IOException ioe)
             {
                 throw new RuntimeIOException(ioe);
             }
         }
         treePrint.PrintHeader(pwo, op.tlpParams.GetOutputEncoding());
         pwErr.Println("Parsing file: " + filename);
         int num          = 0;
         int numProcessed = 0;
         if (op.testOptions.testingThreads != 1)
         {
             MulticoreWrapper <IList <IHasWord>, IParserQuery> wrapper = new MulticoreWrapper <IList <IHasWord>, IParserQuery>(op.testOptions.testingThreads, new ParsingThreadsafeProcessor(pqFactory, pwErr));
             foreach (IList <IHasWord> sentence in documentPreprocessor)
             {
                 num++;
                 numSents++;
                 int len   = sentence.Count;
                 numWords += len;
                 pwErr.Println("Parsing [sent. " + num + " len. " + len + "]: " + SentenceUtils.ListToString(sentence, true));
                 wrapper.Put(sentence);
                 while (wrapper.Peek())
                 {
                     IParserQuery pq = wrapper.Poll();
                     ProcessResults(pq, numProcessed++, pwo);
                 }
             }
             wrapper.Join();
             while (wrapper.Peek())
             {
                 IParserQuery pq = wrapper.Poll();
                 ProcessResults(pq, numProcessed++, pwo);
             }
         }
         else
         {
             IParserQuery pq = pqFactory.ParserQuery();
             foreach (IList <IHasWord> sentence in documentPreprocessor)
             {
                 num++;
                 numSents++;
                 int len   = sentence.Count;
                 numWords += len;
                 pwErr.Println("Parsing [sent. " + num + " len. " + len + "]: " + SentenceUtils.ListToString(sentence, true));
                 pq.ParseAndReport(sentence, pwErr);
                 ProcessResults(pq, numProcessed++, pwo);
             }
         }
         treePrint.PrintFooter(pwo);
         if (op.testOptions.writeOutputFiles)
         {
             pwo.Close();
         }
         pwErr.Println("Parsed file: " + filename + " [" + num + " sentences].");
     }
      long millis = timer.Stop();
      if (summary)
     {
         if (pcfgLL != null)
         {
             pcfgLL.Display(false, pwErr);
         }
         if (depLL != null)
         {
             depLL.Display(false, pwErr);
         }
         if (factLL != null)
         {
             factLL.Display(false, pwErr);
         }
     }
      if (saidMemMessage)
     {
         ParserUtils.PrintOutOfMemory(pwErr);
     }
      double wordspersec = numWords / (((double)millis) / 1000);
      double sentspersec = numSents / (((double)millis) / 1000);
      NumberFormat nf    = new DecimalFormat("0.00");
      // easier way!
      pwErr.Println("Parsed " + numWords + " words in " + numSents + " sentences (" + nf.Format(wordspersec) + " wds/sec; " + nf.Format(sentspersec) + " sents/sec).");
      if (numFallback > 0)
     {
         pwErr.Println("  " + numFallback + " sentences were parsed by fallback to PCFG.");
     }
      if (numUnparsable > 0 || numNoMemory > 0 || numSkipped > 0)
     {
         pwErr.Println("  " + (numUnparsable + numNoMemory + numSkipped) + " sentences were not parsed:");
         if (numUnparsable > 0)
         {
             pwErr.Println("    " + numUnparsable + " were not parsable with non-zero probability.");
         }
         if (numNoMemory > 0)
         {
             pwErr.Println("    " + numNoMemory + " were skipped because of insufficient memory.");
         }
         if (numSkipped > 0)
         {
             pwErr.Println("    " + numSkipped + " were skipped as length 0 or greater than " + op.testOptions.maxLength);
         }
     }
     }
Exemple #7
0
        public Analyzer(string path, DocumentPreprocessor.DocType docType = null, string ignore = "", string punctuation = null, AnalyzerOptions options = AnalyzerOptions.None)
            : this()
        {
            _path = path;
              _docType = docType;
              _ignore = ignore;
              _options = options;

              _punctuation = punctuation ?? PunctuationPatterns;

              Open();
        }