コード例 #1
0
        /// <summary>This example shows a few more ways of providing input to a parser.</summary>
        /// <remarks>
        /// This example shows a few more ways of providing input to a parser.
        /// Usage: ParserDemo2 [grammar [textFile]]
        /// </remarks>
        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            string grammar = args.Length > 0 ? args[0] : "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";

            string[]                        options = new string[] { "-maxLength", "80", "-retainTmpSubcategories" };
            LexicalizedParser               lp      = ((LexicalizedParser)LexicalizedParser.LoadModel(grammar, options));
            ITreebankLanguagePack           tlp     = lp.GetOp().Langpack();
            IGrammaticalStructureFactory    gsf     = tlp.GrammaticalStructureFactory();
            IEnumerable <IList <IHasWord> > sentences;

            if (args.Length > 1)
            {
                DocumentPreprocessor      dp  = new DocumentPreprocessor(args[1]);
                IList <IList <IHasWord> > tmp = new List <IList <IHasWord> >();
                foreach (IList <IHasWord> sentence in dp)
                {
                    tmp.Add(sentence);
                }
                sentences = tmp;
            }
            else
            {
                // Showing tokenization and parsing in code a couple of different ways.
                string[]         sent     = new string[] { "This", "is", "an", "easy", "sentence", "." };
                IList <IHasWord> sentence = new List <IHasWord>();
                foreach (string word in sent)
                {
                    sentence.Add(new Word(word));
                }
                string sent2 = ("This is a slightly longer and more complex " + "sentence requiring tokenization.");
                // Use the default tokenizer for this TreebankLanguagePack
                ITokenizer <IHasWord> toke      = tlp.GetTokenizerFactory().GetTokenizer(new StringReader(sent2));
                IList <IHasWord>      sentence2 = toke.Tokenize();
                string[] sent3 = new string[] { "It", "can", "can", "it", "." };
                string[] tag3  = new string[] { "PRP", "MD", "VB", "PRP", "." };
                // Parser gets second "can" wrong without help
                IList <TaggedWord> sentence3 = new List <TaggedWord>();
                for (int i = 0; i < sent3.Length; i++)
                {
                    sentence3.Add(new TaggedWord(sent3[i], tag3[i]));
                }
                Tree parse = lp.Parse(sentence3);
                parse.PennPrint();
                IList <IList <IHasWord> > tmp = new List <IList <IHasWord> >();
                tmp.Add(sentence);
                tmp.Add(sentence2);
                tmp.Add(sentence3);
                sentences = tmp;
            }
            foreach (IList <IHasWord> sentence_1 in sentences)
            {
                Tree parse = lp.Parse(sentence_1);
                parse.PennPrint();
                System.Console.Out.WriteLine();
                GrammaticalStructure    gs  = gsf.NewGrammaticalStructure(parse);
                IList <TypedDependency> tdl = gs.TypedDependenciesCCprocessed();
                System.Console.Out.WriteLine(tdl);
                System.Console.Out.WriteLine();
                System.Console.Out.WriteLine("The words of the sentence:");
                foreach (ILabel lab in parse.Yield())
                {
                    if (lab is CoreLabel)
                    {
                        System.Console.Out.WriteLine(((CoreLabel)lab).ToString(CoreLabel.OutputFormat.ValueMap));
                    }
                    else
                    {
                        System.Console.Out.WriteLine(lab);
                    }
                }
                System.Console.Out.WriteLine();
                System.Console.Out.WriteLine(parse.TaggedYield());
                System.Console.Out.WriteLine();
            }
            // This method turns the String into a single sentence using the
            // default tokenizer for the TreebankLanguagePack.
            string sent3_1 = "This is one last test!";

            lp.Parse(sent3_1).PennPrint();
        }
コード例 #2
0
        /// <summary>Finds the nearest delimiter starting from index start.</summary>
        /// <remarks>
        /// Finds the nearest delimiter starting from index start. If <tt>seekDir</tt>
        /// is SEEK_FORWARD, finds the nearest delimiter after start.  Else, if it is
        /// SEEK_BACK, finds the nearest delimiter before start.
        /// </remarks>
        private int NearestDelimiter(string text, int start, int seekDir)
        {
            if (seekDir != SeekBack && seekDir != SeekForward)
            {
                throw new ArgumentException("Unknown seek direction " + seekDir);
            }
            StringReader                 reader    = new StringReader(text);
            DocumentPreprocessor         processor = new DocumentPreprocessor(reader);
            ITokenizerFactory <IHasWord> tf        = tlp.GetTokenizerFactory();

            processor.SetTokenizerFactory(tf);
            IList <int> boundaries = new List <int>();

            foreach (IList <IHasWord> sentence in processor)
            {
                if (sentence.Count == 0)
                {
                    continue;
                }
                if (!(sentence[0] is IHasOffset))
                {
                    throw new InvalidCastException("Expected HasOffsets from the " + "DocumentPreprocessor");
                }
                if (boundaries.Count == 0)
                {
                    boundaries.Add(0);
                }
                else
                {
                    IHasOffset first = (IHasOffset)sentence[0];
                    boundaries.Add(first.BeginPosition());
                }
            }
            boundaries.Add(text.Length);
            for (int i = 0; i < boundaries.Count - 1; ++i)
            {
                if (boundaries[i] <= start && start < boundaries[i + 1])
                {
                    if (seekDir == SeekBack)
                    {
                        return(boundaries[i] - 1);
                    }
                    else
                    {
                        if (seekDir == SeekForward)
                        {
                            return(boundaries[i + 1] - 1);
                        }
                    }
                }
            }
            // The cursor position at the end is actually one past the text length.
            // We might as well highlight the last interval in that case.
            if (boundaries.Count >= 2 && start >= text.Length)
            {
                if (seekDir == SeekBack)
                {
                    return(boundaries[boundaries.Count - 2] - 1);
                }
                else
                {
                    if (seekDir == SeekForward)
                    {
                        return(boundaries[boundaries.Count - 1] - 1);
                    }
                }
            }
            return(-1);
        }
コード例 #3
0
 public virtual void ParseFiles <_T0>(string[] args, int argIndex, bool tokenized, ITokenizerFactory <_T0> tokenizerFactory, string elementDelimiter, string sentenceDelimiter, IFunction <IList <IHasWord>, IList <IHasWord> > escaper, string tagDelimiter
                                      )
 where _T0 : IHasWord
     {
      DocumentPreprocessor.DocType docType = (elementDelimiter == null) ? DocumentPreprocessor.DocType.Plain : DocumentPreprocessor.DocType.Xml;
      if (op.testOptions.verbose)
     {
         if (tokenizerFactory != null)
         {
             pwErr.Println("parseFiles: Tokenizer factory is: " + tokenizerFactory);
         }
     }
      Timing timer = new Timing();
      // timer.start(); // constructor already starts it.
      //Loop over the files
      for (int i = argIndex; i < args.Length; i++)
     {
         string filename = args[i];
         DocumentPreprocessor documentPreprocessor;
         if (filename.Equals("-"))
         {
             try
             {
                 documentPreprocessor = new DocumentPreprocessor(IOUtils.ReaderFromStdin(op.tlpParams.GetInputEncoding()), docType);
             }
             catch (IOException e)
             {
                 throw new RuntimeIOException(e);
             }
         }
         else
         {
             documentPreprocessor = new DocumentPreprocessor(filename, docType, op.tlpParams.GetInputEncoding());
         }
         //Unused values are null per the main() method invocation below
         //null is the default for these properties
         documentPreprocessor.SetSentenceFinalPuncWords(tlp.SentenceFinalPunctuationWords());
         documentPreprocessor.SetEscaper(escaper);
         documentPreprocessor.SetSentenceDelimiter(sentenceDelimiter);
         documentPreprocessor.SetTagDelimiter(tagDelimiter);
         documentPreprocessor.SetElementDelimiter(elementDelimiter);
         if (tokenizerFactory == null)
         {
             documentPreprocessor.SetTokenizerFactory((tokenized) ? null : tlp.GetTokenizerFactory());
         }
         else
         {
             documentPreprocessor.SetTokenizerFactory(tokenizerFactory);
         }
         //Setup the output
         PrintWriter pwo = pwOut;
         if (op.testOptions.writeOutputFiles)
         {
             string normalizedName = filename;
             try
             {
                 new URL(normalizedName);
                 // this will exception if not a URL
                 normalizedName = normalizedName.ReplaceAll("/", "_");
             }
             catch (MalformedURLException)
             {
             }
             //It isn't a URL, so silently ignore
             string ext   = (op.testOptions.outputFilesExtension == null) ? "stp" : op.testOptions.outputFilesExtension;
             string fname = normalizedName + '.' + ext;
             if (op.testOptions.outputFilesDirectory != null && !op.testOptions.outputFilesDirectory.IsEmpty())
             {
                 string fseparator = Runtime.GetProperty("file.separator");
                 if (fseparator == null || fseparator.IsEmpty())
                 {
                     fseparator = "/";
                 }
                 File fnameFile = new File(fname);
                 fname          = op.testOptions.outputFilesDirectory + fseparator + fnameFile.GetName();
             }
             try
             {
                 pwo = op.tlpParams.Pw(new FileOutputStream(fname));
             }
             catch (IOException ioe)
             {
                 throw new RuntimeIOException(ioe);
             }
         }
         treePrint.PrintHeader(pwo, op.tlpParams.GetOutputEncoding());
         pwErr.Println("Parsing file: " + filename);
         int num          = 0;
         int numProcessed = 0;
         if (op.testOptions.testingThreads != 1)
         {
             MulticoreWrapper <IList <IHasWord>, IParserQuery> wrapper = new MulticoreWrapper <IList <IHasWord>, IParserQuery>(op.testOptions.testingThreads, new ParsingThreadsafeProcessor(pqFactory, pwErr));
             foreach (IList <IHasWord> sentence in documentPreprocessor)
             {
                 num++;
                 numSents++;
                 int len   = sentence.Count;
                 numWords += len;
                 pwErr.Println("Parsing [sent. " + num + " len. " + len + "]: " + SentenceUtils.ListToString(sentence, true));
                 wrapper.Put(sentence);
                 while (wrapper.Peek())
                 {
                     IParserQuery pq = wrapper.Poll();
                     ProcessResults(pq, numProcessed++, pwo);
                 }
             }
             wrapper.Join();
             while (wrapper.Peek())
             {
                 IParserQuery pq = wrapper.Poll();
                 ProcessResults(pq, numProcessed++, pwo);
             }
         }
         else
         {
             IParserQuery pq = pqFactory.ParserQuery();
             foreach (IList <IHasWord> sentence in documentPreprocessor)
             {
                 num++;
                 numSents++;
                 int len   = sentence.Count;
                 numWords += len;
                 pwErr.Println("Parsing [sent. " + num + " len. " + len + "]: " + SentenceUtils.ListToString(sentence, true));
                 pq.ParseAndReport(sentence, pwErr);
                 ProcessResults(pq, numProcessed++, pwo);
             }
         }
         treePrint.PrintFooter(pwo);
         if (op.testOptions.writeOutputFiles)
         {
             pwo.Close();
         }
         pwErr.Println("Parsed file: " + filename + " [" + num + " sentences].");
     }
      long millis = timer.Stop();
      if (summary)
     {
         if (pcfgLL != null)
         {
             pcfgLL.Display(false, pwErr);
         }
         if (depLL != null)
         {
             depLL.Display(false, pwErr);
         }
         if (factLL != null)
         {
             factLL.Display(false, pwErr);
         }
     }
      if (saidMemMessage)
     {
         ParserUtils.PrintOutOfMemory(pwErr);
     }
      double wordspersec = numWords / (((double)millis) / 1000);
      double sentspersec = numSents / (((double)millis) / 1000);
      NumberFormat nf    = new DecimalFormat("0.00");
      // easier way!
      pwErr.Println("Parsed " + numWords + " words in " + numSents + " sentences (" + nf.Format(wordspersec) + " wds/sec; " + nf.Format(sentspersec) + " sents/sec).");
      if (numFallback > 0)
     {
         pwErr.Println("  " + numFallback + " sentences were parsed by fallback to PCFG.");
     }
      if (numUnparsable > 0 || numNoMemory > 0 || numSkipped > 0)
     {
         pwErr.Println("  " + (numUnparsable + numNoMemory + numSkipped) + " sentences were not parsed:");
         if (numUnparsable > 0)
         {
             pwErr.Println("    " + numUnparsable + " were not parsable with non-zero probability.");
         }
         if (numNoMemory > 0)
         {
             pwErr.Println("    " + numNoMemory + " were skipped because of insufficient memory.");
         }
         if (numSkipped > 0)
         {
             pwErr.Println("    " + numSkipped + " were skipped as length 0 or greater than " + op.testOptions.maxLength);
         }
     }
     }