public static void ProcessCoreNLPIfDoesNotExist(File processedFile, Properties coreNLPProps, string text)
 {
     if (!processedFile.Exists())
     {
         try
         {
             StanfordCoreNLP coreNLP             = new StanfordCoreNLP(coreNLPProps);
             Annotation      processedAnnotation = coreNLP.Process(text);
             //this document holds the split for paragraphs.
             ProtobufAnnotationSerializer pas = new ProtobufAnnotationSerializer(true);
             OutputStream fos = new BufferedOutputStream(new FileOutputStream(processedFile.GetAbsolutePath()));
             pas.Write(processedAnnotation, fos);
         }
         catch (IOException e)
         {
             Sharpen.Runtime.PrintStackTrace(e);
         }
     }
 }
Ejemplo n.º 2
0
        // TODO: make this into a unit test
        //Test QuoteAttributionAnnotator on a chapter of PP.
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.TypeLoadException"/>
        public static void TestPP(string familyFile, string animateFile, string genderFile, string charactersFile, string modelFile)
        {
            Properties props = new Properties();

            props.SetProperty("annotators", "tokenize, ssplit, pos, lemma, ner, depparse, quote, quoteattribution");
            props.SetProperty("quoteattribution.familyWordsFile", familyFile);
            props.SetProperty("quoteattribution.animacyWordsFile", animateFile);
            props.SetProperty("quoteattribution.genderNamesFile", genderFile);
            props.SetProperty("quoteattribution.charactersPath", charactersFile);
            props.SetProperty("quoteattribution.modelPath", modelFile);
            StanfordCoreNLP  coreNLP             = new StanfordCoreNLP(props);
            Annotation       processedAnnotation = coreNLP.Process(test);
            IList <ICoreMap> quotes = processedAnnotation.Get(typeof(CoreAnnotations.QuotationsAnnotation));

            foreach (ICoreMap quote in quotes)
            {
                System.Console.Out.WriteLine("Quote: " + quote.Get(typeof(CoreAnnotations.TextAnnotation)));
                if (quote.Get(typeof(QuoteAttributionAnnotator.MentionAnnotation)) != null)
                {
                    System.Console.Out.WriteLine("Predicted Mention: " + quote.Get(typeof(QuoteAttributionAnnotator.MentionAnnotation)) + " Predictor: " + quote.Get(typeof(QuoteAttributionAnnotator.MentionSieveAnnotation)));
                }
                else
                {
                    System.Console.Out.WriteLine("Predicted Mention: none");
                }
                if (quote.Get(typeof(QuoteAttributionAnnotator.SpeakerAnnotation)) != null)
                {
                    System.Console.Out.WriteLine("Predicted Speaker: " + quote.Get(typeof(QuoteAttributionAnnotator.SpeakerAnnotation)) + " Predictor: " + quote.Get(typeof(QuoteAttributionAnnotator.SpeakerSieveAnnotation)));
                }
                else
                {
                    System.Console.Out.WriteLine("Predicted Speaker: none");
                }
                System.Console.Out.WriteLine("====");
            }
            System.Console.Out.WriteLine("Finished");
        }
Ejemplo n.º 3
0
        /// <summary>Runs the tree-based sentiment model on some text.</summary>
        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            string parserModel    = null;
            string sentimentModel = null;
            string filename       = null;
            string fileList       = null;
            bool   stdin          = false;
            bool   filterUnknown  = false;
            IList <SentimentPipeline.Output> outputFormats = Java.Util.Collections.SingletonList(SentimentPipeline.Output.Root);

            SentimentPipeline.Input inputFormat = SentimentPipeline.Input.Text;
            string tlppClass = DefaultTlppClass;

            for (int argIndex = 0; argIndex < args.Length;)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-sentimentModel"))
                {
                    sentimentModel = args[argIndex + 1];
                    argIndex      += 2;
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-parserModel"))
                    {
                        parserModel = args[argIndex + 1];
                        argIndex   += 2;
                    }
                    else
                    {
                        if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-file"))
                        {
                            filename  = args[argIndex + 1];
                            argIndex += 2;
                        }
                        else
                        {
                            if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-fileList"))
                            {
                                fileList  = args[argIndex + 1];
                                argIndex += 2;
                            }
                            else
                            {
                                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-stdin"))
                                {
                                    stdin = true;
                                    argIndex++;
                                }
                                else
                                {
                                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-input"))
                                    {
                                        inputFormat = SentimentPipeline.Input.ValueOf(args[argIndex + 1].ToUpper());
                                        argIndex   += 2;
                                    }
                                    else
                                    {
                                        if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-output"))
                                        {
                                            string[] formats = args[argIndex + 1].Split(",");
                                            outputFormats = new List <SentimentPipeline.Output>();
                                            foreach (string format in formats)
                                            {
                                                outputFormats.Add(SentimentPipeline.Output.ValueOf(format.ToUpper()));
                                            }
                                            argIndex += 2;
                                        }
                                        else
                                        {
                                            if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-filterUnknown"))
                                            {
                                                filterUnknown = true;
                                                argIndex++;
                                            }
                                            else
                                            {
                                                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-tlppClass"))
                                                {
                                                    tlppClass = args[argIndex + 1];
                                                    argIndex += 2;
                                                }
                                                else
                                                {
                                                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-help"))
                                                    {
                                                        Help();
                                                        System.Environment.Exit(0);
                                                    }
                                                    else
                                                    {
                                                        log.Info("Unknown argument " + args[argIndex + 1]);
                                                        Help();
                                                        throw new ArgumentException("Unknown argument " + args[argIndex + 1]);
                                                    }
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
            // We construct two pipelines.  One handles tokenization, if
            // necessary.  The other takes tokenized sentences and converts
            // them to sentiment trees.
            Properties pipelineProps  = new Properties();
            Properties tokenizerProps = null;

            if (sentimentModel != null)
            {
                pipelineProps.SetProperty("sentiment.model", sentimentModel);
            }
            if (parserModel != null)
            {
                pipelineProps.SetProperty("parse.model", parserModel);
            }
            if (inputFormat == SentimentPipeline.Input.Trees)
            {
                pipelineProps.SetProperty("annotators", "binarizer, sentiment");
                pipelineProps.SetProperty("customAnnotatorClass.binarizer", "edu.stanford.nlp.pipeline.BinarizerAnnotator");
                pipelineProps.SetProperty("binarizer.tlppClass", tlppClass);
                pipelineProps.SetProperty("enforceRequirements", "false");
            }
            else
            {
                pipelineProps.SetProperty("annotators", "parse, sentiment");
                pipelineProps.SetProperty("parse.binaryTrees", "true");
                pipelineProps.SetProperty("parse.buildgraphs", "false");
                pipelineProps.SetProperty("enforceRequirements", "false");
                tokenizerProps = new Properties();
                tokenizerProps.SetProperty("annotators", "tokenize, ssplit");
            }
            if (stdin && tokenizerProps != null)
            {
                tokenizerProps.SetProperty(StanfordCoreNLP.NewlineSplitterProperty, "true");
            }
            int count = 0;

            if (filename != null)
            {
                count++;
            }
            if (fileList != null)
            {
                count++;
            }
            if (stdin)
            {
                count++;
            }
            if (count > 1)
            {
                throw new ArgumentException("Please only specify one of -file, -fileList or -stdin");
            }
            if (count == 0)
            {
                throw new ArgumentException("Please specify either -file, -fileList or -stdin");
            }
            StanfordCoreNLP tokenizer = (tokenizerProps == null) ? null : new StanfordCoreNLP(tokenizerProps);
            StanfordCoreNLP pipeline  = new StanfordCoreNLP(pipelineProps);

            if (filename != null)
            {
                // Process a file.  The pipeline will do tokenization, which
                // means it will split it into sentences as best as possible
                // with the tokenizer.
                IList <Annotation> annotations = GetAnnotations(tokenizer, inputFormat, filename, filterUnknown);
                foreach (Annotation annotation in annotations)
                {
                    pipeline.Annotate(annotation);
                    foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
                    {
                        System.Console.Out.WriteLine(sentence);
                        OutputTree(System.Console.Out, sentence, outputFormats);
                    }
                }
            }
            else
            {
                if (fileList != null)
                {
                    // Process multiple files.  The pipeline will do tokenization,
                    // which means it will split it into sentences as best as
                    // possible with the tokenizer.  Output will go to filename.out
                    // for each file.
                    foreach (string file in fileList.Split(","))
                    {
                        IList <Annotation> annotations = GetAnnotations(tokenizer, inputFormat, file, filterUnknown);
                        FileOutputStream   fout        = new FileOutputStream(file + ".out");
                        TextWriter         pout        = new TextWriter(fout);
                        foreach (Annotation annotation in annotations)
                        {
                            pipeline.Annotate(annotation);
                            foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
                            {
                                pout.WriteLine(sentence);
                                OutputTree(pout, sentence, outputFormats);
                            }
                        }
                        pout.Flush();
                        fout.Close();
                    }
                }
                else
                {
                    // Process stdin.  Each line will be treated as a single sentence.
                    log.Info("Reading in text from stdin.");
                    log.Info("Please enter one sentence per line.");
                    log.Info("Processing will end when EOF is reached.");
                    BufferedReader reader = IOUtils.ReaderFromStdin("utf-8");
                    for (string line; (line = reader.ReadLine()) != null;)
                    {
                        line = line.Trim();
                        if (!line.IsEmpty())
                        {
                            Annotation annotation = tokenizer.Process(line);
                            pipeline.Annotate(annotation);
                            foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
                            {
                                OutputTree(System.Console.Out, sentence, outputFormats);
                            }
                        }
                        else
                        {
                            // Output blank lines for blank lines so the tool can be
                            // used for line-by-line text processing
                            System.Console.Out.WriteLine();
                        }
                    }
                }
            }
        }