Esempio n. 1
0
        /// <summary>Reads an annotation from the given filename using the requested input.</summary>
        public static IList <Annotation> GetAnnotations(StanfordCoreNLP tokenizer, SentimentPipeline.Input inputFormat, string filename, bool filterUnknown)
        {
            switch (inputFormat)
            {
            case SentimentPipeline.Input.Text:
            {
                string     text       = IOUtils.SlurpFileNoExceptions(filename);
                Annotation annotation = new Annotation(text);
                tokenizer.Annotate(annotation);
                IList <Annotation> annotations = Generics.NewArrayList();
                foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
                {
                    Annotation nextAnnotation = new Annotation(sentence.Get(typeof(CoreAnnotations.TextAnnotation)));
                    nextAnnotation.Set(typeof(CoreAnnotations.SentencesAnnotation), Java.Util.Collections.SingletonList(sentence));
                    annotations.Add(nextAnnotation);
                }
                return(annotations);
            }

            case SentimentPipeline.Input.Trees:
            {
                IList <Tree> trees;
                if (filterUnknown)
                {
                    trees = SentimentUtils.ReadTreesWithGoldLabels(filename);
                    trees = SentimentUtils.FilterUnknownRoots(trees);
                }
                else
                {
                    MemoryTreebank treebank = new MemoryTreebank("utf-8");
                    treebank.LoadPath(filename, null);
                    trees = new List <Tree>(treebank);
                }
                IList <Annotation> annotations = Generics.NewArrayList();
                foreach (Tree tree in trees)
                {
                    ICoreMap sentence = new Annotation(SentenceUtils.ListToString(tree.Yield()));
                    sentence.Set(typeof(TreeCoreAnnotations.TreeAnnotation), tree);
                    IList <ICoreMap> sentences  = Java.Util.Collections.SingletonList(sentence);
                    Annotation       annotation = new Annotation(string.Empty);
                    annotation.Set(typeof(CoreAnnotations.SentencesAnnotation), sentences);
                    annotations.Add(annotation);
                }
                return(annotations);
            }

            default:
            {
                throw new ArgumentException("Unknown format " + inputFormat);
            }
            }
        }
Esempio n. 2
0
        /// <summary>Runs the tree-based sentiment model on some text.</summary>
        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            string parserModel    = null;
            string sentimentModel = null;
            string filename       = null;
            string fileList       = null;
            bool   stdin          = false;
            bool   filterUnknown  = false;
            IList <SentimentPipeline.Output> outputFormats = Java.Util.Collections.SingletonList(SentimentPipeline.Output.Root);

            SentimentPipeline.Input inputFormat = SentimentPipeline.Input.Text;
            string tlppClass = DefaultTlppClass;

            for (int argIndex = 0; argIndex < args.Length;)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-sentimentModel"))
                {
                    sentimentModel = args[argIndex + 1];
                    argIndex      += 2;
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-parserModel"))
                    {
                        parserModel = args[argIndex + 1];
                        argIndex   += 2;
                    }
                    else
                    {
                        if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-file"))
                        {
                            filename  = args[argIndex + 1];
                            argIndex += 2;
                        }
                        else
                        {
                            if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-fileList"))
                            {
                                fileList  = args[argIndex + 1];
                                argIndex += 2;
                            }
                            else
                            {
                                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-stdin"))
                                {
                                    stdin = true;
                                    argIndex++;
                                }
                                else
                                {
                                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-input"))
                                    {
                                        inputFormat = SentimentPipeline.Input.ValueOf(args[argIndex + 1].ToUpper());
                                        argIndex   += 2;
                                    }
                                    else
                                    {
                                        if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-output"))
                                        {
                                            string[] formats = args[argIndex + 1].Split(",");
                                            outputFormats = new List <SentimentPipeline.Output>();
                                            foreach (string format in formats)
                                            {
                                                outputFormats.Add(SentimentPipeline.Output.ValueOf(format.ToUpper()));
                                            }
                                            argIndex += 2;
                                        }
                                        else
                                        {
                                            if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-filterUnknown"))
                                            {
                                                filterUnknown = true;
                                                argIndex++;
                                            }
                                            else
                                            {
                                                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-tlppClass"))
                                                {
                                                    tlppClass = args[argIndex + 1];
                                                    argIndex += 2;
                                                }
                                                else
                                                {
                                                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-help"))
                                                    {
                                                        Help();
                                                        System.Environment.Exit(0);
                                                    }
                                                    else
                                                    {
                                                        log.Info("Unknown argument " + args[argIndex + 1]);
                                                        Help();
                                                        throw new ArgumentException("Unknown argument " + args[argIndex + 1]);
                                                    }
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
            // We construct two pipelines.  One handles tokenization, if
            // necessary.  The other takes tokenized sentences and converts
            // them to sentiment trees.
            Properties pipelineProps  = new Properties();
            Properties tokenizerProps = null;

            if (sentimentModel != null)
            {
                pipelineProps.SetProperty("sentiment.model", sentimentModel);
            }
            if (parserModel != null)
            {
                pipelineProps.SetProperty("parse.model", parserModel);
            }
            if (inputFormat == SentimentPipeline.Input.Trees)
            {
                pipelineProps.SetProperty("annotators", "binarizer, sentiment");
                pipelineProps.SetProperty("customAnnotatorClass.binarizer", "edu.stanford.nlp.pipeline.BinarizerAnnotator");
                pipelineProps.SetProperty("binarizer.tlppClass", tlppClass);
                pipelineProps.SetProperty("enforceRequirements", "false");
            }
            else
            {
                pipelineProps.SetProperty("annotators", "parse, sentiment");
                pipelineProps.SetProperty("parse.binaryTrees", "true");
                pipelineProps.SetProperty("parse.buildgraphs", "false");
                pipelineProps.SetProperty("enforceRequirements", "false");
                tokenizerProps = new Properties();
                tokenizerProps.SetProperty("annotators", "tokenize, ssplit");
            }
            if (stdin && tokenizerProps != null)
            {
                tokenizerProps.SetProperty(StanfordCoreNLP.NewlineSplitterProperty, "true");
            }
            int count = 0;

            if (filename != null)
            {
                count++;
            }
            if (fileList != null)
            {
                count++;
            }
            if (stdin)
            {
                count++;
            }
            if (count > 1)
            {
                throw new ArgumentException("Please only specify one of -file, -fileList or -stdin");
            }
            if (count == 0)
            {
                throw new ArgumentException("Please specify either -file, -fileList or -stdin");
            }
            StanfordCoreNLP tokenizer = (tokenizerProps == null) ? null : new StanfordCoreNLP(tokenizerProps);
            StanfordCoreNLP pipeline  = new StanfordCoreNLP(pipelineProps);

            if (filename != null)
            {
                // Process a file.  The pipeline will do tokenization, which
                // means it will split it into sentences as best as possible
                // with the tokenizer.
                IList <Annotation> annotations = GetAnnotations(tokenizer, inputFormat, filename, filterUnknown);
                foreach (Annotation annotation in annotations)
                {
                    pipeline.Annotate(annotation);
                    foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
                    {
                        System.Console.Out.WriteLine(sentence);
                        OutputTree(System.Console.Out, sentence, outputFormats);
                    }
                }
            }
            else
            {
                if (fileList != null)
                {
                    // Process multiple files.  The pipeline will do tokenization,
                    // which means it will split it into sentences as best as
                    // possible with the tokenizer.  Output will go to filename.out
                    // for each file.
                    foreach (string file in fileList.Split(","))
                    {
                        IList <Annotation> annotations = GetAnnotations(tokenizer, inputFormat, file, filterUnknown);
                        FileOutputStream   fout        = new FileOutputStream(file + ".out");
                        TextWriter         pout        = new TextWriter(fout);
                        foreach (Annotation annotation in annotations)
                        {
                            pipeline.Annotate(annotation);
                            foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
                            {
                                pout.WriteLine(sentence);
                                OutputTree(pout, sentence, outputFormats);
                            }
                        }
                        pout.Flush();
                        fout.Close();
                    }
                }
                else
                {
                    // Process stdin.  Each line will be treated as a single sentence.
                    log.Info("Reading in text from stdin.");
                    log.Info("Please enter one sentence per line.");
                    log.Info("Processing will end when EOF is reached.");
                    BufferedReader reader = IOUtils.ReaderFromStdin("utf-8");
                    for (string line; (line = reader.ReadLine()) != null;)
                    {
                        line = line.Trim();
                        if (!line.IsEmpty())
                        {
                            Annotation annotation = tokenizer.Process(line);
                            pipeline.Annotate(annotation);
                            foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
                            {
                                OutputTree(System.Console.Out, sentence, outputFormats);
                            }
                        }
                        else
                        {
                            // Output blank lines for blank lines so the tool can be
                            // used for line-by-line text processing
                            System.Console.Out.WriteLine();
                        }
                    }
                }
            }
        }