/// <summary>Reads an annotation from the given filename using the requested input.</summary> public static IList <Annotation> GetAnnotations(StanfordCoreNLP tokenizer, SentimentPipeline.Input inputFormat, string filename, bool filterUnknown) { switch (inputFormat) { case SentimentPipeline.Input.Text: { string text = IOUtils.SlurpFileNoExceptions(filename); Annotation annotation = new Annotation(text); tokenizer.Annotate(annotation); IList <Annotation> annotations = Generics.NewArrayList(); foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))) { Annotation nextAnnotation = new Annotation(sentence.Get(typeof(CoreAnnotations.TextAnnotation))); nextAnnotation.Set(typeof(CoreAnnotations.SentencesAnnotation), Java.Util.Collections.SingletonList(sentence)); annotations.Add(nextAnnotation); } return(annotations); } case SentimentPipeline.Input.Trees: { IList <Tree> trees; if (filterUnknown) { trees = SentimentUtils.ReadTreesWithGoldLabels(filename); trees = SentimentUtils.FilterUnknownRoots(trees); } else { MemoryTreebank treebank = new MemoryTreebank("utf-8"); treebank.LoadPath(filename, null); trees = new List <Tree>(treebank); } IList <Annotation> annotations = Generics.NewArrayList(); foreach (Tree tree in trees) { ICoreMap sentence = new Annotation(SentenceUtils.ListToString(tree.Yield())); sentence.Set(typeof(TreeCoreAnnotations.TreeAnnotation), tree); IList <ICoreMap> sentences = Java.Util.Collections.SingletonList(sentence); Annotation annotation = new Annotation(string.Empty); annotation.Set(typeof(CoreAnnotations.SentencesAnnotation), sentences); annotations.Add(annotation); } return(annotations); } default: { throw new ArgumentException("Unknown format " + inputFormat); } } }
/// <summary>Runs the tree-based sentiment model on some text.</summary> /// <exception cref="System.IO.IOException"/> public static void Main(string[] args) { string parserModel = null; string sentimentModel = null; string filename = null; string fileList = null; bool stdin = false; bool filterUnknown = false; IList <SentimentPipeline.Output> outputFormats = Java.Util.Collections.SingletonList(SentimentPipeline.Output.Root); SentimentPipeline.Input inputFormat = SentimentPipeline.Input.Text; string tlppClass = DefaultTlppClass; for (int argIndex = 0; argIndex < args.Length;) { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-sentimentModel")) { sentimentModel = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-parserModel")) { parserModel = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-file")) { filename = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-fileList")) { fileList = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-stdin")) { stdin = true; argIndex++; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-input")) { inputFormat = SentimentPipeline.Input.ValueOf(args[argIndex + 1].ToUpper()); argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-output")) { string[] formats = args[argIndex + 1].Split(","); outputFormats = new List <SentimentPipeline.Output>(); foreach (string format in formats) { outputFormats.Add(SentimentPipeline.Output.ValueOf(format.ToUpper())); } argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-filterUnknown")) { filterUnknown = true; argIndex++; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-tlppClass")) { tlppClass = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-help")) { Help(); System.Environment.Exit(0); } else { log.Info("Unknown argument " + args[argIndex + 1]); Help(); throw new ArgumentException("Unknown argument " + args[argIndex + 1]); } } } } } } } } } } } // We construct two pipelines. One handles tokenization, if // necessary. The other takes tokenized sentences and converts // them to sentiment trees. Properties pipelineProps = new Properties(); Properties tokenizerProps = null; if (sentimentModel != null) { pipelineProps.SetProperty("sentiment.model", sentimentModel); } if (parserModel != null) { pipelineProps.SetProperty("parse.model", parserModel); } if (inputFormat == SentimentPipeline.Input.Trees) { pipelineProps.SetProperty("annotators", "binarizer, sentiment"); pipelineProps.SetProperty("customAnnotatorClass.binarizer", "edu.stanford.nlp.pipeline.BinarizerAnnotator"); pipelineProps.SetProperty("binarizer.tlppClass", tlppClass); pipelineProps.SetProperty("enforceRequirements", "false"); } else { pipelineProps.SetProperty("annotators", "parse, sentiment"); pipelineProps.SetProperty("parse.binaryTrees", "true"); pipelineProps.SetProperty("parse.buildgraphs", "false"); pipelineProps.SetProperty("enforceRequirements", "false"); tokenizerProps = new Properties(); tokenizerProps.SetProperty("annotators", "tokenize, ssplit"); } if (stdin && tokenizerProps != null) { tokenizerProps.SetProperty(StanfordCoreNLP.NewlineSplitterProperty, "true"); } int count = 0; if (filename != null) { count++; } if (fileList != null) { count++; } if (stdin) { count++; } if (count > 1) { throw new ArgumentException("Please only specify one of -file, -fileList or -stdin"); } if (count == 0) { throw new ArgumentException("Please specify either -file, -fileList or -stdin"); } StanfordCoreNLP tokenizer = (tokenizerProps == null) ? null : new StanfordCoreNLP(tokenizerProps); StanfordCoreNLP pipeline = new StanfordCoreNLP(pipelineProps); if (filename != null) { // Process a file. The pipeline will do tokenization, which // means it will split it into sentences as best as possible // with the tokenizer. IList <Annotation> annotations = GetAnnotations(tokenizer, inputFormat, filename, filterUnknown); foreach (Annotation annotation in annotations) { pipeline.Annotate(annotation); foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))) { System.Console.Out.WriteLine(sentence); OutputTree(System.Console.Out, sentence, outputFormats); } } } else { if (fileList != null) { // Process multiple files. The pipeline will do tokenization, // which means it will split it into sentences as best as // possible with the tokenizer. Output will go to filename.out // for each file. foreach (string file in fileList.Split(",")) { IList <Annotation> annotations = GetAnnotations(tokenizer, inputFormat, file, filterUnknown); FileOutputStream fout = new FileOutputStream(file + ".out"); TextWriter pout = new TextWriter(fout); foreach (Annotation annotation in annotations) { pipeline.Annotate(annotation); foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))) { pout.WriteLine(sentence); OutputTree(pout, sentence, outputFormats); } } pout.Flush(); fout.Close(); } } else { // Process stdin. Each line will be treated as a single sentence. log.Info("Reading in text from stdin."); log.Info("Please enter one sentence per line."); log.Info("Processing will end when EOF is reached."); BufferedReader reader = IOUtils.ReaderFromStdin("utf-8"); for (string line; (line = reader.ReadLine()) != null;) { line = line.Trim(); if (!line.IsEmpty()) { Annotation annotation = tokenizer.Process(line); pipeline.Annotate(annotation); foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))) { OutputTree(System.Console.Out, sentence, outputFormats); } } else { // Output blank lines for blank lines so the tool can be // used for line-by-line text processing System.Console.Out.WriteLine(); } } } } }