Beispiel #1
0
        /// <summary>
        /// Expected arguments are <code> -model model -treebank treebank </code>
        /// <br />
        /// For example <br />
        /// <code>
        /// java edu.stanford.nlp.sentiment.Evaluate
        /// edu/stanford/nlp/models/sentiment/sentiment.ser.gz
        /// /u/nlp/data/sentiment/trees/dev.txt
        /// </code>
        /// Other arguments are available, for example <code> -numClasses</code>.
        /// </summary>
        /// <remarks>
        /// Expected arguments are <code> -model model -treebank treebank </code>
        /// <br />
        /// For example <br />
        /// <code>
        /// java edu.stanford.nlp.sentiment.Evaluate
        /// edu/stanford/nlp/models/sentiment/sentiment.ser.gz
        /// /u/nlp/data/sentiment/trees/dev.txt
        /// </code>
        /// Other arguments are available, for example <code> -numClasses</code>.
        /// See RNNOptions.java, RNNTestOptions.java and RNNTrainOptions.java for
        /// more arguments.
        /// The configuration is usually derived from the RNN model file, which is
        /// not available here as the predictions are external. It is the caller's
        /// responsibility to provide a configuration matching the settings of
        /// the external predictor. Flags of interest include
        /// <code> -equivalenceClasses </code>.
        /// </remarks>
        public static void Main(string[] args)
        {
            string         modelPath     = null;
            string         treePath      = null;
            bool           filterUnknown = false;
            IList <string> remainingArgs = Generics.NewArrayList();

            for (int argIndex = 0; argIndex < args.Length;)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-model"))
                {
                    modelPath = args[argIndex + 1];
                    argIndex += 2;
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-treebank"))
                    {
                        treePath  = args[argIndex + 1];
                        argIndex += 2;
                    }
                    else
                    {
                        if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-filterUnknown"))
                        {
                            filterUnknown = true;
                            argIndex++;
                        }
                        else
                        {
                            remainingArgs.Add(args[argIndex]);
                            argIndex++;
                        }
                    }
                }
            }
            string[] newArgs = new string[remainingArgs.Count];
            Sharpen.Collections.ToArray(remainingArgs, newArgs);
            SentimentModel model = SentimentModel.LoadSerialized(modelPath);

            for (int argIndex_1 = 0; argIndex_1 < newArgs.Length;)
            {
                int newIndex = model.op.SetOption(newArgs, argIndex_1);
                if (argIndex_1 == newIndex)
                {
                    log.Info("Unknown argument " + newArgs[argIndex_1]);
                    throw new ArgumentException("Unknown argument " + newArgs[argIndex_1]);
                }
                argIndex_1 = newIndex;
            }
            IList <Tree> trees = SentimentUtils.ReadTreesWithGoldLabels(treePath);

            if (filterUnknown)
            {
                trees = SentimentUtils.FilterUnknownRoots(trees);
            }
            Edu.Stanford.Nlp.Sentiment.Evaluate eval = new Edu.Stanford.Nlp.Sentiment.Evaluate(model);
            eval.Eval(trees);
            eval.PrintSummary();
        }
        /// <summary>
        /// Turns a text file into trees for use in a RNTN classifier such as
        /// the treebank used in the Sentiment project.
        /// </summary>
        /// <remarks>
        /// Turns a text file into trees for use in a RNTN classifier such as
        /// the treebank used in the Sentiment project.
        /// <br />
        /// The expected input file is one sentence per line, with sentences
        /// separated by blank lines. The first line has the main label of the sentence together with the full sentence.
        /// Lines after the first sentence line but before
        /// the blank line will be treated as labeled sub-phrases.  The
        /// labels should start with the label and then contain a list of
        /// tokens the label applies to. All phrases that do not have their own label will take on the main sentence label!
        /// For example:
        /// <br />
        /// <code>
        /// 1 Today is not a good day.<br />
        /// 3 good<br />
        /// 3 good day <br />
        /// 3 a good day <br />
        /// <br />
        /// (next block starts here) <br />
        /// </code>
        /// By default the englishPCFG parser is used.  This can be changed
        /// with the
        /// <c>-parserModel</c>
        /// flag.  Specify an input file
        /// with
        /// <c>-input</c>
        /// .
        /// <br />
        /// If a sentiment model is provided with -sentimentModel, that model
        /// will be used to prelabel the sentences.  Any spans with given
        /// labels will then be used to adjust those labels.
        /// </remarks>
        public static void Main(string[] args)
        {
            CollapseUnaryTransformer transformer = new CollapseUnaryTransformer();
            string         parserModel           = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
            string         inputPath             = null;
            string         sentimentModelPath    = null;
            SentimentModel sentimentModel        = null;

            for (int argIndex = 0; argIndex < args.Length;)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-input"))
                {
                    inputPath = args[argIndex + 1];
                    argIndex += 2;
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-parserModel"))
                    {
                        parserModel = args[argIndex + 1];
                        argIndex   += 2;
                    }
                    else
                    {
                        if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-sentimentModel"))
                        {
                            sentimentModelPath = args[argIndex + 1];
                            argIndex          += 2;
                        }
                        else
                        {
                            log.Info("Unknown argument " + args[argIndex]);
                            System.Environment.Exit(2);
                        }
                    }
                }
            }
            if (inputPath == null)
            {
                throw new ArgumentException("Must specify input file with -input");
            }
            LexicalizedParser parser    = ((LexicalizedParser)LexicalizedParser.LoadModel(parserModel));
            TreeBinarizer     binarizer = TreeBinarizer.SimpleTreeBinarizer(parser.GetTLPParams().HeadFinder(), parser.TreebankLanguagePack());

            if (sentimentModelPath != null)
            {
                sentimentModel = SentimentModel.LoadSerialized(sentimentModelPath);
            }
            string text = IOUtils.SlurpFileNoExceptions(inputPath);

            string[] chunks = text.Split("\\n\\s*\\n+");
            // need blank line to make a new chunk
            foreach (string chunk in chunks)
            {
                if (chunk.Trim().IsEmpty())
                {
                    continue;
                }
                // The expected format is that line 0 will be the text of the
                // sentence, and each subsequence line, if any, will be a value
                // followed by the sequence of tokens that get that value.
                // Here we take the first line and tokenize it as one sentence.
                string[]             lines    = chunk.Trim().Split("\\n");
                string               sentence = lines[0];
                StringReader         sin      = new StringReader(sentence);
                DocumentPreprocessor document = new DocumentPreprocessor(sin);
                document.SetSentenceFinalPuncWords(new string[] { "\n" });
                IList <IHasWord> tokens = document.GetEnumerator().Current;
                int mainLabel           = System.Convert.ToInt32(tokens[0].Word());
                //System.out.print("Main Sentence Label: " + mainLabel.toString() + "; ");
                tokens = tokens.SubList(1, tokens.Count);
                //log.info(tokens);
                IDictionary <Pair <int, int>, string> spanToLabels = Generics.NewHashMap();
                for (int i = 1; i < lines.Length; ++i)
                {
                    ExtractLabels(spanToLabels, tokens, lines[i]);
                }
                // TODO: add an option which treats the spans as constraints when parsing
                Tree tree           = parser.Apply(tokens);
                Tree binarized      = binarizer.TransformTree(tree);
                Tree collapsedUnary = transformer.TransformTree(binarized);
                // if there is a sentiment model for use in prelabeling, we
                // label here and then use the user given labels to adjust
                if (sentimentModel != null)
                {
                    Edu.Stanford.Nlp.Trees.Trees.ConvertToCoreLabels(collapsedUnary);
                    SentimentCostAndGradient scorer = new SentimentCostAndGradient(sentimentModel, null);
                    scorer.ForwardPropagateTree(collapsedUnary);
                    SetPredictedLabels(collapsedUnary);
                }
                else
                {
                    SetUnknownLabels(collapsedUnary, mainLabel);
                }
                Edu.Stanford.Nlp.Trees.Trees.ConvertToCoreLabels(collapsedUnary);
                collapsedUnary.IndexSpans();
                foreach (KeyValuePair <Pair <int, int>, string> pairStringEntry in spanToLabels)
                {
                    SetSpanLabel(collapsedUnary, pairStringEntry.Key, pairStringEntry.Value);
                }
                System.Console.Out.WriteLine(collapsedUnary);
            }
        }