예제 #1
0
        /// <summary>
        /// Expected arguments are <code> -model model -treebank treebank </code>
        /// <br />
        /// For example <br />
        /// <code>
        /// java edu.stanford.nlp.sentiment.Evaluate
        /// edu/stanford/nlp/models/sentiment/sentiment.ser.gz
        /// /u/nlp/data/sentiment/trees/dev.txt
        /// </code>
        /// Other arguments are available, for example <code> -numClasses</code>.
        /// </summary>
        /// <remarks>
        /// Expected arguments are <code> -model model -treebank treebank </code>
        /// <br />
        /// For example <br />
        /// <code>
        /// java edu.stanford.nlp.sentiment.Evaluate
        /// edu/stanford/nlp/models/sentiment/sentiment.ser.gz
        /// /u/nlp/data/sentiment/trees/dev.txt
        /// </code>
        /// Other arguments are available, for example <code> -numClasses</code>.
        /// See RNNOptions.java, RNNTestOptions.java and RNNTrainOptions.java for
        /// more arguments.
        /// The configuration is usually derived from the RNN model file, which is
        /// not available here as the predictions are external. It is the caller's
        /// responsibility to provide a configuration matching the settings of
        /// the external predictor. Flags of interest include
        /// <code> -equivalenceClasses </code>.
        /// </remarks>
        public static void Main(string[] args)
        {
            string         modelPath     = null;
            string         treePath      = null;
            bool           filterUnknown = false;
            IList <string> remainingArgs = Generics.NewArrayList();

            for (int argIndex = 0; argIndex < args.Length;)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-model"))
                {
                    modelPath = args[argIndex + 1];
                    argIndex += 2;
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-treebank"))
                    {
                        treePath  = args[argIndex + 1];
                        argIndex += 2;
                    }
                    else
                    {
                        if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-filterUnknown"))
                        {
                            filterUnknown = true;
                            argIndex++;
                        }
                        else
                        {
                            remainingArgs.Add(args[argIndex]);
                            argIndex++;
                        }
                    }
                }
            }
            string[] newArgs = new string[remainingArgs.Count];
            Sharpen.Collections.ToArray(remainingArgs, newArgs);
            SentimentModel model = SentimentModel.LoadSerialized(modelPath);

            for (int argIndex_1 = 0; argIndex_1 < newArgs.Length;)
            {
                int newIndex = model.op.SetOption(newArgs, argIndex_1);
                if (argIndex_1 == newIndex)
                {
                    log.Info("Unknown argument " + newArgs[argIndex_1]);
                    throw new ArgumentException("Unknown argument " + newArgs[argIndex_1]);
                }
                argIndex_1 = newIndex;
            }
            IList <Tree> trees = SentimentUtils.ReadTreesWithGoldLabels(treePath);

            if (filterUnknown)
            {
                trees = SentimentUtils.FilterUnknownRoots(trees);
            }
            Edu.Stanford.Nlp.Sentiment.Evaluate eval = new Edu.Stanford.Nlp.Sentiment.Evaluate(model);
            eval.Eval(trees);
            eval.PrintSummary();
        }
예제 #2
0
        /// <summary>Reads an annotation from the given filename using the requested input.</summary>
        public static IList <Annotation> GetAnnotations(StanfordCoreNLP tokenizer, SentimentPipeline.Input inputFormat, string filename, bool filterUnknown)
        {
            switch (inputFormat)
            {
            case SentimentPipeline.Input.Text:
            {
                string     text       = IOUtils.SlurpFileNoExceptions(filename);
                Annotation annotation = new Annotation(text);
                tokenizer.Annotate(annotation);
                IList <Annotation> annotations = Generics.NewArrayList();
                foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
                {
                    Annotation nextAnnotation = new Annotation(sentence.Get(typeof(CoreAnnotations.TextAnnotation)));
                    nextAnnotation.Set(typeof(CoreAnnotations.SentencesAnnotation), Java.Util.Collections.SingletonList(sentence));
                    annotations.Add(nextAnnotation);
                }
                return(annotations);
            }

            case SentimentPipeline.Input.Trees:
            {
                IList <Tree> trees;
                if (filterUnknown)
                {
                    trees = SentimentUtils.ReadTreesWithGoldLabels(filename);
                    trees = SentimentUtils.FilterUnknownRoots(trees);
                }
                else
                {
                    MemoryTreebank treebank = new MemoryTreebank("utf-8");
                    treebank.LoadPath(filename, null);
                    trees = new List <Tree>(treebank);
                }
                IList <Annotation> annotations = Generics.NewArrayList();
                foreach (Tree tree in trees)
                {
                    ICoreMap sentence = new Annotation(SentenceUtils.ListToString(tree.Yield()));
                    sentence.Set(typeof(TreeCoreAnnotations.TreeAnnotation), tree);
                    IList <ICoreMap> sentences  = Java.Util.Collections.SingletonList(sentence);
                    Annotation       annotation = new Annotation(string.Empty);
                    annotation.Set(typeof(CoreAnnotations.SentencesAnnotation), sentences);
                    annotations.Add(annotation);
                }
                return(annotations);
            }

            default:
            {
                throw new ArgumentException("Unknown format " + inputFormat);
            }
            }
        }
        /// <summary>
        /// Expected arguments are
        /// <c>-gold gold -predicted predicted</c>
        /// For example <br />
        /// <c>java edu.stanford.nlp.sentiment.ExternalEvaluate annotatedTrees.txt predictedTrees.txt</c>
        /// </summary>
        public static void Main(string[] args)
        {
            RNNOptions curOptions    = new RNNOptions();
            string     goldPath      = null;
            string     predictedPath = null;

            for (int argIndex = 0; argIndex < args.Length;)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-gold"))
                {
                    goldPath  = args[argIndex + 1];
                    argIndex += 2;
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-predicted"))
                    {
                        predictedPath = args[argIndex + 1];
                        argIndex     += 2;
                    }
                    else
                    {
                        int newArgIndex = curOptions.SetOption(args, argIndex);
                        if (newArgIndex == argIndex)
                        {
                            throw new ArgumentException("Unknown argument " + args[argIndex]);
                        }
                        argIndex = newArgIndex;
                    }
                }
            }
            if (goldPath == null)
            {
                log.Info("goldPath not set. Exit.");
                System.Environment.Exit(-1);
            }
            if (predictedPath == null)
            {
                log.Info("predictedPath not set. Exit.");
                System.Environment.Exit(-1);
            }
            // filterUnknown not supported because I'd need to know which sentences
            // are removed to remove them from predicted
            IList <Tree> goldTrees      = SentimentUtils.ReadTreesWithGoldLabels(goldPath);
            IList <Tree> predictedTrees = SentimentUtils.ReadTreesWithPredictedLabels(predictedPath);

            Edu.Stanford.Nlp.Sentiment.ExternalEvaluate evaluator = new Edu.Stanford.Nlp.Sentiment.ExternalEvaluate(curOptions, predictedTrees);
            evaluator.Eval(goldTrees);
            evaluator.PrintSummary();
        }
        /// <summary>Trains a sentiment model.</summary>
        /// <remarks>
        /// Trains a sentiment model.
        /// The -trainPath argument points to a labeled sentiment treebank.
        /// The trees in this data will be used to train the model parameters (also to seed the model vocabulary).
        /// The -devPath argument points to a second labeled sentiment treebank.
        /// The trees in this data will be used to periodically evaluate the performance of the model.
        /// We won't train on this data; it will only be used to test how well the model generalizes to unseen data.
        /// The -model argument specifies where to save the learned sentiment model.
        /// </remarks>
        /// <param name="args">Command line arguments</param>
        public static void Main(string[] args)
        {
            RNNOptions op               = new RNNOptions();
            string     trainPath        = "sentimentTreesDebug.txt";
            string     devPath          = null;
            bool       runGradientCheck = false;
            bool       runTraining      = false;
            bool       filterUnknown    = false;
            string     modelPath        = null;

            for (int argIndex = 0; argIndex < args.Length;)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-train"))
                {
                    runTraining = true;
                    argIndex++;
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-gradientcheck"))
                    {
                        runGradientCheck = true;
                        argIndex++;
                    }
                    else
                    {
                        if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-trainpath"))
                        {
                            trainPath = args[argIndex + 1];
                            argIndex += 2;
                        }
                        else
                        {
                            if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-devpath"))
                            {
                                devPath   = args[argIndex + 1];
                                argIndex += 2;
                            }
                            else
                            {
                                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-model"))
                                {
                                    modelPath = args[argIndex + 1];
                                    argIndex += 2;
                                }
                                else
                                {
                                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-filterUnknown"))
                                    {
                                        filterUnknown = true;
                                        argIndex++;
                                    }
                                    else
                                    {
                                        int newArgIndex = op.SetOption(args, argIndex);
                                        if (newArgIndex == argIndex)
                                        {
                                            throw new ArgumentException("Unknown argument " + args[argIndex]);
                                        }
                                        argIndex = newArgIndex;
                                    }
                                }
                            }
                        }
                    }
                }
            }
            // read in the trees
            IList <Tree> trainingTrees = SentimentUtils.ReadTreesWithGoldLabels(trainPath);

            log.Info("Read in " + trainingTrees.Count + " training trees");
            if (filterUnknown)
            {
                trainingTrees = SentimentUtils.FilterUnknownRoots(trainingTrees);
                log.Info("Filtered training trees: " + trainingTrees.Count);
            }
            IList <Tree> devTrees = null;

            if (devPath != null)
            {
                devTrees = SentimentUtils.ReadTreesWithGoldLabels(devPath);
                log.Info("Read in " + devTrees.Count + " dev trees");
                if (filterUnknown)
                {
                    devTrees = SentimentUtils.FilterUnknownRoots(devTrees);
                    log.Info("Filtered dev trees: " + devTrees.Count);
                }
            }
            // TODO: binarize the trees, then collapse the unary chains.
            // Collapsed unary chains always have the label of the top node in
            // the chain
            // Note: the sentiment training data already has this done.
            // However, when we handle trees given to us from the Stanford Parser,
            // we will have to perform this step
            // build an uninitialized SentimentModel from the binary productions
            log.Info("Sentiment model options:\n" + op);
            SentimentModel model = new SentimentModel(op, trainingTrees);

            if (op.trainOptions.initialMatrixLogPath != null)
            {
                StringUtils.PrintToFile(new File(op.trainOptions.initialMatrixLogPath), model.ToString(), false, false, "utf-8");
            }
            // TODO: need to handle unk rules somehow... at test time the tree
            // structures might have something that we never saw at training
            // time.  for example, we could put a threshold on all of the
            // rules at training time and anything that doesn't meet that
            // threshold goes into the unk.  perhaps we could also use some
            // component of the accepted training rules to build up the "unk"
            // parameter in case there are no rules that don't meet the
            // threshold
            if (runGradientCheck)
            {
                RunGradientCheck(model, trainingTrees);
            }
            if (runTraining)
            {
                Train(model, modelPath, trainingTrees, devTrees);
                model.SaveSerialized(modelPath);
            }
        }