Exemple #1
        /// <summary>
        /// demoAPI demonstrates other ways of calling the parser with
        /// already tokenized text, or in some cases, raw text that needs to
        /// be tokenized as a single sentence.
        /// </summary>
        /// <remarks>
        /// TreePrint object.  Note that the options used when creating the
        /// TreePrint can determine what results to print out.  Once again,
        /// one can capture the output by passing a PrintWriter to
        /// TreePrint.printTree. This code is for English.
        /// </remarks>
        public static void DemoAPI(LexicalizedParser lp)
            // This option shows parsing a list of correctly tokenized words
            string[]          sent     = new string[] { "This", "is", "an", "easy", "sentence", "." };
            IList <CoreLabel> rawWords = SentenceUtils.ToCoreLabelList(sent);
            Tree parse = lp.Apply(rawWords);

            // This option shows loading and using an explicit tokenizer
            string sent2 = "This is another sentence.";
            ITokenizerFactory <CoreLabel> tokenizerFactory = PTBTokenizer.Factory(new CoreLabelTokenFactory(), string.Empty);
            ITokenizer <CoreLabel>        tok       = tokenizerFactory.GetTokenizer(new StringReader(sent2));
            IList <CoreLabel>             rawWords2 = tok.Tokenize();

            parse = lp.Apply(rawWords2);
            ITreebankLanguagePack tlp = lp.TreebankLanguagePack();
            // PennTreebankLanguagePack for English
            IGrammaticalStructureFactory gsf = tlp.GrammaticalStructureFactory();
            GrammaticalStructure         gs  = gsf.NewGrammaticalStructure(parse);
            IList <TypedDependency>      tdl = gs.TypedDependenciesCCprocessed();

            // You can also use a TreePrint object to print trees and dependencies
            TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");

Exemple #2
        /// <summary>
        /// demoDP demonstrates turning a file into tokens and then parse
        /// trees.
        /// </summary>
        /// <remarks>
        /// </remarks>
        public static void DemoDP(LexicalizedParser lp, string filename)
            // This option shows loading, sentence-segmenting and tokenizing
            // a file using DocumentPreprocessor.
            ITreebankLanguagePack tlp = lp.TreebankLanguagePack();
            // a PennTreebankLanguagePack for English
            IGrammaticalStructureFactory gsf = null;

            if (tlp.SupportsGrammaticalStructures())
                gsf = tlp.GrammaticalStructureFactory();
            // You could also create a tokenizer here (as below) and pass it
            // to DocumentPreprocessor
            foreach (IList <IHasWord> sentence in new DocumentPreprocessor(filename))
                Tree parse = lp.Apply(sentence);
                if (gsf != null)
                    GrammaticalStructure gs  = gsf.NewGrammaticalStructure(parse);
                    ICollection          tdl = gs.TypedDependenciesCCprocessed();
        /// <summary>
        /// Turns a text file into trees for use in a RNTN classifier such as
        /// the treebank used in the Sentiment project.
        /// </summary>
        /// <remarks>
        /// For example:
        /// <br />
        /// <code>
        /// 1 Today is not a good day.<br />
        /// 3 good<br />
        /// 3 good day <br />
        /// 3 a good day <br />
        /// <br />
        /// (next block starts here) <br />
        /// </code>
        /// By default the englishPCFG parser is used.  This can be changed
        /// with the
        /// <c>-parserModel</c>
        /// flag.  Specify an input file
        /// with
        /// <c>-input</c>
        /// .
        /// <br />
        /// If a sentiment model is provided with -sentimentModel, that model
        /// will be used to prelabel the sentences.  Any spans with given
        /// labels will then be used to adjust those labels.
        /// </remarks>
        public static void Main(string[] args)
            CollapseUnaryTransformer transformer = new CollapseUnaryTransformer();
            string         parserModel           = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
            string         inputPath             = null;
            string         sentimentModelPath    = null;
            SentimentModel sentimentModel        = null;

            for (int argIndex = 0; argIndex < args.Length;)
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-input"))
                    inputPath = args[argIndex + 1];
                    argIndex += 2;
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-parserModel"))
                        parserModel = args[argIndex + 1];
                        argIndex   += 2;
                        if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-sentimentModel"))
                            sentimentModelPath = args[argIndex + 1];
                            argIndex          += 2;
                            log.Info("Unknown argument " + args[argIndex]);
            if (inputPath == null)
                throw new ArgumentException("Must specify input file with -input");
            LexicalizedParser parser    = ((LexicalizedParser)LexicalizedParser.LoadModel(parserModel));
            TreeBinarizer     binarizer = TreeBinarizer.SimpleTreeBinarizer(parser.GetTLPParams().HeadFinder(), parser.TreebankLanguagePack());

            if (sentimentModelPath != null)
                sentimentModel = SentimentModel.LoadSerialized(sentimentModelPath);
            string text = IOUtils.SlurpFileNoExceptions(inputPath);

            string[] chunks = text.Split("\\n\\s*\\n+");
            // need blank line to make a new chunk
            foreach (string chunk in chunks)
                if (chunk.Trim().IsEmpty())
                // The expected format is that line 0 will be the text of the
                // sentence, and each subsequence line, if any, will be a value
                // followed by the sequence of tokens that get that value.
                // Here we take the first line and tokenize it as one sentence.
                string[]             lines    = chunk.Trim().Split("\\n");
                string               sentence = lines[0];
                StringReader         sin      = new StringReader(sentence);
                DocumentPreprocessor document = new DocumentPreprocessor(sin);
                document.SetSentenceFinalPuncWords(new string[] { "\n" });
                IList <IHasWord> tokens = document.GetEnumerator().Current;
                int mainLabel           = System.Convert.ToInt32(tokens[0].Word());
                //System.out.print("Main Sentence Label: " + mainLabel.toString() + "; ");
                tokens = tokens.SubList(1, tokens.Count);
                IDictionary <Pair <int, int>, string> spanToLabels = Generics.NewHashMap();
                for (int i = 1; i < lines.Length; ++i)
                    ExtractLabels(spanToLabels, tokens, lines[i]);
                // TODO: add an option which treats the spans as constraints when parsing
                Tree tree           = parser.Apply(tokens);
                Tree binarized      = binarizer.TransformTree(tree);
                Tree collapsedUnary = transformer.TransformTree(binarized);
                // if there is a sentiment model for use in prelabeling, we
                // label here and then use the user given labels to adjust
                if (sentimentModel != null)
                    SentimentCostAndGradient scorer = new SentimentCostAndGradient(sentimentModel, null);
                    SetUnknownLabels(collapsedUnary, mainLabel);
                foreach (KeyValuePair <Pair <int, int>, string> pairStringEntry in spanToLabels)
                    SetSpanLabel(collapsedUnary, pairStringEntry.Key, pairStringEntry.Value);