public static void Main(string[] args)
        {
            string parserFile = null;

            for (int argIndex = 0; argIndex < args.Length;)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-model"))
                {
                    parserFile = args[argIndex + 1];
                    argIndex  += 2;
                }
                else
                {
                    string error = "Unknown argument " + args[argIndex];
                    log.Info(error);
                    throw new Exception(error);
                }
            }
            if (parserFile == null)
            {
                log.Info("Must specify a model file with -model");
                System.Environment.Exit(2);
            }
            LexicalizedParser    parser = ((LexicalizedParser)LexicalizedParser.LoadModel(parserFile));
            ICollection <string> tags   = Generics.NewTreeSet();

            foreach (string tag in parser.tagIndex)
            {
                tags.Add(parser.TreebankLanguagePack().BasicCategory(tag));
            }
            System.Console.Out.WriteLine("Basic tags: " + tags.Count);
            foreach (string tag_1 in tags)
            {
                System.Console.Out.Write("  " + tag_1);
            }
            System.Console.Out.WriteLine();
            System.Console.Out.WriteLine("All tags size: " + parser.tagIndex.Size());
            ICollection <string> states = Generics.NewTreeSet();

            foreach (string state in parser.stateIndex)
            {
                states.Add(parser.TreebankLanguagePack().BasicCategory(state));
            }
            System.Console.Out.WriteLine("Basic states: " + states.Count);
            foreach (string tag_2 in states)
            {
                System.Console.Out.Write("  " + tag_2);
            }
            System.Console.Out.WriteLine();
            System.Console.Out.WriteLine("All states size: " + parser.stateIndex.Size());
            System.Console.Out.WriteLine("Unary grammar size: " + parser.ug.NumRules());
            System.Console.Out.WriteLine("Binary grammar size: " + parser.bg.NumRules());
        }
Ejemplo n.º 2
0
        /// <summary>
        /// demoDP demonstrates turning a file into tokens and then parse
        /// trees.
        /// </summary>
        /// <remarks>
        /// demoDP demonstrates turning a file into tokens and then parse
        /// trees.  Note that the trees are printed by calling pennPrint on
        /// the Tree object.  It is also possible to pass a PrintWriter to
        /// pennPrint if you want to capture the output.
        /// This code will work with any supported language.
        /// </remarks>
        public static void DemoDP(LexicalizedParser lp, string filename)
        {
            // This option shows loading, sentence-segmenting and tokenizing
            // a file using DocumentPreprocessor.
            ITreebankLanguagePack tlp = lp.TreebankLanguagePack();
            // a PennTreebankLanguagePack for English
            IGrammaticalStructureFactory gsf = null;

            if (tlp.SupportsGrammaticalStructures())
            {
                gsf = tlp.GrammaticalStructureFactory();
            }
            // You could also create a tokenizer here (as below) and pass it
            // to DocumentPreprocessor
            foreach (IList <IHasWord> sentence in new DocumentPreprocessor(filename))
            {
                Tree parse = lp.Apply(sentence);
                parse.PennPrint();
                System.Console.Out.WriteLine();
                if (gsf != null)
                {
                    GrammaticalStructure gs  = gsf.NewGrammaticalStructure(parse);
                    ICollection          tdl = gs.TypedDependenciesCCprocessed();
                    System.Console.Out.WriteLine(tdl);
                    System.Console.Out.WriteLine();
                }
            }
        }
Ejemplo n.º 3
0
        /// <summary>
        /// demoAPI demonstrates other ways of calling the parser with
        /// already tokenized text, or in some cases, raw text that needs to
        /// be tokenized as a single sentence.
        /// </summary>
        /// <remarks>
        /// demoAPI demonstrates other ways of calling the parser with
        /// already tokenized text, or in some cases, raw text that needs to
        /// be tokenized as a single sentence.  Output is handled with a
        /// TreePrint object.  Note that the options used when creating the
        /// TreePrint can determine what results to print out.  Once again,
        /// one can capture the output by passing a PrintWriter to
        /// TreePrint.printTree. This code is for English.
        /// </remarks>
        public static void DemoAPI(LexicalizedParser lp)
        {
            // This option shows parsing a list of correctly tokenized words
            string[]          sent     = new string[] { "This", "is", "an", "easy", "sentence", "." };
            IList <CoreLabel> rawWords = SentenceUtils.ToCoreLabelList(sent);
            Tree parse = lp.Apply(rawWords);

            parse.PennPrint();
            System.Console.Out.WriteLine();
            // This option shows loading and using an explicit tokenizer
            string sent2 = "This is another sentence.";
            ITokenizerFactory <CoreLabel> tokenizerFactory = PTBTokenizer.Factory(new CoreLabelTokenFactory(), string.Empty);
            ITokenizer <CoreLabel>        tok       = tokenizerFactory.GetTokenizer(new StringReader(sent2));
            IList <CoreLabel>             rawWords2 = tok.Tokenize();

            parse = lp.Apply(rawWords2);
            ITreebankLanguagePack tlp = lp.TreebankLanguagePack();
            // PennTreebankLanguagePack for English
            IGrammaticalStructureFactory gsf = tlp.GrammaticalStructureFactory();
            GrammaticalStructure         gs  = gsf.NewGrammaticalStructure(parse);
            IList <TypedDependency>      tdl = gs.TypedDependenciesCCprocessed();

            System.Console.Out.WriteLine(tdl);
            System.Console.Out.WriteLine();
            // You can also use a TreePrint object to print trees and dependencies
            TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");

            tp.PrintTree(parse);
        }
Ejemplo n.º 4
0
 public CacheParseHypotheses(LexicalizedParser parser)
 {
     treeBasicCategories = new BasicCategoryTreeTransformer(parser.TreebankLanguagePack());
     treeFilter          = new FilterConfusingRules(parser);
 }
        /// <summary>
        /// Turns a text file into trees for use in a RNTN classifier such as
        /// the treebank used in the Sentiment project.
        /// </summary>
        /// <remarks>
        /// Turns a text file into trees for use in a RNTN classifier such as
        /// the treebank used in the Sentiment project.
        /// <br />
        /// The expected input file is one sentence per line, with sentences
        /// separated by blank lines. The first line has the main label of the sentence together with the full sentence.
        /// Lines after the first sentence line but before
        /// the blank line will be treated as labeled sub-phrases.  The
        /// labels should start with the label and then contain a list of
        /// tokens the label applies to. All phrases that do not have their own label will take on the main sentence label!
        /// For example:
        /// <br />
        /// <code>
        /// 1 Today is not a good day.<br />
        /// 3 good<br />
        /// 3 good day <br />
        /// 3 a good day <br />
        /// <br />
        /// (next block starts here) <br />
        /// </code>
        /// By default the englishPCFG parser is used.  This can be changed
        /// with the
        /// <c>-parserModel</c>
        /// flag.  Specify an input file
        /// with
        /// <c>-input</c>
        /// .
        /// <br />
        /// If a sentiment model is provided with -sentimentModel, that model
        /// will be used to prelabel the sentences.  Any spans with given
        /// labels will then be used to adjust those labels.
        /// </remarks>
        public static void Main(string[] args)
        {
            CollapseUnaryTransformer transformer = new CollapseUnaryTransformer();
            string         parserModel           = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
            string         inputPath             = null;
            string         sentimentModelPath    = null;
            SentimentModel sentimentModel        = null;

            for (int argIndex = 0; argIndex < args.Length;)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-input"))
                {
                    inputPath = args[argIndex + 1];
                    argIndex += 2;
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-parserModel"))
                    {
                        parserModel = args[argIndex + 1];
                        argIndex   += 2;
                    }
                    else
                    {
                        if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-sentimentModel"))
                        {
                            sentimentModelPath = args[argIndex + 1];
                            argIndex          += 2;
                        }
                        else
                        {
                            log.Info("Unknown argument " + args[argIndex]);
                            System.Environment.Exit(2);
                        }
                    }
                }
            }
            if (inputPath == null)
            {
                throw new ArgumentException("Must specify input file with -input");
            }
            LexicalizedParser parser    = ((LexicalizedParser)LexicalizedParser.LoadModel(parserModel));
            TreeBinarizer     binarizer = TreeBinarizer.SimpleTreeBinarizer(parser.GetTLPParams().HeadFinder(), parser.TreebankLanguagePack());

            if (sentimentModelPath != null)
            {
                sentimentModel = SentimentModel.LoadSerialized(sentimentModelPath);
            }
            string text = IOUtils.SlurpFileNoExceptions(inputPath);

            string[] chunks = text.Split("\\n\\s*\\n+");
            // need blank line to make a new chunk
            foreach (string chunk in chunks)
            {
                if (chunk.Trim().IsEmpty())
                {
                    continue;
                }
                // The expected format is that line 0 will be the text of the
                // sentence, and each subsequence line, if any, will be a value
                // followed by the sequence of tokens that get that value.
                // Here we take the first line and tokenize it as one sentence.
                string[]             lines    = chunk.Trim().Split("\\n");
                string               sentence = lines[0];
                StringReader         sin      = new StringReader(sentence);
                DocumentPreprocessor document = new DocumentPreprocessor(sin);
                document.SetSentenceFinalPuncWords(new string[] { "\n" });
                IList <IHasWord> tokens = document.GetEnumerator().Current;
                int mainLabel           = System.Convert.ToInt32(tokens[0].Word());
                //System.out.print("Main Sentence Label: " + mainLabel.toString() + "; ");
                tokens = tokens.SubList(1, tokens.Count);
                //log.info(tokens);
                IDictionary <Pair <int, int>, string> spanToLabels = Generics.NewHashMap();
                for (int i = 1; i < lines.Length; ++i)
                {
                    ExtractLabels(spanToLabels, tokens, lines[i]);
                }
                // TODO: add an option which treats the spans as constraints when parsing
                Tree tree           = parser.Apply(tokens);
                Tree binarized      = binarizer.TransformTree(tree);
                Tree collapsedUnary = transformer.TransformTree(binarized);
                // if there is a sentiment model for use in prelabeling, we
                // label here and then use the user given labels to adjust
                if (sentimentModel != null)
                {
                    Edu.Stanford.Nlp.Trees.Trees.ConvertToCoreLabels(collapsedUnary);
                    SentimentCostAndGradient scorer = new SentimentCostAndGradient(sentimentModel, null);
                    scorer.ForwardPropagateTree(collapsedUnary);
                    SetPredictedLabels(collapsedUnary);
                }
                else
                {
                    SetUnknownLabels(collapsedUnary, mainLabel);
                }
                Edu.Stanford.Nlp.Trees.Trees.ConvertToCoreLabels(collapsedUnary);
                collapsedUnary.IndexSpans();
                foreach (KeyValuePair <Pair <int, int>, string> pairStringEntry in spanToLabels)
                {
                    SetSpanLabel(collapsedUnary, pairStringEntry.Key, pairStringEntry.Value);
                }
                System.Console.Out.WriteLine(collapsedUnary);
            }
        }