예제 #1
0
        private void FinishSentence(ICoreMap sentence, IList <Tree> trees)
        {
            if (treeMap != null)
            {
                IList <Tree> mappedTrees = Generics.NewLinkedList();
                foreach (Tree tree in trees)
                {
                    Tree mappedTree = treeMap.Apply(tree);
                    mappedTrees.Add(mappedTree);
                }
                trees = mappedTrees;
            }
            ParserAnnotatorUtils.FillInParseAnnotations(Verbose, BuildGraphs, gsf, sentence, trees, extraDependencies);
            if (saveBinaryTrees)
            {
                TreeBinarizer binarizer = TreeBinarizer.SimpleTreeBinarizer(parser.GetTLPParams().HeadFinder(), parser.TreebankLanguagePack());
                Tree          binarized = binarizer.TransformTree(trees[0]);
                Edu.Stanford.Nlp.Trees.Trees.ConvertToCoreLabels(binarized);
                sentence.Set(typeof(TreeCoreAnnotations.BinarizedTreeAnnotation), binarized);
            }
            // for some reason in some corner cases nodes aren't having sentenceIndex set
            // do a pass and make sure all nodes have sentenceIndex set
            SemanticGraph sg = sentence.Get(typeof(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation));

            if (sg != null)
            {
                foreach (IndexedWord iw in sg.VertexSet())
                {
                    if (iw.Get(typeof(CoreAnnotations.SentenceIndexAnnotation)) == null && sentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation)) != null)
                    {
                        iw.SetSentIndex(sentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation)));
                    }
                }
            }
        }
예제 #2
0
        public BinarizerAnnotator(string annotatorName, Properties props)
        {
            this.tlppClass = props.GetProperty(annotatorName + ".tlppClass", DefaultTlppClass);
            ITreebankLangParserParams tlpp = ReflectionLoading.LoadByReflection(tlppClass);

            this.binarizer = TreeBinarizer.SimpleTreeBinarizer(tlpp.HeadFinder(), tlpp.TreebankLanguagePack());
        }
 /// <exception cref="System.IO.IOException"/>
 public LexicalizedParserServer(int port, ParserGrammar parser)
 {
     //static final Charset utf8Charset = Charset.forName("utf-8");
     this.port         = port;
     this.serverSocket = new ServerSocket(port);
     this.parser       = parser;
     this.binarizer    = TreeBinarizer.SimpleTreeBinarizer(parser.GetTLPParams().HeadFinder(), parser.TreebankLanguagePack());
 }
예제 #4
0
        public static IList <Tree> BinarizeTreebank(Treebank treebank, Options op)
        {
            TreeBinarizer binarizer = TreeBinarizer.SimpleTreeBinarizer(op.tlpParams.HeadFinder(), op.tlpParams.TreebankLanguagePack());
            BasicCategoryTreeTransformer basicTransformer = new BasicCategoryTreeTransformer(op.Langpack());
            CompositeTreeTransformer     transformer      = new CompositeTreeTransformer();

            transformer.AddTransformer(binarizer);
            transformer.AddTransformer(basicTransformer);
            treebank = treebank.Transform(transformer);
            IHeadFinder  binaryHeadFinder = new BinaryHeadFinder(op.tlpParams.HeadFinder());
            IList <Tree> binarizedTrees   = Generics.NewArrayList();

            foreach (Tree tree in treebank)
            {
                Edu.Stanford.Nlp.Trees.Trees.ConvertToCoreLabels(tree);
                tree.PercolateHeadAnnotations(binaryHeadFinder);
                // Index from 1.  Tools downstream expect index from 1, so for
                // uses internal to the srparser we have to renormalize the
                // indices, with the result that here we have to index from 1
                tree.IndexLeaves(1, true);
                binarizedTrees.Add(tree);
            }
            return(binarizedTrees);
        }
        public static void Main(string[] args)
        {
            // TODO: rather than always rolling our own arg parser, we should
            // find a library which does it for us nicely
            string outputFile    = null;
            string sentencesFile = null;
            string labelsFile    = null;
            string parserFile    = LexicalizedParser.DefaultParserLoc;
            string taggerFile    = null;

            ParseAndSetLabels.MissingLabels missing = ParseAndSetLabels.MissingLabels.Default;
            string defaultLabel     = "-1";
            string separator        = "\\t+";
            string saveUnknownsFile = null;
            string remapLabels      = null;
            int    argIndex         = 0;
            bool   binarize         = true;
            bool   useLabelKeys     = false;

            while (argIndex < args.Length)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-output"))
                {
                    outputFile = args[argIndex + 1];
                    argIndex  += 2;
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-sentences"))
                    {
                        sentencesFile = args[argIndex + 1];
                        argIndex     += 2;
                    }
                    else
                    {
                        if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-labels"))
                        {
                            labelsFile = args[argIndex + 1];
                            argIndex  += 2;
                        }
                        else
                        {
                            if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-parser"))
                            {
                                parserFile = args[argIndex + 1];
                                argIndex  += 2;
                            }
                            else
                            {
                                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-tagger"))
                                {
                                    taggerFile = args[argIndex + 1];
                                    argIndex  += 2;
                                }
                                else
                                {
                                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-missing"))
                                    {
                                        missing   = ParseAndSetLabels.MissingLabels.ValueOf(args[argIndex + 1]);
                                        argIndex += 2;
                                    }
                                    else
                                    {
                                        if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-separator"))
                                        {
                                            separator = args[argIndex + 1];
                                            argIndex += 2;
                                        }
                                        else
                                        {
                                            if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-default"))
                                            {
                                                defaultLabel = args[argIndex + 1];
                                                argIndex    += 2;
                                            }
                                            else
                                            {
                                                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-saveUnknowns"))
                                                {
                                                    saveUnknownsFile = args[argIndex + 1];
                                                    argIndex        += 2;
                                                }
                                                else
                                                {
                                                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-remapLabels"))
                                                    {
                                                        remapLabels = args[argIndex + 1];
                                                        argIndex   += 2;
                                                    }
                                                    else
                                                    {
                                                        if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-binarize"))
                                                        {
                                                            binarize  = true;
                                                            argIndex += 1;
                                                        }
                                                        else
                                                        {
                                                            if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-nobinarize"))
                                                            {
                                                                binarize  = false;
                                                                argIndex += 1;
                                                            }
                                                            else
                                                            {
                                                                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-useLabelKeys"))
                                                                {
                                                                    useLabelKeys = true;
                                                                    argIndex    += 1;
                                                                }
                                                                else
                                                                {
                                                                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-nouseLabelKeys"))
                                                                    {
                                                                        useLabelKeys = false;
                                                                        argIndex    += 1;
                                                                    }
                                                                    else
                                                                    {
                                                                        throw new ArgumentException("Unknown argument " + args[argIndex]);
                                                                    }
                                                                }
                                                            }
                                                        }
                                                    }
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
            if (outputFile == null)
            {
                throw new ArgumentException("-output is required");
            }
            if (sentencesFile == null && !useLabelKeys)
            {
                throw new ArgumentException("-sentences or -useLabelKeys is required");
            }
            if (sentencesFile != null && useLabelKeys)
            {
                throw new ArgumentException("Use only one of -sentences or -useLabelKeys");
            }
            if (labelsFile == null)
            {
                throw new ArgumentException("-labels is required");
            }
            ParserGrammar parser    = LoadParser(parserFile, taggerFile);
            TreeBinarizer binarizer = null;

            if (binarize)
            {
                binarizer = TreeBinarizer.SimpleTreeBinarizer(parser.GetTLPParams().HeadFinder(), parser.TreebankLanguagePack());
            }
            IDictionary <string, string> labelMap = ReadLabelMap(labelsFile, separator, remapLabels);
            IList <string> sentences;

            if (sentencesFile != null)
            {
                sentences = ReadSentences(sentencesFile);
            }
            else
            {
                sentences = new List <string>(labelMap.Keys);
            }
            IList <Tree>         trees    = ParseSentences(sentences, parser, binarizer);
            ICollection <string> unknowns = SetLabels(trees, labelMap, missing, defaultLabel);

            WriteTrees(trees, outputFile);
        }
        public static IList <Tree> ParseSentences(IList <string> sentences, ParserGrammar parser, TreeBinarizer binarizer)
        {
            logger.Info("Parsing sentences");
            IList <Tree> trees = new List <Tree>();

            foreach (string sentence in sentences)
            {
                Tree tree = parser.Parse(sentence);
                if (binarizer != null)
                {
                    tree = binarizer.TransformTree(tree);
                }
                trees.Add(tree);
                if (trees.Count % 1000 == 0)
                {
                    logger.Info("  Parsed " + trees.Count + " trees");
                }
            }
            return(trees);
        }
        /// <summary>
        /// Turns a text file into trees for use in a RNTN classifier such as
        /// the treebank used in the Sentiment project.
        /// </summary>
        /// <remarks>
        /// Turns a text file into trees for use in a RNTN classifier such as
        /// the treebank used in the Sentiment project.
        /// <br />
        /// The expected input file is one sentence per line, with sentences
        /// separated by blank lines. The first line has the main label of the sentence together with the full sentence.
        /// Lines after the first sentence line but before
        /// the blank line will be treated as labeled sub-phrases.  The
        /// labels should start with the label and then contain a list of
        /// tokens the label applies to. All phrases that do not have their own label will take on the main sentence label!
        /// For example:
        /// <br />
        /// <code>
        /// 1 Today is not a good day.<br />
        /// 3 good<br />
        /// 3 good day <br />
        /// 3 a good day <br />
        /// <br />
        /// (next block starts here) <br />
        /// </code>
        /// By default the englishPCFG parser is used.  This can be changed
        /// with the
        /// <c>-parserModel</c>
        /// flag.  Specify an input file
        /// with
        /// <c>-input</c>
        /// .
        /// <br />
        /// If a sentiment model is provided with -sentimentModel, that model
        /// will be used to prelabel the sentences.  Any spans with given
        /// labels will then be used to adjust those labels.
        /// </remarks>
        public static void Main(string[] args)
        {
            CollapseUnaryTransformer transformer = new CollapseUnaryTransformer();
            string         parserModel           = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
            string         inputPath             = null;
            string         sentimentModelPath    = null;
            SentimentModel sentimentModel        = null;

            for (int argIndex = 0; argIndex < args.Length;)
            {
                if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-input"))
                {
                    inputPath = args[argIndex + 1];
                    argIndex += 2;
                }
                else
                {
                    if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-parserModel"))
                    {
                        parserModel = args[argIndex + 1];
                        argIndex   += 2;
                    }
                    else
                    {
                        if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-sentimentModel"))
                        {
                            sentimentModelPath = args[argIndex + 1];
                            argIndex          += 2;
                        }
                        else
                        {
                            log.Info("Unknown argument " + args[argIndex]);
                            System.Environment.Exit(2);
                        }
                    }
                }
            }
            if (inputPath == null)
            {
                throw new ArgumentException("Must specify input file with -input");
            }
            LexicalizedParser parser    = ((LexicalizedParser)LexicalizedParser.LoadModel(parserModel));
            TreeBinarizer     binarizer = TreeBinarizer.SimpleTreeBinarizer(parser.GetTLPParams().HeadFinder(), parser.TreebankLanguagePack());

            if (sentimentModelPath != null)
            {
                sentimentModel = SentimentModel.LoadSerialized(sentimentModelPath);
            }
            string text = IOUtils.SlurpFileNoExceptions(inputPath);

            string[] chunks = text.Split("\\n\\s*\\n+");
            // need blank line to make a new chunk
            foreach (string chunk in chunks)
            {
                if (chunk.Trim().IsEmpty())
                {
                    continue;
                }
                // The expected format is that line 0 will be the text of the
                // sentence, and each subsequence line, if any, will be a value
                // followed by the sequence of tokens that get that value.
                // Here we take the first line and tokenize it as one sentence.
                string[]             lines    = chunk.Trim().Split("\\n");
                string               sentence = lines[0];
                StringReader         sin      = new StringReader(sentence);
                DocumentPreprocessor document = new DocumentPreprocessor(sin);
                document.SetSentenceFinalPuncWords(new string[] { "\n" });
                IList <IHasWord> tokens = document.GetEnumerator().Current;
                int mainLabel           = System.Convert.ToInt32(tokens[0].Word());
                //System.out.print("Main Sentence Label: " + mainLabel.toString() + "; ");
                tokens = tokens.SubList(1, tokens.Count);
                //log.info(tokens);
                IDictionary <Pair <int, int>, string> spanToLabels = Generics.NewHashMap();
                for (int i = 1; i < lines.Length; ++i)
                {
                    ExtractLabels(spanToLabels, tokens, lines[i]);
                }
                // TODO: add an option which treats the spans as constraints when parsing
                Tree tree           = parser.Apply(tokens);
                Tree binarized      = binarizer.TransformTree(tree);
                Tree collapsedUnary = transformer.TransformTree(binarized);
                // if there is a sentiment model for use in prelabeling, we
                // label here and then use the user given labels to adjust
                if (sentimentModel != null)
                {
                    Edu.Stanford.Nlp.Trees.Trees.ConvertToCoreLabels(collapsedUnary);
                    SentimentCostAndGradient scorer = new SentimentCostAndGradient(sentimentModel, null);
                    scorer.ForwardPropagateTree(collapsedUnary);
                    SetPredictedLabels(collapsedUnary);
                }
                else
                {
                    SetUnknownLabels(collapsedUnary, mainLabel);
                }
                Edu.Stanford.Nlp.Trees.Trees.ConvertToCoreLabels(collapsedUnary);
                collapsedUnary.IndexSpans();
                foreach (KeyValuePair <Pair <int, int>, string> pairStringEntry in spanToLabels)
                {
                    SetSpanLabel(collapsedUnary, pairStringEntry.Key, pairStringEntry.Value);
                }
                System.Console.Out.WriteLine(collapsedUnary);
            }
        }