private void FinishSentence(ICoreMap sentence, IList <Tree> trees) { if (treeMap != null) { IList <Tree> mappedTrees = Generics.NewLinkedList(); foreach (Tree tree in trees) { Tree mappedTree = treeMap.Apply(tree); mappedTrees.Add(mappedTree); } trees = mappedTrees; } ParserAnnotatorUtils.FillInParseAnnotations(Verbose, BuildGraphs, gsf, sentence, trees, extraDependencies); if (saveBinaryTrees) { TreeBinarizer binarizer = TreeBinarizer.SimpleTreeBinarizer(parser.GetTLPParams().HeadFinder(), parser.TreebankLanguagePack()); Tree binarized = binarizer.TransformTree(trees[0]); Edu.Stanford.Nlp.Trees.Trees.ConvertToCoreLabels(binarized); sentence.Set(typeof(TreeCoreAnnotations.BinarizedTreeAnnotation), binarized); } // for some reason in some corner cases nodes aren't having sentenceIndex set // do a pass and make sure all nodes have sentenceIndex set SemanticGraph sg = sentence.Get(typeof(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation)); if (sg != null) { foreach (IndexedWord iw in sg.VertexSet()) { if (iw.Get(typeof(CoreAnnotations.SentenceIndexAnnotation)) == null && sentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation)) != null) { iw.SetSentIndex(sentence.Get(typeof(CoreAnnotations.SentenceIndexAnnotation))); } } } }
public BinarizerAnnotator(string annotatorName, Properties props) { this.tlppClass = props.GetProperty(annotatorName + ".tlppClass", DefaultTlppClass); ITreebankLangParserParams tlpp = ReflectionLoading.LoadByReflection(tlppClass); this.binarizer = TreeBinarizer.SimpleTreeBinarizer(tlpp.HeadFinder(), tlpp.TreebankLanguagePack()); }
/// <exception cref="System.IO.IOException"/> public LexicalizedParserServer(int port, ParserGrammar parser) { //static final Charset utf8Charset = Charset.forName("utf-8"); this.port = port; this.serverSocket = new ServerSocket(port); this.parser = parser; this.binarizer = TreeBinarizer.SimpleTreeBinarizer(parser.GetTLPParams().HeadFinder(), parser.TreebankLanguagePack()); }
public static IList <Tree> BinarizeTreebank(Treebank treebank, Options op) { TreeBinarizer binarizer = TreeBinarizer.SimpleTreeBinarizer(op.tlpParams.HeadFinder(), op.tlpParams.TreebankLanguagePack()); BasicCategoryTreeTransformer basicTransformer = new BasicCategoryTreeTransformer(op.Langpack()); CompositeTreeTransformer transformer = new CompositeTreeTransformer(); transformer.AddTransformer(binarizer); transformer.AddTransformer(basicTransformer); treebank = treebank.Transform(transformer); IHeadFinder binaryHeadFinder = new BinaryHeadFinder(op.tlpParams.HeadFinder()); IList <Tree> binarizedTrees = Generics.NewArrayList(); foreach (Tree tree in treebank) { Edu.Stanford.Nlp.Trees.Trees.ConvertToCoreLabels(tree); tree.PercolateHeadAnnotations(binaryHeadFinder); // Index from 1. Tools downstream expect index from 1, so for // uses internal to the srparser we have to renormalize the // indices, with the result that here we have to index from 1 tree.IndexLeaves(1, true); binarizedTrees.Add(tree); } return(binarizedTrees); }
public static void Main(string[] args) { // TODO: rather than always rolling our own arg parser, we should // find a library which does it for us nicely string outputFile = null; string sentencesFile = null; string labelsFile = null; string parserFile = LexicalizedParser.DefaultParserLoc; string taggerFile = null; ParseAndSetLabels.MissingLabels missing = ParseAndSetLabels.MissingLabels.Default; string defaultLabel = "-1"; string separator = "\\t+"; string saveUnknownsFile = null; string remapLabels = null; int argIndex = 0; bool binarize = true; bool useLabelKeys = false; while (argIndex < args.Length) { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-output")) { outputFile = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-sentences")) { sentencesFile = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-labels")) { labelsFile = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-parser")) { parserFile = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-tagger")) { taggerFile = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-missing")) { missing = ParseAndSetLabels.MissingLabels.ValueOf(args[argIndex + 1]); argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-separator")) { separator = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-default")) { defaultLabel = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-saveUnknowns")) { saveUnknownsFile = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-remapLabels")) { remapLabels = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-binarize")) { binarize = true; argIndex += 1; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-nobinarize")) { binarize = false; argIndex += 1; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-useLabelKeys")) { useLabelKeys = true; argIndex += 1; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-nouseLabelKeys")) { useLabelKeys = false; argIndex += 1; } else { throw new ArgumentException("Unknown argument " + args[argIndex]); } } } } } } } } } } } } } } } if (outputFile == null) { throw new ArgumentException("-output is required"); } if (sentencesFile == null && !useLabelKeys) { throw new ArgumentException("-sentences or -useLabelKeys is required"); } if (sentencesFile != null && useLabelKeys) { throw new ArgumentException("Use only one of -sentences or -useLabelKeys"); } if (labelsFile == null) { throw new ArgumentException("-labels is required"); } ParserGrammar parser = LoadParser(parserFile, taggerFile); TreeBinarizer binarizer = null; if (binarize) { binarizer = TreeBinarizer.SimpleTreeBinarizer(parser.GetTLPParams().HeadFinder(), parser.TreebankLanguagePack()); } IDictionary <string, string> labelMap = ReadLabelMap(labelsFile, separator, remapLabels); IList <string> sentences; if (sentencesFile != null) { sentences = ReadSentences(sentencesFile); } else { sentences = new List <string>(labelMap.Keys); } IList <Tree> trees = ParseSentences(sentences, parser, binarizer); ICollection <string> unknowns = SetLabels(trees, labelMap, missing, defaultLabel); WriteTrees(trees, outputFile); }
public static IList <Tree> ParseSentences(IList <string> sentences, ParserGrammar parser, TreeBinarizer binarizer) { logger.Info("Parsing sentences"); IList <Tree> trees = new List <Tree>(); foreach (string sentence in sentences) { Tree tree = parser.Parse(sentence); if (binarizer != null) { tree = binarizer.TransformTree(tree); } trees.Add(tree); if (trees.Count % 1000 == 0) { logger.Info(" Parsed " + trees.Count + " trees"); } } return(trees); }
/// <summary> /// Turns a text file into trees for use in a RNTN classifier such as /// the treebank used in the Sentiment project. /// </summary> /// <remarks> /// Turns a text file into trees for use in a RNTN classifier such as /// the treebank used in the Sentiment project. /// <br /> /// The expected input file is one sentence per line, with sentences /// separated by blank lines. The first line has the main label of the sentence together with the full sentence. /// Lines after the first sentence line but before /// the blank line will be treated as labeled sub-phrases. The /// labels should start with the label and then contain a list of /// tokens the label applies to. All phrases that do not have their own label will take on the main sentence label! /// For example: /// <br /> /// <code> /// 1 Today is not a good day.<br /> /// 3 good<br /> /// 3 good day <br /> /// 3 a good day <br /> /// <br /> /// (next block starts here) <br /> /// </code> /// By default the englishPCFG parser is used. This can be changed /// with the /// <c>-parserModel</c> /// flag. Specify an input file /// with /// <c>-input</c> /// . /// <br /> /// If a sentiment model is provided with -sentimentModel, that model /// will be used to prelabel the sentences. Any spans with given /// labels will then be used to adjust those labels. /// </remarks> public static void Main(string[] args) { CollapseUnaryTransformer transformer = new CollapseUnaryTransformer(); string parserModel = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"; string inputPath = null; string sentimentModelPath = null; SentimentModel sentimentModel = null; for (int argIndex = 0; argIndex < args.Length;) { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-input")) { inputPath = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-parserModel")) { parserModel = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-sentimentModel")) { sentimentModelPath = args[argIndex + 1]; argIndex += 2; } else { log.Info("Unknown argument " + args[argIndex]); System.Environment.Exit(2); } } } } if (inputPath == null) { throw new ArgumentException("Must specify input file with -input"); } LexicalizedParser parser = ((LexicalizedParser)LexicalizedParser.LoadModel(parserModel)); TreeBinarizer binarizer = TreeBinarizer.SimpleTreeBinarizer(parser.GetTLPParams().HeadFinder(), parser.TreebankLanguagePack()); if (sentimentModelPath != null) { sentimentModel = SentimentModel.LoadSerialized(sentimentModelPath); } string text = IOUtils.SlurpFileNoExceptions(inputPath); string[] chunks = text.Split("\\n\\s*\\n+"); // need blank line to make a new chunk foreach (string chunk in chunks) { if (chunk.Trim().IsEmpty()) { continue; } // The expected format is that line 0 will be the text of the // sentence, and each subsequence line, if any, will be a value // followed by the sequence of tokens that get that value. // Here we take the first line and tokenize it as one sentence. string[] lines = chunk.Trim().Split("\\n"); string sentence = lines[0]; StringReader sin = new StringReader(sentence); DocumentPreprocessor document = new DocumentPreprocessor(sin); document.SetSentenceFinalPuncWords(new string[] { "\n" }); IList <IHasWord> tokens = document.GetEnumerator().Current; int mainLabel = System.Convert.ToInt32(tokens[0].Word()); //System.out.print("Main Sentence Label: " + mainLabel.toString() + "; "); tokens = tokens.SubList(1, tokens.Count); //log.info(tokens); IDictionary <Pair <int, int>, string> spanToLabels = Generics.NewHashMap(); for (int i = 1; i < lines.Length; ++i) { ExtractLabels(spanToLabels, tokens, lines[i]); } // TODO: add an option which treats the spans as constraints when parsing Tree tree = parser.Apply(tokens); Tree binarized = binarizer.TransformTree(tree); Tree collapsedUnary = transformer.TransformTree(binarized); // if there is a sentiment model for use in prelabeling, we // label here and then use the user given labels to adjust if (sentimentModel != null) { Edu.Stanford.Nlp.Trees.Trees.ConvertToCoreLabels(collapsedUnary); SentimentCostAndGradient scorer = new SentimentCostAndGradient(sentimentModel, null); scorer.ForwardPropagateTree(collapsedUnary); SetPredictedLabels(collapsedUnary); } else { SetUnknownLabels(collapsedUnary, mainLabel); } Edu.Stanford.Nlp.Trees.Trees.ConvertToCoreLabels(collapsedUnary); collapsedUnary.IndexSpans(); foreach (KeyValuePair <Pair <int, int>, string> pairStringEntry in spanToLabels) { SetSpanLabel(collapsedUnary, pairStringEntry.Key, pairStringEntry.Value); } System.Console.Out.WriteLine(collapsedUnary); } }