// static class public static Tree ConvertTree(IList <int> parentPointers, IList <string> sentence, IDictionary <IList <string>, int> phraseIds, IDictionary <int, double> sentimentScores, PTBEscapingProcessor escaper, int numClasses) { int maxNode = 0; foreach (int parent in parentPointers) { maxNode = Math.Max(maxNode, parent); } Tree[] subtrees = new Tree[maxNode + 1]; for (int i = 0; i < sentence.Count; ++i) { CoreLabel word = new CoreLabel(); word.SetValue(sentence[i]); Tree leaf = new LabeledScoredTreeNode(word); subtrees[i] = new LabeledScoredTreeNode(new CoreLabel()); subtrees[i].AddChild(leaf); } for (int i_1 = sentence.Count; i_1 <= maxNode; ++i_1) { subtrees[i_1] = new LabeledScoredTreeNode(new CoreLabel()); } bool[] connected = new bool[maxNode + 1]; Tree root = null; for (int index = 0; index < parentPointers.Count; ++index) { if (parentPointers[index] == -1) { if (root != null) { throw new Exception("Found two roots for sentence " + sentence); } root = subtrees[index]; } else { // Walk up the tree structure to make sure that leftmost // phrases are added first. Otherwise, if the numbers are // inverted, we might get the right phrase added to a parent // first, resulting in "case zero in this", for example, // instead of "in this case zero" // Note that because we keep track of which ones are already // connected, we process this at most once per parent, so the // overall construction time is still efficient. Connect(parentPointers, subtrees, connected, index); } } for (int i_2 = 0; i_2 <= maxNode; ++i_2) { IList <Tree> leaves = subtrees[i_2].GetLeaves(); IList <string> words = CollectionUtils.TransformAsList(leaves, TransformTreeToWord); // First we look for a copy of the phrase with -LRB- -RRB- // instead of (). The sentiment trees sometimes have both, and // the escaped versions seem to have more reasonable scores. // If a particular phrase doesn't have -LRB- -RRB- we fall back // to the unescaped versions. int phraseId = phraseIds[CollectionUtils.TransformAsList(words, TransformParens)]; if (phraseId == null) { phraseId = phraseIds[words]; } if (phraseId == null) { throw new Exception("Could not find phrase id for phrase " + sentence); } // TODO: should we make this an option? Perhaps we want cases // where the trees have the phrase id and not their class double score = sentimentScores[phraseId]; if (score == null) { throw new Exception("Could not find sentiment score for phrase id " + phraseId); } // TODO: make this a numClasses option int classLabel = Math.Round((float)Math.Floor(score * (float)numClasses)); if (classLabel > numClasses - 1) { classLabel = numClasses - 1; } subtrees[i_2].Label().SetValue(int.ToString(classLabel)); } for (int i_3 = 0; i_3 < sentence.Count; ++i_3) { Tree leaf = subtrees[i_3].Children()[0]; leaf.Label().SetValue(escaper.EscapeString(leaf.Label().Value())); } for (int i_4 = 0; i_4 < tregexPatterns.Length; ++i_4) { root = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ProcessPattern(tregexPatterns[i_4], tsurgeonPatterns[i_4], root); } return(root); }
/// <summary> /// This program converts the format of the Sentiment data set /// prepared by Richard, Jean, etc. /// </summary> /// <remarks> /// This program converts the format of the Sentiment data set /// prepared by Richard, Jean, etc. into trees readable with the /// normal TreeReaders. /// <br /> /// An example command line is /// <br /> /// <code>java edu.stanford.nlp.sentiment.ReadSentimentDataset -dictionary stanfordSentimentTreebank/dictionary.txt -sentiment stanfordSentimentTreebank/sentiment_labels.txt -tokens stanfordSentimentTreebank/SOStr.txt -parse stanfordSentimentTreebank/STree.txt -split stanfordSentimentTreebank/datasetSplit.txt -train train.txt -dev dev.txt -test test.txt</code> /// <br /> /// The arguments are as follows: <br /> /// <code>-dictionary</code>, <code>-sentiment</code>, /// <code>-tokens</code>, <code>-parse</code>, <code>-split</code> /// Path to the corresponding files from the dataset <br /> /// <code>-train</code>, <code>-dev</code>, <code>-test</code> /// Paths for saving the corresponding output files <br /> /// Each of these arguments is required. /// <br /> /// Macro arguments exist in -inputDir and -outputDir, so you can for example run <br /> /// <code>java edu.stanford.nlp.sentiment.ReadSentimentDataset -inputDir ../data/sentiment/stanfordSentimentTreebank -outputDir .</code> /// </remarks> public static void Main(string[] args) { string dictionaryFilename = null; string sentimentFilename = null; string tokensFilename = null; string parseFilename = null; string splitFilename = null; string trainFilename = null; string devFilename = null; string testFilename = null; int numClasses = 5; int argIndex = 0; while (argIndex < args.Length) { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-dictionary")) { dictionaryFilename = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-sentiment")) { sentimentFilename = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-tokens")) { tokensFilename = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-parse")) { parseFilename = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-split")) { splitFilename = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-inputDir") || Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-inputDirectory")) { dictionaryFilename = args[argIndex + 1] + "/dictionary.txt"; sentimentFilename = args[argIndex + 1] + "/sentiment_labels.txt"; tokensFilename = args[argIndex + 1] + "/SOStr.txt"; parseFilename = args[argIndex + 1] + "/STree.txt"; splitFilename = args[argIndex + 1] + "/datasetSplit.txt"; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-train")) { trainFilename = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-dev")) { devFilename = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-test")) { testFilename = args[argIndex + 1]; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-outputDir") || Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-outputDirectory")) { trainFilename = args[argIndex + 1] + "/train.txt"; devFilename = args[argIndex + 1] + "/dev.txt"; testFilename = args[argIndex + 1] + "/test.txt"; argIndex += 2; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[argIndex], "-numClasses")) { numClasses = System.Convert.ToInt32(args[argIndex + 1]); argIndex += 2; } else { log.Info("Unknown argument " + args[argIndex]); System.Environment.Exit(2); } } } } } } } } } } } } // Sentence file is formatted // w1|w2|w3... IList <IList <string> > sentences = Generics.NewArrayList(); foreach (string line in IOUtils.ReadLines(tokensFilename, "utf-8")) { string[] sentence = line.Split("\\|"); sentences.Add(Arrays.AsList(sentence)); } // Split and read the phrase ids file. This file is in the format // w1 w2 w3 ... | id IDictionary <IList <string>, int> phraseIds = Generics.NewHashMap(); foreach (string line_1 in IOUtils.ReadLines(dictionaryFilename, "utf-8")) { string[] pieces = line_1.Split("\\|"); string[] sentence = pieces[0].Split(" "); int id = int.Parse(pieces[1]); phraseIds[Arrays.AsList(sentence)] = id; } // Split and read the sentiment scores file. Each line of this // file is of the format: // phrasenum | score IDictionary <int, double> sentimentScores = Generics.NewHashMap(); foreach (string line_2 in IOUtils.ReadLines(sentimentFilename, "utf-8")) { if (line_2.StartsWith("phrase")) { continue; } string[] pieces = line_2.Split("\\|"); int id = int.Parse(pieces[0]); double score = double.ValueOf(pieces[1]); sentimentScores[id] = score; } // Read lines from the tree structure file. This is a file of parent pointers for each tree. int index = 0; PTBEscapingProcessor escaper = new PTBEscapingProcessor(); IList <Tree> trees = Generics.NewArrayList(); foreach (string line_3 in IOUtils.ReadLines(parseFilename, "utf-8")) { string[] pieces = line_3.Split("\\|"); IList <int> parentPointers = CollectionUtils.TransformAsList(Arrays.AsList(pieces), null); Tree tree = ConvertTree(parentPointers, sentences[index], phraseIds, sentimentScores, escaper, numClasses); ++index; trees.Add(tree); } IDictionary <int, IList <int> > splits = Generics.NewHashMap(); splits[1] = Generics.NewArrayList <int>(); splits[2] = Generics.NewArrayList <int>(); splits[3] = Generics.NewArrayList <int>(); foreach (string line_4 in IOUtils.ReadLines(splitFilename, "utf-8")) { if (line_4.StartsWith("sentence_index")) { continue; } string[] pieces = line_4.Split(","); int treeId = int.Parse(pieces[0]) - 1; int fileId = int.Parse(pieces[1]); splits[fileId].Add(treeId); } WriteTrees(trainFilename, trees, splits[1]); WriteTrees(testFilename, trees, splits[2]); WriteTrees(devFilename, trees, splits[3]); }