/// <summary>Loads treebank and prints it.</summary> /// <remarks> /// Loads treebank and prints it. /// All files below the designated /// <c>filePath</c> /// within the given /// number range if any are loaded. You can normalize the trees or not /// (English-specific) and print trees one per line up to a certain length /// (for EVALB). /// <p> /// Usage: /// <c>java edu.stanford.nlp.trees.Treebanks [-maxLength n|-normalize|-treeReaderFactory class] filePath [numberRanges]</c> /// </remarks> /// <param name="args">Array of command-line arguments</param> /// <exception cref="System.IO.IOException">If there is a treebank file access problem</exception> public static void Main(string[] args) { if (args.Length == 0) { PrintUsage(); return; } int i = 0; int maxLength; int minLength; int maxL = int.MaxValue; int minL = -1; bool normalized = false; bool decimate = false; bool pennPrintTrees = false; bool oneLinePrint = false; bool printTaggedWords = false; bool printWords = false; bool correct = false; string annotationOptions = null; bool summary = false; bool timing = false; bool yield = false; bool punct = false; bool sentenceLengths = false; bool countTaggings = false; bool removeCodeTrees = false; string decimatePrefix = null; string encoding = TreebankLanguagePackConstants.DefaultEncoding; string suffix = Treebank.DefaultTreeFileSuffix; ITreeReaderFactory trf = null; ITreebankLanguagePack tlp = null; IList <IPredicate <Tree> > filters = new List <IPredicate <Tree> >(); while (i < args.Length && args[i].StartsWith("-")) { if (args[i].Equals("-maxLength") && i + 1 < args.Length) { maxL = System.Convert.ToInt32(args[i + 1]); i += 2; } else { if (args[i].Equals("-minLength") && i + 1 < args.Length) { minL = System.Convert.ToInt32(args[i + 1]); i += 2; } else { if (args[i].Equals("-h") || args[i].Equals("-help")) { PrintUsage(); i++; } else { if (args[i].Equals("-normalized")) { normalized = true; i += 1; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-tlp")) { try { object o = Sharpen.Runtime.GetType(args[i + 1]).GetDeclaredConstructor().NewInstance(); tlp = (ITreebankLanguagePack)o; trf = tlp.TreeReaderFactory(); } catch (Exception) { log.Info("Couldn't instantiate as TreebankLanguagePack: " + args[i + 1]); return; } i += 2; } else { if (args[i].Equals("-treeReaderFactory") || args[i].Equals("-trf")) { try { object o = Sharpen.Runtime.GetType(args[i + 1]).GetDeclaredConstructor().NewInstance(); trf = (ITreeReaderFactory)o; } catch (Exception) { log.Info("Couldn't instantiate as TreeReaderFactory: " + args[i + 1]); return; } i += 2; } else { if (args[i].Equals("-suffix")) { suffix = args[i + 1]; i += 2; } else { if (args[i].Equals("-decimate")) { decimate = true; decimatePrefix = args[i + 1]; i += 2; } else { if (args[i].Equals("-encoding")) { encoding = args[i + 1]; i += 2; } else { if (args[i].Equals("-correct")) { correct = true; i += 1; } else { if (args[i].Equals("-summary")) { summary = true; i += 1; } else { if (args[i].Equals("-yield")) { yield = true; i += 1; } else { if (args[i].Equals("-punct")) { punct = true; i += 1; } else { if (args[i].Equals("-pennPrint")) { pennPrintTrees = true; i++; } else { if (args[i].Equals("-oneLine")) { oneLinePrint = true; i++; } else { if (args[i].Equals("-taggedWords")) { printTaggedWords = true; i++; } else { if (args[i].Equals("-words")) { printWords = true; i++; } else { if (args[i].Equals("-annotate")) { annotationOptions = args[i + 1]; i += 2; } else { if (args[i].Equals("-timing")) { timing = true; i++; } else { if (args[i].Equals("-countTaggings")) { countTaggings = true; i++; } else { if (args[i].Equals("-sentenceLengths")) { sentenceLengths = true; i++; } else { if (args[i].Equals("-removeCodeTrees")) { removeCodeTrees = true; i++; } else { if (args[i].Equals("-filter")) { IPredicate <Tree> filter = ReflectionLoading.LoadByReflection(args[i + 1]); filters.Add(filter); i += 2; } else { log.Info("Unknown option: " + args[i]); i++; } } } } } } } } } } } } } } } } } } } } } } } } maxLength = maxL; minLength = minL; Treebank treebank; if (trf == null) { trf = null; } if (normalized) { treebank = new DiskTreebank(); } else { treebank = new DiskTreebank(trf, encoding); } foreach (IPredicate <Tree> filter_1 in filters) { treebank = new FilteringTreebank(treebank, filter_1); } PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.Console.Out, encoding), true); if (i + 1 < args.Length) { treebank.LoadPath(args[i], new NumberRangesFileFilter(args[i + 1], true)); } else { if (i < args.Length) { treebank.LoadPath(args[i], suffix, true); } else { PrintUsage(); return; } } // log.info("Loaded " + treebank.size() + " trees from " + args[i]); if (annotationOptions != null) { // todo Not yet implemented log.Info("annotationOptions not yet implemented"); } if (summary) { System.Console.Out.WriteLine(treebank.TextualSummary()); } if (sentenceLengths) { SentenceLengths(treebank, args[i], ((i + 1) < args.Length ? args[i + 1] : null), pw); } if (punct) { PrintPunct(treebank, tlp, pw); } if (correct) { treebank = new EnglishPTBTreebankCorrector().TransformTrees(treebank); } if (pennPrintTrees) { treebank.Apply(null); } if (oneLinePrint) { treebank.Apply(null); } if (printWords) { TreeNormalizer tn = new BobChrisTreeNormalizer(); treebank.Apply(null); } if (printTaggedWords) { TreeNormalizer tn = new BobChrisTreeNormalizer(); treebank.Apply(null); } if (countTaggings) { CountTaggings(treebank, pw); } if (yield) { treebank.Apply(null); } if (decimate) { TextWriter w1 = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(decimatePrefix + "-train.txt"), encoding)); TextWriter w2 = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(decimatePrefix + "-dev.txt"), encoding)); TextWriter w3 = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(decimatePrefix + "-test.txt"), encoding)); treebank.Decimate(w1, w2, w3); } if (timing) { RunTiming(treebank); } if (removeCodeTrees) { // this is a bit of a hack. It only works on an individual file if (new File(args[i]).IsDirectory()) { throw new Exception("-removeCodeTrees only works on a single file"); } string treebankStr = IOUtils.SlurpFile(args[i]); treebankStr = treebankStr.ReplaceAll("\\( \\(CODE <[^>]+>\\)\\)", string.Empty); TextWriter w = new OutputStreamWriter(new FileOutputStream(args[i]), encoding); w.Write(treebankStr); w.Close(); } }