private TaggedFileRecord(string file, TaggedFileRecord.Format format, string encoding, string tagSeparator, ITreeTransformer treeTransformer, TreeNormalizer treeNormalizer, ITreeReaderFactory trf, NumberRangesFileFilter treeRange, IPredicate <Tree> treeFilter, int wordColumn, int tagColumn) { // represents a tokenized file separated by text // represents a tsv file such as a conll file // represents a file in PTB format this.file = file; this.format = format; this.encoding = encoding; this.tagSeparator = tagSeparator; this.treeTransformer = treeTransformer; this.treeNormalizer = treeNormalizer; this.treeRange = treeRange; this.treeFilter = treeFilter; this.wordColumn = wordColumn; this.tagColumn = tagColumn; this.trf = trf; }
public TreeTaggedFileReader(TaggedFileRecord record) { // int numSentences = 0; filename = record.file; trf = record.trf == null ? new LabeledScoredTreeReaderFactory() : record.trf; transformer = record.treeTransformer; normalizer = record.treeNormalizer; treeFilter = record.treeFilter; treebank = new DiskTreebank(trf, record.encoding); if (record.treeRange != null) { treebank.LoadPath(filename, record.treeRange); } else { treebank.LoadPath(filename); } treeIterator = treebank.GetEnumerator(); FindNext(); }
public static Edu.Stanford.Nlp.Tagger.IO.TaggedFileRecord CreateRecord(Properties config, string description) { string[] pieces = description.Split(","); if (pieces.Length == 1) { return(new Edu.Stanford.Nlp.Tagger.IO.TaggedFileRecord(description, TaggedFileRecord.Format.Text, GetEncoding(config), GetTagSeparator(config), null, null, null, null, null, null, null)); } string[] args = new string[pieces.Length - 1]; System.Array.Copy(pieces, 0, args, 0, pieces.Length - 1); string file = pieces[pieces.Length - 1]; TaggedFileRecord.Format format = TaggedFileRecord.Format.Text; string encoding = GetEncoding(config); string tagSeparator = GetTagSeparator(config); ITreeTransformer treeTransformer = null; TreeNormalizer treeNormalizer = null; ITreeReaderFactory trf = null; NumberRangesFileFilter treeRange = null; IPredicate <Tree> treeFilter = null; int wordColumn = null; int tagColumn = null; foreach (string arg in args) { string[] argPieces = arg.Split("=", 2); if (argPieces.Length != 2) { throw new ArgumentException("TaggedFileRecord argument " + arg + " has an unexpected number of =s"); } if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], Format)) { format = TaggedFileRecord.Format.ValueOf(argPieces[1]); } else { if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], Encoding)) { encoding = argPieces[1]; } else { if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TagSeparator)) { tagSeparator = argPieces[1]; } else { if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TreeTransformer)) { treeTransformer = ReflectionLoading.LoadByReflection(argPieces[1]); } else { if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TreeNormalizer)) { treeNormalizer = ReflectionLoading.LoadByReflection(argPieces[1]); } else { if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TreeReader)) { trf = ReflectionLoading.LoadByReflection(argPieces[1]); } else { if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TreeRange)) { string range = argPieces[1].ReplaceAll(":", ","); treeRange = new NumberRangesFileFilter(range, true); } else { if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TreeFilter)) { treeFilter = ReflectionLoading.LoadByReflection(argPieces[1]); } else { if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], WordColumn)) { wordColumn = int.Parse(argPieces[1]); } else { if (Sharpen.Runtime.EqualsIgnoreCase(argPieces[0], TagColumn)) { tagColumn = int.Parse(argPieces[1]); } else { throw new ArgumentException("TaggedFileRecord argument " + argPieces[0] + " is unknown"); } } } } } } } } } } } return(new Edu.Stanford.Nlp.Tagger.IO.TaggedFileRecord(file, format, encoding, tagSeparator, treeTransformer, treeNormalizer, trf, treeRange, treeFilter, wordColumn, tagColumn)); }
/// <summary> /// Create a TreeTokenizerFactory from a TreeReaderFactory. /// </summary> public TreeTokenizerFactory(ITreeReaderFactory trf) { this.trf = trf; }
/// <summary>Create a new Treebank.</summary> /// <param name="initialCapacity"> /// The initial size of the underlying Collection, /// (if a Collection-based storage mechanism is being provided) /// </param> /// <param name="trf"> /// the factory class to be called to create a new /// <c>TreeReader</c> /// </param> public Treebank(int initialCapacity, ITreeReaderFactory trf) { this.trf = trf; }
/// <summary>Create a new Treebank.</summary> /// <param name="trf"> /// the factory class to be called to create a new /// <c>TreeReader</c> /// </param> /// <param name="encoding">The charset encoding to use for treebank file decoding</param> public Treebank(ITreeReaderFactory trf, string encoding) { this.trf = trf; this.encoding = encoding; }
/// <summary>Create a new Treebank.</summary> /// <param name="trf"> /// the factory class to be called to create a new /// <c>TreeReader</c> /// </param> public Treebank(ITreeReaderFactory trf) { this.trf = trf; }
/// <summary>Create a new DiskTreebank.</summary> /// <param name="trf"> /// the factory class to be called to create a new /// <code>TreeReader</code> /// </param> /// <param name="encoding">The charset encoding to use for treebank file decoding</param> public DiskTreebank(ITreeReaderFactory trf, string encoding) : base(trf, encoding) { }
/// <summary>Create a new TransformingTreebank.</summary> /// <param name="trf"> /// the factory class to be called to create a new /// <code>TreeReader</code> /// </param> public TransformingTreebank(ITreeReaderFactory trf) : base(trf) { }
/// <summary>Create a new tree bank.</summary> /// <param name="trf"> /// the factory class to be called to create a new /// <code>TreeReader</code> /// </param> /// <param name="encoding">the encoding to use for file access.</param> public MemoryTreebank(ITreeReaderFactory trf, string encoding) : base(trf, encoding) { parseTrees = new List <Tree>(); }
/// <summary>Create a new tree bank.</summary> /// <param name="trf"> /// the factory class to be called to create a new /// <code>TreeReader</code> /// </param> public MemoryTreebank(ITreeReaderFactory trf) : base(trf) { // private static final boolean BROKEN_NFS = true; parseTrees = new List <Tree>(); }
/// <summary>Create a new tree bank.</summary> /// <param name="initialCapacity">The initial size of the underlying Collection</param> /// <param name="trf"> /// the factory class to be called to create a new /// <code>TreeReader</code> /// </param> public MemoryTreebank(int initialCapacity, ITreeReaderFactory trf) : base(initialCapacity, trf) { parseTrees = new List <Tree>(initialCapacity); }
/// <summary>Create a new tree bank.</summary> /// <remarks> /// Create a new tree bank. The list of trees passed in is simply placed /// in the Treebank. It is not copied. /// </remarks> /// <param name="trees">The trees to put in the Treebank.</param> /// <param name="trf"> /// the factory class to be called to create a new /// <code>TreeReader</code> /// </param> /// <param name="encoding">the encoding to use for file access.</param> public MemoryTreebank(IList <Tree> trees, ITreeReaderFactory trf, string encoding) : base(trf, encoding) { parseTrees = trees; }
/// <summary>Create a new Treebank.</summary> /// <param name="initialCapacity"> /// The initial size of the underlying Collection, /// For a <code>DiskTreebank</code>, this parameter is ignored. /// </param> /// <param name="trf"> /// the factory class to be called to create a new /// <code>TreeReader</code> /// </param> public DiskTreebank(int initialCapacity, ITreeReaderFactory trf) : this(trf) { }
/// <summary>Loads treebank and prints it.</summary> /// <remarks> /// Loads treebank and prints it. /// All files below the designated /// <c>filePath</c> /// within the given /// number range if any are loaded. You can normalize the trees or not /// (English-specific) and print trees one per line up to a certain length /// (for EVALB). /// <p> /// Usage: /// <c>java edu.stanford.nlp.trees.Treebanks [-maxLength n|-normalize|-treeReaderFactory class] filePath [numberRanges]</c> /// </remarks> /// <param name="args">Array of command-line arguments</param> /// <exception cref="System.IO.IOException">If there is a treebank file access problem</exception> public static void Main(string[] args) { if (args.Length == 0) { PrintUsage(); return; } int i = 0; int maxLength; int minLength; int maxL = int.MaxValue; int minL = -1; bool normalized = false; bool decimate = false; bool pennPrintTrees = false; bool oneLinePrint = false; bool printTaggedWords = false; bool printWords = false; bool correct = false; string annotationOptions = null; bool summary = false; bool timing = false; bool yield = false; bool punct = false; bool sentenceLengths = false; bool countTaggings = false; bool removeCodeTrees = false; string decimatePrefix = null; string encoding = TreebankLanguagePackConstants.DefaultEncoding; string suffix = Treebank.DefaultTreeFileSuffix; ITreeReaderFactory trf = null; ITreebankLanguagePack tlp = null; IList <IPredicate <Tree> > filters = new List <IPredicate <Tree> >(); while (i < args.Length && args[i].StartsWith("-")) { if (args[i].Equals("-maxLength") && i + 1 < args.Length) { maxL = System.Convert.ToInt32(args[i + 1]); i += 2; } else { if (args[i].Equals("-minLength") && i + 1 < args.Length) { minL = System.Convert.ToInt32(args[i + 1]); i += 2; } else { if (args[i].Equals("-h") || args[i].Equals("-help")) { PrintUsage(); i++; } else { if (args[i].Equals("-normalized")) { normalized = true; i += 1; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-tlp")) { try { object o = Sharpen.Runtime.GetType(args[i + 1]).GetDeclaredConstructor().NewInstance(); tlp = (ITreebankLanguagePack)o; trf = tlp.TreeReaderFactory(); } catch (Exception) { log.Info("Couldn't instantiate as TreebankLanguagePack: " + args[i + 1]); return; } i += 2; } else { if (args[i].Equals("-treeReaderFactory") || args[i].Equals("-trf")) { try { object o = Sharpen.Runtime.GetType(args[i + 1]).GetDeclaredConstructor().NewInstance(); trf = (ITreeReaderFactory)o; } catch (Exception) { log.Info("Couldn't instantiate as TreeReaderFactory: " + args[i + 1]); return; } i += 2; } else { if (args[i].Equals("-suffix")) { suffix = args[i + 1]; i += 2; } else { if (args[i].Equals("-decimate")) { decimate = true; decimatePrefix = args[i + 1]; i += 2; } else { if (args[i].Equals("-encoding")) { encoding = args[i + 1]; i += 2; } else { if (args[i].Equals("-correct")) { correct = true; i += 1; } else { if (args[i].Equals("-summary")) { summary = true; i += 1; } else { if (args[i].Equals("-yield")) { yield = true; i += 1; } else { if (args[i].Equals("-punct")) { punct = true; i += 1; } else { if (args[i].Equals("-pennPrint")) { pennPrintTrees = true; i++; } else { if (args[i].Equals("-oneLine")) { oneLinePrint = true; i++; } else { if (args[i].Equals("-taggedWords")) { printTaggedWords = true; i++; } else { if (args[i].Equals("-words")) { printWords = true; i++; } else { if (args[i].Equals("-annotate")) { annotationOptions = args[i + 1]; i += 2; } else { if (args[i].Equals("-timing")) { timing = true; i++; } else { if (args[i].Equals("-countTaggings")) { countTaggings = true; i++; } else { if (args[i].Equals("-sentenceLengths")) { sentenceLengths = true; i++; } else { if (args[i].Equals("-removeCodeTrees")) { removeCodeTrees = true; i++; } else { if (args[i].Equals("-filter")) { IPredicate <Tree> filter = ReflectionLoading.LoadByReflection(args[i + 1]); filters.Add(filter); i += 2; } else { log.Info("Unknown option: " + args[i]); i++; } } } } } } } } } } } } } } } } } } } } } } } } maxLength = maxL; minLength = minL; Treebank treebank; if (trf == null) { trf = null; } if (normalized) { treebank = new DiskTreebank(); } else { treebank = new DiskTreebank(trf, encoding); } foreach (IPredicate <Tree> filter_1 in filters) { treebank = new FilteringTreebank(treebank, filter_1); } PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.Console.Out, encoding), true); if (i + 1 < args.Length) { treebank.LoadPath(args[i], new NumberRangesFileFilter(args[i + 1], true)); } else { if (i < args.Length) { treebank.LoadPath(args[i], suffix, true); } else { PrintUsage(); return; } } // log.info("Loaded " + treebank.size() + " trees from " + args[i]); if (annotationOptions != null) { // todo Not yet implemented log.Info("annotationOptions not yet implemented"); } if (summary) { System.Console.Out.WriteLine(treebank.TextualSummary()); } if (sentenceLengths) { SentenceLengths(treebank, args[i], ((i + 1) < args.Length ? args[i + 1] : null), pw); } if (punct) { PrintPunct(treebank, tlp, pw); } if (correct) { treebank = new EnglishPTBTreebankCorrector().TransformTrees(treebank); } if (pennPrintTrees) { treebank.Apply(null); } if (oneLinePrint) { treebank.Apply(null); } if (printWords) { TreeNormalizer tn = new BobChrisTreeNormalizer(); treebank.Apply(null); } if (printTaggedWords) { TreeNormalizer tn = new BobChrisTreeNormalizer(); treebank.Apply(null); } if (countTaggings) { CountTaggings(treebank, pw); } if (yield) { treebank.Apply(null); } if (decimate) { TextWriter w1 = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(decimatePrefix + "-train.txt"), encoding)); TextWriter w2 = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(decimatePrefix + "-dev.txt"), encoding)); TextWriter w3 = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(decimatePrefix + "-test.txt"), encoding)); treebank.Decimate(w1, w2, w3); } if (timing) { RunTiming(treebank); } if (removeCodeTrees) { // this is a bit of a hack. It only works on an individual file if (new File(args[i]).IsDirectory()) { throw new Exception("-removeCodeTrees only works on a single file"); } string treebankStr = IOUtils.SlurpFile(args[i]); treebankStr = treebankStr.ReplaceAll("\\( \\(CODE <[^>]+>\\)\\)", string.Empty); TextWriter w = new OutputStreamWriter(new FileOutputStream(args[i]), encoding); w.Write(treebankStr); w.Close(); } }
/// <summary>Lets you test out the TreeBinarizer on the command line.</summary> /// <remarks> /// Lets you test out the TreeBinarizer on the command line. /// This main method doesn't yet handle as many flags as one would like. /// But it does have: /// <ul> /// <li> -tlp TreebankLanguagePack /// <li>-tlpp TreebankLangParserParams /// <li>-insideFactor /// <li>-markovOrder /// </ul> /// </remarks> /// <param name="args"> /// Command line arguments: flags as above, as above followed by /// treebankPath /// </param> public static void Main(string[] args) { ITreebankLangParserParams tlpp = null; // TreebankLangParserParams tlpp = new EnglishTreebankParserParams(); // TreeReaderFactory trf = new LabeledScoredTreeReaderFactory(); // Looks like it must build CategoryWordTagFactory!! ITreeReaderFactory trf = null; string fileExt = "mrg"; IHeadFinder hf = new ModCollinsHeadFinder(); ITreebankLanguagePack tlp = new PennTreebankLanguagePack(); bool insideFactor = false; bool mf = false; int mo = 1; bool uwl = false; bool uat = false; double sst = 20.0; bool mfs = false; bool simpleLabels = false; bool noRebinarization = false; int i = 0; while (i < args.Length && args[i].StartsWith("-")) { if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-tlp") && i + 1 < args.Length) { try { tlp = (ITreebankLanguagePack)System.Activator.CreateInstance(Sharpen.Runtime.GetType(args[i + 1])); } catch (Exception e) { log.Info("Couldn't instantiate: " + args[i + 1]); throw new Exception(e); } i++; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-tlpp") && i + 1 < args.Length) { try { tlpp = (ITreebankLangParserParams)System.Activator.CreateInstance(Sharpen.Runtime.GetType(args[i + 1])); } catch (Exception e) { log.Info("Couldn't instantiate: " + args[i + 1]); throw new Exception(e); } i++; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-insideFactor")) { insideFactor = true; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-markovOrder") && i + 1 < args.Length) { i++; mo = System.Convert.ToInt32(args[i]); } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-simpleLabels")) { simpleLabels = true; } else { if (Sharpen.Runtime.EqualsIgnoreCase(args[i], "-noRebinarization")) { noRebinarization = true; } else { log.Info("Unknown option:" + args[i]); } } } } } } i++; } if (i >= args.Length) { log.Info("usage: java TreeBinarizer [-tlpp class|-markovOrder int|...] treebankPath"); System.Environment.Exit(0); } Treebank treebank; if (tlpp != null) { treebank = tlpp.MemoryTreebank(); tlp = tlpp.TreebankLanguagePack(); fileExt = tlp.TreebankFileExtension(); hf = tlpp.HeadFinder(); } else { treebank = new DiskTreebank(trf); } treebank.LoadPath(args[i], fileExt, true); ITreeTransformer tt = new Edu.Stanford.Nlp.Parser.Lexparser.TreeBinarizer(hf, tlp, insideFactor, mf, mo, uwl, uat, sst, mfs, simpleLabels, noRebinarization); foreach (Tree t in treebank) { Tree newT = tt.TransformTree(t); System.Console.Out.WriteLine("Original tree:"); t.PennPrint(); System.Console.Out.WriteLine("Binarized tree:"); newT.PennPrint(); System.Console.Out.WriteLine(); } }
/// <summary>Create a new DiskTreebank.</summary> /// <param name="trf"> /// the factory class to be called to create a new /// <code>TreeReader</code> /// </param> public DiskTreebank(ITreeReaderFactory trf) : base(trf) { }