/// <param name="args"/> public static void Main(string[] args) { if (args.Length != 2) { System.Console.Error.Printf("Usage: java %s tree_file morfette_tnt_file%n", typeof(MungeTreesWithMorfetteAnalyses).FullName); System.Environment.Exit(-1); } string treeFile = args[0]; string morfetteFile = args[1]; ITreeReaderFactory trf = new FrenchTreeReaderFactory(); try { ITreeReader tr = trf.NewTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"))); IEnumerator <IList <CoreLabel> > morfetteItr = new MungeTreesWithMorfetteAnalyses.MorfetteFileIterator(morfetteFile); for (Tree tree; (tree = tr.ReadTree()) != null && morfetteItr.MoveNext();) { IList <CoreLabel> analysis = morfetteItr.Current; IList <ILabel> yield = tree.Yield(); System.Diagnostics.Debug.Assert(analysis.Count == yield.Count); int yieldLen = yield.Count; for (int i = 0; i < yieldLen; ++i) { CoreLabel tokenAnalysis = analysis[i]; ILabel token = yield[i]; string lemma = GetLemma(token.Value(), tokenAnalysis.Lemma()); string newLeaf = string.Format("%s%s%s%s%s", token.Value(), MorphoFeatureSpecification.MorphoMark, lemma, MorphoFeatureSpecification.LemmaMark, tokenAnalysis.Tag()); ((CoreLabel)token).SetValue(newLeaf); } System.Console.Out.WriteLine(tree.ToString()); } if (tr.ReadTree() != null || morfetteItr.MoveNext()) { log.Info("WARNING: Uneven input files!"); } tr.Close(); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
/// <summary>Load a collection of parse trees from a Reader.</summary> /// <remarks> /// Load a collection of parse trees from a Reader. /// Each tree may optionally be encased in parens to allow for Penn /// Treebank style trees. /// </remarks> /// <param name="r"> /// The reader to read trees from. (If you want it buffered, /// you should already have buffered it!) /// </param> /// <param name="id"> /// An ID for where these files come from (arbitrary, but /// something like a filename. Can be <code>null</code> for none. /// </param> public void Load(Reader r, string id) { try { // could throw an IO exception? ITreeReader tr = TreeReaderFactory().NewTreeReader(r); int sentIndex = 0; for (Tree pt; (pt = tr.ReadTree()) != null;) { if (pt.Label() is IHasIndex) { // so we can trace where this tree came from IHasIndex hi = (IHasIndex)pt.Label(); if (id != null) { hi.SetDocID(id); } hi.SetSentIndex(sentIndex); } parseTrees.Add(pt); sentIndex++; } } catch (IOException e) { log.Info("load IO Exception: " + e); } }
/// <exception cref="System.IO.IOException"/> public virtual void Process() { SpanishXMLTreeReaderFactory trf = new SpanishXMLTreeReaderFactory(); Tree t; foreach (File file in fileList) { Reader @in = new BufferedReader(new InputStreamReader(new FileInputStream(file), AncoraEncoding)); ITreeReader tr = trf.NewTreeReader(@in); // Tree reading will implicitly perform tree normalization for us while ((t = tr.ReadTree()) != null) { // Update tagger with this tree IList <CoreLabel> yield = t.TaggedLabeledYield(); foreach (CoreLabel leafLabel in yield) { if (leafLabel.Tag().Equals(SpanishTreeNormalizer.MwTag)) { continue; } unigramTagger.IncrementCount(leafLabel.Word(), leafLabel.Tag()); } } } }
/// <param name="args">File to run on</param> public static void Main(string[] args) { if (args.Length < 1) { System.Console.Out.Printf("Usage: java %s tree_file%n", typeof(Edu.Stanford.Nlp.Trees.International.Negra.NegraPennTreeReaderFactory).FullName); return; } ITreebankLanguagePack tlp = new NegraPennLanguagePack(); ITreeReaderFactory trf = new Edu.Stanford.Nlp.Trees.International.Negra.NegraPennTreeReaderFactory(2, false, false, tlp); try { ITreeReader tr = trf.NewTreeReader(IOUtils.ReaderFromString(args[0], tlp.GetEncoding())); for (Tree t; (t = tr.ReadTree()) != null;) { t.PennPrint(); } tr.Close(); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
private static void ResolveDummyTags(File treeFile, TwoDimensionalCounter <string, string> pretermLabel, TwoDimensionalCounter <string, string> unigramTagger) { try { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")); ITreeReaderFactory trf = new FrenchTreeReaderFactory(); ITreeReader tr = trf.NewTreeReader(br); PrintWriter pw = new PrintWriter(new TextWriter(new FileOutputStream(new File(treeFile + ".fixed")), false, "UTF-8")); int nTrees = 0; for (Tree t; (t = tr.ReadTree()) != null; nTrees++) { TraverseAndFix(t, pretermLabel, unigramTagger); pw.Println(t.ToString()); } pw.Close(); tr.Close(); System.Console.Out.WriteLine("Processed " + nTrees + " trees"); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
/// <summary>Processes a single file containing AnCora XML trees.</summary> /// <remarks> /// Processes a single file containing AnCora XML trees. Returns MWE statistics for the trees in /// the file and the actual parsed trees. /// </remarks> private static Pair <TwoDimensionalCounter <string, string>, IList <Tree> > ProcessTreeFile(File file, SpanishXMLTreeReaderFactory trf, string encoding) { TwoDimensionalCounter <string, string> tagger = new TwoDimensionalCounter <string, string>(); try { Reader @in = new BufferedReader(new InputStreamReader(new FileInputStream(file), encoding)); ITreeReader tr = trf.NewTreeReader(file.GetPath(), @in); IList <Tree> trees = new List <Tree>(); Tree t; Tree splitPoint; while ((t = tr.ReadTree()) != null) { do { // We may need to split the current tree into multiple parts. // (If not, a call to `split` with a `null` split-point is a // no-op splitPoint = FindSplitPoint(t); Pair <Tree, Tree> split = Split(t, splitPoint); Tree toAdd = split.First(); t = split.Second(); trees.Add(toAdd); UpdateTagger(tagger, toAdd); }while (splitPoint != null); } tr.Close(); return(new Pair <TwoDimensionalCounter <string, string>, IList <Tree> >(tagger, trees)); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); return(null); } }
/// <param name="args"/> public static void Main(string[] args) { if (args.Length != 1) { System.Console.Error.Printf("Usage: java %s tree_file%n", typeof(SplitMaker).FullName); System.Environment.Exit(-1); } ITreebankLanguagePack tlp = new HebrewTreebankLanguagePack(); string inputFile = args[0]; File treeFile = new File(inputFile); try { ITreeReaderFactory trf = new HebrewTreeReaderFactory(); BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), tlp.GetEncoding())); ITreeReader tr = trf.NewTreeReader(br); PrintWriter pwDev = new PrintWriter(new TextWriter(new FileOutputStream(inputFile + ".clean.dev"), false, tlp.GetEncoding())); PrintWriter pwTrain = new PrintWriter(new TextWriter(new FileOutputStream(inputFile + ".clean.train"), false, tlp.GetEncoding())); PrintWriter pwTest = new PrintWriter(new TextWriter(new FileOutputStream(inputFile + ".clean.test"), false, tlp.GetEncoding())); int numTrees = 0; for (Tree t; ((t = tr.ReadTree()) != null); numTrees++) { if (numTrees < 483) { pwDev.Println(t.ToString()); } else { if (numTrees >= 483 && numTrees < 5724) { pwTrain.Println(t.ToString()); } else { pwTest.Println(t.ToString()); } } } tr.Close(); pwDev.Close(); pwTrain.Close(); pwTest.Close(); System.Console.Error.Printf("Processed %d trees.%n", numTrees); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
/// <summary>For debugging.</summary> /// <param name="args"/> public static void Main(string[] args) { if (args.Length < 1) { System.Console.Error.Printf("Usage: java %s tree_file(s)%n%n", typeof(Edu.Stanford.Nlp.Trees.International.French.FrenchXMLTreeReader).FullName); System.Environment.Exit(-1); } IList <File> fileList = new List <File>(); foreach (string arg in args) { fileList.Add(new File(arg)); } ITreeReaderFactory trf = new FrenchXMLTreeReaderFactory(false); int totalTrees = 0; ICollection <string> morphAnalyses = Generics.NewHashSet(); try { foreach (File file in fileList) { ITreeReader tr = trf.NewTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8"))); Tree t; int numTrees; string canonicalFileName = Sharpen.Runtime.Substring(file.GetName(), 0, file.GetName().LastIndexOf('.')); for (numTrees = 0; (t = tr.ReadTree()) != null; numTrees++) { string ftbID = ((CoreLabel)t.Label()).Get(typeof(CoreAnnotations.SentenceIDAnnotation)); System.Console.Out.Printf("%s-%s\t%s%n", canonicalFileName, ftbID, t.ToString()); IList <ILabel> leaves = t.Yield(); foreach (ILabel label in leaves) { if (label is CoreLabel) { morphAnalyses.Add(((CoreLabel)label).OriginalText()); } } } tr.Close(); System.Console.Error.Printf("%s: %d trees%n", file.GetName(), numTrees); totalTrees += numTrees; } //wsg2011: Print out the observed morphological analyses // for(String analysis : morphAnalyses) // log.info(analysis); System.Console.Error.Printf("%nRead %d trees%n", totalTrees); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
/// <summary>Reads a single tree.</summary> /// <returns>A single tree, or <code>null</code> at end of file.</returns> /// <exception cref="System.IO.IOException"/> public virtual Tree ReadTree() { Tree t; do { t = tr.ReadTree(); }while (t != null && !f.Test(t)); return(t); }
//Delete sentence-initial punctuation //Delete sentence final punctuation that is preceded by punctuation (first time) //Delete sentence final punctuation that is preceded by punctuation (second time) //Convert remaining sentence-final punctuation to either . if it is not [.!?] //Delete medial, sentence-final punctuation //Now move the sentence-final mark under SENT //For those trees that lack a sentence-final punc, add one. //Finally, delete these punctuation marks, which I can't seem to kill otherwise... //A bad MWADV tree in the training set // Not sure why this got a label of X. Similar trees suggest it // should be A instead // This also seems to be mislabeled /// <param name="args"/> public static void Main(string[] args) { if (args.Length != 1) { log.Info("Usage: java " + typeof(Edu.Stanford.Nlp.International.French.Pipeline.FTBCorrector).FullName + " filename\n"); System.Environment.Exit(-1); } ITreeTransformer tt = new Edu.Stanford.Nlp.International.French.Pipeline.FTBCorrector(); File f = new File(args[0]); try { //These bad trees in the Candito training set should be thrown out: // (ROOT (SENT (" ") (. .))) // (ROOT (SENT (. .))) TregexPattern pBadTree = TregexPattern.Compile("@SENT <: @PUNC"); TregexPattern pBadTree2 = TregexPattern.Compile("@SENT <1 @PUNC <2 @PUNC !<3 __"); BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8")); ITreeReaderFactory trf = new FrenchTreeReaderFactory(); ITreeReader tr = trf.NewTreeReader(br); int nTrees = 0; for (Tree t; (t = tr.ReadTree()) != null; nTrees++) { TregexMatcher m = pBadTree.Matcher(t); TregexMatcher m2 = pBadTree2.Matcher(t); if (m.Find() || m2.Find()) { log.Info("Discarding tree: " + t.ToString()); } else { Tree fixedT = tt.TransformTree(t); System.Console.Out.WriteLine(fixedT.ToString()); } } tr.Close(); System.Console.Error.Printf("Wrote %d trees%n", nTrees); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (TregexParseException e) { Sharpen.Runtime.PrintStackTrace(e); } }
/// <param name="args"/> public static void Main(string[] args) { if (args.Length != 1) { System.Console.Error.Printf("Usage: java %s file%n", typeof(MWEPreprocessor).FullName); System.Environment.Exit(-1); } File treeFile = new File(args[0]); TwoDimensionalCounter <string, string> labelTerm = new TwoDimensionalCounter <string, string>(); TwoDimensionalCounter <string, string> termLabel = new TwoDimensionalCounter <string, string>(); TwoDimensionalCounter <string, string> labelPreterm = new TwoDimensionalCounter <string, string>(); TwoDimensionalCounter <string, string> pretermLabel = new TwoDimensionalCounter <string, string>(); TwoDimensionalCounter <string, string> unigramTagger = new TwoDimensionalCounter <string, string>(); try { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")); ITreeReaderFactory trf = new FrenchTreeReaderFactory(); ITreeReader tr = trf.NewTreeReader(br); for (Tree t; (t = tr.ReadTree()) != null;) { CountMWEStatistics(t, unigramTagger, labelPreterm, pretermLabel, labelTerm, termLabel); } tr.Close(); //Closes the underlying reader System.Console.Out.WriteLine("Generating {MWE Type -> Terminal}"); PrintCounter(labelTerm, "label_term.csv"); System.Console.Out.WriteLine("Generating {Terminal -> MWE Type}"); PrintCounter(termLabel, "term_label.csv"); System.Console.Out.WriteLine("Generating {MWE Type -> POS sequence}"); PrintCounter(labelPreterm, "label_pos.csv"); System.Console.Out.WriteLine("Generating {POS sequence -> MWE Type}"); PrintCounter(pretermLabel, "pos_label.csv"); System.Console.Out.WriteLine("Resolving DUMMY tags"); ResolveDummyTags(treeFile, pretermLabel, unigramTagger); System.Console.Out.WriteLine("#Unknown Word Types: " + MWEPreprocessor.ManualUWModel.nUnknownWordTypes); System.Console.Out.WriteLine("#Missing POS: " + nMissingPOS); System.Console.Out.WriteLine("#Missing Phrasal: " + nMissingPhrasal); System.Console.Out.WriteLine("Done!"); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
/// <param name="args"/> public static void Main(string[] args) { if (args.Length != 1) { System.Console.Error.Printf("Usage: java %s tree_file%n", typeof(TreeToMorfette).FullName); System.Environment.Exit(-1); } string treeFile = args[0]; ITreeReaderFactory trf = new FrenchTreeReaderFactory(); try { ITreeReader tr = trf.NewTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"))); for (Tree tree1; (tree1 = tr.ReadTree()) != null;) { IList <ILabel> pretermYield = tree1.PreTerminalYield(); IList <ILabel> yield = tree1.Yield(); int yieldLen = yield.Count; for (int i = 0; i < yieldLen; ++i) { CoreLabel rawToken = (CoreLabel)yield[i]; string word = rawToken.Value(); string morphStr = rawToken.OriginalText(); Pair <string, string> lemmaMorph = MorphoFeatureSpecification.SplitMorphString(word, morphStr); string lemma = lemmaMorph.First(); string morph = lemmaMorph.Second(); if (morph == null || morph.Equals(string.Empty) || morph.Equals("XXX")) { morph = ((CoreLabel)pretermYield[i]).Value(); } System.Console.Out.Printf("%s %s %s%n", word, lemma, morph); } System.Console.Out.WriteLine(); } tr.Close(); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
/// <param name="args"/> public static void Main(string[] args) { Properties options = StringUtils.ArgsToProperties(args, argOptionDefs); if (!options.Contains(string.Empty) || options.Contains("help")) { log.Info(Usage()); return; } bool retainNER = PropertiesUtils.GetBool(options, "ner", false); bool normalize = PropertiesUtils.GetBool(options, "normalize", true); File treeFile = new File(options.GetProperty(string.Empty)); TwoDimensionalCounter <string, string> labelTerm = new TwoDimensionalCounter <string, string>(); TwoDimensionalCounter <string, string> termLabel = new TwoDimensionalCounter <string, string>(); TwoDimensionalCounter <string, string> labelPreterm = new TwoDimensionalCounter <string, string>(); TwoDimensionalCounter <string, string> pretermLabel = new TwoDimensionalCounter <string, string>(); TwoDimensionalCounter <string, string> unigramTagger = new TwoDimensionalCounter <string, string>(); try { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")); ITreeReaderFactory trf = new SpanishTreeReaderFactory(); ITreeReader tr = trf.NewTreeReader(br); for (Tree t; (t = tr.ReadTree()) != null;) { UpdateTagger(unigramTagger, t); } tr.Close(); //Closes the underlying reader System.Console.Out.WriteLine("Resolving DUMMY tags"); ResolveDummyTags(treeFile, unigramTagger, retainNER, normalize ? new SpanishTreeNormalizer(true, false, false) : null); System.Console.Out.WriteLine("#Unknown Word Types: " + MultiWordPreprocessor.ManualUWModel.nUnknownWordTypes); System.Console.Out.WriteLine(string.Format("#Missing POS: %d (fixed: %d, %.2f%%)", nMissingPOS, nFixedPOS, (double)nFixedPOS / nMissingPOS * 100)); System.Console.Out.WriteLine(string.Format("#Missing Phrasal: %d (fixed: %d, %.2f%%)", nMissingPhrasal, nFixedPhrasal, (double)nFixedPhrasal / nMissingPhrasal * 100)); System.Console.Out.WriteLine("Done!"); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
private static void ResolveDummyTags(File treeFile, TwoDimensionalCounter <string, string> unigramTagger, bool retainNER, TreeNormalizer tn) { ITreeFactory tf = new LabeledScoredTreeFactory(); MultiWordTreeExpander expander = new MultiWordTreeExpander(); try { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")); ITreeReaderFactory trf = new SpanishTreeReaderFactory(); ITreeReader tr = trf.NewTreeReader(br); PrintWriter pw = new PrintWriter(new TextWriter(new FileOutputStream(new File(treeFile + ".fixed")), false, "UTF-8")); int nTrees = 0; for (Tree t; (t = tr.ReadTree()) != null; nTrees++) { TraverseAndFix(t, null, unigramTagger, retainNER); // Now "decompress" further the expanded trees formed by // multiword token splitting t = expander.ExpandPhrases(t, tn, tf); if (tn != null) { t = tn.NormalizeWholeTree(t, tf); } pw.Println(t.ToString()); } pw.Close(); tr.Close(); System.Console.Out.WriteLine("Processed " + nTrees + " trees"); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
/// <summary> /// Read trees from the given file and output their processed forms to /// standard output. /// </summary> /// <exception cref="System.IO.IOException"/> public static void Process(File file, ITreeReader tr, Pattern posPattern, Pattern wordPattern, bool plainPrint) { Tree t; int numTrees = 0; int numTreesRetained = 0; string canonicalFileName = Sharpen.Runtime.Substring(file.GetName(), 0, file.GetName().LastIndexOf('.')); while ((t = tr.ReadTree()) != null) { numTrees++; if (!ShouldPrintTree(t, posPattern, wordPattern)) { continue; } numTreesRetained++; string ftbID = ((CoreLabel)t.Label()).Get(typeof(CoreAnnotations.SentenceIDAnnotation)); string output = ToString(t, plainPrint); System.Console.Out.Printf("%s-%s\t%s%n", canonicalFileName, ftbID, output); } System.Console.Error.Printf("%s: %d trees, %d matched and printed%n", file.GetName(), numTrees, numTreesRetained); }
/// <summary>For debugging.</summary> /// <param name="args"/> public static void Main(string[] args) { if (args.Length != 1) { System.Console.Error.Printf("Usage: java %s atb_tree_file > atb_tree_file.out%n", typeof(Edu.Stanford.Nlp.International.Arabic.Pipeline.MWETreeVisitorExternal).FullName); System.Environment.Exit(-1); } ITreeReaderFactory trf = new ArabicTreeReaderFactory(); try { ITreeReader tr = trf.NewTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(args[0]), "UTF-8"))); ITreeVisitor visitor = new Edu.Stanford.Nlp.International.Arabic.Pipeline.MWETreeVisitorExternal(); int treeId = 0; for (Tree tree; (tree = tr.ReadTree()) != null; ++treeId) { if (tree.Value().Equals("ROOT")) { // Skip over the ROOT tag tree = tree.FirstChild(); } visitor.VisitTree(tree); System.Console.Out.WriteLine(tree.ToString()); } tr.Close(); System.Console.Error.Printf("Processed %d trees.%n", treeId); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
private void PrimeNext() { try { if (treeReader != null) { Tree tree = treeReader.ReadTree(); if (tree == null) { nextYield = null; } else { IList <CoreLabel> mLabeledLeaves = tree.TaggedLabeledYield(); nextYield = new List <string>(mLabeledLeaves.Count); foreach (CoreLabel label in mLabeledLeaves) { nextYield.Add(label.Tag()); } } } else { string line = fileReader.ReadLine(); if (line == null) { nextYield = null; } else { nextYield = Arrays.AsList(line.Split("\\s+")); } } } catch (IOException e) { nextYield = null; Sharpen.Runtime.PrintStackTrace(e); } }
//Delete sentence-initial punctuation //Delete sentence-initial punctuation (again) //Delete sentence final punctuation that is preceded by punctuation (first time) //Delete sentence final punctuation that is preceded by punctuation (second time) //Convert remaining sentence-final punctuation to . if it is not [.!?] //Delete medial, sentence-final punctuation // ("@PUNC=punc <: /[!\\.\\?]+/ $. __\n" // + "prune punc\n" // + "\n") + //Now move the sentence-final mark under the top-level node //For those trees that lack a sentence-final punc, add one. // ("/^[^\\.!\\?]$/ >>- (__ > @ROOT <- __=loc) <: __\n" // + "insert (PUNC .) $- loc\n" // + "\n"); /// <param name="args"/> public static void Main(string[] args) { if (args.Length != 1) { log.Info("Usage: java " + typeof(Edu.Stanford.Nlp.International.Arabic.Pipeline.ATBCorrector).FullName + " filename\n"); System.Environment.Exit(-1); } ITreeTransformer tt = new Edu.Stanford.Nlp.International.Arabic.Pipeline.ATBCorrector(); File f = new File(args[0]); try { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8")); ITreeReaderFactory trf = new ArabicTreeReaderFactory.ArabicRawTreeReaderFactory(); ITreeReader tr = trf.NewTreeReader(br); int nTrees = 0; for (Tree t; (t = tr.ReadTree()) != null; nTrees++) { Tree fixedT = tt.TransformTree(t); System.Console.Out.WriteLine(fixedT.ToString()); } tr.Close(); System.Console.Error.Printf("Wrote %d trees%n", nTrees); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
/// <param name="args"/> public static void Main(string[] args) { if (args.Length != 1) { System.Console.Error.Printf("Usage: java %s tree_file > trees%n", typeof(HebrewTreeReaderFactory).FullName); System.Environment.Exit(-1); } ITreebankLanguagePack tlp = new HebrewTreebankLanguagePack(); File treeFile = new File(args[0]); try { ITreeReaderFactory trf = new HebrewTreeReaderFactory(); BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), tlp.GetEncoding())); ITreeReader tr = trf.NewTreeReader(br); int numTrees = 0; for (Tree t; ((t = tr.ReadTree()) != null); numTrees++) { System.Console.Out.WriteLine(t.ToString()); } tr.Close(); System.Console.Error.Printf("Processed %d trees.%n", numTrees); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
/// <summary>Load a collection of parse trees from the file of given name.</summary> /// <remarks> /// Load a collection of parse trees from the file of given name. /// Each tree may optionally be encased in parens to allow for Penn /// Treebank style trees. /// This methods implements the <code>FileProcessor</code> interface. /// </remarks> /// <param name="file">file to load a tree from</param> public void ProcessFile(File file) { ITreeReader tr = null; // SRL stuff CollectionValuedMap <int, string> srlMap = null; if (this.srlMap != null) { // there must be a better way ... string filename = file.GetAbsolutePath(); foreach (string suffix in this.srlMap.Keys) { if (filename.EndsWith(suffix)) { srlMap = this.srlMap[suffix]; break; } } if (srlMap == null) { log.Info("could not find SRL entries for file: " + file); } } try { // maybe print file name to stdout to get some feedback // could throw an IO exception if can't open for reading tr = TreeReaderFactory().NewTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(file), Encoding()))); int sentIndex = 0; Tree pt; while ((pt = tr.ReadTree()) != null) { if (pt.Label() is IHasIndex) { // so we can trace where this tree came from IHasIndex hi = (IHasIndex)pt.Label(); hi.SetDocID(file.GetName()); hi.SetSentIndex(sentIndex); } if (srlMap == null) { parseTrees.Add(pt); } else { ICollection <string> srls = srlMap[sentIndex]; // pt.pennPrint(); // log.info(srls); parseTrees.Add(pt); if (srls.IsEmpty()) { } else { // parseTrees.add(pt); foreach (string srl in srls) { // Tree t = pt.deepCopy(); string[] bits = srl.Split("\\s+"); int verbIndex = System.Convert.ToInt32(bits[0]); string lemma = bits[2].Split("\\.")[0]; // Tree verb = Trees.getTerminal(t, verbIndex); Tree verb = Edu.Stanford.Nlp.Trees.Trees.GetTerminal(pt, verbIndex); // ((CoreLabel)verb.label()).set(SRLIDAnnotation.class, SRL_ID.REL); ((CoreLabel)verb.Label()).Set(typeof(CoreAnnotations.CoNLLPredicateAnnotation), true); for (int i = 4; i < bits.Length; i++) { string arg = bits[i]; string[] bits1; if (arg.IndexOf("ARGM") >= 0) { bits1 = arg.Split("-"); } else { bits1 = arg.Split("-"); } string locs = bits1[0]; string argType = bits1[1]; if (argType.Equals("rel")) { continue; } foreach (string loc in locs.Split("[*,]")) { bits1 = loc.Split(":"); int term = System.Convert.ToInt32(bits1[0]); int height = System.Convert.ToInt32(bits1[1]); // Tree t1 = Trees.getPreTerminal(t, term); Tree t1 = Edu.Stanford.Nlp.Trees.Trees.GetPreTerminal(pt, term); for (int j = 0; j < height; j++) { // t1 = t1.parent(t); t1 = t1.Parent(pt); } IDictionary <int, string> roleMap = ((CoreLabel)t1.Label()).Get(typeof(CoreAnnotations.CoNLLSRLAnnotation)); if (roleMap == null) { roleMap = Generics.NewHashMap(); ((CoreLabel)t1.Label()).Set(typeof(CoreAnnotations.CoNLLSRLAnnotation), roleMap); } roleMap[verbIndex] = argType; } } } } } // ((CoreLabel)t1.label()).set(SRLIDAnnotation.class, SRL_ID.ARG); // for (Tree t1 : t) { // if (t1.isLeaf()) { continue; } // CoreLabel fl = (CoreLabel)t1.label(); // if (fl.value() == null) { continue; } // if (!fl.has(SRLIDAnnotation.class)) { // boolean allNone = true; // for (Tree t2 : t1) { // SRL_ID s = ((CoreLabel)t2.label()).get(SRLIDAnnotation.class); // if (s == SRL_ID.ARG || s == SRL_ID.REL) { // allNone = false; // break; // } // } // if (allNone) { // fl.set(SRLIDAnnotation.class, SRL_ID.ALL_NO); // } else { // fl.set(SRLIDAnnotation.class, SRL_ID.NO); // } // } // } // parseTrees.add(t); sentIndex++; } } catch (IOException e) { throw new RuntimeIOException("MemoryTreebank.processFile IOException in file " + file, e); } finally { IOUtils.CloseIgnoringExceptions(tr); } }
public static void Main(string[] args) { if (args.Length < 1) { System.Console.Error.Printf("Usage: java %s tree_file%n", typeof(TreeToTSV).FullName); System.Environment.Exit(-1); } string treeFile = args[0]; try { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")); ITreeReaderFactory trf = new SpanishTreeReaderFactory(); ITreeReader tr = trf.NewTreeReader(br); StringBuilder sb = new StringBuilder(); string nl = Runtime.GetProperty("line.separator"); Pattern nePattern = Pattern.Compile("^grup\\.nom\\."); Pattern npPattern = Pattern.Compile("^np0000.$"); for (Tree tree; (tree = tr.ReadTree()) != null;) { foreach (Tree t in tree) { if (!t.IsPreTerminal()) { continue; } char type = 'O'; Tree grandma = t.Ancestor(1, tree); string grandmaValue = ((CoreLabel)grandma.Label()).Value(); // grup.nom.x if (nePattern.Matcher(grandmaValue).Find()) { type = grandmaValue[9]; } else { // else check the pos for np0000x or not string pos = ((CoreLabel)t.Label()).Value(); if (npPattern.Matcher(pos).Find()) { type = pos[6]; } } Tree wordNode = t.FirstChild(); string word = ((CoreLabel)wordNode.Label()).Value(); sb.Append(word).Append("\t"); switch (type) { case 'p': { sb.Append("PERS"); break; } case 'l': { sb.Append("LUG"); break; } case 'o': { sb.Append("ORG"); break; } case '0': { sb.Append("OTROS"); break; } default: { sb.Append("O"); break; } } sb.Append(nl); } sb.Append(nl); } System.Console.Out.Write(sb.ToString()); tr.Close(); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
public static void Main(string[] args) { if (args.Length != 1) { System.Console.Error.Printf("Usage: java %s file%n", typeof(Edu.Stanford.Nlp.International.French.Scripts.MWEFrequencyDist).FullName); System.Environment.Exit(-1); } File treeFile = new File(args[0]); TwoDimensionalCounter <string, string> mweLabelToString = new TwoDimensionalCounter <string, string>(); ICollection <string> uniquePOSSequences = Generics.NewHashSet(); try { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")); ITreeReaderFactory trf = new FrenchTreeReaderFactory(); ITreeReader tr = trf.NewTreeReader(br); TregexPattern pMWE = TregexPattern.Compile("/^MW/"); for (Tree t; (t = tr.ReadTree()) != null;) { //Count MWE statistics TregexMatcher m = pMWE.Matcher(t); while (m.FindNextMatchingNode()) { Tree match = m.GetMatch(); string label = match.Value(); IList <CoreLabel> yield = match.TaggedLabeledYield(); StringBuilder termYield = new StringBuilder(); StringBuilder posYield = new StringBuilder(); foreach (CoreLabel cl in yield) { termYield.Append(cl.Word()).Append(" "); posYield.Append(cl.Tag()).Append(" "); } mweLabelToString.IncrementCount(label, termYield.ToString().Trim()); uniquePOSSequences.Add(posYield.ToString().Trim()); } } tr.Close(); //Closes the underlying reader System.Console.Out.Printf("Type\t#Type\t#Single\t%%Single\t%%Total%n"); double nMWEs = mweLabelToString.TotalCount(); int nAllSingletons = 0; int nTokens = 0; foreach (string mweLabel in mweLabelToString.FirstKeySet()) { int nSingletons = 0; double totalCount = mweLabelToString.TotalCount(mweLabel); ICounter <string> mc = mweLabelToString.GetCounter(mweLabel); foreach (string term in mc.KeySet()) { if (mc.GetCount(term) == 1.0) { nSingletons++; } nTokens += term.Split("\\s+").Length *(int)mc.GetCount(term); } nAllSingletons += nSingletons; System.Console.Out.Printf("%s\t%d\t%d\t%.2f\t%.2f%n", mweLabel, (int)totalCount, nSingletons, 100.0 * nSingletons / totalCount, 100.0 * totalCount / nMWEs); } System.Console.Out.Printf("TOTAL:\t%d\t%d\t%.2f%n", (int)nMWEs, nAllSingletons, 100.0 * nAllSingletons / nMWEs); System.Console.Out.WriteLine("#tokens = " + nTokens); System.Console.Out.WriteLine("#unique MWE POS sequences = " + uniquePOSSequences.Count); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (TregexParseException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }