private static void ResolveDummyTags(File treeFile, TwoDimensionalCounter <string, string> pretermLabel, TwoDimensionalCounter <string, string> unigramTagger) { try { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")); ITreeReaderFactory trf = new FrenchTreeReaderFactory(); ITreeReader tr = trf.NewTreeReader(br); PrintWriter pw = new PrintWriter(new TextWriter(new FileOutputStream(new File(treeFile + ".fixed")), false, "UTF-8")); int nTrees = 0; for (Tree t; (t = tr.ReadTree()) != null; nTrees++) { TraverseAndFix(t, pretermLabel, unigramTagger); pw.Println(t.ToString()); } pw.Close(); tr.Close(); System.Console.Out.WriteLine("Processed " + nTrees + " trees"); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
//Delete sentence-initial punctuation //Delete sentence final punctuation that is preceded by punctuation (first time) //Delete sentence final punctuation that is preceded by punctuation (second time) //Convert remaining sentence-final punctuation to either . if it is not [.!?] //Delete medial, sentence-final punctuation //Now move the sentence-final mark under SENT //For those trees that lack a sentence-final punc, add one. //Finally, delete these punctuation marks, which I can't seem to kill otherwise... //A bad MWADV tree in the training set // Not sure why this got a label of X. Similar trees suggest it // should be A instead // This also seems to be mislabeled /// <param name="args"/> public static void Main(string[] args) { if (args.Length != 1) { log.Info("Usage: java " + typeof(Edu.Stanford.Nlp.International.French.Pipeline.FTBCorrector).FullName + " filename\n"); System.Environment.Exit(-1); } ITreeTransformer tt = new Edu.Stanford.Nlp.International.French.Pipeline.FTBCorrector(); File f = new File(args[0]); try { //These bad trees in the Candito training set should be thrown out: // (ROOT (SENT (" ") (. .))) // (ROOT (SENT (. .))) TregexPattern pBadTree = TregexPattern.Compile("@SENT <: @PUNC"); TregexPattern pBadTree2 = TregexPattern.Compile("@SENT <1 @PUNC <2 @PUNC !<3 __"); BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8")); ITreeReaderFactory trf = new FrenchTreeReaderFactory(); ITreeReader tr = trf.NewTreeReader(br); int nTrees = 0; for (Tree t; (t = tr.ReadTree()) != null; nTrees++) { TregexMatcher m = pBadTree.Matcher(t); TregexMatcher m2 = pBadTree2.Matcher(t); if (m.Find() || m2.Find()) { log.Info("Discarding tree: " + t.ToString()); } else { Tree fixedT = tt.TransformTree(t); System.Console.Out.WriteLine(fixedT.ToString()); } } tr.Close(); System.Console.Error.Printf("Wrote %d trees%n", nTrees); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (TregexParseException e) { Sharpen.Runtime.PrintStackTrace(e); } }
/// <param name="args"/> public static void Main(string[] args) { if (args.Length != 1) { System.Console.Error.Printf("Usage: java %s file%n", typeof(MWEPreprocessor).FullName); System.Environment.Exit(-1); } File treeFile = new File(args[0]); TwoDimensionalCounter <string, string> labelTerm = new TwoDimensionalCounter <string, string>(); TwoDimensionalCounter <string, string> termLabel = new TwoDimensionalCounter <string, string>(); TwoDimensionalCounter <string, string> labelPreterm = new TwoDimensionalCounter <string, string>(); TwoDimensionalCounter <string, string> pretermLabel = new TwoDimensionalCounter <string, string>(); TwoDimensionalCounter <string, string> unigramTagger = new TwoDimensionalCounter <string, string>(); try { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")); ITreeReaderFactory trf = new FrenchTreeReaderFactory(); ITreeReader tr = trf.NewTreeReader(br); for (Tree t; (t = tr.ReadTree()) != null;) { CountMWEStatistics(t, unigramTagger, labelPreterm, pretermLabel, labelTerm, termLabel); } tr.Close(); //Closes the underlying reader System.Console.Out.WriteLine("Generating {MWE Type -> Terminal}"); PrintCounter(labelTerm, "label_term.csv"); System.Console.Out.WriteLine("Generating {Terminal -> MWE Type}"); PrintCounter(termLabel, "term_label.csv"); System.Console.Out.WriteLine("Generating {MWE Type -> POS sequence}"); PrintCounter(labelPreterm, "label_pos.csv"); System.Console.Out.WriteLine("Generating {POS sequence -> MWE Type}"); PrintCounter(pretermLabel, "pos_label.csv"); System.Console.Out.WriteLine("Resolving DUMMY tags"); ResolveDummyTags(treeFile, pretermLabel, unigramTagger); System.Console.Out.WriteLine("#Unknown Word Types: " + MWEPreprocessor.ManualUWModel.nUnknownWordTypes); System.Console.Out.WriteLine("#Missing POS: " + nMissingPOS); System.Console.Out.WriteLine("#Missing Phrasal: " + nMissingPhrasal); System.Console.Out.WriteLine("Done!"); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
/// <param name="args"/> public static void Main(string[] args) { if (args.Length != 2) { System.Console.Error.Printf("Usage: java %s tree_file morfette_tnt_file%n", typeof(MungeTreesWithMorfetteAnalyses).FullName); System.Environment.Exit(-1); } string treeFile = args[0]; string morfetteFile = args[1]; ITreeReaderFactory trf = new FrenchTreeReaderFactory(); try { ITreeReader tr = trf.NewTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"))); IEnumerator <IList <CoreLabel> > morfetteItr = new MungeTreesWithMorfetteAnalyses.MorfetteFileIterator(morfetteFile); for (Tree tree; (tree = tr.ReadTree()) != null && morfetteItr.MoveNext();) { IList <CoreLabel> analysis = morfetteItr.Current; IList <ILabel> yield = tree.Yield(); System.Diagnostics.Debug.Assert(analysis.Count == yield.Count); int yieldLen = yield.Count; for (int i = 0; i < yieldLen; ++i) { CoreLabel tokenAnalysis = analysis[i]; ILabel token = yield[i]; string lemma = GetLemma(token.Value(), tokenAnalysis.Lemma()); string newLeaf = string.Format("%s%s%s%s%s", token.Value(), MorphoFeatureSpecification.MorphoMark, lemma, MorphoFeatureSpecification.LemmaMark, tokenAnalysis.Tag()); ((CoreLabel)token).SetValue(newLeaf); } System.Console.Out.WriteLine(tree.ToString()); } if (tr.ReadTree() != null || morfetteItr.MoveNext()) { log.Info("WARNING: Uneven input files!"); } tr.Close(); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
/// <param name="args"/> public static void Main(string[] args) { if (args.Length != 1) { System.Console.Error.Printf("Usage: java %s tree_file%n", typeof(TreeToMorfette).FullName); System.Environment.Exit(-1); } string treeFile = args[0]; ITreeReaderFactory trf = new FrenchTreeReaderFactory(); try { ITreeReader tr = trf.NewTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"))); for (Tree tree1; (tree1 = tr.ReadTree()) != null;) { IList <ILabel> pretermYield = tree1.PreTerminalYield(); IList <ILabel> yield = tree1.Yield(); int yieldLen = yield.Count; for (int i = 0; i < yieldLen; ++i) { CoreLabel rawToken = (CoreLabel)yield[i]; string word = rawToken.Value(); string morphStr = rawToken.OriginalText(); Pair <string, string> lemmaMorph = MorphoFeatureSpecification.SplitMorphString(word, morphStr); string lemma = lemmaMorph.First(); string morph = lemmaMorph.Second(); if (morph == null || morph.Equals(string.Empty) || morph.Equals("XXX")) { morph = ((CoreLabel)pretermYield[i]).Value(); } System.Console.Out.Printf("%s %s %s%n", word, lemma, morph); } System.Console.Out.WriteLine(); } tr.Close(); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
public static void Main(string[] args) { if (args.Length != 1) { System.Console.Error.Printf("Usage: java %s file%n", typeof(Edu.Stanford.Nlp.International.French.Scripts.MWEFrequencyDist).FullName); System.Environment.Exit(-1); } File treeFile = new File(args[0]); TwoDimensionalCounter <string, string> mweLabelToString = new TwoDimensionalCounter <string, string>(); ICollection <string> uniquePOSSequences = Generics.NewHashSet(); try { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")); ITreeReaderFactory trf = new FrenchTreeReaderFactory(); ITreeReader tr = trf.NewTreeReader(br); TregexPattern pMWE = TregexPattern.Compile("/^MW/"); for (Tree t; (t = tr.ReadTree()) != null;) { //Count MWE statistics TregexMatcher m = pMWE.Matcher(t); while (m.FindNextMatchingNode()) { Tree match = m.GetMatch(); string label = match.Value(); IList <CoreLabel> yield = match.TaggedLabeledYield(); StringBuilder termYield = new StringBuilder(); StringBuilder posYield = new StringBuilder(); foreach (CoreLabel cl in yield) { termYield.Append(cl.Word()).Append(" "); posYield.Append(cl.Tag()).Append(" "); } mweLabelToString.IncrementCount(label, termYield.ToString().Trim()); uniquePOSSequences.Add(posYield.ToString().Trim()); } } tr.Close(); //Closes the underlying reader System.Console.Out.Printf("Type\t#Type\t#Single\t%%Single\t%%Total%n"); double nMWEs = mweLabelToString.TotalCount(); int nAllSingletons = 0; int nTokens = 0; foreach (string mweLabel in mweLabelToString.FirstKeySet()) { int nSingletons = 0; double totalCount = mweLabelToString.TotalCount(mweLabel); ICounter <string> mc = mweLabelToString.GetCounter(mweLabel); foreach (string term in mc.KeySet()) { if (mc.GetCount(term) == 1.0) { nSingletons++; } nTokens += term.Split("\\s+").Length *(int)mc.GetCount(term); } nAllSingletons += nSingletons; System.Console.Out.Printf("%s\t%d\t%d\t%.2f\t%.2f%n", mweLabel, (int)totalCount, nSingletons, 100.0 * nSingletons / totalCount, 100.0 * totalCount / nMWEs); } System.Console.Out.Printf("TOTAL:\t%d\t%d\t%.2f%n", (int)nMWEs, nAllSingletons, 100.0 * nAllSingletons / nMWEs); System.Console.Out.WriteLine("#tokens = " + nTokens); System.Console.Out.WriteLine("#unique MWE POS sequences = " + uniquePOSSequences.Count); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (TregexParseException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }