/// <summary>Processes a single file containing AnCora XML trees.</summary> /// <remarks> /// Processes a single file containing AnCora XML trees. Returns MWE statistics for the trees in /// the file and the actual parsed trees. /// </remarks> private static Pair <TwoDimensionalCounter <string, string>, IList <Tree> > ProcessTreeFile(File file, SpanishXMLTreeReaderFactory trf, string encoding) { TwoDimensionalCounter <string, string> tagger = new TwoDimensionalCounter <string, string>(); try { Reader @in = new BufferedReader(new InputStreamReader(new FileInputStream(file), encoding)); ITreeReader tr = trf.NewTreeReader(file.GetPath(), @in); IList <Tree> trees = new List <Tree>(); Tree t; Tree splitPoint; while ((t = tr.ReadTree()) != null) { do { // We may need to split the current tree into multiple parts. // (If not, a call to `split` with a `null` split-point is a // no-op splitPoint = FindSplitPoint(t); Pair <Tree, Tree> split = Split(t, splitPoint); Tree toAdd = split.First(); t = split.Second(); trees.Add(toAdd); UpdateTagger(tagger, toAdd); }while (splitPoint != null); } tr.Close(); return(new Pair <TwoDimensionalCounter <string, string>, IList <Tree> >(tagger, trees)); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); return(null); } }
/// <param name="args">File to run on</param> public static void Main(string[] args) { if (args.Length < 1) { System.Console.Out.Printf("Usage: java %s tree_file%n", typeof(Edu.Stanford.Nlp.Trees.International.Negra.NegraPennTreeReaderFactory).FullName); return; } ITreebankLanguagePack tlp = new NegraPennLanguagePack(); ITreeReaderFactory trf = new Edu.Stanford.Nlp.Trees.International.Negra.NegraPennTreeReaderFactory(2, false, false, tlp); try { ITreeReader tr = trf.NewTreeReader(IOUtils.ReaderFromString(args[0], tlp.GetEncoding())); for (Tree t; (t = tr.ReadTree()) != null;) { t.PennPrint(); } tr.Close(); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
private static void ResolveDummyTags(File treeFile, TwoDimensionalCounter <string, string> pretermLabel, TwoDimensionalCounter <string, string> unigramTagger) { try { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")); ITreeReaderFactory trf = new FrenchTreeReaderFactory(); ITreeReader tr = trf.NewTreeReader(br); PrintWriter pw = new PrintWriter(new TextWriter(new FileOutputStream(new File(treeFile + ".fixed")), false, "UTF-8")); int nTrees = 0; for (Tree t; (t = tr.ReadTree()) != null; nTrees++) { TraverseAndFix(t, pretermLabel, unigramTagger); pw.Println(t.ToString()); } pw.Close(); tr.Close(); System.Console.Out.WriteLine("Processed " + nTrees + " trees"); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
/// <param name="args"/> public static void Main(string[] args) { if (args.Length != 1) { System.Console.Error.Printf("Usage: java %s tree_file%n", typeof(SplitMaker).FullName); System.Environment.Exit(-1); } ITreebankLanguagePack tlp = new HebrewTreebankLanguagePack(); string inputFile = args[0]; File treeFile = new File(inputFile); try { ITreeReaderFactory trf = new HebrewTreeReaderFactory(); BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), tlp.GetEncoding())); ITreeReader tr = trf.NewTreeReader(br); PrintWriter pwDev = new PrintWriter(new TextWriter(new FileOutputStream(inputFile + ".clean.dev"), false, tlp.GetEncoding())); PrintWriter pwTrain = new PrintWriter(new TextWriter(new FileOutputStream(inputFile + ".clean.train"), false, tlp.GetEncoding())); PrintWriter pwTest = new PrintWriter(new TextWriter(new FileOutputStream(inputFile + ".clean.test"), false, tlp.GetEncoding())); int numTrees = 0; for (Tree t; ((t = tr.ReadTree()) != null); numTrees++) { if (numTrees < 483) { pwDev.Println(t.ToString()); } else { if (numTrees >= 483 && numTrees < 5724) { pwTrain.Println(t.ToString()); } else { pwTest.Println(t.ToString()); } } } tr.Close(); pwDev.Close(); pwTrain.Close(); pwTest.Close(); System.Console.Error.Printf("Processed %d trees.%n", numTrees); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
/// <summary>For debugging.</summary> /// <param name="args"/> public static void Main(string[] args) { if (args.Length < 1) { System.Console.Error.Printf("Usage: java %s tree_file(s)%n%n", typeof(Edu.Stanford.Nlp.Trees.International.French.FrenchXMLTreeReader).FullName); System.Environment.Exit(-1); } IList <File> fileList = new List <File>(); foreach (string arg in args) { fileList.Add(new File(arg)); } ITreeReaderFactory trf = new FrenchXMLTreeReaderFactory(false); int totalTrees = 0; ICollection <string> morphAnalyses = Generics.NewHashSet(); try { foreach (File file in fileList) { ITreeReader tr = trf.NewTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8"))); Tree t; int numTrees; string canonicalFileName = Sharpen.Runtime.Substring(file.GetName(), 0, file.GetName().LastIndexOf('.')); for (numTrees = 0; (t = tr.ReadTree()) != null; numTrees++) { string ftbID = ((CoreLabel)t.Label()).Get(typeof(CoreAnnotations.SentenceIDAnnotation)); System.Console.Out.Printf("%s-%s\t%s%n", canonicalFileName, ftbID, t.ToString()); IList <ILabel> leaves = t.Yield(); foreach (ILabel label in leaves) { if (label is CoreLabel) { morphAnalyses.Add(((CoreLabel)label).OriginalText()); } } } tr.Close(); System.Console.Error.Printf("%s: %d trees%n", file.GetName(), numTrees); totalTrees += numTrees; } //wsg2011: Print out the observed morphological analyses // for(String analysis : morphAnalyses) // log.info(analysis); System.Console.Error.Printf("%nRead %d trees%n", totalTrees); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
//Delete sentence-initial punctuation //Delete sentence final punctuation that is preceded by punctuation (first time) //Delete sentence final punctuation that is preceded by punctuation (second time) //Convert remaining sentence-final punctuation to either . if it is not [.!?] //Delete medial, sentence-final punctuation //Now move the sentence-final mark under SENT //For those trees that lack a sentence-final punc, add one. //Finally, delete these punctuation marks, which I can't seem to kill otherwise... //A bad MWADV tree in the training set // Not sure why this got a label of X. Similar trees suggest it // should be A instead // This also seems to be mislabeled /// <param name="args"/> public static void Main(string[] args) { if (args.Length != 1) { log.Info("Usage: java " + typeof(Edu.Stanford.Nlp.International.French.Pipeline.FTBCorrector).FullName + " filename\n"); System.Environment.Exit(-1); } ITreeTransformer tt = new Edu.Stanford.Nlp.International.French.Pipeline.FTBCorrector(); File f = new File(args[0]); try { //These bad trees in the Candito training set should be thrown out: // (ROOT (SENT (" ") (. .))) // (ROOT (SENT (. .))) TregexPattern pBadTree = TregexPattern.Compile("@SENT <: @PUNC"); TregexPattern pBadTree2 = TregexPattern.Compile("@SENT <1 @PUNC <2 @PUNC !<3 __"); BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8")); ITreeReaderFactory trf = new FrenchTreeReaderFactory(); ITreeReader tr = trf.NewTreeReader(br); int nTrees = 0; for (Tree t; (t = tr.ReadTree()) != null; nTrees++) { TregexMatcher m = pBadTree.Matcher(t); TregexMatcher m2 = pBadTree2.Matcher(t); if (m.Find() || m2.Find()) { log.Info("Discarding tree: " + t.ToString()); } else { Tree fixedT = tt.TransformTree(t); System.Console.Out.WriteLine(fixedT.ToString()); } } tr.Close(); System.Console.Error.Printf("Wrote %d trees%n", nTrees); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (TregexParseException e) { Sharpen.Runtime.PrintStackTrace(e); } }
/// <param name="args"/> public static void Main(string[] args) { if (args.Length != 1) { System.Console.Error.Printf("Usage: java %s file%n", typeof(MWEPreprocessor).FullName); System.Environment.Exit(-1); } File treeFile = new File(args[0]); TwoDimensionalCounter <string, string> labelTerm = new TwoDimensionalCounter <string, string>(); TwoDimensionalCounter <string, string> termLabel = new TwoDimensionalCounter <string, string>(); TwoDimensionalCounter <string, string> labelPreterm = new TwoDimensionalCounter <string, string>(); TwoDimensionalCounter <string, string> pretermLabel = new TwoDimensionalCounter <string, string>(); TwoDimensionalCounter <string, string> unigramTagger = new TwoDimensionalCounter <string, string>(); try { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")); ITreeReaderFactory trf = new FrenchTreeReaderFactory(); ITreeReader tr = trf.NewTreeReader(br); for (Tree t; (t = tr.ReadTree()) != null;) { CountMWEStatistics(t, unigramTagger, labelPreterm, pretermLabel, labelTerm, termLabel); } tr.Close(); //Closes the underlying reader System.Console.Out.WriteLine("Generating {MWE Type -> Terminal}"); PrintCounter(labelTerm, "label_term.csv"); System.Console.Out.WriteLine("Generating {Terminal -> MWE Type}"); PrintCounter(termLabel, "term_label.csv"); System.Console.Out.WriteLine("Generating {MWE Type -> POS sequence}"); PrintCounter(labelPreterm, "label_pos.csv"); System.Console.Out.WriteLine("Generating {POS sequence -> MWE Type}"); PrintCounter(pretermLabel, "pos_label.csv"); System.Console.Out.WriteLine("Resolving DUMMY tags"); ResolveDummyTags(treeFile, pretermLabel, unigramTagger); System.Console.Out.WriteLine("#Unknown Word Types: " + MWEPreprocessor.ManualUWModel.nUnknownWordTypes); System.Console.Out.WriteLine("#Missing POS: " + nMissingPOS); System.Console.Out.WriteLine("#Missing Phrasal: " + nMissingPhrasal); System.Console.Out.WriteLine("Done!"); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
/// <param name="args"/> public static void Main(string[] args) { if (args.Length != 2) { System.Console.Error.Printf("Usage: java %s tree_file morfette_tnt_file%n", typeof(MungeTreesWithMorfetteAnalyses).FullName); System.Environment.Exit(-1); } string treeFile = args[0]; string morfetteFile = args[1]; ITreeReaderFactory trf = new FrenchTreeReaderFactory(); try { ITreeReader tr = trf.NewTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"))); IEnumerator <IList <CoreLabel> > morfetteItr = new MungeTreesWithMorfetteAnalyses.MorfetteFileIterator(morfetteFile); for (Tree tree; (tree = tr.ReadTree()) != null && morfetteItr.MoveNext();) { IList <CoreLabel> analysis = morfetteItr.Current; IList <ILabel> yield = tree.Yield(); System.Diagnostics.Debug.Assert(analysis.Count == yield.Count); int yieldLen = yield.Count; for (int i = 0; i < yieldLen; ++i) { CoreLabel tokenAnalysis = analysis[i]; ILabel token = yield[i]; string lemma = GetLemma(token.Value(), tokenAnalysis.Lemma()); string newLeaf = string.Format("%s%s%s%s%s", token.Value(), MorphoFeatureSpecification.MorphoMark, lemma, MorphoFeatureSpecification.LemmaMark, tokenAnalysis.Tag()); ((CoreLabel)token).SetValue(newLeaf); } System.Console.Out.WriteLine(tree.ToString()); } if (tr.ReadTree() != null || morfetteItr.MoveNext()) { log.Info("WARNING: Uneven input files!"); } tr.Close(); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
/// <param name="args"/> public static void Main(string[] args) { Properties options = StringUtils.ArgsToProperties(args, argOptionDefs); if (!options.Contains(string.Empty) || options.Contains("help")) { log.Info(Usage()); return; } bool retainNER = PropertiesUtils.GetBool(options, "ner", false); bool normalize = PropertiesUtils.GetBool(options, "normalize", true); File treeFile = new File(options.GetProperty(string.Empty)); TwoDimensionalCounter <string, string> labelTerm = new TwoDimensionalCounter <string, string>(); TwoDimensionalCounter <string, string> termLabel = new TwoDimensionalCounter <string, string>(); TwoDimensionalCounter <string, string> labelPreterm = new TwoDimensionalCounter <string, string>(); TwoDimensionalCounter <string, string> pretermLabel = new TwoDimensionalCounter <string, string>(); TwoDimensionalCounter <string, string> unigramTagger = new TwoDimensionalCounter <string, string>(); try { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")); ITreeReaderFactory trf = new SpanishTreeReaderFactory(); ITreeReader tr = trf.NewTreeReader(br); for (Tree t; (t = tr.ReadTree()) != null;) { UpdateTagger(unigramTagger, t); } tr.Close(); //Closes the underlying reader System.Console.Out.WriteLine("Resolving DUMMY tags"); ResolveDummyTags(treeFile, unigramTagger, retainNER, normalize ? new SpanishTreeNormalizer(true, false, false) : null); System.Console.Out.WriteLine("#Unknown Word Types: " + MultiWordPreprocessor.ManualUWModel.nUnknownWordTypes); System.Console.Out.WriteLine(string.Format("#Missing POS: %d (fixed: %d, %.2f%%)", nMissingPOS, nFixedPOS, (double)nFixedPOS / nMissingPOS * 100)); System.Console.Out.WriteLine(string.Format("#Missing Phrasal: %d (fixed: %d, %.2f%%)", nMissingPhrasal, nFixedPhrasal, (double)nFixedPhrasal / nMissingPhrasal * 100)); System.Console.Out.WriteLine("Done!"); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
/// <param name="args"/> public static void Main(string[] args) { if (args.Length != 1) { System.Console.Error.Printf("Usage: java %s tree_file%n", typeof(TreeToMorfette).FullName); System.Environment.Exit(-1); } string treeFile = args[0]; ITreeReaderFactory trf = new FrenchTreeReaderFactory(); try { ITreeReader tr = trf.NewTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"))); for (Tree tree1; (tree1 = tr.ReadTree()) != null;) { IList <ILabel> pretermYield = tree1.PreTerminalYield(); IList <ILabel> yield = tree1.Yield(); int yieldLen = yield.Count; for (int i = 0; i < yieldLen; ++i) { CoreLabel rawToken = (CoreLabel)yield[i]; string word = rawToken.Value(); string morphStr = rawToken.OriginalText(); Pair <string, string> lemmaMorph = MorphoFeatureSpecification.SplitMorphString(word, morphStr); string lemma = lemmaMorph.First(); string morph = lemmaMorph.Second(); if (morph == null || morph.Equals(string.Empty) || morph.Equals("XXX")) { morph = ((CoreLabel)pretermYield[i]).Value(); } System.Console.Out.Printf("%s %s %s%n", word, lemma, morph); } System.Console.Out.WriteLine(); } tr.Close(); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
private static void ResolveDummyTags(File treeFile, TwoDimensionalCounter <string, string> unigramTagger, bool retainNER, TreeNormalizer tn) { ITreeFactory tf = new LabeledScoredTreeFactory(); MultiWordTreeExpander expander = new MultiWordTreeExpander(); try { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")); ITreeReaderFactory trf = new SpanishTreeReaderFactory(); ITreeReader tr = trf.NewTreeReader(br); PrintWriter pw = new PrintWriter(new TextWriter(new FileOutputStream(new File(treeFile + ".fixed")), false, "UTF-8")); int nTrees = 0; for (Tree t; (t = tr.ReadTree()) != null; nTrees++) { TraverseAndFix(t, null, unigramTagger, retainNER); // Now "decompress" further the expanded trees formed by // multiword token splitting t = expander.ExpandPhrases(t, tn, tf); if (tn != null) { t = tn.NormalizeWholeTree(t, tf); } pw.Println(t.ToString()); } pw.Close(); tr.Close(); System.Console.Out.WriteLine("Processed " + nTrees + " trees"); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
/// <summary>For debugging.</summary> /// <param name="args"/> public static void Main(string[] args) { if (args.Length != 1) { System.Console.Error.Printf("Usage: java %s atb_tree_file > atb_tree_file.out%n", typeof(Edu.Stanford.Nlp.International.Arabic.Pipeline.MWETreeVisitorExternal).FullName); System.Environment.Exit(-1); } ITreeReaderFactory trf = new ArabicTreeReaderFactory(); try { ITreeReader tr = trf.NewTreeReader(new BufferedReader(new InputStreamReader(new FileInputStream(args[0]), "UTF-8"))); ITreeVisitor visitor = new Edu.Stanford.Nlp.International.Arabic.Pipeline.MWETreeVisitorExternal(); int treeId = 0; for (Tree tree; (tree = tr.ReadTree()) != null; ++treeId) { if (tree.Value().Equals("ROOT")) { // Skip over the ROOT tag tree = tree.FirstChild(); } visitor.VisitTree(tree); System.Console.Out.WriteLine(tree.ToString()); } tr.Close(); System.Console.Error.Printf("Processed %d trees.%n", treeId); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
//Delete sentence-initial punctuation //Delete sentence-initial punctuation (again) //Delete sentence final punctuation that is preceded by punctuation (first time) //Delete sentence final punctuation that is preceded by punctuation (second time) //Convert remaining sentence-final punctuation to . if it is not [.!?] //Delete medial, sentence-final punctuation // ("@PUNC=punc <: /[!\\.\\?]+/ $. __\n" // + "prune punc\n" // + "\n") + //Now move the sentence-final mark under the top-level node //For those trees that lack a sentence-final punc, add one. // ("/^[^\\.!\\?]$/ >>- (__ > @ROOT <- __=loc) <: __\n" // + "insert (PUNC .) $- loc\n" // + "\n"); /// <param name="args"/> public static void Main(string[] args) { if (args.Length != 1) { log.Info("Usage: java " + typeof(Edu.Stanford.Nlp.International.Arabic.Pipeline.ATBCorrector).FullName + " filename\n"); System.Environment.Exit(-1); } ITreeTransformer tt = new Edu.Stanford.Nlp.International.Arabic.Pipeline.ATBCorrector(); File f = new File(args[0]); try { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8")); ITreeReaderFactory trf = new ArabicTreeReaderFactory.ArabicRawTreeReaderFactory(); ITreeReader tr = trf.NewTreeReader(br); int nTrees = 0; for (Tree t; (t = tr.ReadTree()) != null; nTrees++) { Tree fixedT = tt.TransformTree(t); System.Console.Out.WriteLine(fixedT.ToString()); } tr.Close(); System.Console.Error.Printf("Wrote %d trees%n", nTrees); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
/// <param name="args"/> public static void Main(string[] args) { if (args.Length != 1) { System.Console.Error.Printf("Usage: java %s tree_file > trees%n", typeof(HebrewTreeReaderFactory).FullName); System.Environment.Exit(-1); } ITreebankLanguagePack tlp = new HebrewTreebankLanguagePack(); File treeFile = new File(args[0]); try { ITreeReaderFactory trf = new HebrewTreeReaderFactory(); BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), tlp.GetEncoding())); ITreeReader tr = trf.NewTreeReader(br); int numTrees = 0; for (Tree t; ((t = tr.ReadTree()) != null); numTrees++) { System.Console.Out.WriteLine(t.ToString()); } tr.Close(); System.Console.Error.Printf("Processed %d trees.%n", numTrees); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
/// <summary>Close the Reader behind this <code>TreeReader</code>.</summary> /// <exception cref="System.IO.IOException"/> public virtual void Close() { tr.Close(); }
public static void Main(string[] args) { if (args.Length < 1) { System.Console.Error.Printf("Usage: java %s tree_file%n", typeof(TreeToTSV).FullName); System.Environment.Exit(-1); } string treeFile = args[0]; try { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")); ITreeReaderFactory trf = new SpanishTreeReaderFactory(); ITreeReader tr = trf.NewTreeReader(br); StringBuilder sb = new StringBuilder(); string nl = Runtime.GetProperty("line.separator"); Pattern nePattern = Pattern.Compile("^grup\\.nom\\."); Pattern npPattern = Pattern.Compile("^np0000.$"); for (Tree tree; (tree = tr.ReadTree()) != null;) { foreach (Tree t in tree) { if (!t.IsPreTerminal()) { continue; } char type = 'O'; Tree grandma = t.Ancestor(1, tree); string grandmaValue = ((CoreLabel)grandma.Label()).Value(); // grup.nom.x if (nePattern.Matcher(grandmaValue).Find()) { type = grandmaValue[9]; } else { // else check the pos for np0000x or not string pos = ((CoreLabel)t.Label()).Value(); if (npPattern.Matcher(pos).Find()) { type = pos[6]; } } Tree wordNode = t.FirstChild(); string word = ((CoreLabel)wordNode.Label()).Value(); sb.Append(word).Append("\t"); switch (type) { case 'p': { sb.Append("PERS"); break; } case 'l': { sb.Append("LUG"); break; } case 'o': { sb.Append("ORG"); break; } case '0': { sb.Append("OTROS"); break; } default: { sb.Append("O"); break; } } sb.Append(nl); } sb.Append(nl); } System.Console.Out.Write(sb.ToString()); tr.Close(); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
public static void Main(string[] args) { if (args.Length != 1) { System.Console.Error.Printf("Usage: java %s file%n", typeof(Edu.Stanford.Nlp.International.French.Scripts.MWEFrequencyDist).FullName); System.Environment.Exit(-1); } File treeFile = new File(args[0]); TwoDimensionalCounter <string, string> mweLabelToString = new TwoDimensionalCounter <string, string>(); ICollection <string> uniquePOSSequences = Generics.NewHashSet(); try { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")); ITreeReaderFactory trf = new FrenchTreeReaderFactory(); ITreeReader tr = trf.NewTreeReader(br); TregexPattern pMWE = TregexPattern.Compile("/^MW/"); for (Tree t; (t = tr.ReadTree()) != null;) { //Count MWE statistics TregexMatcher m = pMWE.Matcher(t); while (m.FindNextMatchingNode()) { Tree match = m.GetMatch(); string label = match.Value(); IList <CoreLabel> yield = match.TaggedLabeledYield(); StringBuilder termYield = new StringBuilder(); StringBuilder posYield = new StringBuilder(); foreach (CoreLabel cl in yield) { termYield.Append(cl.Word()).Append(" "); posYield.Append(cl.Tag()).Append(" "); } mweLabelToString.IncrementCount(label, termYield.ToString().Trim()); uniquePOSSequences.Add(posYield.ToString().Trim()); } } tr.Close(); //Closes the underlying reader System.Console.Out.Printf("Type\t#Type\t#Single\t%%Single\t%%Total%n"); double nMWEs = mweLabelToString.TotalCount(); int nAllSingletons = 0; int nTokens = 0; foreach (string mweLabel in mweLabelToString.FirstKeySet()) { int nSingletons = 0; double totalCount = mweLabelToString.TotalCount(mweLabel); ICounter <string> mc = mweLabelToString.GetCounter(mweLabel); foreach (string term in mc.KeySet()) { if (mc.GetCount(term) == 1.0) { nSingletons++; } nTokens += term.Split("\\s+").Length *(int)mc.GetCount(term); } nAllSingletons += nSingletons; System.Console.Out.Printf("%s\t%d\t%d\t%.2f\t%.2f%n", mweLabel, (int)totalCount, nSingletons, 100.0 * nSingletons / totalCount, 100.0 * totalCount / nMWEs); } System.Console.Out.Printf("TOTAL:\t%d\t%d\t%.2f%n", (int)nMWEs, nAllSingletons, 100.0 * nAllSingletons / nMWEs); System.Console.Out.WriteLine("#tokens = " + nTokens); System.Console.Out.WriteLine("#unique MWE POS sequences = " + uniquePOSSequences.Count); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (TregexParseException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }