/// <param name="args"/> public static void Main(string[] args) { Properties options = StringUtils.ArgsToProperties(args, argOptionDefs); if (!options.Contains(string.Empty) || options.Contains("help")) { log.Info(Usage()); return; } bool retainNER = PropertiesUtils.GetBool(options, "ner", false); bool normalize = PropertiesUtils.GetBool(options, "normalize", true); File treeFile = new File(options.GetProperty(string.Empty)); TwoDimensionalCounter <string, string> labelTerm = new TwoDimensionalCounter <string, string>(); TwoDimensionalCounter <string, string> termLabel = new TwoDimensionalCounter <string, string>(); TwoDimensionalCounter <string, string> labelPreterm = new TwoDimensionalCounter <string, string>(); TwoDimensionalCounter <string, string> pretermLabel = new TwoDimensionalCounter <string, string>(); TwoDimensionalCounter <string, string> unigramTagger = new TwoDimensionalCounter <string, string>(); try { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")); ITreeReaderFactory trf = new SpanishTreeReaderFactory(); ITreeReader tr = trf.NewTreeReader(br); for (Tree t; (t = tr.ReadTree()) != null;) { UpdateTagger(unigramTagger, t); } tr.Close(); //Closes the underlying reader System.Console.Out.WriteLine("Resolving DUMMY tags"); ResolveDummyTags(treeFile, unigramTagger, retainNER, normalize ? new SpanishTreeNormalizer(true, false, false) : null); System.Console.Out.WriteLine("#Unknown Word Types: " + MultiWordPreprocessor.ManualUWModel.nUnknownWordTypes); System.Console.Out.WriteLine(string.Format("#Missing POS: %d (fixed: %d, %.2f%%)", nMissingPOS, nFixedPOS, (double)nFixedPOS / nMissingPOS * 100)); System.Console.Out.WriteLine(string.Format("#Missing Phrasal: %d (fixed: %d, %.2f%%)", nMissingPhrasal, nFixedPhrasal, (double)nFixedPhrasal / nMissingPhrasal * 100)); System.Console.Out.WriteLine("Done!"); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
private static void ResolveDummyTags(File treeFile, TwoDimensionalCounter <string, string> unigramTagger, bool retainNER, TreeNormalizer tn) { ITreeFactory tf = new LabeledScoredTreeFactory(); MultiWordTreeExpander expander = new MultiWordTreeExpander(); try { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")); ITreeReaderFactory trf = new SpanishTreeReaderFactory(); ITreeReader tr = trf.NewTreeReader(br); PrintWriter pw = new PrintWriter(new TextWriter(new FileOutputStream(new File(treeFile + ".fixed")), false, "UTF-8")); int nTrees = 0; for (Tree t; (t = tr.ReadTree()) != null; nTrees++) { TraverseAndFix(t, null, unigramTagger, retainNER); // Now "decompress" further the expanded trees formed by // multiword token splitting t = expander.ExpandPhrases(t, tn, tf); if (tn != null) { t = tn.NormalizeWholeTree(t, tf); } pw.Println(t.ToString()); } pw.Close(); tr.Close(); System.Console.Out.WriteLine("Processed " + nTrees + " trees"); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }
public static void Main(string[] args) { if (args.Length < 1) { System.Console.Error.Printf("Usage: java %s tree_file%n", typeof(TreeToTSV).FullName); System.Environment.Exit(-1); } string treeFile = args[0]; try { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8")); ITreeReaderFactory trf = new SpanishTreeReaderFactory(); ITreeReader tr = trf.NewTreeReader(br); StringBuilder sb = new StringBuilder(); string nl = Runtime.GetProperty("line.separator"); Pattern nePattern = Pattern.Compile("^grup\\.nom\\."); Pattern npPattern = Pattern.Compile("^np0000.$"); for (Tree tree; (tree = tr.ReadTree()) != null;) { foreach (Tree t in tree) { if (!t.IsPreTerminal()) { continue; } char type = 'O'; Tree grandma = t.Ancestor(1, tree); string grandmaValue = ((CoreLabel)grandma.Label()).Value(); // grup.nom.x if (nePattern.Matcher(grandmaValue).Find()) { type = grandmaValue[9]; } else { // else check the pos for np0000x or not string pos = ((CoreLabel)t.Label()).Value(); if (npPattern.Matcher(pos).Find()) { type = pos[6]; } } Tree wordNode = t.FirstChild(); string word = ((CoreLabel)wordNode.Label()).Value(); sb.Append(word).Append("\t"); switch (type) { case 'p': { sb.Append("PERS"); break; } case 'l': { sb.Append("LUG"); break; } case 'o': { sb.Append("ORG"); break; } case '0': { sb.Append("OTROS"); break; } default: { sb.Append("O"); break; } } sb.Append(nl); } sb.Append(nl); } System.Console.Out.Write(sb.ToString()); tr.Close(); } catch (UnsupportedEncodingException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (FileNotFoundException e) { Sharpen.Runtime.PrintStackTrace(e); } catch (IOException e) { Sharpen.Runtime.PrintStackTrace(e); } }