Exemplo n.º 1
0
        /// <param name="args"/>
        public static void Main(string[] args)
        {
            Properties options = StringUtils.ArgsToProperties(args, argOptionDefs);

            if (!options.Contains(string.Empty) || options.Contains("help"))
            {
                log.Info(Usage());
                return;
            }
            bool retainNER = PropertiesUtils.GetBool(options, "ner", false);
            bool normalize = PropertiesUtils.GetBool(options, "normalize", true);
            File treeFile  = new File(options.GetProperty(string.Empty));
            TwoDimensionalCounter <string, string> labelTerm     = new TwoDimensionalCounter <string, string>();
            TwoDimensionalCounter <string, string> termLabel     = new TwoDimensionalCounter <string, string>();
            TwoDimensionalCounter <string, string> labelPreterm  = new TwoDimensionalCounter <string, string>();
            TwoDimensionalCounter <string, string> pretermLabel  = new TwoDimensionalCounter <string, string>();
            TwoDimensionalCounter <string, string> unigramTagger = new TwoDimensionalCounter <string, string>();

            try
            {
                BufferedReader     br  = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
                ITreeReaderFactory trf = new SpanishTreeReaderFactory();
                ITreeReader        tr  = trf.NewTreeReader(br);
                for (Tree t; (t = tr.ReadTree()) != null;)
                {
                    UpdateTagger(unigramTagger, t);
                }
                tr.Close();
                //Closes the underlying reader
                System.Console.Out.WriteLine("Resolving DUMMY tags");
                ResolveDummyTags(treeFile, unigramTagger, retainNER, normalize ? new SpanishTreeNormalizer(true, false, false) : null);
                System.Console.Out.WriteLine("#Unknown Word Types: " + MultiWordPreprocessor.ManualUWModel.nUnknownWordTypes);
                System.Console.Out.WriteLine(string.Format("#Missing POS: %d (fixed: %d, %.2f%%)", nMissingPOS, nFixedPOS, (double)nFixedPOS / nMissingPOS * 100));
                System.Console.Out.WriteLine(string.Format("#Missing Phrasal: %d (fixed: %d, %.2f%%)", nMissingPhrasal, nFixedPhrasal, (double)nFixedPhrasal / nMissingPhrasal * 100));
                System.Console.Out.WriteLine("Done!");
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }
Exemplo n.º 2
0
        private static void ResolveDummyTags(File treeFile, TwoDimensionalCounter <string, string> unigramTagger, bool retainNER, TreeNormalizer tn)
        {
            ITreeFactory          tf       = new LabeledScoredTreeFactory();
            MultiWordTreeExpander expander = new MultiWordTreeExpander();

            try
            {
                BufferedReader     br  = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
                ITreeReaderFactory trf = new SpanishTreeReaderFactory();
                ITreeReader        tr  = trf.NewTreeReader(br);
                PrintWriter        pw  = new PrintWriter(new TextWriter(new FileOutputStream(new File(treeFile + ".fixed")), false, "UTF-8"));
                int nTrees             = 0;
                for (Tree t; (t = tr.ReadTree()) != null; nTrees++)
                {
                    TraverseAndFix(t, null, unigramTagger, retainNER);
                    // Now "decompress" further the expanded trees formed by
                    // multiword token splitting
                    t = expander.ExpandPhrases(t, tn, tf);
                    if (tn != null)
                    {
                        t = tn.NormalizeWholeTree(t, tf);
                    }
                    pw.Println(t.ToString());
                }
                pw.Close();
                tr.Close();
                System.Console.Out.WriteLine("Processed " + nTrees + " trees");
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }
        public static void Main(string[] args)
        {
            if (args.Length < 1)
            {
                System.Console.Error.Printf("Usage: java %s tree_file%n", typeof(TreeToTSV).FullName);
                System.Environment.Exit(-1);
            }
            string treeFile = args[0];

            try
            {
                BufferedReader     br        = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
                ITreeReaderFactory trf       = new SpanishTreeReaderFactory();
                ITreeReader        tr        = trf.NewTreeReader(br);
                StringBuilder      sb        = new StringBuilder();
                string             nl        = Runtime.GetProperty("line.separator");
                Pattern            nePattern = Pattern.Compile("^grup\\.nom\\.");
                Pattern            npPattern = Pattern.Compile("^np0000.$");
                for (Tree tree; (tree = tr.ReadTree()) != null;)
                {
                    foreach (Tree t in tree)
                    {
                        if (!t.IsPreTerminal())
                        {
                            continue;
                        }
                        char   type         = 'O';
                        Tree   grandma      = t.Ancestor(1, tree);
                        string grandmaValue = ((CoreLabel)grandma.Label()).Value();
                        // grup.nom.x
                        if (nePattern.Matcher(grandmaValue).Find())
                        {
                            type = grandmaValue[9];
                        }
                        else
                        {
                            // else check the pos for np0000x or not
                            string pos = ((CoreLabel)t.Label()).Value();
                            if (npPattern.Matcher(pos).Find())
                            {
                                type = pos[6];
                            }
                        }
                        Tree   wordNode = t.FirstChild();
                        string word     = ((CoreLabel)wordNode.Label()).Value();
                        sb.Append(word).Append("\t");
                        switch (type)
                        {
                        case 'p':
                        {
                            sb.Append("PERS");
                            break;
                        }

                        case 'l':
                        {
                            sb.Append("LUG");
                            break;
                        }

                        case 'o':
                        {
                            sb.Append("ORG");
                            break;
                        }

                        case '0':
                        {
                            sb.Append("OTROS");
                            break;
                        }

                        default:
                        {
                            sb.Append("O");
                            break;
                        }
                        }
                        sb.Append(nl);
                    }
                    sb.Append(nl);
                }
                System.Console.Out.Write(sb.ToString());
                tr.Close();
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }