/// <summary>
        /// Fix tree structure, phrasal categories and part-of-speech labels in newly expanded
        /// multi-word tokens.
        /// </summary>
        /// <exception cref="System.Exception"/>
        /// <exception cref="Java.Util.Concurrent.ExecutionException"/>
        private IList <Tree> FixMultiWordTokens(IList <Tree> trees)
        {
            bool ner = PropertiesUtils.GetBool(options, "ner", false);
            // Shared resources
            IFactory <TreeNormalizer> tnf = new _IFactory_389();
            ITreeFactory tf = new LabeledScoredTreeFactory();
            IThreadsafeProcessor <ICollection <Tree>, ICollection <Tree> > processor = new AnCoraProcessor.MultiWordProcessor(this, tnf, tf, ner);
            int availableProcessors = Runtime.GetRuntime().AvailableProcessors();
            MulticoreWrapper <ICollection <Tree>, ICollection <Tree> > wrapper = new MulticoreWrapper <ICollection <Tree>, ICollection <Tree> >(availableProcessors, processor, false);
            // Chunk our work so that parallelization is actually worth it
            int numChunks = availableProcessors * 20;
            IList <IList <Tree> > chunked = CollectionUtils.PartitionIntoFolds(trees, numChunks);
            IList <Tree>          ret     = new List <Tree>();

            foreach (ICollection <Tree> coll in chunked)
            {
                wrapper.Put(coll);
                while (wrapper.Peek())
                {
                    Sharpen.Collections.AddAll(ret, wrapper.Poll());
                }
            }
            wrapper.Join();
            while (wrapper.Peek())
            {
                Sharpen.Collections.AddAll(ret, wrapper.Poll());
            }
            return(ret);
        }
        public static void Main(string[] args)
        {
            if (args.Length < minArgs)
            {
                System.Console.Out.WriteLine(Usage());
                System.Environment.Exit(-1);
            }
            Properties options             = StringUtils.ArgsToProperties(args, ArgDefs());
            Language   language            = PropertiesUtils.Get(options, "l", Language.English, typeof(Language));
            ITreebankLangParserParams tlpp = language.@params;
            DiskTreebank tb            = null;
            string       encoding      = options.GetProperty("l", "UTF-8");
            bool         removeBracket = PropertiesUtils.GetBool(options, "b", false);

            tlpp.SetInputEncoding(encoding);
            tlpp.SetOutputEncoding(encoding);
            tb = tlpp.DiskTreebank();
            string[] files = options.GetProperty(string.Empty, string.Empty).Split("\\s+");
            if (files.Length != 0)
            {
                foreach (string filename in files)
                {
                    tb.LoadPath(filename);
                }
            }
            else
            {
                log.Info(Usage());
                System.Environment.Exit(-1);
            }
            PrintWriter  pwo         = tlpp.Pw();
            string       startSymbol = tlpp.TreebankLanguagePack().StartSymbol();
            ITreeFactory tf          = new LabeledScoredTreeFactory();
            int          nTrees      = 0;

            foreach (Tree t in tb)
            {
                if (removeBracket)
                {
                    if (t.Value().Equals(startSymbol))
                    {
                        t = t.FirstChild();
                    }
                }
                else
                {
                    if (!t.Value().Equals(startSymbol))
                    {
                        //Add a bracket if it isn't already there
                        t = tf.NewTreeNode(startSymbol, Java.Util.Collections.SingletonList(t));
                    }
                }
                pwo.Println(t.ToString());
                nTrees++;
            }
            pwo.Close();
            System.Console.Error.Printf("Processed %d trees.%n", nTrees);
        }
Example #3
0
        private static void ResolveDummyTags(File treeFile, TwoDimensionalCounter <string, string> unigramTagger, bool retainNER, TreeNormalizer tn)
        {
            ITreeFactory          tf       = new LabeledScoredTreeFactory();
            MultiWordTreeExpander expander = new MultiWordTreeExpander();

            try
            {
                BufferedReader     br  = new BufferedReader(new InputStreamReader(new FileInputStream(treeFile), "UTF-8"));
                ITreeReaderFactory trf = new SpanishTreeReaderFactory();
                ITreeReader        tr  = trf.NewTreeReader(br);
                PrintWriter        pw  = new PrintWriter(new TextWriter(new FileOutputStream(new File(treeFile + ".fixed")), false, "UTF-8"));
                int nTrees             = 0;
                for (Tree t; (t = tr.ReadTree()) != null; nTrees++)
                {
                    TraverseAndFix(t, null, unigramTagger, retainNER);
                    // Now "decompress" further the expanded trees formed by
                    // multiword token splitting
                    t = expander.ExpandPhrases(t, tn, tf);
                    if (tn != null)
                    {
                        t = tn.NormalizeWholeTree(t, tf);
                    }
                    pw.Println(t.ToString());
                }
                pw.Close();
                tr.Close();
                System.Console.Out.WriteLine("Processed " + nTrees + " trees");
            }
            catch (UnsupportedEncodingException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (FileNotFoundException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
            catch (IOException e)
            {
                Sharpen.Runtime.PrintStackTrace(e);
            }
        }
        // static methods
        /// <summary>Construct a fall through tree in case we can't parse this sentence.</summary>
        /// <param name="words">Words of the sentence that didn't parse</param>
        /// <returns>
        /// A tree with X for all the internal nodes.
        /// Preterminals have the right tag if the words are tagged.
        /// </returns>
        public static Tree XTree <_T0>(IList <_T0> words)
            where _T0 : IHasWord
        {
            ITreeFactory treeFactory = new LabeledScoredTreeFactory();
            IList <Tree> lst2        = new List <Tree>();

            foreach (IHasWord obj in words)
            {
                string s   = obj.Word();
                Tree   t   = treeFactory.NewLeaf(s);
                string tag = "XX";
                if (obj is IHasTag)
                {
                    if (((IHasTag)obj).Tag() != null)
                    {
                        tag = ((IHasTag)obj).Tag();
                    }
                }
                Tree t2 = treeFactory.NewTreeNode(tag, Java.Util.Collections.SingletonList(t));
                lst2.Add(t2);
            }
            return(treeFactory.NewTreeNode("X", lst2));
        }