Пример #1
0
        private Tree TransformTree(Tree tree, bool isRoot)
        {
            string label = tree.Label().Value();

            // log.info("ChineseCollinizer: Node label is " + label);
            if (tree.IsLeaf())
            {
                if (deletePunct && ctlp.IsPunctuationWord(label))
                {
                    return(null);
                }
                else
                {
                    return(tf.NewLeaf(new StringLabel(label)));
                }
            }
            if (tree.IsPreTerminal() && deletePunct && ctlp.IsPunctuationTag(label))
            {
                // System.out.println("Deleting punctuation");
                return(null);
            }
            IList <Tree> children = new List <Tree>();

            if (label.Matches("ROOT.*") && tree.NumChildren() == 1)
            {
                // keep non-unary roots for now
                return(TransformTree(tree.Children()[0], true));
            }
            //System.out.println("Enhanced label is " + label);
            // remove all functional and machine-generated annotations
            label = label.ReplaceFirst("[^A-Z].*$", string.Empty);
            // merge parentheticals with adverb phrases
            label = label.ReplaceFirst("PRN", "ADVP");
            //System.out.println("New label is " + label);
            for (int cNum = 0; cNum < tree.Children().Length; cNum++)
            {
                Tree child    = tree.Children()[cNum];
                Tree newChild = TransformTree(child, false);
                if (newChild != null)
                {
                    children.Add(newChild);
                }
            }
            // We don't delete the root because there are trees in the
            // Chinese treebank that only have punctuation in them!!!
            if (children.IsEmpty() && !isRoot)
            {
                return(null);
            }
            return(tf.NewTreeNode(new StringLabel(label), children));
        }
Пример #2
0
        /// <summary>
        /// The main method reads (segmented, whitespace delimited) words from a file
        /// and prints them with their English translation(s).
        /// </summary>
        /// <remarks>
        /// The main method reads (segmented, whitespace delimited) words from a file
        /// and prints them with their English translation(s).
        /// The path and filename of the CEDict Lexicon can be supplied via the
        /// "-dictPath" flag; otherwise the default filename "cedict_ts.u8" in the
        /// current directory is checked.
        /// By default, only the first translation is printed.  If the "-all" flag
        /// is given, all translations are printed.
        /// The input and output encoding can be specified using the "-encoding" flag.
        /// Otherwise UTF-8 is assumed.
        /// </remarks>
        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            IDictionary <string, int> flagsToNumArgs = Generics.NewHashMap();

            flagsToNumArgs["-dictPath"] = 1;
            flagsToNumArgs["-encoding"] = 1;
            IDictionary <string, string[]> argMap = StringUtils.ArgsToMap(args, flagsToNumArgs);

            string[] otherArgs = argMap[null];
            if (otherArgs.Length < 1)
            {
                log.Info("usage: ChineseEnglishWordMap [-all] [-dictPath path] [-encoding enc_string] inputFile");
                System.Environment.Exit(1);
            }
            string filename        = otherArgs[0];
            bool   allTranslations = argMap.Contains("-all");
            string charset         = defaultCharset;

            if (argMap.Contains("-encoding"))
            {
                charset = argMap["-encoding"][0];
            }
            BufferedReader        r   = new BufferedReader(new InputStreamReader(new FileInputStream(filename), charset));
            ITreebankLanguagePack tlp = new ChineseTreebankLanguagePack();

            string[] dpString          = argMap["-dictPath"];
            ChineseEnglishWordMap cewm = (dpString == null) ? new ChineseEnglishWordMap() : new ChineseEnglishWordMap(dpString[0]);
            int         totalWords     = 0;
            int         coveredWords   = 0;
            PrintWriter pw             = new PrintWriter(new OutputStreamWriter(System.Console.Out, charset), true);

            for (string line = r.ReadLine(); line != null; line = r.ReadLine())
            {
                string[] words = line.Split("\\s", 1000);
                foreach (string word in words)
                {
                    totalWords++;
                    if (word.Length == 0)
                    {
                        continue;
                    }
                    pw.Print(StringUtils.Pad(word + ':', 8));
                    if (tlp.IsPunctuationWord(word))
                    {
                        totalWords--;
                        pw.Print(word);
                    }
                    else
                    {
                        if (IsDigits(word))
                        {
                            pw.Print(word + " [NUMBER]");
                        }
                        else
                        {
                            if (cewm.ContainsKey(word))
                            {
                                coveredWords++;
                                if (allTranslations)
                                {
                                    IList <string> trans = new List <string>(cewm.GetAllTranslations(word));
                                    foreach (string s in trans)
                                    {
                                        pw.Print((trans.IndexOf(s) > 0 ? "|" : string.Empty) + s);
                                    }
                                }
                                else
                                {
                                    pw.Print(cewm.GetFirstTranslation(word));
                                }
                            }
                            else
                            {
                                pw.Print("[UNK]");
                            }
                        }
                    }
                    pw.Println();
                }
                pw.Println();
            }
            r.Close();
            log.Info("Finished translating " + totalWords + " words (");
            log.Info(coveredWords + " were in dictionary).");
        }