Beispiel #1
0
        /// <summary>
        /// The main method reads (segmented, whitespace delimited) words from a file
        /// and prints them with their English translation(s).
        /// </summary>
        /// <remarks>
        /// The main method reads (segmented, whitespace delimited) words from a file
        /// and prints them with their English translation(s).
        /// The path and filename of the CEDict Lexicon can be supplied via the
        /// "-dictPath" flag; otherwise the default filename "cedict_ts.u8" in the
        /// current directory is checked.
        /// By default, only the first translation is printed.  If the "-all" flag
        /// is given, all translations are printed.
        /// The input and output encoding can be specified using the "-encoding" flag.
        /// Otherwise UTF-8 is assumed.
        /// </remarks>
        /// <exception cref="System.IO.IOException"/>
        public static void Main(string[] args)
        {
            IDictionary <string, int> flagsToNumArgs = Generics.NewHashMap();

            flagsToNumArgs["-dictPath"] = 1;
            flagsToNumArgs["-encoding"] = 1;
            IDictionary <string, string[]> argMap = StringUtils.ArgsToMap(args, flagsToNumArgs);

            string[] otherArgs = argMap[null];
            if (otherArgs.Length < 1)
            {
                log.Info("usage: ChineseEnglishWordMap [-all] [-dictPath path] [-encoding enc_string] inputFile");
                System.Environment.Exit(1);
            }
            string filename        = otherArgs[0];
            bool   allTranslations = argMap.Contains("-all");
            string charset         = defaultCharset;

            if (argMap.Contains("-encoding"))
            {
                charset = argMap["-encoding"][0];
            }
            BufferedReader        r   = new BufferedReader(new InputStreamReader(new FileInputStream(filename), charset));
            ITreebankLanguagePack tlp = new ChineseTreebankLanguagePack();

            string[] dpString          = argMap["-dictPath"];
            ChineseEnglishWordMap cewm = (dpString == null) ? new ChineseEnglishWordMap() : new ChineseEnglishWordMap(dpString[0]);
            int         totalWords     = 0;
            int         coveredWords   = 0;
            PrintWriter pw             = new PrintWriter(new OutputStreamWriter(System.Console.Out, charset), true);

            for (string line = r.ReadLine(); line != null; line = r.ReadLine())
            {
                string[] words = line.Split("\\s", 1000);
                foreach (string word in words)
                {
                    totalWords++;
                    if (word.Length == 0)
                    {
                        continue;
                    }
                    pw.Print(StringUtils.Pad(word + ':', 8));
                    if (tlp.IsPunctuationWord(word))
                    {
                        totalWords--;
                        pw.Print(word);
                    }
                    else
                    {
                        if (IsDigits(word))
                        {
                            pw.Print(word + " [NUMBER]");
                        }
                        else
                        {
                            if (cewm.ContainsKey(word))
                            {
                                coveredWords++;
                                if (allTranslations)
                                {
                                    IList <string> trans = new List <string>(cewm.GetAllTranslations(word));
                                    foreach (string s in trans)
                                    {
                                        pw.Print((trans.IndexOf(s) > 0 ? "|" : string.Empty) + s);
                                    }
                                }
                                else
                                {
                                    pw.Print(cewm.GetFirstTranslation(word));
                                }
                            }
                            else
                            {
                                pw.Print("[UNK]");
                            }
                        }
                    }
                    pw.Println();
                }
                pw.Println();
            }
            r.Close();
            log.Info("Finished translating " + totalWords + " words (");
            log.Info(coveredWords + " were in dictionary).");
        }
Beispiel #2
0
 public ChineseCollinizer(ChineseTreebankLanguagePack ctlp)
     : this(ctlp, true)
 {
 }
Beispiel #3
0
 public ChineseCollinizer(ChineseTreebankLanguagePack ctlp, bool deletePunct)
 {
     this.deletePunct = deletePunct;
     this.ctlp        = ctlp;
 }
        // We delete the most egregious non-speech DFL, FLR, IMG, and SKIP constituents, according to the Tregex
        // expression above. Maybe more should be deleted really. I don't understand this very well, and there is no documentation.
        // New phrasal categories in CTB 7 and later:
        // DFL = Disfluency. Generally keep but delete for ones that are things like (FLR (PU <) (VV turn) (PU >)).
        // EMO = Emoticon. For emoticons. Fine to keep.
        // FLR = Filler.  Generally keep but delete for ones that are things like (FLR (PU <) (VV turn) (PU >)).
        // IMG = ?Image?. Appear to all be of form (IMG (PU [) (NN 图片) (PU ])). Delete all those.
        // INC = Incomplete (more incomplete than a FRAG which is only syntactically incomplete). Just keep.
        // INTJ = Interjection. Fine to keep.
        // META = Just one of these in chtb_5200.df. Delete whole tree. Should have been turned into XML metadata
        // OTH = ??. Weird but just leave.
        // SKIP = ??. Always has NOI under it. Omit or keep?
        // TYPO = seems like should mainly go, but sometimes a branching node??
        // WHPP = ??. Just one of these. Over a -NONE- so will go if empties are deleted. But should just be PP.
        //
        // There is a tree in chtb_2856.bn which has IP -> ... PU (FLR (PU <)) (VV turn) (PU >)
        // which just seems an error - should all be under FLR.
        //
        // POS tags are now 38. Original 33 plus these:
        // EM = Emoticon. Often but not always under EMO.
        // IC = Incomplete word rendered in pinyin, usually under DFL.
        // NOI =
        // URL = URL.
        // X = In practice currently used only for "x" in constructions like "30 x 25 cm". Shouldn't exist!
        public override Tree NormalizeWholeTree(Tree tree, ITreeFactory tf)
        {
            Tree newTree = tree.Prune(chineseEmptyFilter, tf).SpliceOut(aOverAFilter);

            // Report non-unary initial rewrites & fix 'obvious ones'
            Tree[] kids = newTree.Children();
            if (kids.Length > 1)
            {
                /* -------------- don't do this as probably shouldn't for test set (and doesn't help anyway)
                 * if (kids.length == 2 &&
                 * "PU".equals(kids[kids.length - 1].value()) &&
                 * kids[0].isPhrasal()) {
                 * printlnErr("Correcting error: non-unary initial rewrite fixed by tucking punctuation inside constituent: " + newTree.localTree());
                 * List kidkids = kids[0].getChildrenAsList();
                 * kidkids.add(kids[1]);
                 * Tree bigger = tf.newTreeNode(kids[0].label(), kidkids);
                 * newTree = tf.newTreeNode(newTree.label(), Collections.singletonList(bigger));
                 * } else {
                 * -------------------- */
                EncodingPrintWriter.Err.Println("Possible error: non-unary initial rewrite: " + newTree.LocalTree(), ChineseTreebankLanguagePack.Encoding);
            }
            else
            {
                // }
                if (kids.Length > 0)
                {
                    // ROOT has 1 child - the normal case
                    Tree child = kids[0];
                    if (!child.IsPhrasal())
                    {
                        if (Debug)
                        {
                            EncodingPrintWriter.Err.Println("Correcting error: treebank tree is not phrasal; wrapping in FRAG: " + child, ChineseTreebankLanguagePack.Encoding);
                        }
                        Tree added = tf.NewTreeNode("FRAG", Arrays.AsList(kids));
                        newTree.SetChild(0, added);
                    }
                    else
                    {
                        if (child.Label().Value().Equals("META"))
                        {
                            // Delete the one bogus META tree in CTB 9
                            EncodingPrintWriter.Err.Println("Deleting META tree that should be XML metadata in chtb_5200.df: " + child, ChineseTreebankLanguagePack.Encoding);
                            return(null);
                        }
                    }
                }
                else
                {
                    EncodingPrintWriter.Err.Println("Error: tree with no children: " + tree, ChineseTreebankLanguagePack.Encoding);
                }
            }
            // note that there's also at least 1 tree that is an IP with no surrounding ROOT node
            // there are also several places where "NP" is used as a preterminal tag
            // and presumably should be "NN"
            // a couple of other random errors are corrected here
            foreach (Tree subtree in newTree)
            {
                if (subtree.Value().Equals("CP") && subtree.NumChildren() == 1)
                {
                    Tree subsubtree = subtree.FirstChild();
                    if (subsubtree.Value().Equals("ROOT"))
                    {
                        if (subsubtree.FirstChild().IsLeaf() && "CP".Equals(subsubtree.FirstChild().Value()))
                        {
                            EncodingPrintWriter.Err.Println("Correcting error: seriously messed up tree in CTB6 (chtb_3095.bn): " + newTree, ChineseTreebankLanguagePack.Encoding);
                            IList <Tree> children = subsubtree.GetChildrenAsList();
                            children = children.SubList(1, children.Count);
                            subtree.SetChildren(children);
                            EncodingPrintWriter.Err.Println("  Corrected as:                                                    " + newTree, ChineseTreebankLanguagePack.Encoding);
                        }
                    }
                }
                // spaced to align with above
                // All the stuff below here seems to have been fixed in CTB 9. Maybe reporting errors sometimes does help.
                if (subtree.IsPreTerminal())
                {
                    if (subtree.Value().Matches("NP"))
                    {
                        if (ChineseTreebankLanguagePack.ChineseDouHaoAcceptFilter().Test(subtree.FirstChild().Value()))
                        {
                            if (Debug)
                            {
                                EncodingPrintWriter.Err.Println("Correcting error: NP preterminal over douhao; preterminal changed to PU: " + subtree, ChineseTreebankLanguagePack.Encoding);
                            }
                            subtree.SetValue("PU");
                        }
                        else
                        {
                            if (subtree.Parent(newTree).Value().Matches("NP"))
                            {
                                if (Debug)
                                {
                                    EncodingPrintWriter.Err.Println("Correcting error: NP preterminal w/ NP parent; preterminal changed to NN: " + subtree.Parent(newTree), ChineseTreebankLanguagePack.Encoding);
                                }
                                subtree.SetValue("NN");
                            }
                            else
                            {
                                if (Debug)
                                {
                                    EncodingPrintWriter.Err.Println("Correcting error: NP preterminal w/o NP parent, changing preterminal to NN: " + subtree.Parent(newTree), ChineseTreebankLanguagePack.Encoding);
                                }
                                // Tree newChild = tf.newTreeNode("NN", Collections.singletonList(subtree.firstChild()));
                                // subtree.setChildren(Collections.singletonList(newChild));
                                subtree.SetValue("NN");
                            }
                        }
                    }
                    else
                    {
                        if (subtree.Value().Matches("PU"))
                        {
                            if (subtree.FirstChild().Value().Matches("他"))
                            {
                                if (Debug)
                                {
                                    EncodingPrintWriter.Err.Println("Correcting error: \"他\" under PU tag; tag changed to PN: " + subtree, ChineseTreebankLanguagePack.Encoding);
                                }
                                subtree.SetValue("PN");
                            }
                            else
                            {
                                if (subtree.FirstChild().Value().Equals("里"))
                                {
                                    if (Debug)
                                    {
                                        EncodingPrintWriter.Err.Println("Correcting error: \"" + subtree.FirstChild().Value() + "\" under PU tag; tag changed to LC: " + subtree, ChineseTreebankLanguagePack.Encoding);
                                    }
                                    subtree.SetValue("LC");
                                }
                                else
                                {
                                    if (subtree.FirstChild().Value().Equals("是"))
                                    {
                                        if (Debug)
                                        {
                                            EncodingPrintWriter.Err.Println("Correcting error: \"" + subtree.FirstChild().Value() + "\" under PU tag; tag changed to VC: " + subtree, ChineseTreebankLanguagePack.Encoding);
                                        }
                                        subtree.SetValue("VC");
                                    }
                                    else
                                    {
                                        if (subtree.FirstChild().Value().Matches("tw|半穴式"))
                                        {
                                            if (Debug)
                                            {
                                                EncodingPrintWriter.Err.Println("Correcting error: \"" + subtree.FirstChild().Value() + "\" under PU tag; tag changed to NN: " + subtree, ChineseTreebankLanguagePack.Encoding);
                                            }
                                            subtree.SetValue("NN");
                                        }
                                        else
                                        {
                                            if (subtree.FirstChild().Value().Matches("33"))
                                            {
                                                if (Debug)
                                                {
                                                    EncodingPrintWriter.Err.Println("Correcting error: \"33\" under PU tag; tag changed to CD: " + subtree, ChineseTreebankLanguagePack.Encoding);
                                                }
                                                subtree.SetValue("CD");
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
                else
                {
                    if (subtree.Value().Matches("NN"))
                    {
                        if (Debug)
                        {
                            EncodingPrintWriter.Err.Println("Correcting error: NN phrasal tag changed to NP: " + subtree, ChineseTreebankLanguagePack.Encoding);
                        }
                        subtree.SetValue("NP");
                    }
                    else
                    {
                        if (subtree.Value().Matches("MSP"))
                        {
                            if (Debug)
                            {
                                EncodingPrintWriter.Err.Println("Correcting error: MSP phrasal tag changed to VP: " + subtree, ChineseTreebankLanguagePack.Encoding);
                            }
                            subtree.SetValue("VP");
                        }
                    }
                }
            }
            for (int i = 0; i < fixupTregex.Length; ++i)
            {
                if (Debug)
                {
                    Tree preProcessed = newTree.DeepCopy();
                    newTree = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ProcessPattern(fixupTregex[i], fixupTsurgeon[i], newTree);
                    if (!preProcessed.Equals(newTree))
                    {
                        EncodingPrintWriter.Err.Println("Correcting error: Updated tree using tregex " + fixupTregex[i] + " and tsurgeon " + fixupTsurgeon[i], ChineseTreebankLanguagePack.Encoding);
                        EncodingPrintWriter.Err.Println("  from: " + preProcessed, ChineseTreebankLanguagePack.Encoding);
                        EncodingPrintWriter.Err.Println("    to: " + newTree, ChineseTreebankLanguagePack.Encoding);
                    }
                }
                else
                {
                    newTree = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ProcessPattern(fixupTregex[i], fixupTsurgeon[i], newTree);
                }
            }
            // at least once we just end up deleting everything under ROOT. In which case, we should just get rid of the tree.
            if (newTree.NumChildren() == 0)
            {
                if (Debug)
                {
                    EncodingPrintWriter.Err.Println("Deleting tree that now has no contents: " + newTree, ChineseTreebankLanguagePack.Encoding);
                }
                return(null);
            }
            if (tagExtender != null)
            {
                newTree = tagExtender.TransformTree(newTree);
            }
            return(newTree);
        }