/// <summary> /// The main method reads (segmented, whitespace delimited) words from a file /// and prints them with their English translation(s). /// </summary> /// <remarks> /// The main method reads (segmented, whitespace delimited) words from a file /// and prints them with their English translation(s). /// The path and filename of the CEDict Lexicon can be supplied via the /// "-dictPath" flag; otherwise the default filename "cedict_ts.u8" in the /// current directory is checked. /// By default, only the first translation is printed. If the "-all" flag /// is given, all translations are printed. /// The input and output encoding can be specified using the "-encoding" flag. /// Otherwise UTF-8 is assumed. /// </remarks> /// <exception cref="System.IO.IOException"/> public static void Main(string[] args) { IDictionary <string, int> flagsToNumArgs = Generics.NewHashMap(); flagsToNumArgs["-dictPath"] = 1; flagsToNumArgs["-encoding"] = 1; IDictionary <string, string[]> argMap = StringUtils.ArgsToMap(args, flagsToNumArgs); string[] otherArgs = argMap[null]; if (otherArgs.Length < 1) { log.Info("usage: ChineseEnglishWordMap [-all] [-dictPath path] [-encoding enc_string] inputFile"); System.Environment.Exit(1); } string filename = otherArgs[0]; bool allTranslations = argMap.Contains("-all"); string charset = defaultCharset; if (argMap.Contains("-encoding")) { charset = argMap["-encoding"][0]; } BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(filename), charset)); ITreebankLanguagePack tlp = new ChineseTreebankLanguagePack(); string[] dpString = argMap["-dictPath"]; ChineseEnglishWordMap cewm = (dpString == null) ? new ChineseEnglishWordMap() : new ChineseEnglishWordMap(dpString[0]); int totalWords = 0; int coveredWords = 0; PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.Console.Out, charset), true); for (string line = r.ReadLine(); line != null; line = r.ReadLine()) { string[] words = line.Split("\\s", 1000); foreach (string word in words) { totalWords++; if (word.Length == 0) { continue; } pw.Print(StringUtils.Pad(word + ':', 8)); if (tlp.IsPunctuationWord(word)) { totalWords--; pw.Print(word); } else { if (IsDigits(word)) { pw.Print(word + " [NUMBER]"); } else { if (cewm.ContainsKey(word)) { coveredWords++; if (allTranslations) { IList <string> trans = new List <string>(cewm.GetAllTranslations(word)); foreach (string s in trans) { pw.Print((trans.IndexOf(s) > 0 ? "|" : string.Empty) + s); } } else { pw.Print(cewm.GetFirstTranslation(word)); } } else { pw.Print("[UNK]"); } } } pw.Println(); } pw.Println(); } r.Close(); log.Info("Finished translating " + totalWords + " words ("); log.Info(coveredWords + " were in dictionary)."); }
public ChineseCollinizer(ChineseTreebankLanguagePack ctlp) : this(ctlp, true) { }
public ChineseCollinizer(ChineseTreebankLanguagePack ctlp, bool deletePunct) { this.deletePunct = deletePunct; this.ctlp = ctlp; }
// We delete the most egregious non-speech DFL, FLR, IMG, and SKIP constituents, according to the Tregex // expression above. Maybe more should be deleted really. I don't understand this very well, and there is no documentation. // New phrasal categories in CTB 7 and later: // DFL = Disfluency. Generally keep but delete for ones that are things like (FLR (PU <) (VV turn) (PU >)). // EMO = Emoticon. For emoticons. Fine to keep. // FLR = Filler. Generally keep but delete for ones that are things like (FLR (PU <) (VV turn) (PU >)). // IMG = ?Image?. Appear to all be of form (IMG (PU [) (NN 图片) (PU ])). Delete all those. // INC = Incomplete (more incomplete than a FRAG which is only syntactically incomplete). Just keep. // INTJ = Interjection. Fine to keep. // META = Just one of these in chtb_5200.df. Delete whole tree. Should have been turned into XML metadata // OTH = ??. Weird but just leave. // SKIP = ??. Always has NOI under it. Omit or keep? // TYPO = seems like should mainly go, but sometimes a branching node?? // WHPP = ??. Just one of these. Over a -NONE- so will go if empties are deleted. But should just be PP. // // There is a tree in chtb_2856.bn which has IP -> ... PU (FLR (PU <)) (VV turn) (PU >) // which just seems an error - should all be under FLR. // // POS tags are now 38. Original 33 plus these: // EM = Emoticon. Often but not always under EMO. // IC = Incomplete word rendered in pinyin, usually under DFL. // NOI = // URL = URL. // X = In practice currently used only for "x" in constructions like "30 x 25 cm". Shouldn't exist! public override Tree NormalizeWholeTree(Tree tree, ITreeFactory tf) { Tree newTree = tree.Prune(chineseEmptyFilter, tf).SpliceOut(aOverAFilter); // Report non-unary initial rewrites & fix 'obvious ones' Tree[] kids = newTree.Children(); if (kids.Length > 1) { /* -------------- don't do this as probably shouldn't for test set (and doesn't help anyway) * if (kids.length == 2 && * "PU".equals(kids[kids.length - 1].value()) && * kids[0].isPhrasal()) { * printlnErr("Correcting error: non-unary initial rewrite fixed by tucking punctuation inside constituent: " + newTree.localTree()); * List kidkids = kids[0].getChildrenAsList(); * kidkids.add(kids[1]); * Tree bigger = tf.newTreeNode(kids[0].label(), kidkids); * newTree = tf.newTreeNode(newTree.label(), Collections.singletonList(bigger)); * } else { * -------------------- */ EncodingPrintWriter.Err.Println("Possible error: non-unary initial rewrite: " + newTree.LocalTree(), ChineseTreebankLanguagePack.Encoding); } else { // } if (kids.Length > 0) { // ROOT has 1 child - the normal case Tree child = kids[0]; if (!child.IsPhrasal()) { if (Debug) { EncodingPrintWriter.Err.Println("Correcting error: treebank tree is not phrasal; wrapping in FRAG: " + child, ChineseTreebankLanguagePack.Encoding); } Tree added = tf.NewTreeNode("FRAG", Arrays.AsList(kids)); newTree.SetChild(0, added); } else { if (child.Label().Value().Equals("META")) { // Delete the one bogus META tree in CTB 9 EncodingPrintWriter.Err.Println("Deleting META tree that should be XML metadata in chtb_5200.df: " + child, ChineseTreebankLanguagePack.Encoding); return(null); } } } else { EncodingPrintWriter.Err.Println("Error: tree with no children: " + tree, ChineseTreebankLanguagePack.Encoding); } } // note that there's also at least 1 tree that is an IP with no surrounding ROOT node // there are also several places where "NP" is used as a preterminal tag // and presumably should be "NN" // a couple of other random errors are corrected here foreach (Tree subtree in newTree) { if (subtree.Value().Equals("CP") && subtree.NumChildren() == 1) { Tree subsubtree = subtree.FirstChild(); if (subsubtree.Value().Equals("ROOT")) { if (subsubtree.FirstChild().IsLeaf() && "CP".Equals(subsubtree.FirstChild().Value())) { EncodingPrintWriter.Err.Println("Correcting error: seriously messed up tree in CTB6 (chtb_3095.bn): " + newTree, ChineseTreebankLanguagePack.Encoding); IList <Tree> children = subsubtree.GetChildrenAsList(); children = children.SubList(1, children.Count); subtree.SetChildren(children); EncodingPrintWriter.Err.Println(" Corrected as: " + newTree, ChineseTreebankLanguagePack.Encoding); } } } // spaced to align with above // All the stuff below here seems to have been fixed in CTB 9. Maybe reporting errors sometimes does help. if (subtree.IsPreTerminal()) { if (subtree.Value().Matches("NP")) { if (ChineseTreebankLanguagePack.ChineseDouHaoAcceptFilter().Test(subtree.FirstChild().Value())) { if (Debug) { EncodingPrintWriter.Err.Println("Correcting error: NP preterminal over douhao; preterminal changed to PU: " + subtree, ChineseTreebankLanguagePack.Encoding); } subtree.SetValue("PU"); } else { if (subtree.Parent(newTree).Value().Matches("NP")) { if (Debug) { EncodingPrintWriter.Err.Println("Correcting error: NP preterminal w/ NP parent; preterminal changed to NN: " + subtree.Parent(newTree), ChineseTreebankLanguagePack.Encoding); } subtree.SetValue("NN"); } else { if (Debug) { EncodingPrintWriter.Err.Println("Correcting error: NP preterminal w/o NP parent, changing preterminal to NN: " + subtree.Parent(newTree), ChineseTreebankLanguagePack.Encoding); } // Tree newChild = tf.newTreeNode("NN", Collections.singletonList(subtree.firstChild())); // subtree.setChildren(Collections.singletonList(newChild)); subtree.SetValue("NN"); } } } else { if (subtree.Value().Matches("PU")) { if (subtree.FirstChild().Value().Matches("他")) { if (Debug) { EncodingPrintWriter.Err.Println("Correcting error: \"他\" under PU tag; tag changed to PN: " + subtree, ChineseTreebankLanguagePack.Encoding); } subtree.SetValue("PN"); } else { if (subtree.FirstChild().Value().Equals("里")) { if (Debug) { EncodingPrintWriter.Err.Println("Correcting error: \"" + subtree.FirstChild().Value() + "\" under PU tag; tag changed to LC: " + subtree, ChineseTreebankLanguagePack.Encoding); } subtree.SetValue("LC"); } else { if (subtree.FirstChild().Value().Equals("是")) { if (Debug) { EncodingPrintWriter.Err.Println("Correcting error: \"" + subtree.FirstChild().Value() + "\" under PU tag; tag changed to VC: " + subtree, ChineseTreebankLanguagePack.Encoding); } subtree.SetValue("VC"); } else { if (subtree.FirstChild().Value().Matches("tw|半穴式")) { if (Debug) { EncodingPrintWriter.Err.Println("Correcting error: \"" + subtree.FirstChild().Value() + "\" under PU tag; tag changed to NN: " + subtree, ChineseTreebankLanguagePack.Encoding); } subtree.SetValue("NN"); } else { if (subtree.FirstChild().Value().Matches("33")) { if (Debug) { EncodingPrintWriter.Err.Println("Correcting error: \"33\" under PU tag; tag changed to CD: " + subtree, ChineseTreebankLanguagePack.Encoding); } subtree.SetValue("CD"); } } } } } } } } else { if (subtree.Value().Matches("NN")) { if (Debug) { EncodingPrintWriter.Err.Println("Correcting error: NN phrasal tag changed to NP: " + subtree, ChineseTreebankLanguagePack.Encoding); } subtree.SetValue("NP"); } else { if (subtree.Value().Matches("MSP")) { if (Debug) { EncodingPrintWriter.Err.Println("Correcting error: MSP phrasal tag changed to VP: " + subtree, ChineseTreebankLanguagePack.Encoding); } subtree.SetValue("VP"); } } } } for (int i = 0; i < fixupTregex.Length; ++i) { if (Debug) { Tree preProcessed = newTree.DeepCopy(); newTree = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ProcessPattern(fixupTregex[i], fixupTsurgeon[i], newTree); if (!preProcessed.Equals(newTree)) { EncodingPrintWriter.Err.Println("Correcting error: Updated tree using tregex " + fixupTregex[i] + " and tsurgeon " + fixupTsurgeon[i], ChineseTreebankLanguagePack.Encoding); EncodingPrintWriter.Err.Println(" from: " + preProcessed, ChineseTreebankLanguagePack.Encoding); EncodingPrintWriter.Err.Println(" to: " + newTree, ChineseTreebankLanguagePack.Encoding); } } else { newTree = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ProcessPattern(fixupTregex[i], fixupTsurgeon[i], newTree); } } // at least once we just end up deleting everything under ROOT. In which case, we should just get rid of the tree. if (newTree.NumChildren() == 0) { if (Debug) { EncodingPrintWriter.Err.Println("Deleting tree that now has no contents: " + newTree, ChineseTreebankLanguagePack.Encoding); } return(null); } if (tagExtender != null) { newTree = tagExtender.TransformTree(newTree); } return(newTree); }