private void CreateOutDict() { lock (this) { if (outDict == null) { logger.Info("reading " + flags.outDict2 + " as a seen lexicon"); outDict = new CorpusDictionary(flags.outDict2); } } }
public NonDict2(SeqClassifierFlags flags) { //public String sighanCorporaDict = "/u/nlp/data/chinese-segmenter/"; if (cd == null) { if (flags.sighanCorporaDict != null) { corporaDict = flags.sighanCorporaDict; } // use the same flag for Sighan 2005, // but our list is extracted from ctb string path; if (flags.useAs || flags.useHk || flags.useMsr) { throw new Exception("only support settings for CTB and PKU now."); } else { if (flags.usePk) { path = corporaDict + "/dict/pku.non"; } else { // CTB path = corporaDict + "/dict/ctb.non"; } } cd = new CorpusDictionary(path); // just output the msg... if (flags.useAs || flags.useHk || flags.useMsr) { } else { if (flags.usePk) { logger.Info("INFO: flags.usePk=true | building NonDict2 from " + path); } else { // CTB logger.Info("INFO: flags.usePk=false | building NonDict2 from " + path); } } } }
public virtual ICollection <string> FeaturesCpC(PaddedList <IN> cInfo, int loc) { ICollection <string> features = new List <string>(); CoreLabel c = cInfo[loc]; CoreLabel c1 = cInfo[loc + 1]; CoreLabel c2 = cInfo[loc + 2]; CoreLabel c3 = cInfo[loc + 3]; CoreLabel p = cInfo[loc - 1]; CoreLabel p2 = cInfo[loc - 2]; CoreLabel p3 = cInfo[loc - 3]; string charc = c.Get(typeof(CoreAnnotations.CharAnnotation)); if (charc == null) { charc = string.Empty; } string charc1 = c1.Get(typeof(CoreAnnotations.CharAnnotation)); if (charc1 == null) { charc1 = string.Empty; } string charc2 = c2.Get(typeof(CoreAnnotations.CharAnnotation)); if (charc2 == null) { charc2 = string.Empty; } string charc3 = c3.Get(typeof(CoreAnnotations.CharAnnotation)); if (charc3 == null) { charc3 = string.Empty; } string charp = p.Get(typeof(CoreAnnotations.CharAnnotation)); if (charp == null) { charp = string.Empty; } string charp2 = p2.Get(typeof(CoreAnnotations.CharAnnotation)); if (charp2 == null) { charp2 = string.Empty; } string charp3 = p3.Get(typeof(CoreAnnotations.CharAnnotation)); if (charp3 == null) { charp3 = string.Empty; } /* * N-gram features. N is upto 2. */ if (flags.useWord2) { // features.add(charc +"c"); // features.add(charc1+"c1"); // features.add(charp +"p"); // features.add(charp +charc +"pc"); // if( flags.useMsr ){ // features.add(charc +charc1 +"cc1"); // features.add(charp + charc1 +"pc1"); // } features.Add(charc + "::c"); features.Add(charc1 + "::c1"); features.Add(charp + "::p"); features.Add(charp2 + "::p2"); // trying to restore the features that Huishin described in SIGHAN 2005 paper features.Add(charc + charc1 + "::cn"); features.Add(charp + charc + "::pc"); features.Add(charp + charc1 + "::pn"); features.Add(charp2 + charp + "::p2p"); features.Add(charp2 + charc + "::p2c"); features.Add(charc2 + charc + "::n2c"); features.Add("|word2"); } /* * Radical N-gram features. N is upto 4. * Smoothing method of N-gram, because there are too many characters in Chinese. * (It works better than N-gram when they are used individually. less sparse) */ char rcharc; char rcharc1; char rcharc2; char rcharc3; char rcharp; char rcharp1; char rcharp2; char rcharp3; if (charc.Length == 0) { rcharc = 'n'; } else { rcharc = RadicalMap.GetRadical(charc[0]); } if (charc1.Length == 0) { rcharc1 = 'n'; } else { rcharc1 = RadicalMap.GetRadical(charc1[0]); } if (charc2.Length == 0) { rcharc2 = 'n'; } else { rcharc2 = RadicalMap.GetRadical(charc2[0]); } if (charc3.Length == 0) { rcharc3 = 'n'; } else { rcharc3 = RadicalMap.GetRadical(charc3[0]); } if (charp.Length == 0) { rcharp = 'n'; } else { rcharp = RadicalMap.GetRadical(charp[0]); } if (charp2.Length == 0) { rcharp2 = 'n'; } else { rcharp2 = RadicalMap.GetRadical(charp2[0]); } if (charp3.Length == 0) { rcharp3 = 'n'; } else { rcharp3 = RadicalMap.GetRadical(charp3[0]); } if (flags.useRad2) { features.Add(rcharc + "rc"); features.Add(rcharc1 + "rc1"); features.Add(rcharp + "rp"); features.Add(rcharp + rcharc + "rpc"); features.Add(rcharc + rcharc1 + "rcc1"); features.Add(rcharp + rcharc + rcharc1 + "rpcc1"); features.Add("|rad2"); } /* non-word dictionary:SEEM bi-gram marked as non-word */ if (flags.useDict2) { NonDict2 nd = new NonDict2(flags); features.Add(nd.CheckDic(charp + charc, flags) + "nondict"); features.Add("|useDict2"); } if (flags.useOutDict2) { if (outDict == null) { logger.Info("reading " + flags.outDict2 + " as a seen lexicon"); outDict = new CorpusDictionary(flags.outDict2, true); } features.Add(outDict.GetW(charp + charc) + "outdict"); // -1 0 features.Add(outDict.GetW(charc + charc1) + "outdict"); // 0 1 features.Add(outDict.GetW(charp2 + charp) + "outdict"); // -2 -1 features.Add(outDict.GetW(charp2 + charp + charc) + "outdict"); // -2 -1 0 features.Add(outDict.GetW(charp3 + charp2 + charp) + "outdict"); // -3 -2 -1 features.Add(outDict.GetW(charp + charc + charc1) + "outdict"); // -1 0 1 features.Add(outDict.GetW(charc + charc1 + charc2) + "outdict"); // 0 1 2 features.Add(outDict.GetW(charp + charc + charc1 + charc2) + "outdict"); } // -1 0 1 2 /* * (CTB/ASBC/HK/PK/MSR) POS information of each characters. * If a character falls into some function categories, * it is very likely there is a boundary. * A lot of Chinese function words belong to single characters. * This feature is also good for numbers and punctuations. * DE* are grouped into DE. */ if (flags.useCTBChar2 || flags.useASBCChar2 || flags.useHKChar2 || flags.usePKChar2 || flags.useMSRChar2) { string[] tagsets; // the "useChPos" now only works for CTB and PK if (flags.useChPos) { if (flags.useCTBChar2) { tagsets = new string[] { "AD", "AS", "BA", "CC", "CD", "CS", "DE", "DT", "ETC", "IJ", "JJ", "LB", "LC", "M", "NN", "NR", "NT", "OD", "P", "PN", "PU", "SB", "SP", "VA", "VC", "VE", "VV" }; } else { if (flags.usePKChar2) { //tagsets = new String[]{"r", "j", "t", "a", "nz", "l", "vn", "i", "m", "ns", "nr", "v", "n", "q", "Ng", "b", "d", "nt"}; tagsets = new string[] { "2", "3", "4" }; } else { throw new Exception("only support settings for CTB and PK now."); } } } else { //logger.info("Using Derived features"); tagsets = new string[] { "2", "3", "4" }; } if (taDetector == null) { taDetector = new TagAffixDetector(flags); } foreach (string tagset in tagsets) { features.Add(taDetector.CheckDic(tagset + "p", charp) + taDetector.CheckDic(tagset + "i", charp) + taDetector.CheckDic(tagset + "s", charc) + taDetector.CheckInDic(charp) + taDetector.CheckInDic(charc) + tagset + "prep-sufc"); } } // features.add("|ctbchar2"); // Added a constant feature several times!! /* * In error analysis, we found English words and numbers are often separated. * Rule 1: isNumber feature: check if the current and previous char is a number. * Rule 2: Disambiguation of time point and time duration. * Rule 3: isEnglish feature: check if the current and previous character is an english letter. * Rule 4: English name feature: check if the current char is a conjunct pu for English first and last name, since there is no space between two names. * Most of PUs are a good indicator for word boundary, but - and . is a strong indicator that there is no boundry within a previous , a follow char and it. */ if (flags.useRule2) { /* Reduplication features */ // previous character == current character if (charp.Equals(charc)) { features.Add("11"); } // previous character == next character if (charp.Equals(charc1)) { features.Add("22"); } // current character == next next character // fire only when usePk and useHk are both false. // Notice: this should be (almost) the same as the "22" feature, but we keep it for now. if (!flags.usePk && !flags.useHk) { if (charc.Equals(charc2)) { features.Add("33"); } } char cur1 = ' '; char cur2 = ' '; char cur = ' '; char pre = ' '; // actually their length must be either 0 or 1 if (charc1.Length > 0) { cur1 = charc1[0]; } if (charc2.Length > 0) { cur2 = charc2[0]; } if (charc.Length > 0) { cur = charc[0]; } if (charp.Length > 0) { pre = charp[0]; } string prer = rcharp.ToString(); // the radical of previous character Pattern E = Pattern.Compile("[a-zA-Z]"); Pattern N = Pattern.Compile("[0-9]"); Matcher m = E.Matcher(charp); Matcher ce = E.Matcher(charc); Matcher pe = E.Matcher(charp2); Matcher cn = N.Matcher(charc); Matcher pn = N.Matcher(charp2); // if current and previous characters are numbers... if (cur >= '0' && cur <= '9' && pre >= '0' && pre <= '9') { if (cur == '9' && pre == '1' && cur1 == '9' && cur2 >= '0' && cur2 <= '9') { //199x features.Add("YR"); } else { features.Add("2N"); } } else { // if current and previous characters are not both numbers // but previous char is a number // i.e. patterns like "1N" , "2A", etc if (pre >= '0' && pre <= '9') { features.Add("1N"); } else { // if previous character is an English character if (m.Matches()) { features.Add("E"); } else { // if the previous character contains no radical (and it exist) if (prer.Equals(".") && charp.Length == 1) { // fire only when usePk and useHk are both false. Not sure why. -pichuan if (!flags.useHk && !flags.usePk) { if (ce.Matches()) { features.Add("PU+E"); } if (pe.Matches()) { features.Add("E+PU"); } if (cn.Matches()) { features.Add("PU+N"); } if (pn.Matches()) { features.Add("N+PU"); } } features.Add("PU"); } } } } string engType = IsEnglish(charp, charc); string engPU = IsEngPU(charp); if (!engType.Equals(string.Empty)) { features.Add(engType); } if (!engPU.Equals(string.Empty) && !engType.Equals(string.Empty)) { features.Add(engPU + engType); } } //end of use rule // features using "Character.getType" information! string origS = c.Get(typeof(CoreAnnotations.OriginalCharAnnotation)); char origC = ' '; if (origS.Length > 0) { origC = origS[0]; } int type = char.GetType(origC); switch (type) { case char.UppercaseLetter: case char.LowercaseLetter: { // A-Z and full-width A-Z // a-z and full-width a-z features.Add("CHARTYPE-LETTER"); break; } case char.DecimalDigitNumber: { features.Add("CHARTYPE-DECIMAL_DIGIT_NUMBER"); break; } case char.OtherLetter: { // mostly chinese chars features.Add("CHARTYPE-OTHER_LETTER"); break; } default: { // other types features.Add("CHARTYPE-MISC"); break; } } return(features); }