public virtual ICollection <string> FeaturesCnC(PaddedList <IN> cInfo, int loc) { ICollection <string> features = new List <string>(); CoreLabel c = cInfo[loc]; CoreLabel c1 = cInfo[loc + 1]; CoreLabel p = cInfo[loc - 1]; string charc = c.Get(typeof(CoreAnnotations.CharAnnotation)); string charc1 = c1.Get(typeof(CoreAnnotations.CharAnnotation)); string charp = p.Get(typeof(CoreAnnotations.CharAnnotation)); if (flags.useWordn) { features.Add(charc + "c"); features.Add(charc1 + "c1"); features.Add(charp + "p"); features.Add(charp + charc + "pc"); if (flags.useAs || flags.useMsr || flags.usePk || flags.useHk) { features.Add(charc + charc1 + "cc1"); features.Add(charp + charc1 + "pc1"); } features.Add("|wordn"); } return(features); }
/// <summary>Extracts all the features from the input data at a certain index.</summary> /// <param name="cInfo">The complete data set as a List of WordInfo</param> /// <param name="loc">The index at which to extract features.</param> public override ICollection <string> GetCliqueFeatures(PaddedList <IN> cInfo, int loc, Clique clique) { ICollection <string> features = Generics.NewHashSet(); if (clique == cliqueC) { AddAllInterningAndSuffixing(features, FeaturesC(cInfo, loc), "C"); } else { if (clique == cliqueCpC) { AddAllInterningAndSuffixing(features, FeaturesCpC(cInfo, loc), "CpC"); AddAllInterningAndSuffixing(features, FeaturesCnC(cInfo, loc - 1), "CnC"); } } // else if (clique == cliqueCpCp2C) { // addAllInterningAndSuffixing(features, featuresCpCp2C(cInfo, loc), "CpCp2C"); // } else if (clique == cliqueCpCp2Cp3C) { // addAllInterningAndSuffixing(features, featuresCpCp2Cp3C(cInfo, loc), "CpCp2Cp3C"); // } else if (clique == cliqueCpCp2Cp3Cp4C) { // addAllInterningAndSuffixing(features, featuresCpCp2Cp3Cp4C(cInfo, loc), "CpCp2Cp3Cp4C"); // } else if (clique == cliqueCpCp2Cp3Cp4Cp5C) { // addAllInterningAndSuffixing(features, featuresCpCp2Cp3Cp4Cp5C(cInfo, loc), "CpCp2Cp3Cp4Cp5C"); // } return(features); }
// end featuresCpC /// <summary> /// For a CRF, this shouldn't be necessary, since the features duplicate /// those from CpC, but Huihsin found some valuable, presumably becuase /// it modified the regularization a bit. /// </summary> /// <param name="cInfo">The list of characters</param> /// <param name="loc">Position of c in list</param> /// <returns>Collection of String features (sparse set of boolean features</returns> protected internal virtual ICollection <string> FeaturesCnC <_T0>(PaddedList <_T0> cInfo, int loc) where _T0 : CoreLabel { ICollection <string> features = new List <string>(); if (flags.useWordn) { CoreLabel c = cInfo[loc]; CoreLabel c2 = cInfo[loc + 1]; CoreLabel p = cInfo[loc - 1]; CoreLabel p2 = cInfo[loc - 2]; string charc = c.GetString <CoreAnnotations.CharAnnotation>(); string charc2 = c2.GetString <CoreAnnotations.CharAnnotation>(); string charp = p.GetString <CoreAnnotations.CharAnnotation>(); string charp2 = p2.GetString <CoreAnnotations.CharAnnotation>(); features.Add(charc + "c"); features.Add(charc2 + "c2"); features.Add(charp + "p"); features.Add(charp2 + "p2"); features.Add(charp2 + charp + "p2p"); features.Add(charp + charc + "pc"); features.Add(charc + charc2 + "cc2"); features.Add(charp + "-" + charc2 + "pc2"); features.Add("cliqueCnC"); } return(features); }
protected internal virtual ICollection <string> FeaturesC(PaddedList <In> cInfo, int loc) { ICollection <string> features = new List <string>(); CoreLabel c = cInfo[loc]; CoreLabel n = cInfo[loc + 1]; CoreLabel n2 = cInfo[loc + 2]; CoreLabel p = cInfo[loc - 1]; CoreLabel p2 = cInfo[loc - 2]; string charc = c.Get(typeof(CoreAnnotations.CharAnnotation)); string charn = n.Get(typeof(CoreAnnotations.CharAnnotation)); string charn2 = n2.Get(typeof(CoreAnnotations.CharAnnotation)); string charp = p.Get(typeof(CoreAnnotations.CharAnnotation)); string charp2 = p2.Get(typeof(CoreAnnotations.CharAnnotation)); // Default feature set...a 5 character window // plus a few other language-independent features features.Add(charc + "-c"); features.Add(charn + "-n1"); features.Add(charn2 + "-n2"); features.Add(charp + "-p"); features.Add(charp2 + "-p2"); // Length feature if (charc.Length > 1) { features.Add("length"); } // Character-level class features bool seenPunc = false; bool seenDigit = false; for (int i = 0; i < limit; ++i) { char charcC = charc[i]; seenPunc = seenPunc || Characters.IsPunctuation(charcC); seenDigit = seenDigit || char.IsDigit(charcC); string cuBlock = Characters.UnicodeBlockStringOf(charcC); features.Add(cuBlock + "-uBlock"); string cuType = char.GetType(charcC).ToString(); features.Add(cuType + "-uType"); } if (seenPunc) { features.Add("haspunc"); } if (seenDigit) { features.Add("hasdigit"); } // Token-level features string word = c.Word(); int index = c.Index(); features.Add(Math.Min(MaxBefore, index) + "-before"); features.Add(Math.Min(MaxAfter, word.Length - charc.Length - index) + "-after"); features.Add(Math.Min(MaxLength, word.Length) + "-length"); // Indicator transition feature features.Add("cliqueC"); return(features); }
protected internal override ICollection <string> FeaturesC(PaddedList <In> cInfo, int loc) { ICollection <string> features = base.FeaturesC(cInfo, loc); CoreLabel n3 = cInfo[loc + 3]; CoreLabel p3 = cInfo[loc - 3]; string charn3 = n3.Get(typeof(CoreAnnotations.CharAnnotation)); string charp3 = p3.Get(typeof(CoreAnnotations.CharAnnotation)); // a 7 character window instead of a 5 character window features.Add(charn3 + "-n3"); features.Add(charp3 + "-p3"); return(features); }
protected internal virtual ICollection <string> FeaturesCpC(PaddedList <In> cInfo, int loc) { ICollection <string> features = new List <string>(); CoreLabel c = cInfo[loc]; CoreLabel p = cInfo[loc - 1]; string charc = c.Get(typeof(CoreAnnotations.CharAnnotation)); string charp = p.Get(typeof(CoreAnnotations.CharAnnotation)); features.Add(charc + charp + "-cngram"); // Indicator transition feature features.Add("cliqueCpC"); return(features); }
public override ICollection <string> GetCliqueFeatures(PaddedList <CoreLabel> info, int position, Clique clique) { ICollection <string> features = new HashSet <string>(); foreach (CoreLabel l in info) { for (int i = 0; i < 10; i++) { features.Add("feat" + i + ":" + l.Word()); } } return(features); }
//is EnglishPU public virtual ICollection <string> FeaturesC(PaddedList <IN> cInfo, int loc) { ICollection <string> features = new List <string>(); CoreLabel c = cInfo[loc]; CoreLabel c1 = cInfo[loc + 1]; CoreLabel c2 = cInfo[loc + 2]; CoreLabel c3 = cInfo[loc + 3]; CoreLabel p = cInfo[loc - 1]; CoreLabel p2 = cInfo[loc - 2]; CoreLabel p3 = cInfo[loc - 3]; string charc = c.Get(typeof(CoreAnnotations.CharAnnotation)); string charc1 = c1.Get(typeof(CoreAnnotations.CharAnnotation)); string charc2 = c2.Get(typeof(CoreAnnotations.CharAnnotation)); string charc3 = c3.Get(typeof(CoreAnnotations.CharAnnotation)); string charp = p.Get(typeof(CoreAnnotations.CharAnnotation)); string charp2 = p2.Get(typeof(CoreAnnotations.CharAnnotation)); string charp3 = p3.Get(typeof(CoreAnnotations.CharAnnotation)); if (flags.useWord1) { // features.add(charc +"c"); // features.add(charc1+"c1"); // features.add(charp +"p"); // features.add(charp +charc +"pc"); // if(flags.useAs || flags.useMsr || flags.usePk || flags.useHk){ //msr, as // features.add(charc +charc1 +"cc1"); // features.add(charp + charc1 +"pc1"); // } features.Add(charc + "::c"); features.Add(charc1 + "::c1"); features.Add(charp + "::p"); features.Add(charp2 + "::p2"); // trying to restore the features that Huishin described in SIGHAN 2005 paper features.Add(charc + charc1 + "::cn"); features.Add(charp + charc + "::pc"); features.Add(charp + charc1 + "::pn"); features.Add(charp2 + charp + "::p2p"); features.Add(charp2 + charc + "::p2c"); features.Add(charc2 + charc + "::n2c"); features.Add("|word1"); } return(features); }
/// <summary>Extracts all the features from the input data at a certain index.</summary> /// <param name="cInfo">The complete data set as a List of WordInfo</param> /// <param name="loc">The index at which to extract features.</param> public override ICollection <string> GetCliqueFeatures(PaddedList <In> cInfo, int loc, Clique clique) { ICollection <string> features = Generics.NewHashSet(); if (clique == cliqueC) { AddAllInterningAndSuffixing(features, FeaturesC(cInfo, loc), "C"); } else { if (clique == cliqueCpC) { AddAllInterningAndSuffixing(features, FeaturesCpC(cInfo, loc), "CpC"); } else { if (clique == cliqueCp2C) { AddAllInterningAndSuffixing(features, FeaturesCp2C(cInfo, loc), "Cp2C"); } else { if (clique == cliqueCp3C) { AddAllInterningAndSuffixing(features, FeaturesCp3C(cInfo, loc), "Cp3C"); } } } } string domain = cInfo[loc].Get(typeof(CoreAnnotations.DomainAnnotation)); if (domain != null) { ICollection <string> domainFeatures = Generics.NewHashSet(); foreach (string feature in features) { domainFeatures.Add(feature + DomainMarker + domain); } Sharpen.Collections.AddAll(features, domainFeatures); } return(features); }
// end featuresCpCp2C protected internal virtual ICollection <string> FeaturesCpCp2Cp3C <_T0>(PaddedList <_T0> cInfo, int loc) where _T0 : CoreLabel { ICollection <string> features = new List <string>(); if (flags.use4Clique && flags.maxLeft >= 3) { CoreLabel c = cInfo[loc]; CoreLabel c2 = cInfo[loc + 1]; CoreLabel p = cInfo[loc - 1]; CoreLabel p2 = cInfo[loc - 2]; CoreLabel p3 = cInfo[loc - 3]; string charc = c.GetString <CoreAnnotations.CharAnnotation>(); string charp = p.GetString <CoreAnnotations.CharAnnotation>(); string charp2 = p2.GetString <CoreAnnotations.CharAnnotation>(); string charp3 = p3.GetString <CoreAnnotations.CharAnnotation>(); int cI = c.Get(typeof(CoreAnnotations.UTypeAnnotation)); string uTypec = (cI != null ? cI.ToString() : string.Empty); int c2I = c2.Get(typeof(CoreAnnotations.UTypeAnnotation)); string uTypec2 = (c2I != null ? c2I.ToString() : string.Empty); int pI = p.Get(typeof(CoreAnnotations.UTypeAnnotation)); string uTypep = (pI != null ? pI.ToString() : string.Empty); int p2I = p2.Get(typeof(CoreAnnotations.UTypeAnnotation)); string uTypep2 = (p2I != null ? p2I.ToString() : string.Empty); int p3I = p3.Get(typeof(CoreAnnotations.UTypeAnnotation)); string uTypep3 = (p3I != null ? p3I.ToString() : string.Empty); if (flags.useLongSequences) { features.Add(charp3 + charp2 + charp + charc + "p3p2pc"); } if (flags.useUnicodeType4gram || flags.useUnicodeType5gram) { features.Add(uTypep3 + "-" + uTypep2 + "-" + uTypep + "-" + uTypec + "-uType4"); } if (flags.useUnicodeType5gram) { features.Add(uTypep3 + "-" + uTypep2 + "-" + uTypep + "-" + uTypec + "-" + uTypec2 + "-uType5"); } features.Add("cliqueCpCp2Cp3C"); } return(features); }
protected internal override ICollection <string> FeaturesCpC(PaddedList <IN> cInfo, int loc) { ICollection <string> features = base.FeaturesCpC(cInfo, loc); CoreLabel c = cInfo[loc]; // "Wrapper" feature: identity of first and last two chars of the current word. // This helps detect ma+_+sh in dialect, as well as avoiding segmenting possessive // pronouns if the word starts with al-. if (c.Word().Length > 3) { string start = Sharpen.Runtime.Substring(c.Word(), 0, 2); string end = Sharpen.Runtime.Substring(c.Word(), c.Word().Length - 2); if (c.Index() == 2) { features.Add(start + "_" + end + "-begin-wrap"); } if (c.Index() == c.Word().Length - 1) { features.Add(start + "_" + end + "-end-wrap"); } } return(features); }
public override CRFDatum <IList <string>, CRFLabel> MakeDatum(IList <IN> info, int loc, IList <FeatureFactory <IN> > featureFactories) { pad.Set(typeof(CoreAnnotations.AnswerAnnotation), flags.backgroundSymbol); PaddedList <IN> pInfo = new PaddedList <IN>(info, pad); IList <IList <string> > features = new List <IList <string> >(); ICollection <Clique> done = Generics.NewHashSet(); for (int i = 0; i < windowSize; i++) { IList <string> featuresC = new List <string>(); IList <Clique> windowCliques = FeatureFactory.GetCliques(i, 0); windowCliques.RemoveAll(done); Sharpen.Collections.AddAll(done, windowCliques); foreach (Clique c in windowCliques) { foreach (FeatureFactory <IN> featureFactory in featureFactories) { Sharpen.Collections.AddAll(featuresC, featureFactory.GetCliqueFeatures(pInfo, loc, c)); } } if (testTime && i == 0) { // this feature is only present at test time and only appears // in cliques of size 1 (i.e., cliques with window=0) featuresC.Add(Bias); } features.Add(featuresC); } int[] labels = new int[windowSize]; for (int i_1 = 0; i_1 < windowSize; i_1++) { string answer = pInfo[loc + i_1 - windowSize + 1].Get(typeof(CoreAnnotations.AnswerAnnotation)); labels[i_1] = classIndex.IndexOf(answer); } return(new CRFDatum <IList <string>, CRFLabel>(features, new CRFLabel(labels), null)); }
/// <summary>Extracts all the features from the input data at a certain index.</summary> /// <param name="cInfo">The complete data set as a List of WordInfo</param> /// <param name="loc">The index at which to extract features.</param> public override ICollection <string> GetCliqueFeatures(PaddedList <IN> cInfo, int loc, Clique clique) { ICollection <string> features = Generics.NewHashSet(); if (clique == cliqueC) { AddAllInterningAndSuffixing(features, FeaturesC(cInfo, loc), "C"); } else { if (clique == cliqueCpC) { AddAllInterningAndSuffixing(features, FeaturesCpC(cInfo, loc), "CpC"); AddAllInterningAndSuffixing(features, FeaturesCnC(cInfo, loc - 1), "CnC"); } else { if (clique == cliqueCpCp2C) { AddAllInterningAndSuffixing(features, FeaturesCpCp2C(cInfo, loc), "CpCp2C"); } else { if (clique == cliqueCpCp2Cp3C) { AddAllInterningAndSuffixing(features, FeaturesCpCp2Cp3C(cInfo, loc), "CpCp2Cp3C"); } } } } if (Debug > 0) { EncodingPrintWriter.Err.Println("For " + cInfo[loc] + ", features: " + features, "UTF-8"); } return(features); }
/* (non-Javadoc) * @see edu.stanford.nlp.sequences.FeatureFactory#getCliqueFeatures(edu.stanford.nlp.util.PaddedList, int, edu.stanford.nlp.sequences.Clique) */ public override ICollection GetCliqueFeatures(PaddedList info, int position, Clique clique) { // TODO Auto-generated method stub return(null); }
//end of CnC /// <summary>Second order clique features</summary> /// <param name="cInfo">The list of characters</param> /// <param name="loc">Position of c in list</param> /// <returns>Collection of String features (sparse set of boolean features</returns> protected internal virtual ICollection <string> FeaturesCpCp2C <_T0>(PaddedList <_T0> cInfo, int loc) where _T0 : CoreLabel { ICollection <string> features = new List <string>(); CoreLabel c = cInfo[loc]; CoreLabel c2 = cInfo[loc + 1]; CoreLabel c3 = cInfo[loc + 2]; CoreLabel p = cInfo[loc - 1]; CoreLabel p2 = cInfo[loc - 2]; CoreLabel p3 = cInfo[loc - 3]; string charc = c.GetString <CoreAnnotations.CharAnnotation>(); string charc2 = c2.GetString <CoreAnnotations.CharAnnotation>(); string charc3 = c3.GetString <CoreAnnotations.CharAnnotation>(); string charp = p.GetString <CoreAnnotations.CharAnnotation>(); string charp2 = p2.GetString <CoreAnnotations.CharAnnotation>(); string charp3 = p3.GetString <CoreAnnotations.CharAnnotation>(); // N-gram features. N is up to 3 if (flags.useWord3) { features.Add(charc + "::c"); features.Add(charc2 + "::n"); features.Add(charp + "::p"); features.Add(charp2 + "::p2"); // trying to restore the features that Huihsin described in SIGHAN 2005 paper features.Add(charc + charc2 + "::cn"); features.Add(charc + charc2 + charc3 + "::cnn2"); features.Add(charp + charc + "::pc"); features.Add(charp + charc2 + "::pn"); features.Add(charp2 + charp + "::p2p"); features.Add(charp3 + charp2 + charp + "::p3p2p"); features.Add(charp2 + charc + "::p2c"); features.Add(charc + charc3 + "::cn2"); } if (flags.useShapeStrings) { if (flags.useShapeStrings1) { features.Add(p.GetString <CoreAnnotations.ShapeAnnotation>() + "ps"); features.Add(c.GetString <CoreAnnotations.ShapeAnnotation>() + "cs"); features.Add(c2.GetString <CoreAnnotations.ShapeAnnotation>() + "c2s"); } if (flags.useShapeStrings3) { features.Add(p.GetString <CoreAnnotations.ShapeAnnotation>() + c.GetString <CoreAnnotations.ShapeAnnotation>() + c2.GetString <CoreAnnotations.ShapeAnnotation>() + "pscsc2s"); } if (flags.useShapeStrings4) { features.Add(p2.GetString <CoreAnnotations.ShapeAnnotation>() + p.GetString <CoreAnnotations.ShapeAnnotation>() + c.GetString <CoreAnnotations.ShapeAnnotation>() + c2.GetString <CoreAnnotations.ShapeAnnotation>() + "p2spscsc2s"); } if (flags.useShapeStrings5) { features.Add(p2.GetString <CoreAnnotations.ShapeAnnotation>() + p.GetString <CoreAnnotations.ShapeAnnotation>() + c.GetString <CoreAnnotations.ShapeAnnotation>() + c2.GetString <CoreAnnotations.ShapeAnnotation>() + c3.GetString <CoreAnnotations.ShapeAnnotation >() + "p2spscsc2sc3s"); } if (flags.useWordShapeConjunctions2) { features.Add(p.GetString <CoreAnnotations.ShapeAnnotation>() + charc + "pscc"); features.Add(charp + c.GetString <CoreAnnotations.ShapeAnnotation>() + "pccs"); } if (flags.useWordShapeConjunctions3) { features.Add(p2.GetString <CoreAnnotations.ShapeAnnotation>() + p.GetString <CoreAnnotations.ShapeAnnotation>() + charc + "p2spscc"); features.Add(p.GetString <CoreAnnotations.ShapeAnnotation>() + charc + c2.GetString <CoreAnnotations.ShapeAnnotation>() + "psccc2s"); features.Add(charc + c2.GetString <CoreAnnotations.ShapeAnnotation>() + c3.GetString <CoreAnnotations.ShapeAnnotation>() + "ccc2sc3s"); } } /* * Radical N-gram features. N is upto 4. * Smoothing method of N-gram, because there are too many characters in Chinese. * (It works better than N-gram when they are used individually. less sparse) */ char rcharc; char rcharc2; char rcharp; char rcharp2; if (charc.Length == 0) { rcharc = 'n'; } else { rcharc = RadicalMap.GetRadical(charc[0]); } if (charc2.Length == 0) { rcharc2 = 'n'; } else { rcharc2 = RadicalMap.GetRadical(charc2[0]); } if (charp.Length == 0) { rcharp = 'n'; } else { rcharp = RadicalMap.GetRadical(charp[0]); } if (charp2.Length == 0) { rcharp2 = 'n'; } else { rcharp2 = RadicalMap.GetRadical(charp2[0]); } if (flags.useRad2) { features.Add(rcharc + "rc"); features.Add(rcharc2 + "rc2"); features.Add(rcharp + "rp"); features.Add(rcharp + rcharc + "rprc"); features.Add(rcharc + rcharc2 + "rcrc2"); features.Add(rcharp + rcharc + rcharc2 + "rprcrc2"); } if (flags.useRad2b) { features.Add(rcharc + "rc"); features.Add(rcharc2 + "rc2"); features.Add(rcharp + "rp"); features.Add(rcharp + rcharc + "rprc"); features.Add(rcharc + rcharc2 + "rcrc2"); features.Add(rcharp2 + rcharp + "rp2rp"); } features.Add("cliqueCpCp2C"); return(features); }
// static methods /// <summary> /// This can be used to map from any IOB-style (i.e., "I-PERS" style labels) /// or just categories representation to any other. /// </summary> /// <remarks> /// This can be used to map from any IOB-style (i.e., "I-PERS" style labels) /// or just categories representation to any other. /// It can read and change any representation to other representations: /// a 4 way representation of all entities, like S-PERS, B-PERS, /// I-PERS, E-PERS for single word, beginning, internal, and end of entity /// (IOBES or SBIEO); always marking the first word of an entity (IOB2 or BIO); /// only marking specially the beginning of non-first /// items of an entity sequences with B-PERS (IOB1); /// the reverse IOE1 and IOE2; IO where everything is I-tagged; and /// NOPREFIX, where no prefixes are written on category labels. /// The last two representations are deficient in not allowing adjacent /// entities of the same class to be represented, but nevertheless /// convenient. Note that the background label is never given a prefix. /// This code is very specific to the particular CoNLL way of labeling /// classes for IOB-style encoding, but this notation is quite widespread. /// It will work on any of these styles of input. /// This will also recognize BILOU format (B=B, I=I, L=E, O=O, U=S). /// It also works with lowercased names like i-org. /// If the labels are not of the form "C-Y+", where C is a single character, /// then they will be regarded as NOPREFIX labels. /// This method updates the List tokens in place. /// </remarks> /// <param name="tokens">List of tokens (each a CoreLabel) in some style</param> /// <param name="key">The key in the CoreLabel to change, commonly CoreAnnotations.AnswerAnnotation.class</param> /// <param name="backgroundLabel">The background label, which gets special treatment</param> /// <param name="style">Output style; one of iob[12], ioe[12], io, sbieo/iobes, noprefix</param> /// <param name="intern">Whether to String-intern the new labels (may as well, small number!)</param> public static void EntitySubclassify <Tok>(IList <TOK> tokens, Type key, string backgroundLabel, string style, bool intern) where Tok : ICoreMap { int how; string lowerStyle = style.ToLower(Locale.English); switch (lowerStyle) { case "iob1": { how = 0; break; } case "iob2": case "bio": { how = 1; break; } case "ioe1": { how = 2; break; } case "ioe2": { how = 3; break; } case "io": { how = 4; break; } case "sbieo": case "iobes": { how = 5; break; } case "noprefix": { how = 6; break; } case "bilou": { how = 7; break; } default: { throw new ArgumentException("entitySubclassify: unknown style: " + style); } } IList <TOK> paddedTokens = new PaddedList <TOK>(tokens, (TOK) new CoreLabel()); int size = paddedTokens.Count; string[] newAnswers = new string[size]; for (int i = 0; i < size; i++) { TOK c = paddedTokens[i]; TOK p = paddedTokens[i - 1]; TOK n = paddedTokens[i + 1]; string cAns = c.Get(key); string pAns = p.Get(key); if (pAns == null) { pAns = backgroundLabel; } string nAns = n.Get(key); if (nAns == null) { nAns = backgroundLabel; } string @base; char prefix; if (cAns.Length > 2 && cAns[1] == '-') { @base = Sharpen.Runtime.Substring(cAns, 2, cAns.Length); prefix = char.ToUpperCase(cAns[0]); } else { @base = cAns; prefix = ' '; } string pBase; char pPrefix; if (pAns.Length > 2 && pAns[1] == '-') { pBase = Sharpen.Runtime.Substring(pAns, 2, pAns.Length); pPrefix = char.ToUpperCase(pAns[0]); } else { pBase = pAns; pPrefix = ' '; } string nBase; char nPrefix; if (nAns.Length > 2 && nAns[1] == '-') { nBase = Sharpen.Runtime.Substring(nAns, 2, nAns.Length); nPrefix = char.ToUpperCase(nAns[0]); } else { nBase = nAns; nPrefix = ' '; } bool isStartAdjacentSame = IsSameEntityBoundary(pBase, pPrefix, @base, prefix); bool isEndAdjacentSame = IsSameEntityBoundary(@base, prefix, nBase, nPrefix); bool isFirst = IsDifferentEntityBoundary(pBase, @base) || isStartAdjacentSame; bool isLast = IsDifferentEntityBoundary(@base, nBase) || isEndAdjacentSame; string newAnswer = @base; if ([email protected](backgroundLabel)) { switch (how) { case 0: { // iob1, only B if adjacent if (isStartAdjacentSame) { newAnswer = "B-" + @base; } else { newAnswer = "I-" + @base; } break; } case 1: { // iob2 always B at start if (isFirst) { newAnswer = "B-" + @base; } else { newAnswer = "I-" + @base; } break; } case 2: { // ioe1 if (isEndAdjacentSame) { newAnswer = "E-" + @base; } else { newAnswer = "I-" + @base; } break; } case 3: { // ioe2 if (isLast) { newAnswer = "E-" + @base; } else { newAnswer = "I-" + @base; } break; } case 4: { newAnswer = "I-" + @base; break; } case 5: { if (isFirst && isLast) { newAnswer = "S-" + @base; } else { if ((!isFirst) && isLast) { newAnswer = "E-" + @base; } else { if (isFirst && (!isLast)) { newAnswer = "B-" + @base; } else { newAnswer = "I-" + @base; } } } break; } case 7: { // nothing to do on case 6 as it's just base if (isFirst && isLast) { newAnswer = "U-" + @base; } else { if ((!isFirst) && isLast) { newAnswer = "L-" + @base; } else { if (isFirst && (!isLast)) { newAnswer = "B-" + @base; } else { newAnswer = "I-" + @base; } } } break; } } } if (intern) { newAnswer = string.Intern(newAnswer); } newAnswers[i] = newAnswer; } for (int i_1 = 0; i_1 < size; i_1++) { TOK c = tokens[i_1]; c.Set(typeof(CoreAnnotations.AnswerAnnotation), newAnswers[i_1]); } }
protected internal virtual ICollection <string> FeaturesCpC <_T0>(PaddedList <_T0> cInfo, int loc) where _T0 : CoreLabel { ICollection <string> features = new List <string>(); CoreLabel c = cInfo[loc]; CoreLabel c2 = cInfo[loc + 1]; CoreLabel c3 = cInfo[loc + 2]; CoreLabel p = cInfo[loc - 1]; CoreLabel p2 = cInfo[loc - 2]; CoreLabel p3 = cInfo[loc - 3]; string charc = c.GetString <CoreAnnotations.CharAnnotation>(); string charc2 = c2.GetString <CoreAnnotations.CharAnnotation>(); string charc3 = c3.GetString <CoreAnnotations.CharAnnotation>(); string charp = p.GetString <CoreAnnotations.CharAnnotation>(); string charp2 = p2.GetString <CoreAnnotations.CharAnnotation>(); string charp3 = p3.GetString <CoreAnnotations.CharAnnotation>(); int cI = c.Get(typeof(CoreAnnotations.UTypeAnnotation)); string uTypec = (cI != null ? cI.ToString() : string.Empty); int c2I = c2.Get(typeof(CoreAnnotations.UTypeAnnotation)); string uTypec2 = (c2I != null ? c2I.ToString() : string.Empty); int c3I = c3.Get(typeof(CoreAnnotations.UTypeAnnotation)); string uTypec3 = (c3I != null ? c3I.ToString() : string.Empty); int pI = p.Get(typeof(CoreAnnotations.UTypeAnnotation)); string uTypep = (pI != null ? pI.ToString() : string.Empty); int p2I = p2.Get(typeof(CoreAnnotations.UTypeAnnotation)); string uTypep2 = (p2I != null ? p2I.ToString() : string.Empty); if (flags.dictionary != null || flags.serializedDictionary != null) { DictionaryFeaturesCpC(typeof(CoreAnnotations.LBeginAnnotation), typeof(CoreAnnotations.LMiddleAnnotation), typeof(CoreAnnotations.LEndAnnotation), string.Empty, features, p2, p, c, c2); } if (flags.dictionary2 != null) { DictionaryFeaturesCpC(typeof(CoreAnnotations.D2_LBeginAnnotation), typeof(CoreAnnotations.D2_LMiddleAnnotation), typeof(CoreAnnotations.D2_LEndAnnotation), "-D2-", features, p2, p, c, c2); } /* * N-gram features. N is upto 2. */ if (flags.useWord2) { // features.add(charc +"c"); // features.add(charc2+"c2"); // features.add(charp +"p"); // features.add(charp + charc +"pc"); // features.add(charc + charc2 +"cc2"); // // cdm: need hyphen so you can see which of charp or charc2 is null.... // features.add(charp + "-" + charc2 + "pc2"); features.Add(charc + "::c"); features.Add(charc2 + "::c1"); features.Add(charp + "::p"); features.Add(charp2 + "::p2"); // trying to restore the features that Huihsin described in SIGHAN 2005 paper features.Add(charc + charc2 + "::cn"); // (*) features.Add(charp + charc + "::pc"); features.Add(charp + charc2 + "::pn"); features.Add(charp2 + charp + "::p2p"); features.Add(charp2 + charc + "::p2c"); features.Add(charc2 + charc + "::n2c"); } // todo: this is messed up: Same as one above at (*); should be cn2 = charc + charc3 + "::cn2" if (flags.useFeaturesCpC4gram || flags.useFeaturesCpC5gram || flags.useFeaturesCpC6gram) { // todo: Both these features duplicate ones already in useWord2 features.Add(charp2 + charp + "p2p"); features.Add(charp2 + "p2"); } if (flags.useFeaturesCpC5gram || flags.useFeaturesCpC6gram) { features.Add(charc3 + "c3"); features.Add(charc2 + charc3 + "c2c3"); } if (flags.useFeaturesCpC6gram) { features.Add(charp3 + "p3"); features.Add(charp3 + charp2 + "p3p2"); } if (flags.useGoodForNamesCpC) { // these 2 features should be distinctively good at biasing from // picking up a Chinese family name in the p2 or p3 positions: // familyName X X startWord AND familyName X startWord // But actually they seem to have negative value. features.Add(charp2 + "p2"); features.Add(charp3 + "p3"); } if (flags.useUnicodeType || flags.useUnicodeType4gram || flags.useUnicodeType5gram) { features.Add(uTypep + "-" + uTypec + "-" + uTypec2 + "-uType3"); } if (flags.useUnicodeType4gram || flags.useUnicodeType5gram) { features.Add(uTypep2 + "-" + uTypep + "-" + uTypec + "-" + uTypec2 + "-uType4"); } if (flags.useUnicodeType5gram) { features.Add(uTypep2 + "-" + uTypep + "-" + uTypec + "-" + uTypec2 + "-" + uTypec3 + "-uType5"); } if (flags.useWordUTypeConjunctions2) { features.Add(uTypep + charc + "putcc"); features.Add(charp + uTypec + "pccut"); } if (flags.useWordUTypeConjunctions3) { features.Add(uTypep2 + uTypep + charc + "p2utputcc"); features.Add(uTypep + charc + uTypec2 + "putccc2ut"); features.Add(charc + uTypec2 + uTypec3 + "ccc2utc3ut"); } if (flags.useUnicodeBlock) { features.Add(p.GetString <CoreAnnotations.UBlockAnnotation>() + "-" + c.GetString <CoreAnnotations.UBlockAnnotation>() + "-" + c2.GetString <CoreAnnotations.UBlockAnnotation>() + "-uBlock"); } if (flags.useShapeStrings) { if (flags.useShapeStrings1) { features.Add(p.GetString <CoreAnnotations.ShapeAnnotation>() + "ps"); features.Add(c.GetString <CoreAnnotations.ShapeAnnotation>() + "cs"); features.Add(c2.GetString <CoreAnnotations.ShapeAnnotation>() + "c2s"); } if (flags.useShapeStrings3) { features.Add(p.GetString <CoreAnnotations.ShapeAnnotation>() + c.GetString <CoreAnnotations.ShapeAnnotation>() + c2.GetString <CoreAnnotations.ShapeAnnotation>() + "pscsc2s"); } if (flags.useShapeStrings4) { features.Add(p2.GetString <CoreAnnotations.ShapeAnnotation>() + p.GetString <CoreAnnotations.ShapeAnnotation>() + c.GetString <CoreAnnotations.ShapeAnnotation>() + c2.GetString <CoreAnnotations.ShapeAnnotation>() + "p2spscsc2s"); } if (flags.useShapeStrings5) { features.Add(p2.GetString <CoreAnnotations.ShapeAnnotation>() + p.GetString <CoreAnnotations.ShapeAnnotation>() + c.GetString <CoreAnnotations.ShapeAnnotation>() + c2.GetString <CoreAnnotations.ShapeAnnotation>() + c3.GetString <CoreAnnotations.ShapeAnnotation >() + "p2spscsc2sc3s"); } if (flags.useWordShapeConjunctions2) { features.Add(p.GetString <CoreAnnotations.ShapeAnnotation>() + charc + "pscc"); features.Add(charp + c.GetString <CoreAnnotations.ShapeAnnotation>() + "pccs"); } if (flags.useWordShapeConjunctions3) { features.Add(p2.GetString <CoreAnnotations.ShapeAnnotation>() + p.GetString <CoreAnnotations.ShapeAnnotation>() + charc + "p2spscc"); features.Add(p.GetString <CoreAnnotations.ShapeAnnotation>() + charc + c2.GetString <CoreAnnotations.ShapeAnnotation>() + "psccc2s"); features.Add(charc + c2.GetString <CoreAnnotations.ShapeAnnotation>() + c3.GetString <CoreAnnotations.ShapeAnnotation>() + "ccc2sc3s"); } } /* * Radical N-gram features. N is upto 4. * Smoothing method of N-gram, because there are too many characters in Chinese. * (It works better than N-gram when they are used individually. less sparse) */ char rcharc; char rcharc2; char rcharp; char rcharp2; if (charc.Length == 0) { rcharc = 'n'; } else { rcharc = RadicalMap.GetRadical(charc[0]); } if (charc2.Length == 0) { rcharc2 = 'n'; } else { rcharc2 = RadicalMap.GetRadical(charc2[0]); } if (charp.Length == 0) { rcharp = 'n'; } else { rcharp = RadicalMap.GetRadical(charp[0]); } if (charp2.Length == 0) { rcharp2 = 'n'; } else { rcharp2 = RadicalMap.GetRadical(charp2[0]); } if (flags.useRad2) { features.Add(rcharc + "rc"); features.Add(rcharc2 + "rc2"); features.Add(rcharp + "rp"); features.Add(rcharp + rcharc + "rprc"); features.Add(rcharc + rcharc2 + "rcrc2"); features.Add(rcharp + rcharc + rcharc2 + "rprcrc2"); } if (flags.useRad2b) { features.Add(rcharc + "rc"); features.Add(rcharc2 + "rc2"); features.Add(rcharp + "rp"); features.Add(rcharp + rcharc + "rprc"); features.Add(rcharc + rcharc2 + "rcrc2"); features.Add(rcharp2 + rcharp + "rp2rp"); } /* Non-word dictionary: SEEN bi-gram marked as non-word. * This is frickin' useful. I hadn't realized. CDM Oct 2007. */ if (flags.useDict2) { NonDict2 nd = new NonDict2(flags); features.Add(nd.CheckDic(charp + charc, flags) + "nondict"); } if (flags.useOutDict2) { if (outDict == null) { CreateOutDict(); } features.Add(outDict.GetW(charp + charc) + "outdict"); // -1 0 features.Add(outDict.GetW(charc + charc2) + "outdict"); // 0 1 features.Add(outDict.GetW(charp2 + charp) + "outdict"); // -2 -1 features.Add(outDict.GetW(charp2 + charp + charc) + "outdict"); // -2 -1 0 features.Add(outDict.GetW(charp3 + charp2 + charp) + "outdict"); // -3 -2 -1 features.Add(outDict.GetW(charp + charc + charc2) + "outdict"); // -1 0 1 features.Add(outDict.GetW(charc + charc2 + charc3) + "outdict"); // 0 1 2 features.Add(outDict.GetW(charp + charc + charc2 + charc3) + "outdict"); } // -1 0 1 2 /* * (CTB/ASBC/HK/PK/MSR) POS information of each characters. * If a character falls into some function categories, * it is very likely there is a boundary. * A lot of Chinese function words belong to single characters. * This feature is also good for numbers and punctuations. * DE* are grouped into DE. */ if (flags.useCTBChar2 || flags.useASBCChar2 || flags.useHKChar2 || flags.usePKChar2 || flags.useMSRChar2) { string[] tagsets; // the "useChPos" now only works for CTB and PK if (flags.useChPos) { if (flags.useCTBChar2) { tagsets = new string[] { "AD", "AS", "BA", "CC", "CD", "CS", "DE", "DT", "ETC", "IJ", "JJ", "LB", "LC", "M", "NN", "NR", "NT", "OD", "P", "PN", "PU", "SB", "SP", "VA", "VC", "VE", "VV" }; } else { if (flags.usePKChar2) { //tagsets = new String[]{"r", "j", "t", "a", "nz", "l", "vn", "i", "m", "ns", "nr", "v", "n", "q", "Ng", "b", "d", "nt"}; tagsets = new string[] { "2", "3", "4" }; } else { throw new Exception("only support settings for CTB and PK now."); } } } else { //logger.info("Using Derived features"); tagsets = new string[] { "2", "3", "4" }; } if (taDetector == null) { CreateTADetector(); } foreach (string tag in tagsets) { features.Add(taDetector.CheckDic(tag + "p", charp) + taDetector.CheckDic(tag + "i", charp) + taDetector.CheckDic(tag + "s", charc) + taDetector.CheckInDic(charp) + taDetector.CheckInDic(charc) + tag + "prep-sufc"); } } //features.add("|ctbchar2"); /* * In error analysis, we found English words and numbers are often separated. * Rule 1: isNumber feature: check if the current and previous char is a number. * Rule 2: Disambiguation of time point and time duration. * Rule 3: isEnglish feature: check if the current and previous character is an english letter. * Rule 4: English name feature: check if the current char is a conjunct pu for English first and last name, since there is no space between two names. * Most of PUs are a good indicator for word boundary, but - and . is a strong indicator that there is no boundry within a previous , a follow char and it. */ if (flags.useRule2) { /* Reduplication features */ // previous character == current character if (charp.Equals(charc)) { features.Add("11-R2"); } // previous character == next character if (charp.Equals(charc2)) { features.Add("22-R2"); } // current character == next next character // fire only when usePk and useHk are both false. // Notice: this should be (almost) the same as the "22" feature, but we keep it for now. if (!flags.usePk && !flags.useHk) { if (charc.Equals(charc2)) { features.Add("33-R2"); } } char cur1 = ' '; char cur2 = ' '; char cur = ' '; char pre = ' '; // actually their length must be either 0 or 1 if (charc2.Length > 0) { cur1 = charc2[0]; } if (charc3.Length > 0) { cur2 = charc3[0]; } if (charc.Length > 0) { cur = charc[0]; } if (charp.Length > 0) { pre = charp[0]; } string prer = rcharp.ToString(); // the radical of previous character Pattern E = Pattern.Compile("[a-zA-Z]"); Pattern N = Pattern.Compile("[0-9]"); Matcher m = E.Matcher(charp); Matcher ce = E.Matcher(charc); Matcher pe = E.Matcher(charp2); Matcher cn = N.Matcher(charc); Matcher pn = N.Matcher(charp2); // if current and previous characters are numbers... if (cur >= '0' && cur <= '9' && pre >= '0' && pre <= '9') { if (cur == '9' && pre == '1' && cur1 == '9' && cur2 >= '0' && cur2 <= '9') { //199x features.Add("YR-R2"); } else { features.Add("2N-R2"); } } else { // if current and previous characters are not both numbers // but previous char is a number // i.e. patterns like "1N" , "2A", etc if (pre >= '0' && pre <= '9') { features.Add("1N-R2"); } else { // if previous character is an English character if (m.Matches()) { features.Add("E-R2"); } else { // if the previous character contains no radical (and it exist) if (prer.Equals(".") && charp.Length == 1) { if (ce.Matches()) { features.Add("PU+E-R2"); } if (pe.Matches()) { features.Add("E+PU-R2"); } if (cn.Matches()) { features.Add("PU+N-R2"); } if (pn.Matches()) { features.Add("N+PU-R2"); } features.Add("PU-R2"); } } } } string engType = IsEnglish(charp, charc); string engPU = IsEngPU(charp); if (!engType.Equals(string.Empty)) { features.Add(engType); } if (!engPU.Equals(string.Empty) && !engType.Equals(string.Empty)) { StringBuilder sb = new StringBuilder(); sb.Append(engPU).Append(engType).Append("R2"); features.Add(sb.ToString()); } } //end of use rule // features using "Character.getType" information! string origS = c.GetString <CoreAnnotations.OriginalCharAnnotation>(); char origC = ' '; if (origS.Length > 0) { origC = origS[0]; } int type = char.GetType(origC); switch (type) { case char.UppercaseLetter: case char.LowercaseLetter: { // A-Z and full-width A-Z // a-z and full-width a-z features.Add("CHARTYPE-LETTER"); break; } case char.DecimalDigitNumber: { features.Add("CHARTYPE-DECIMAL_DIGIT_NUMBER"); break; } case char.OtherLetter: { // mostly chinese chars features.Add("CHARTYPE-OTHER_LETTER"); break; } default: { // other types features.Add("CHARTYPE-MISC"); break; } } features.Add("cliqueCpC"); return(features); }
protected internal virtual ICollection <string> FeaturesC <_T0>(PaddedList <_T0> cInfo, int loc) where _T0 : CoreLabel { ICollection <string> features = new List <string>(); CoreLabel c = cInfo[loc]; CoreLabel c2 = cInfo[loc + 1]; CoreLabel c3 = cInfo[loc + 2]; CoreLabel p = cInfo[loc - 1]; CoreLabel p2 = cInfo[loc - 2]; CoreLabel p3 = cInfo[loc - 3]; string charc = c.GetString <CoreAnnotations.CharAnnotation>(); string charc2 = c2.GetString <CoreAnnotations.CharAnnotation>(); string charc3 = c3.GetString <CoreAnnotations.CharAnnotation>(); string charp = p.GetString <CoreAnnotations.CharAnnotation>(); string charp2 = p2.GetString <CoreAnnotations.CharAnnotation>(); string charp3 = p3.GetString <CoreAnnotations.CharAnnotation>(); int cI = c.Get(typeof(CoreAnnotations.UTypeAnnotation)); string uTypec = (cI != null ? cI.ToString() : string.Empty); int c2I = c2.Get(typeof(CoreAnnotations.UTypeAnnotation)); string uTypec2 = (c2I != null ? c2I.ToString() : string.Empty); int c3I = c3.Get(typeof(CoreAnnotations.UTypeAnnotation)); string uTypec3 = (c3I != null ? c3I.ToString() : string.Empty); int pI = p.Get(typeof(CoreAnnotations.UTypeAnnotation)); string uTypep = (pI != null ? pI.ToString() : string.Empty); int p2I = p2.Get(typeof(CoreAnnotations.UTypeAnnotation)); string uTypep2 = (p2I != null ? p2I.ToString() : string.Empty); /* N-gram features. N is upto 2. */ if (flags.useWord1) { // features.add(charc +"c"); // features.add(charc2+"c2"); // features.add(charp +"p"); // features.add(charp + charc +"pc"); // features.add(charc + charc2 +"cc2"); // cdm: need hyphen so you can see which of charp or charc2 is null.... // features.add(charp + "-" + charc2 + "pc2"); features.Add(charc + "::c"); features.Add(charc2 + "::c2"); features.Add(charp + "::p"); features.Add(charp2 + "::p2"); // trying to restore the features that Huishin described in SIGHAN 2005 paper features.Add(charc + charc2 + "::cn"); features.Add(charc + charc3 + "::cn2"); features.Add(charp + charc + "::pc"); features.Add(charp + charc2 + "::pn"); features.Add(charp2 + charp + "::p2p"); features.Add(charp2 + charc + "::p2c"); features.Add(charc2 + charc + "::n2c"); } if (flags.dictionary != null || flags.serializedDictionary != null) { DictionaryFeaturesC(typeof(CoreAnnotations.LBeginAnnotation), typeof(CoreAnnotations.LMiddleAnnotation), typeof(CoreAnnotations.LEndAnnotation), string.Empty, features, p, c, c2); } if (flags.dictionary2 != null) { DictionaryFeaturesC(typeof(CoreAnnotations.D2_LBeginAnnotation), typeof(CoreAnnotations.D2_LMiddleAnnotation), typeof(CoreAnnotations.D2_LEndAnnotation), "-D2-", features, p, c, c2); } if (flags.useFeaturesC4gram || flags.useFeaturesC5gram || flags.useFeaturesC6gram) { features.Add(charp2 + charp + "p2p"); features.Add(charp2 + "p2"); } if (flags.useFeaturesC5gram || flags.useFeaturesC6gram) { features.Add(charc3 + "c3"); features.Add(charc2 + charc3 + "c2c3"); } if (flags.useFeaturesC6gram) { features.Add(charp3 + "p3"); features.Add(charp3 + charp2 + "p3p2"); } if (flags.useUnicodeType || flags.useUnicodeType4gram || flags.useUnicodeType5gram) { features.Add(uTypep + "-" + uTypec + "-" + uTypec2 + "-uType3"); } if (flags.useUnicodeType4gram || flags.useUnicodeType5gram) { features.Add(uTypep2 + "-" + uTypep + "-" + uTypec + "-" + uTypec2 + "-uType4"); } if (flags.useUnicodeType5gram) { features.Add(uTypep2 + "-" + uTypep + "-" + uTypec + "-" + uTypec2 + "-" + uTypec3 + "-uType5"); } if (flags.useUnicodeBlock) { features.Add(p.GetString <CoreAnnotations.UBlockAnnotation>() + "-" + c.GetString <CoreAnnotations.UBlockAnnotation>() + "-" + c2.GetString <CoreAnnotations.UBlockAnnotation>() + "-uBlock"); } if (flags.useShapeStrings) { if (flags.useShapeStrings1) { features.Add(p.GetString <CoreAnnotations.ShapeAnnotation>() + "ps"); features.Add(c.GetString <CoreAnnotations.ShapeAnnotation>() + "cs"); features.Add(c2.GetString <CoreAnnotations.ShapeAnnotation>() + "c2s"); } if (flags.useShapeStrings3) { features.Add(p.GetString <CoreAnnotations.ShapeAnnotation>() + c.GetString <CoreAnnotations.ShapeAnnotation>() + c2.GetString <CoreAnnotations.ShapeAnnotation>() + "pscsc2s"); } if (flags.useShapeStrings4) { features.Add(p2.GetString <CoreAnnotations.ShapeAnnotation>() + p.GetString <CoreAnnotations.ShapeAnnotation>() + c.GetString <CoreAnnotations.ShapeAnnotation>() + c2.GetString <CoreAnnotations.ShapeAnnotation>() + "p2spscsc2s"); } if (flags.useShapeStrings5) { features.Add(p2.GetString <CoreAnnotations.ShapeAnnotation>() + p.GetString <CoreAnnotations.ShapeAnnotation>() + c.GetString <CoreAnnotations.ShapeAnnotation>() + c2.GetString <CoreAnnotations.ShapeAnnotation>() + c3.GetString <CoreAnnotations.ShapeAnnotation >() + "p2spscsc2sc3s"); } } features.Add("cliqueC"); return(features); }
// All the tags we need // Patterns we need // In theory 块 钱 should be separated by segmenter, but just in case segmenter fails // TODO(yuhao): Need to add support for 块 钱, 毛 钱, 角 钱, 角, 五 块 二 // This only works when POS = NT // This is used to capture a special case of date in Chinese: 70 后 or 七零 后 // order it by number of characters DESC for handy one-by-one matching of string suffix /// <summary>Use a set of heuristic rules to assign NER tags to tokens.</summary> /// <param name="document"> /// A /// <see cref="System.Collections.IList{E}"/> /// of something that extends /// <see cref="Edu.Stanford.Nlp.Util.ICoreMap"/> /// . /// </param> /// <returns/> public override IList<CoreLabel> Classify(IList<CoreLabel> document) { // The actual implementation of the classifier PaddedList<CoreLabel> pl = new PaddedList<CoreLabel>(document, pad); for (int i = 0; i < sz; i++) { CoreLabel me = pl[i]; CoreLabel prev = pl[i - 1]; CoreLabel next = pl[i + 1]; // by default set to be "O" me.Set(typeof(CoreAnnotations.AnswerAnnotation), flags.backgroundSymbol); // If current word is OD, label it as ORDINAL if (me.GetString<CoreAnnotations.PartOfSpeechAnnotation>().Equals("OD")) { me.Set(typeof(CoreAnnotations.AnswerAnnotation), OrdinalTag); } else { if (CurrencyWordPattern.Matcher(me.Word()).Matches() && prev.GetString<CoreAnnotations.PartOfSpeechAnnotation>().Equals("CD")) { // If current word is currency word and prev word is a CD me.Set(typeof(CoreAnnotations.AnswerAnnotation), MoneyTag); } else { if (me.GetString<CoreAnnotations.PartOfSpeechAnnotation>().Equals("CD")) { // TODO(yuhao): Need to support Chinese captial numbers like 叁拾 (This won't be POS-tagged as CD). // If current word is a CD if (PercentWordPattern1.Matcher(me.Word()).Matches() || PercentWordPattern2.Matcher(me.Word()).Matches()) { // If current word is a percent me.Set(typeof(CoreAnnotations.AnswerAnnotation), PercentTag); } else { if (RightScanFindsMoneyWord(pl, i)) { // If one the right finds a currency word me.Set(typeof(CoreAnnotations.AnswerAnnotation), MoneyTag); } else { if (me.Word().Length == 2 && ChineseAndArabicNumeralsPattern.Matcher(me.Word()).Matches() && DateAgeLocalizer.Equals(next.Word())) { // This is to extract a special case of DATE: 70 后 or 七零 后 me.Set(typeof(CoreAnnotations.AnswerAnnotation), DateTag); } else { // Otherwise we should safely label it as NUMBER me.Set(typeof(CoreAnnotations.AnswerAnnotation), NumberTag); } } } } else { if (me.GetString<CoreAnnotations.PartOfSpeechAnnotation>().Equals("NT")) { // If current word is a NT (temporal noun) if (DatePattern1.Matcher(me.Word()).Matches() || DatePattern2.Matcher(me.Word()).Matches() || DatePattern3.Matcher(me.Word()).Matches() || DatePattern4.Matcher(me.Word()).Matches() || DatePattern5.Matcher(me.Word()).Matches() || DateWords.Contains (me.Word())) { me.Set(typeof(CoreAnnotations.AnswerAnnotation), DateTag); } else { if (TimePattern1.Matcher(me.Word()).Matches() || TimeWords.Contains(me.Word())) { me.Set(typeof(CoreAnnotations.AnswerAnnotation), TimeTag); } else { // TIME may have more variants (really?) so always add as TIME by default me.Set(typeof(CoreAnnotations.AnswerAnnotation), TimeTag); } } } else { if (DateAgeLocalizer.Equals(me.Word()) && prev.Word() != null && prev.Word().Length == 2 && ChineseAndArabicNumeralsPattern.Matcher(prev.Word()).Matches()) { // Label 后 as DATE if the sequence is 70 后 or 七零 后 me.Set(typeof(CoreAnnotations.AnswerAnnotation), DateTag); } } } } } } return document; }
public override ICollection <string> GetCliqueFeatures(PaddedList <In> info, int position, Clique clique) { IList <string> features = new List <string>(Arrays.AsList(info[position].Word().Split(" "))); return(features); }
/// <summary> /// This method returns a /// <see cref="System.Collections.ICollection{E}"/> /// of the features /// calculated for the word at the specified position in info (the list of /// words) for the specified /// <see cref="Clique"/> /// . /// It should return the actual String features, <b>NOT</b> wrapped in any /// other object, as the wrapping /// will be done automatically. /// Because it takes a /// <see cref="Edu.Stanford.Nlp.Util.PaddedList{E}"/> /// you don't /// need to worry about indices which are outside of the list. /// </summary> /// <param name="info">A PaddedList of the feature-value pairs</param> /// <param name="position">The current position to extract features at</param> /// <param name="clique"> /// The particular clique for which to extract features. It /// should be a member of the knownCliques list. /// </param> /// <returns> /// A /// <see cref="System.Collections.ICollection{E}"/> /// of the features /// calculated for the word at the specified position in info. /// </returns> public abstract ICollection <string> GetCliqueFeatures(PaddedList <In> info, int position, Clique clique);
public virtual ICollection <string> FeaturesCpC(PaddedList <IN> cInfo, int loc) { ICollection <string> features = new List <string>(); CoreLabel c = cInfo[loc]; CoreLabel c1 = cInfo[loc + 1]; CoreLabel c2 = cInfo[loc + 2]; CoreLabel c3 = cInfo[loc + 3]; CoreLabel p = cInfo[loc - 1]; CoreLabel p2 = cInfo[loc - 2]; CoreLabel p3 = cInfo[loc - 3]; string charc = c.Get(typeof(CoreAnnotations.CharAnnotation)); if (charc == null) { charc = string.Empty; } string charc1 = c1.Get(typeof(CoreAnnotations.CharAnnotation)); if (charc1 == null) { charc1 = string.Empty; } string charc2 = c2.Get(typeof(CoreAnnotations.CharAnnotation)); if (charc2 == null) { charc2 = string.Empty; } string charc3 = c3.Get(typeof(CoreAnnotations.CharAnnotation)); if (charc3 == null) { charc3 = string.Empty; } string charp = p.Get(typeof(CoreAnnotations.CharAnnotation)); if (charp == null) { charp = string.Empty; } string charp2 = p2.Get(typeof(CoreAnnotations.CharAnnotation)); if (charp2 == null) { charp2 = string.Empty; } string charp3 = p3.Get(typeof(CoreAnnotations.CharAnnotation)); if (charp3 == null) { charp3 = string.Empty; } /* * N-gram features. N is upto 2. */ if (flags.useWord2) { // features.add(charc +"c"); // features.add(charc1+"c1"); // features.add(charp +"p"); // features.add(charp +charc +"pc"); // if( flags.useMsr ){ // features.add(charc +charc1 +"cc1"); // features.add(charp + charc1 +"pc1"); // } features.Add(charc + "::c"); features.Add(charc1 + "::c1"); features.Add(charp + "::p"); features.Add(charp2 + "::p2"); // trying to restore the features that Huishin described in SIGHAN 2005 paper features.Add(charc + charc1 + "::cn"); features.Add(charp + charc + "::pc"); features.Add(charp + charc1 + "::pn"); features.Add(charp2 + charp + "::p2p"); features.Add(charp2 + charc + "::p2c"); features.Add(charc2 + charc + "::n2c"); features.Add("|word2"); } /* * Radical N-gram features. N is upto 4. * Smoothing method of N-gram, because there are too many characters in Chinese. * (It works better than N-gram when they are used individually. less sparse) */ char rcharc; char rcharc1; char rcharc2; char rcharc3; char rcharp; char rcharp1; char rcharp2; char rcharp3; if (charc.Length == 0) { rcharc = 'n'; } else { rcharc = RadicalMap.GetRadical(charc[0]); } if (charc1.Length == 0) { rcharc1 = 'n'; } else { rcharc1 = RadicalMap.GetRadical(charc1[0]); } if (charc2.Length == 0) { rcharc2 = 'n'; } else { rcharc2 = RadicalMap.GetRadical(charc2[0]); } if (charc3.Length == 0) { rcharc3 = 'n'; } else { rcharc3 = RadicalMap.GetRadical(charc3[0]); } if (charp.Length == 0) { rcharp = 'n'; } else { rcharp = RadicalMap.GetRadical(charp[0]); } if (charp2.Length == 0) { rcharp2 = 'n'; } else { rcharp2 = RadicalMap.GetRadical(charp2[0]); } if (charp3.Length == 0) { rcharp3 = 'n'; } else { rcharp3 = RadicalMap.GetRadical(charp3[0]); } if (flags.useRad2) { features.Add(rcharc + "rc"); features.Add(rcharc1 + "rc1"); features.Add(rcharp + "rp"); features.Add(rcharp + rcharc + "rpc"); features.Add(rcharc + rcharc1 + "rcc1"); features.Add(rcharp + rcharc + rcharc1 + "rpcc1"); features.Add("|rad2"); } /* non-word dictionary:SEEM bi-gram marked as non-word */ if (flags.useDict2) { NonDict2 nd = new NonDict2(flags); features.Add(nd.CheckDic(charp + charc, flags) + "nondict"); features.Add("|useDict2"); } if (flags.useOutDict2) { if (outDict == null) { logger.Info("reading " + flags.outDict2 + " as a seen lexicon"); outDict = new CorpusDictionary(flags.outDict2, true); } features.Add(outDict.GetW(charp + charc) + "outdict"); // -1 0 features.Add(outDict.GetW(charc + charc1) + "outdict"); // 0 1 features.Add(outDict.GetW(charp2 + charp) + "outdict"); // -2 -1 features.Add(outDict.GetW(charp2 + charp + charc) + "outdict"); // -2 -1 0 features.Add(outDict.GetW(charp3 + charp2 + charp) + "outdict"); // -3 -2 -1 features.Add(outDict.GetW(charp + charc + charc1) + "outdict"); // -1 0 1 features.Add(outDict.GetW(charc + charc1 + charc2) + "outdict"); // 0 1 2 features.Add(outDict.GetW(charp + charc + charc1 + charc2) + "outdict"); } // -1 0 1 2 /* * (CTB/ASBC/HK/PK/MSR) POS information of each characters. * If a character falls into some function categories, * it is very likely there is a boundary. * A lot of Chinese function words belong to single characters. * This feature is also good for numbers and punctuations. * DE* are grouped into DE. */ if (flags.useCTBChar2 || flags.useASBCChar2 || flags.useHKChar2 || flags.usePKChar2 || flags.useMSRChar2) { string[] tagsets; // the "useChPos" now only works for CTB and PK if (flags.useChPos) { if (flags.useCTBChar2) { tagsets = new string[] { "AD", "AS", "BA", "CC", "CD", "CS", "DE", "DT", "ETC", "IJ", "JJ", "LB", "LC", "M", "NN", "NR", "NT", "OD", "P", "PN", "PU", "SB", "SP", "VA", "VC", "VE", "VV" }; } else { if (flags.usePKChar2) { //tagsets = new String[]{"r", "j", "t", "a", "nz", "l", "vn", "i", "m", "ns", "nr", "v", "n", "q", "Ng", "b", "d", "nt"}; tagsets = new string[] { "2", "3", "4" }; } else { throw new Exception("only support settings for CTB and PK now."); } } } else { //logger.info("Using Derived features"); tagsets = new string[] { "2", "3", "4" }; } if (taDetector == null) { taDetector = new TagAffixDetector(flags); } foreach (string tagset in tagsets) { features.Add(taDetector.CheckDic(tagset + "p", charp) + taDetector.CheckDic(tagset + "i", charp) + taDetector.CheckDic(tagset + "s", charc) + taDetector.CheckInDic(charp) + taDetector.CheckInDic(charc) + tagset + "prep-sufc"); } } // features.add("|ctbchar2"); // Added a constant feature several times!! /* * In error analysis, we found English words and numbers are often separated. * Rule 1: isNumber feature: check if the current and previous char is a number. * Rule 2: Disambiguation of time point and time duration. * Rule 3: isEnglish feature: check if the current and previous character is an english letter. * Rule 4: English name feature: check if the current char is a conjunct pu for English first and last name, since there is no space between two names. * Most of PUs are a good indicator for word boundary, but - and . is a strong indicator that there is no boundry within a previous , a follow char and it. */ if (flags.useRule2) { /* Reduplication features */ // previous character == current character if (charp.Equals(charc)) { features.Add("11"); } // previous character == next character if (charp.Equals(charc1)) { features.Add("22"); } // current character == next next character // fire only when usePk and useHk are both false. // Notice: this should be (almost) the same as the "22" feature, but we keep it for now. if (!flags.usePk && !flags.useHk) { if (charc.Equals(charc2)) { features.Add("33"); } } char cur1 = ' '; char cur2 = ' '; char cur = ' '; char pre = ' '; // actually their length must be either 0 or 1 if (charc1.Length > 0) { cur1 = charc1[0]; } if (charc2.Length > 0) { cur2 = charc2[0]; } if (charc.Length > 0) { cur = charc[0]; } if (charp.Length > 0) { pre = charp[0]; } string prer = rcharp.ToString(); // the radical of previous character Pattern E = Pattern.Compile("[a-zA-Z]"); Pattern N = Pattern.Compile("[0-9]"); Matcher m = E.Matcher(charp); Matcher ce = E.Matcher(charc); Matcher pe = E.Matcher(charp2); Matcher cn = N.Matcher(charc); Matcher pn = N.Matcher(charp2); // if current and previous characters are numbers... if (cur >= '0' && cur <= '9' && pre >= '0' && pre <= '9') { if (cur == '9' && pre == '1' && cur1 == '9' && cur2 >= '0' && cur2 <= '9') { //199x features.Add("YR"); } else { features.Add("2N"); } } else { // if current and previous characters are not both numbers // but previous char is a number // i.e. patterns like "1N" , "2A", etc if (pre >= '0' && pre <= '9') { features.Add("1N"); } else { // if previous character is an English character if (m.Matches()) { features.Add("E"); } else { // if the previous character contains no radical (and it exist) if (prer.Equals(".") && charp.Length == 1) { // fire only when usePk and useHk are both false. Not sure why. -pichuan if (!flags.useHk && !flags.usePk) { if (ce.Matches()) { features.Add("PU+E"); } if (pe.Matches()) { features.Add("E+PU"); } if (cn.Matches()) { features.Add("PU+N"); } if (pn.Matches()) { features.Add("N+PU"); } } features.Add("PU"); } } } } string engType = IsEnglish(charp, charc); string engPU = IsEngPU(charp); if (!engType.Equals(string.Empty)) { features.Add(engType); } if (!engPU.Equals(string.Empty) && !engType.Equals(string.Empty)) { features.Add(engPU + engType); } } //end of use rule // features using "Character.getType" information! string origS = c.Get(typeof(CoreAnnotations.OriginalCharAnnotation)); char origC = ' '; if (origS.Length > 0) { origC = origS[0]; } int type = char.GetType(origC); switch (type) { case char.UppercaseLetter: case char.LowercaseLetter: { // A-Z and full-width A-Z // a-z and full-width a-z features.Add("CHARTYPE-LETTER"); break; } case char.DecimalDigitNumber: { features.Add("CHARTYPE-DECIMAL_DIGIT_NUMBER"); break; } case char.OtherLetter: { // mostly chinese chars features.Add("CHARTYPE-OTHER_LETTER"); break; } default: { // other types features.Add("CHARTYPE-MISC"); break; } } return(features); }