示例#1
0
        public virtual ICollection <string> FeaturesCnC(PaddedList <IN> cInfo, int loc)
        {
            ICollection <string> features = new List <string>();
            CoreLabel            c        = cInfo[loc];
            CoreLabel            c1       = cInfo[loc + 1];
            CoreLabel            p        = cInfo[loc - 1];
            string charc  = c.Get(typeof(CoreAnnotations.CharAnnotation));
            string charc1 = c1.Get(typeof(CoreAnnotations.CharAnnotation));
            string charp  = p.Get(typeof(CoreAnnotations.CharAnnotation));

            if (flags.useWordn)
            {
                features.Add(charc + "c");
                features.Add(charc1 + "c1");
                features.Add(charp + "p");
                features.Add(charp + charc + "pc");
                if (flags.useAs || flags.useMsr || flags.usePk || flags.useHk)
                {
                    features.Add(charc + charc1 + "cc1");
                    features.Add(charp + charc1 + "pc1");
                }
                features.Add("|wordn");
            }
            return(features);
        }
示例#2
0
        /// <summary>Extracts all the features from the input data at a certain index.</summary>
        /// <param name="cInfo">The complete data set as a List of WordInfo</param>
        /// <param name="loc">The index at which to extract features.</param>
        public override ICollection <string> GetCliqueFeatures(PaddedList <IN> cInfo, int loc, Clique clique)
        {
            ICollection <string> features = Generics.NewHashSet();

            if (clique == cliqueC)
            {
                AddAllInterningAndSuffixing(features, FeaturesC(cInfo, loc), "C");
            }
            else
            {
                if (clique == cliqueCpC)
                {
                    AddAllInterningAndSuffixing(features, FeaturesCpC(cInfo, loc), "CpC");
                    AddAllInterningAndSuffixing(features, FeaturesCnC(cInfo, loc - 1), "CnC");
                }
            }
            // else if (clique == cliqueCpCp2C) {
            //   addAllInterningAndSuffixing(features, featuresCpCp2C(cInfo, loc), "CpCp2C");
            // } else if (clique == cliqueCpCp2Cp3C) {
            //   addAllInterningAndSuffixing(features, featuresCpCp2Cp3C(cInfo, loc), "CpCp2Cp3C");
            // } else if (clique == cliqueCpCp2Cp3Cp4C) {
            //   addAllInterningAndSuffixing(features, featuresCpCp2Cp3Cp4C(cInfo, loc), "CpCp2Cp3Cp4C");
            // } else if (clique == cliqueCpCp2Cp3Cp4Cp5C) {
            //   addAllInterningAndSuffixing(features, featuresCpCp2Cp3Cp4Cp5C(cInfo, loc), "CpCp2Cp3Cp4Cp5C");
            // }
            return(features);
        }
示例#3
0
        // end featuresCpC
        /// <summary>
        /// For a CRF, this shouldn't be necessary, since the features duplicate
        /// those from CpC, but Huihsin found some valuable, presumably becuase
        /// it modified the regularization a bit.
        /// </summary>
        /// <param name="cInfo">The list of characters</param>
        /// <param name="loc">Position of c in list</param>
        /// <returns>Collection of String features (sparse set of boolean features</returns>
        protected internal virtual ICollection <string> FeaturesCnC <_T0>(PaddedList <_T0> cInfo, int loc)
            where _T0 : CoreLabel
        {
            ICollection <string> features = new List <string>();

            if (flags.useWordn)
            {
                CoreLabel c      = cInfo[loc];
                CoreLabel c2     = cInfo[loc + 1];
                CoreLabel p      = cInfo[loc - 1];
                CoreLabel p2     = cInfo[loc - 2];
                string    charc  = c.GetString <CoreAnnotations.CharAnnotation>();
                string    charc2 = c2.GetString <CoreAnnotations.CharAnnotation>();
                string    charp  = p.GetString <CoreAnnotations.CharAnnotation>();
                string    charp2 = p2.GetString <CoreAnnotations.CharAnnotation>();
                features.Add(charc + "c");
                features.Add(charc2 + "c2");
                features.Add(charp + "p");
                features.Add(charp2 + "p2");
                features.Add(charp2 + charp + "p2p");
                features.Add(charp + charc + "pc");
                features.Add(charc + charc2 + "cc2");
                features.Add(charp + "-" + charc2 + "pc2");
                features.Add("cliqueCnC");
            }
            return(features);
        }
示例#4
0
        protected internal virtual ICollection <string> FeaturesC(PaddedList <In> cInfo, int loc)
        {
            ICollection <string> features = new List <string>();
            CoreLabel            c        = cInfo[loc];
            CoreLabel            n        = cInfo[loc + 1];
            CoreLabel            n2       = cInfo[loc + 2];
            CoreLabel            p        = cInfo[loc - 1];
            CoreLabel            p2       = cInfo[loc - 2];
            string charc  = c.Get(typeof(CoreAnnotations.CharAnnotation));
            string charn  = n.Get(typeof(CoreAnnotations.CharAnnotation));
            string charn2 = n2.Get(typeof(CoreAnnotations.CharAnnotation));
            string charp  = p.Get(typeof(CoreAnnotations.CharAnnotation));
            string charp2 = p2.Get(typeof(CoreAnnotations.CharAnnotation));

            // Default feature set...a 5 character window
            // plus a few other language-independent features
            features.Add(charc + "-c");
            features.Add(charn + "-n1");
            features.Add(charn2 + "-n2");
            features.Add(charp + "-p");
            features.Add(charp2 + "-p2");
            // Length feature
            if (charc.Length > 1)
            {
                features.Add("length");
            }
            // Character-level class features
            bool seenPunc  = false;
            bool seenDigit = false;

            for (int i = 0; i < limit; ++i)
            {
                char charcC = charc[i];
                seenPunc  = seenPunc || Characters.IsPunctuation(charcC);
                seenDigit = seenDigit || char.IsDigit(charcC);
                string cuBlock = Characters.UnicodeBlockStringOf(charcC);
                features.Add(cuBlock + "-uBlock");
                string cuType = char.GetType(charcC).ToString();
                features.Add(cuType + "-uType");
            }
            if (seenPunc)
            {
                features.Add("haspunc");
            }
            if (seenDigit)
            {
                features.Add("hasdigit");
            }
            // Token-level features
            string word  = c.Word();
            int    index = c.Index();

            features.Add(Math.Min(MaxBefore, index) + "-before");
            features.Add(Math.Min(MaxAfter, word.Length - charc.Length - index) + "-after");
            features.Add(Math.Min(MaxLength, word.Length) + "-length");
            // Indicator transition feature
            features.Add("cliqueC");
            return(features);
        }
示例#5
0
        protected internal override ICollection <string> FeaturesC(PaddedList <In> cInfo, int loc)
        {
            ICollection <string> features = base.FeaturesC(cInfo, loc);
            CoreLabel            n3       = cInfo[loc + 3];
            CoreLabel            p3       = cInfo[loc - 3];
            string charn3 = n3.Get(typeof(CoreAnnotations.CharAnnotation));
            string charp3 = p3.Get(typeof(CoreAnnotations.CharAnnotation));

            // a 7 character window instead of a 5 character window
            features.Add(charn3 + "-n3");
            features.Add(charp3 + "-p3");
            return(features);
        }
示例#6
0
        protected internal virtual ICollection <string> FeaturesCpC(PaddedList <In> cInfo, int loc)
        {
            ICollection <string> features = new List <string>();
            CoreLabel            c        = cInfo[loc];
            CoreLabel            p        = cInfo[loc - 1];
            string charc = c.Get(typeof(CoreAnnotations.CharAnnotation));
            string charp = p.Get(typeof(CoreAnnotations.CharAnnotation));

            features.Add(charc + charp + "-cngram");
            // Indicator transition feature
            features.Add("cliqueCpC");
            return(features);
        }
        public override ICollection <string> GetCliqueFeatures(PaddedList <CoreLabel> info, int position, Clique clique)
        {
            ICollection <string> features = new HashSet <string>();

            foreach (CoreLabel l in info)
            {
                for (int i = 0; i < 10; i++)
                {
                    features.Add("feat" + i + ":" + l.Word());
                }
            }
            return(features);
        }
示例#8
0
        //is EnglishPU
        public virtual ICollection <string> FeaturesC(PaddedList <IN> cInfo, int loc)
        {
            ICollection <string> features = new List <string>();
            CoreLabel            c        = cInfo[loc];
            CoreLabel            c1       = cInfo[loc + 1];
            CoreLabel            c2       = cInfo[loc + 2];
            CoreLabel            c3       = cInfo[loc + 3];
            CoreLabel            p        = cInfo[loc - 1];
            CoreLabel            p2       = cInfo[loc - 2];
            CoreLabel            p3       = cInfo[loc - 3];
            string charc  = c.Get(typeof(CoreAnnotations.CharAnnotation));
            string charc1 = c1.Get(typeof(CoreAnnotations.CharAnnotation));
            string charc2 = c2.Get(typeof(CoreAnnotations.CharAnnotation));
            string charc3 = c3.Get(typeof(CoreAnnotations.CharAnnotation));
            string charp  = p.Get(typeof(CoreAnnotations.CharAnnotation));
            string charp2 = p2.Get(typeof(CoreAnnotations.CharAnnotation));
            string charp3 = p3.Get(typeof(CoreAnnotations.CharAnnotation));

            if (flags.useWord1)
            {
                // features.add(charc +"c");
                // features.add(charc1+"c1");
                // features.add(charp +"p");
                // features.add(charp +charc  +"pc");
                // if(flags.useAs || flags.useMsr || flags.usePk || flags.useHk){ //msr, as
                //   features.add(charc +charc1 +"cc1");
                //   features.add(charp + charc1 +"pc1");
                // }
                features.Add(charc + "::c");
                features.Add(charc1 + "::c1");
                features.Add(charp + "::p");
                features.Add(charp2 + "::p2");
                // trying to restore the features that Huishin described in SIGHAN 2005 paper
                features.Add(charc + charc1 + "::cn");
                features.Add(charp + charc + "::pc");
                features.Add(charp + charc1 + "::pn");
                features.Add(charp2 + charp + "::p2p");
                features.Add(charp2 + charc + "::p2c");
                features.Add(charc2 + charc + "::n2c");
                features.Add("|word1");
            }
            return(features);
        }
示例#9
0
        /// <summary>Extracts all the features from the input data at a certain index.</summary>
        /// <param name="cInfo">The complete data set as a List of WordInfo</param>
        /// <param name="loc">The index at which to extract features.</param>
        public override ICollection <string> GetCliqueFeatures(PaddedList <In> cInfo, int loc, Clique clique)
        {
            ICollection <string> features = Generics.NewHashSet();

            if (clique == cliqueC)
            {
                AddAllInterningAndSuffixing(features, FeaturesC(cInfo, loc), "C");
            }
            else
            {
                if (clique == cliqueCpC)
                {
                    AddAllInterningAndSuffixing(features, FeaturesCpC(cInfo, loc), "CpC");
                }
                else
                {
                    if (clique == cliqueCp2C)
                    {
                        AddAllInterningAndSuffixing(features, FeaturesCp2C(cInfo, loc), "Cp2C");
                    }
                    else
                    {
                        if (clique == cliqueCp3C)
                        {
                            AddAllInterningAndSuffixing(features, FeaturesCp3C(cInfo, loc), "Cp3C");
                        }
                    }
                }
            }
            string domain = cInfo[loc].Get(typeof(CoreAnnotations.DomainAnnotation));

            if (domain != null)
            {
                ICollection <string> domainFeatures = Generics.NewHashSet();
                foreach (string feature in features)
                {
                    domainFeatures.Add(feature + DomainMarker + domain);
                }
                Sharpen.Collections.AddAll(features, domainFeatures);
            }
            return(features);
        }
示例#10
0
        // end featuresCpCp2C
        protected internal virtual ICollection <string> FeaturesCpCp2Cp3C <_T0>(PaddedList <_T0> cInfo, int loc)
            where _T0 : CoreLabel
        {
            ICollection <string> features = new List <string>();

            if (flags.use4Clique && flags.maxLeft >= 3)
            {
                CoreLabel c       = cInfo[loc];
                CoreLabel c2      = cInfo[loc + 1];
                CoreLabel p       = cInfo[loc - 1];
                CoreLabel p2      = cInfo[loc - 2];
                CoreLabel p3      = cInfo[loc - 3];
                string    charc   = c.GetString <CoreAnnotations.CharAnnotation>();
                string    charp   = p.GetString <CoreAnnotations.CharAnnotation>();
                string    charp2  = p2.GetString <CoreAnnotations.CharAnnotation>();
                string    charp3  = p3.GetString <CoreAnnotations.CharAnnotation>();
                int       cI      = c.Get(typeof(CoreAnnotations.UTypeAnnotation));
                string    uTypec  = (cI != null ? cI.ToString() : string.Empty);
                int       c2I     = c2.Get(typeof(CoreAnnotations.UTypeAnnotation));
                string    uTypec2 = (c2I != null ? c2I.ToString() : string.Empty);
                int       pI      = p.Get(typeof(CoreAnnotations.UTypeAnnotation));
                string    uTypep  = (pI != null ? pI.ToString() : string.Empty);
                int       p2I     = p2.Get(typeof(CoreAnnotations.UTypeAnnotation));
                string    uTypep2 = (p2I != null ? p2I.ToString() : string.Empty);
                int       p3I     = p3.Get(typeof(CoreAnnotations.UTypeAnnotation));
                string    uTypep3 = (p3I != null ? p3I.ToString() : string.Empty);
                if (flags.useLongSequences)
                {
                    features.Add(charp3 + charp2 + charp + charc + "p3p2pc");
                }
                if (flags.useUnicodeType4gram || flags.useUnicodeType5gram)
                {
                    features.Add(uTypep3 + "-" + uTypep2 + "-" + uTypep + "-" + uTypec + "-uType4");
                }
                if (flags.useUnicodeType5gram)
                {
                    features.Add(uTypep3 + "-" + uTypep2 + "-" + uTypep + "-" + uTypec + "-" + uTypec2 + "-uType5");
                }
                features.Add("cliqueCpCp2Cp3C");
            }
            return(features);
        }
        protected internal override ICollection <string> FeaturesCpC(PaddedList <IN> cInfo, int loc)
        {
            ICollection <string> features = base.FeaturesCpC(cInfo, loc);
            CoreLabel            c        = cInfo[loc];

            // "Wrapper" feature: identity of first and last two chars of the current word.
            // This helps detect ma+_+sh in dialect, as well as avoiding segmenting possessive
            // pronouns if the word starts with al-.
            if (c.Word().Length > 3)
            {
                string start = Sharpen.Runtime.Substring(c.Word(), 0, 2);
                string end   = Sharpen.Runtime.Substring(c.Word(), c.Word().Length - 2);
                if (c.Index() == 2)
                {
                    features.Add(start + "_" + end + "-begin-wrap");
                }
                if (c.Index() == c.Word().Length - 1)
                {
                    features.Add(start + "_" + end + "-end-wrap");
                }
            }
            return(features);
        }
        public override CRFDatum <IList <string>, CRFLabel> MakeDatum(IList <IN> info, int loc, IList <FeatureFactory <IN> > featureFactories)
        {
            pad.Set(typeof(CoreAnnotations.AnswerAnnotation), flags.backgroundSymbol);
            PaddedList <IN>         pInfo    = new PaddedList <IN>(info, pad);
            IList <IList <string> > features = new List <IList <string> >();
            ICollection <Clique>    done     = Generics.NewHashSet();

            for (int i = 0; i < windowSize; i++)
            {
                IList <string> featuresC     = new List <string>();
                IList <Clique> windowCliques = FeatureFactory.GetCliques(i, 0);
                windowCliques.RemoveAll(done);
                Sharpen.Collections.AddAll(done, windowCliques);
                foreach (Clique c in windowCliques)
                {
                    foreach (FeatureFactory <IN> featureFactory in featureFactories)
                    {
                        Sharpen.Collections.AddAll(featuresC, featureFactory.GetCliqueFeatures(pInfo, loc, c));
                    }
                }
                if (testTime && i == 0)
                {
                    // this feature is only present at test time and only appears
                    // in cliques of size 1 (i.e., cliques with window=0)
                    featuresC.Add(Bias);
                }
                features.Add(featuresC);
            }
            int[] labels = new int[windowSize];
            for (int i_1 = 0; i_1 < windowSize; i_1++)
            {
                string answer = pInfo[loc + i_1 - windowSize + 1].Get(typeof(CoreAnnotations.AnswerAnnotation));
                labels[i_1] = classIndex.IndexOf(answer);
            }
            return(new CRFDatum <IList <string>, CRFLabel>(features, new CRFLabel(labels), null));
        }
示例#13
0
        /// <summary>Extracts all the features from the input data at a certain index.</summary>
        /// <param name="cInfo">The complete data set as a List of WordInfo</param>
        /// <param name="loc">The index at which to extract features.</param>
        public override ICollection <string> GetCliqueFeatures(PaddedList <IN> cInfo, int loc, Clique clique)
        {
            ICollection <string> features = Generics.NewHashSet();

            if (clique == cliqueC)
            {
                AddAllInterningAndSuffixing(features, FeaturesC(cInfo, loc), "C");
            }
            else
            {
                if (clique == cliqueCpC)
                {
                    AddAllInterningAndSuffixing(features, FeaturesCpC(cInfo, loc), "CpC");
                    AddAllInterningAndSuffixing(features, FeaturesCnC(cInfo, loc - 1), "CnC");
                }
                else
                {
                    if (clique == cliqueCpCp2C)
                    {
                        AddAllInterningAndSuffixing(features, FeaturesCpCp2C(cInfo, loc), "CpCp2C");
                    }
                    else
                    {
                        if (clique == cliqueCpCp2Cp3C)
                        {
                            AddAllInterningAndSuffixing(features, FeaturesCpCp2Cp3C(cInfo, loc), "CpCp2Cp3C");
                        }
                    }
                }
            }
            if (Debug > 0)
            {
                EncodingPrintWriter.Err.Println("For " + cInfo[loc] + ", features: " + features, "UTF-8");
            }
            return(features);
        }
 /* (non-Javadoc)
  * @see edu.stanford.nlp.sequences.FeatureFactory#getCliqueFeatures(edu.stanford.nlp.util.PaddedList, int, edu.stanford.nlp.sequences.Clique)
  */
 public override ICollection GetCliqueFeatures(PaddedList info, int position, Clique clique)
 {
     // TODO Auto-generated method stub
     return(null);
 }
示例#15
0
        //end of CnC
        /// <summary>Second order clique features</summary>
        /// <param name="cInfo">The list of characters</param>
        /// <param name="loc">Position of c in list</param>
        /// <returns>Collection of String features (sparse set of boolean features</returns>
        protected internal virtual ICollection <string> FeaturesCpCp2C <_T0>(PaddedList <_T0> cInfo, int loc)
            where _T0 : CoreLabel
        {
            ICollection <string> features = new List <string>();
            CoreLabel            c        = cInfo[loc];
            CoreLabel            c2       = cInfo[loc + 1];
            CoreLabel            c3       = cInfo[loc + 2];
            CoreLabel            p        = cInfo[loc - 1];
            CoreLabel            p2       = cInfo[loc - 2];
            CoreLabel            p3       = cInfo[loc - 3];
            string charc  = c.GetString <CoreAnnotations.CharAnnotation>();
            string charc2 = c2.GetString <CoreAnnotations.CharAnnotation>();
            string charc3 = c3.GetString <CoreAnnotations.CharAnnotation>();
            string charp  = p.GetString <CoreAnnotations.CharAnnotation>();
            string charp2 = p2.GetString <CoreAnnotations.CharAnnotation>();
            string charp3 = p3.GetString <CoreAnnotations.CharAnnotation>();

            // N-gram features. N is up to 3
            if (flags.useWord3)
            {
                features.Add(charc + "::c");
                features.Add(charc2 + "::n");
                features.Add(charp + "::p");
                features.Add(charp2 + "::p2");
                // trying to restore the features that Huihsin described in SIGHAN 2005 paper
                features.Add(charc + charc2 + "::cn");
                features.Add(charc + charc2 + charc3 + "::cnn2");
                features.Add(charp + charc + "::pc");
                features.Add(charp + charc2 + "::pn");
                features.Add(charp2 + charp + "::p2p");
                features.Add(charp3 + charp2 + charp + "::p3p2p");
                features.Add(charp2 + charc + "::p2c");
                features.Add(charc + charc3 + "::cn2");
            }
            if (flags.useShapeStrings)
            {
                if (flags.useShapeStrings1)
                {
                    features.Add(p.GetString <CoreAnnotations.ShapeAnnotation>() + "ps");
                    features.Add(c.GetString <CoreAnnotations.ShapeAnnotation>() + "cs");
                    features.Add(c2.GetString <CoreAnnotations.ShapeAnnotation>() + "c2s");
                }
                if (flags.useShapeStrings3)
                {
                    features.Add(p.GetString <CoreAnnotations.ShapeAnnotation>() + c.GetString <CoreAnnotations.ShapeAnnotation>() + c2.GetString <CoreAnnotations.ShapeAnnotation>() + "pscsc2s");
                }
                if (flags.useShapeStrings4)
                {
                    features.Add(p2.GetString <CoreAnnotations.ShapeAnnotation>() + p.GetString <CoreAnnotations.ShapeAnnotation>() + c.GetString <CoreAnnotations.ShapeAnnotation>() + c2.GetString <CoreAnnotations.ShapeAnnotation>() + "p2spscsc2s");
                }
                if (flags.useShapeStrings5)
                {
                    features.Add(p2.GetString <CoreAnnotations.ShapeAnnotation>() + p.GetString <CoreAnnotations.ShapeAnnotation>() + c.GetString <CoreAnnotations.ShapeAnnotation>() + c2.GetString <CoreAnnotations.ShapeAnnotation>() + c3.GetString <CoreAnnotations.ShapeAnnotation
                                                                                                                                                                                                                                                         >() + "p2spscsc2sc3s");
                }
                if (flags.useWordShapeConjunctions2)
                {
                    features.Add(p.GetString <CoreAnnotations.ShapeAnnotation>() + charc + "pscc");
                    features.Add(charp + c.GetString <CoreAnnotations.ShapeAnnotation>() + "pccs");
                }
                if (flags.useWordShapeConjunctions3)
                {
                    features.Add(p2.GetString <CoreAnnotations.ShapeAnnotation>() + p.GetString <CoreAnnotations.ShapeAnnotation>() + charc + "p2spscc");
                    features.Add(p.GetString <CoreAnnotations.ShapeAnnotation>() + charc + c2.GetString <CoreAnnotations.ShapeAnnotation>() + "psccc2s");
                    features.Add(charc + c2.GetString <CoreAnnotations.ShapeAnnotation>() + c3.GetString <CoreAnnotations.ShapeAnnotation>() + "ccc2sc3s");
                }
            }

            /*
             * Radical N-gram features. N is upto 4.
             * Smoothing method of N-gram, because there are too many characters in Chinese.
             * (It works better than N-gram when they are used individually. less sparse)
             */
            char rcharc;
            char rcharc2;
            char rcharp;
            char rcharp2;

            if (charc.Length == 0)
            {
                rcharc = 'n';
            }
            else
            {
                rcharc = RadicalMap.GetRadical(charc[0]);
            }
            if (charc2.Length == 0)
            {
                rcharc2 = 'n';
            }
            else
            {
                rcharc2 = RadicalMap.GetRadical(charc2[0]);
            }
            if (charp.Length == 0)
            {
                rcharp = 'n';
            }
            else
            {
                rcharp = RadicalMap.GetRadical(charp[0]);
            }
            if (charp2.Length == 0)
            {
                rcharp2 = 'n';
            }
            else
            {
                rcharp2 = RadicalMap.GetRadical(charp2[0]);
            }
            if (flags.useRad2)
            {
                features.Add(rcharc + "rc");
                features.Add(rcharc2 + "rc2");
                features.Add(rcharp + "rp");
                features.Add(rcharp + rcharc + "rprc");
                features.Add(rcharc + rcharc2 + "rcrc2");
                features.Add(rcharp + rcharc + rcharc2 + "rprcrc2");
            }
            if (flags.useRad2b)
            {
                features.Add(rcharc + "rc");
                features.Add(rcharc2 + "rc2");
                features.Add(rcharp + "rp");
                features.Add(rcharp + rcharc + "rprc");
                features.Add(rcharc + rcharc2 + "rcrc2");
                features.Add(rcharp2 + rcharp + "rp2rp");
            }
            features.Add("cliqueCpCp2C");
            return(features);
        }
示例#16
0
        // static methods
        /// <summary>
        /// This can be used to map from any IOB-style (i.e., "I-PERS" style labels)
        /// or just categories representation to any other.
        /// </summary>
        /// <remarks>
        /// This can be used to map from any IOB-style (i.e., "I-PERS" style labels)
        /// or just categories representation to any other.
        /// It can read and change any representation to other representations:
        /// a 4 way representation of all entities, like S-PERS, B-PERS,
        /// I-PERS, E-PERS for single word, beginning, internal, and end of entity
        /// (IOBES or SBIEO); always marking the first word of an entity (IOB2 or BIO);
        /// only marking specially the beginning of non-first
        /// items of an entity sequences with B-PERS (IOB1);
        /// the reverse IOE1 and IOE2; IO where everything is I-tagged; and
        /// NOPREFIX, where no prefixes are written on category labels.
        /// The last two representations are deficient in not allowing adjacent
        /// entities of the same class to be represented, but nevertheless
        /// convenient.  Note that the background label is never given a prefix.
        /// This code is very specific to the particular CoNLL way of labeling
        /// classes for IOB-style encoding, but this notation is quite widespread.
        /// It will work on any of these styles of input.
        /// This will also recognize BILOU format (B=B, I=I, L=E, O=O, U=S).
        /// It also works with lowercased names like i-org.
        /// If the labels are not of the form "C-Y+", where C is a single character,
        /// then they will be regarded as NOPREFIX labels.
        /// This method updates the List tokens in place.
        /// </remarks>
        /// <param name="tokens">List of tokens (each a CoreLabel) in some style</param>
        /// <param name="key">The key in the CoreLabel to change, commonly CoreAnnotations.AnswerAnnotation.class</param>
        /// <param name="backgroundLabel">The background label, which gets special treatment</param>
        /// <param name="style">Output style; one of iob[12], ioe[12], io, sbieo/iobes, noprefix</param>
        /// <param name="intern">Whether to String-intern the new labels (may as well, small number!)</param>
        public static void EntitySubclassify <Tok>(IList <TOK> tokens, Type key, string backgroundLabel, string style, bool intern)
            where Tok : ICoreMap
        {
            int    how;
            string lowerStyle = style.ToLower(Locale.English);

            switch (lowerStyle)
            {
            case "iob1":
            {
                how = 0;
                break;
            }

            case "iob2":
            case "bio":
            {
                how = 1;
                break;
            }

            case "ioe1":
            {
                how = 2;
                break;
            }

            case "ioe2":
            {
                how = 3;
                break;
            }

            case "io":
            {
                how = 4;
                break;
            }

            case "sbieo":
            case "iobes":
            {
                how = 5;
                break;
            }

            case "noprefix":
            {
                how = 6;
                break;
            }

            case "bilou":
            {
                how = 7;
                break;
            }

            default:
            {
                throw new ArgumentException("entitySubclassify: unknown style: " + style);
            }
            }
            IList <TOK> paddedTokens = new PaddedList <TOK>(tokens, (TOK) new CoreLabel());
            int         size         = paddedTokens.Count;

            string[] newAnswers = new string[size];
            for (int i = 0; i < size; i++)
            {
                TOK    c    = paddedTokens[i];
                TOK    p    = paddedTokens[i - 1];
                TOK    n    = paddedTokens[i + 1];
                string cAns = c.Get(key);
                string pAns = p.Get(key);
                if (pAns == null)
                {
                    pAns = backgroundLabel;
                }
                string nAns = n.Get(key);
                if (nAns == null)
                {
                    nAns = backgroundLabel;
                }
                string @base;
                char   prefix;
                if (cAns.Length > 2 && cAns[1] == '-')
                {
                    @base  = Sharpen.Runtime.Substring(cAns, 2, cAns.Length);
                    prefix = char.ToUpperCase(cAns[0]);
                }
                else
                {
                    @base  = cAns;
                    prefix = ' ';
                }
                string pBase;
                char   pPrefix;
                if (pAns.Length > 2 && pAns[1] == '-')
                {
                    pBase   = Sharpen.Runtime.Substring(pAns, 2, pAns.Length);
                    pPrefix = char.ToUpperCase(pAns[0]);
                }
                else
                {
                    pBase   = pAns;
                    pPrefix = ' ';
                }
                string nBase;
                char   nPrefix;
                if (nAns.Length > 2 && nAns[1] == '-')
                {
                    nBase   = Sharpen.Runtime.Substring(nAns, 2, nAns.Length);
                    nPrefix = char.ToUpperCase(nAns[0]);
                }
                else
                {
                    nBase   = nAns;
                    nPrefix = ' ';
                }
                bool   isStartAdjacentSame = IsSameEntityBoundary(pBase, pPrefix, @base, prefix);
                bool   isEndAdjacentSame   = IsSameEntityBoundary(@base, prefix, nBase, nPrefix);
                bool   isFirst             = IsDifferentEntityBoundary(pBase, @base) || isStartAdjacentSame;
                bool   isLast    = IsDifferentEntityBoundary(@base, nBase) || isEndAdjacentSame;
                string newAnswer = @base;
                if ([email protected](backgroundLabel))
                {
                    switch (how)
                    {
                    case 0:
                    {
                        // iob1, only B if adjacent
                        if (isStartAdjacentSame)
                        {
                            newAnswer = "B-" + @base;
                        }
                        else
                        {
                            newAnswer = "I-" + @base;
                        }
                        break;
                    }

                    case 1:
                    {
                        // iob2 always B at start
                        if (isFirst)
                        {
                            newAnswer = "B-" + @base;
                        }
                        else
                        {
                            newAnswer = "I-" + @base;
                        }
                        break;
                    }

                    case 2:
                    {
                        // ioe1
                        if (isEndAdjacentSame)
                        {
                            newAnswer = "E-" + @base;
                        }
                        else
                        {
                            newAnswer = "I-" + @base;
                        }
                        break;
                    }

                    case 3:
                    {
                        // ioe2
                        if (isLast)
                        {
                            newAnswer = "E-" + @base;
                        }
                        else
                        {
                            newAnswer = "I-" + @base;
                        }
                        break;
                    }

                    case 4:
                    {
                        newAnswer = "I-" + @base;
                        break;
                    }

                    case 5:
                    {
                        if (isFirst && isLast)
                        {
                            newAnswer = "S-" + @base;
                        }
                        else
                        {
                            if ((!isFirst) && isLast)
                            {
                                newAnswer = "E-" + @base;
                            }
                            else
                            {
                                if (isFirst && (!isLast))
                                {
                                    newAnswer = "B-" + @base;
                                }
                                else
                                {
                                    newAnswer = "I-" + @base;
                                }
                            }
                        }
                        break;
                    }

                    case 7:
                    {
                        // nothing to do on case 6 as it's just base
                        if (isFirst && isLast)
                        {
                            newAnswer = "U-" + @base;
                        }
                        else
                        {
                            if ((!isFirst) && isLast)
                            {
                                newAnswer = "L-" + @base;
                            }
                            else
                            {
                                if (isFirst && (!isLast))
                                {
                                    newAnswer = "B-" + @base;
                                }
                                else
                                {
                                    newAnswer = "I-" + @base;
                                }
                            }
                        }
                        break;
                    }
                    }
                }
                if (intern)
                {
                    newAnswer = string.Intern(newAnswer);
                }
                newAnswers[i] = newAnswer;
            }
            for (int i_1 = 0; i_1 < size; i_1++)
            {
                TOK c = tokens[i_1];
                c.Set(typeof(CoreAnnotations.AnswerAnnotation), newAnswers[i_1]);
            }
        }
示例#17
0
        protected internal virtual ICollection <string> FeaturesCpC <_T0>(PaddedList <_T0> cInfo, int loc)
            where _T0 : CoreLabel
        {
            ICollection <string> features = new List <string>();
            CoreLabel            c        = cInfo[loc];
            CoreLabel            c2       = cInfo[loc + 1];
            CoreLabel            c3       = cInfo[loc + 2];
            CoreLabel            p        = cInfo[loc - 1];
            CoreLabel            p2       = cInfo[loc - 2];
            CoreLabel            p3       = cInfo[loc - 3];
            string charc   = c.GetString <CoreAnnotations.CharAnnotation>();
            string charc2  = c2.GetString <CoreAnnotations.CharAnnotation>();
            string charc3  = c3.GetString <CoreAnnotations.CharAnnotation>();
            string charp   = p.GetString <CoreAnnotations.CharAnnotation>();
            string charp2  = p2.GetString <CoreAnnotations.CharAnnotation>();
            string charp3  = p3.GetString <CoreAnnotations.CharAnnotation>();
            int    cI      = c.Get(typeof(CoreAnnotations.UTypeAnnotation));
            string uTypec  = (cI != null ? cI.ToString() : string.Empty);
            int    c2I     = c2.Get(typeof(CoreAnnotations.UTypeAnnotation));
            string uTypec2 = (c2I != null ? c2I.ToString() : string.Empty);
            int    c3I     = c3.Get(typeof(CoreAnnotations.UTypeAnnotation));
            string uTypec3 = (c3I != null ? c3I.ToString() : string.Empty);
            int    pI      = p.Get(typeof(CoreAnnotations.UTypeAnnotation));
            string uTypep  = (pI != null ? pI.ToString() : string.Empty);
            int    p2I     = p2.Get(typeof(CoreAnnotations.UTypeAnnotation));
            string uTypep2 = (p2I != null ? p2I.ToString() : string.Empty);

            if (flags.dictionary != null || flags.serializedDictionary != null)
            {
                DictionaryFeaturesCpC(typeof(CoreAnnotations.LBeginAnnotation), typeof(CoreAnnotations.LMiddleAnnotation), typeof(CoreAnnotations.LEndAnnotation), string.Empty, features, p2, p, c, c2);
            }
            if (flags.dictionary2 != null)
            {
                DictionaryFeaturesCpC(typeof(CoreAnnotations.D2_LBeginAnnotation), typeof(CoreAnnotations.D2_LMiddleAnnotation), typeof(CoreAnnotations.D2_LEndAnnotation), "-D2-", features, p2, p, c, c2);
            }

            /*
             * N-gram features. N is upto 2.
             */
            if (flags.useWord2)
            {
                // features.add(charc +"c");
                // features.add(charc2+"c2");
                // features.add(charp +"p");
                // features.add(charp + charc  +"pc");
                // features.add(charc + charc2  +"cc2");
                // // cdm: need hyphen so you can see which of charp or charc2 is null....
                // features.add(charp + "-" + charc2 + "pc2");
                features.Add(charc + "::c");
                features.Add(charc2 + "::c1");
                features.Add(charp + "::p");
                features.Add(charp2 + "::p2");
                // trying to restore the features that Huihsin described in SIGHAN 2005 paper
                features.Add(charc + charc2 + "::cn");
                // (*)
                features.Add(charp + charc + "::pc");
                features.Add(charp + charc2 + "::pn");
                features.Add(charp2 + charp + "::p2p");
                features.Add(charp2 + charc + "::p2c");
                features.Add(charc2 + charc + "::n2c");
            }
            // todo: this is messed up: Same as one above at (*); should be cn2 = charc + charc3 + "::cn2"
            if (flags.useFeaturesCpC4gram || flags.useFeaturesCpC5gram || flags.useFeaturesCpC6gram)
            {
                // todo: Both these features duplicate ones already in useWord2
                features.Add(charp2 + charp + "p2p");
                features.Add(charp2 + "p2");
            }
            if (flags.useFeaturesCpC5gram || flags.useFeaturesCpC6gram)
            {
                features.Add(charc3 + "c3");
                features.Add(charc2 + charc3 + "c2c3");
            }
            if (flags.useFeaturesCpC6gram)
            {
                features.Add(charp3 + "p3");
                features.Add(charp3 + charp2 + "p3p2");
            }
            if (flags.useGoodForNamesCpC)
            {
                // these 2 features should be distinctively good at biasing from
                // picking up a Chinese family name in the p2 or p3 positions:
                // familyName X X startWord AND familyName X startWord
                // But actually they seem to have negative value.
                features.Add(charp2 + "p2");
                features.Add(charp3 + "p3");
            }
            if (flags.useUnicodeType || flags.useUnicodeType4gram || flags.useUnicodeType5gram)
            {
                features.Add(uTypep + "-" + uTypec + "-" + uTypec2 + "-uType3");
            }
            if (flags.useUnicodeType4gram || flags.useUnicodeType5gram)
            {
                features.Add(uTypep2 + "-" + uTypep + "-" + uTypec + "-" + uTypec2 + "-uType4");
            }
            if (flags.useUnicodeType5gram)
            {
                features.Add(uTypep2 + "-" + uTypep + "-" + uTypec + "-" + uTypec2 + "-" + uTypec3 + "-uType5");
            }
            if (flags.useWordUTypeConjunctions2)
            {
                features.Add(uTypep + charc + "putcc");
                features.Add(charp + uTypec + "pccut");
            }
            if (flags.useWordUTypeConjunctions3)
            {
                features.Add(uTypep2 + uTypep + charc + "p2utputcc");
                features.Add(uTypep + charc + uTypec2 + "putccc2ut");
                features.Add(charc + uTypec2 + uTypec3 + "ccc2utc3ut");
            }
            if (flags.useUnicodeBlock)
            {
                features.Add(p.GetString <CoreAnnotations.UBlockAnnotation>() + "-" + c.GetString <CoreAnnotations.UBlockAnnotation>() + "-" + c2.GetString <CoreAnnotations.UBlockAnnotation>() + "-uBlock");
            }
            if (flags.useShapeStrings)
            {
                if (flags.useShapeStrings1)
                {
                    features.Add(p.GetString <CoreAnnotations.ShapeAnnotation>() + "ps");
                    features.Add(c.GetString <CoreAnnotations.ShapeAnnotation>() + "cs");
                    features.Add(c2.GetString <CoreAnnotations.ShapeAnnotation>() + "c2s");
                }
                if (flags.useShapeStrings3)
                {
                    features.Add(p.GetString <CoreAnnotations.ShapeAnnotation>() + c.GetString <CoreAnnotations.ShapeAnnotation>() + c2.GetString <CoreAnnotations.ShapeAnnotation>() + "pscsc2s");
                }
                if (flags.useShapeStrings4)
                {
                    features.Add(p2.GetString <CoreAnnotations.ShapeAnnotation>() + p.GetString <CoreAnnotations.ShapeAnnotation>() + c.GetString <CoreAnnotations.ShapeAnnotation>() + c2.GetString <CoreAnnotations.ShapeAnnotation>() + "p2spscsc2s");
                }
                if (flags.useShapeStrings5)
                {
                    features.Add(p2.GetString <CoreAnnotations.ShapeAnnotation>() + p.GetString <CoreAnnotations.ShapeAnnotation>() + c.GetString <CoreAnnotations.ShapeAnnotation>() + c2.GetString <CoreAnnotations.ShapeAnnotation>() + c3.GetString <CoreAnnotations.ShapeAnnotation
                                                                                                                                                                                                                                                         >() + "p2spscsc2sc3s");
                }
                if (flags.useWordShapeConjunctions2)
                {
                    features.Add(p.GetString <CoreAnnotations.ShapeAnnotation>() + charc + "pscc");
                    features.Add(charp + c.GetString <CoreAnnotations.ShapeAnnotation>() + "pccs");
                }
                if (flags.useWordShapeConjunctions3)
                {
                    features.Add(p2.GetString <CoreAnnotations.ShapeAnnotation>() + p.GetString <CoreAnnotations.ShapeAnnotation>() + charc + "p2spscc");
                    features.Add(p.GetString <CoreAnnotations.ShapeAnnotation>() + charc + c2.GetString <CoreAnnotations.ShapeAnnotation>() + "psccc2s");
                    features.Add(charc + c2.GetString <CoreAnnotations.ShapeAnnotation>() + c3.GetString <CoreAnnotations.ShapeAnnotation>() + "ccc2sc3s");
                }
            }

            /*
             * Radical N-gram features. N is upto 4.
             * Smoothing method of N-gram, because there are too many characters in Chinese.
             * (It works better than N-gram when they are used individually. less sparse)
             */
            char rcharc;
            char rcharc2;
            char rcharp;
            char rcharp2;

            if (charc.Length == 0)
            {
                rcharc = 'n';
            }
            else
            {
                rcharc = RadicalMap.GetRadical(charc[0]);
            }
            if (charc2.Length == 0)
            {
                rcharc2 = 'n';
            }
            else
            {
                rcharc2 = RadicalMap.GetRadical(charc2[0]);
            }
            if (charp.Length == 0)
            {
                rcharp = 'n';
            }
            else
            {
                rcharp = RadicalMap.GetRadical(charp[0]);
            }
            if (charp2.Length == 0)
            {
                rcharp2 = 'n';
            }
            else
            {
                rcharp2 = RadicalMap.GetRadical(charp2[0]);
            }
            if (flags.useRad2)
            {
                features.Add(rcharc + "rc");
                features.Add(rcharc2 + "rc2");
                features.Add(rcharp + "rp");
                features.Add(rcharp + rcharc + "rprc");
                features.Add(rcharc + rcharc2 + "rcrc2");
                features.Add(rcharp + rcharc + rcharc2 + "rprcrc2");
            }
            if (flags.useRad2b)
            {
                features.Add(rcharc + "rc");
                features.Add(rcharc2 + "rc2");
                features.Add(rcharp + "rp");
                features.Add(rcharp + rcharc + "rprc");
                features.Add(rcharc + rcharc2 + "rcrc2");
                features.Add(rcharp2 + rcharp + "rp2rp");
            }

            /* Non-word dictionary: SEEN bi-gram marked as non-word.
             * This is frickin' useful.  I hadn't realized.  CDM Oct 2007.
             */
            if (flags.useDict2)
            {
                NonDict2 nd = new NonDict2(flags);
                features.Add(nd.CheckDic(charp + charc, flags) + "nondict");
            }
            if (flags.useOutDict2)
            {
                if (outDict == null)
                {
                    CreateOutDict();
                }
                features.Add(outDict.GetW(charp + charc) + "outdict");
                // -1 0
                features.Add(outDict.GetW(charc + charc2) + "outdict");
                // 0 1
                features.Add(outDict.GetW(charp2 + charp) + "outdict");
                // -2 -1
                features.Add(outDict.GetW(charp2 + charp + charc) + "outdict");
                // -2 -1 0
                features.Add(outDict.GetW(charp3 + charp2 + charp) + "outdict");
                // -3 -2 -1
                features.Add(outDict.GetW(charp + charc + charc2) + "outdict");
                // -1 0 1
                features.Add(outDict.GetW(charc + charc2 + charc3) + "outdict");
                // 0 1 2
                features.Add(outDict.GetW(charp + charc + charc2 + charc3) + "outdict");
            }
            // -1 0 1 2

            /*
             * (CTB/ASBC/HK/PK/MSR) POS information of each characters.
             * If a character falls into some function categories,
             * it is very likely there is a boundary.
             * A lot of Chinese function words belong to single characters.
             * This feature is also good for numbers and punctuations.
             * DE* are grouped into DE.
             */
            if (flags.useCTBChar2 || flags.useASBCChar2 || flags.useHKChar2 || flags.usePKChar2 || flags.useMSRChar2)
            {
                string[] tagsets;
                // the "useChPos" now only works for CTB and PK
                if (flags.useChPos)
                {
                    if (flags.useCTBChar2)
                    {
                        tagsets = new string[] { "AD", "AS", "BA", "CC", "CD", "CS", "DE", "DT", "ETC", "IJ", "JJ", "LB", "LC", "M", "NN", "NR", "NT", "OD", "P", "PN", "PU", "SB", "SP", "VA", "VC", "VE", "VV" };
                    }
                    else
                    {
                        if (flags.usePKChar2)
                        {
                            //tagsets = new String[]{"r", "j", "t", "a", "nz", "l", "vn", "i", "m", "ns", "nr", "v", "n", "q", "Ng", "b", "d", "nt"};
                            tagsets = new string[] { "2", "3", "4" };
                        }
                        else
                        {
                            throw new Exception("only support settings for CTB and PK now.");
                        }
                    }
                }
                else
                {
                    //logger.info("Using Derived features");
                    tagsets = new string[] { "2", "3", "4" };
                }
                if (taDetector == null)
                {
                    CreateTADetector();
                }
                foreach (string tag in tagsets)
                {
                    features.Add(taDetector.CheckDic(tag + "p", charp) + taDetector.CheckDic(tag + "i", charp) + taDetector.CheckDic(tag + "s", charc) + taDetector.CheckInDic(charp) + taDetector.CheckInDic(charc) + tag + "prep-sufc");
                }
            }
            //features.add("|ctbchar2");

            /*
             * In error analysis, we found English words and numbers are often separated.
             * Rule 1: isNumber feature: check if the current and previous char is a number.
             * Rule 2: Disambiguation of time point and time duration.
             * Rule 3: isEnglish feature: check if the current and previous character is an english letter.
             * Rule 4: English name feature: check if the current char is a conjunct pu for English first and last name, since there is no space between two names.
             * Most of PUs are a good indicator for word boundary, but - and .  is a strong indicator that there is no boundry within a previous , a follow char and it.
             */
            if (flags.useRule2)
            {
                /* Reduplication features */
                // previous character == current character
                if (charp.Equals(charc))
                {
                    features.Add("11-R2");
                }
                // previous character == next character
                if (charp.Equals(charc2))
                {
                    features.Add("22-R2");
                }
                // current character == next next character
                // fire only when usePk and useHk are both false.
                // Notice: this should be (almost) the same as the "22" feature, but we keep it for now.
                if (!flags.usePk && !flags.useHk)
                {
                    if (charc.Equals(charc2))
                    {
                        features.Add("33-R2");
                    }
                }
                char cur1 = ' ';
                char cur2 = ' ';
                char cur  = ' ';
                char pre  = ' ';
                // actually their length must be either 0 or 1
                if (charc2.Length > 0)
                {
                    cur1 = charc2[0];
                }
                if (charc3.Length > 0)
                {
                    cur2 = charc3[0];
                }
                if (charc.Length > 0)
                {
                    cur = charc[0];
                }
                if (charp.Length > 0)
                {
                    pre = charp[0];
                }
                string prer = rcharp.ToString();
                // the radical of previous character
                Pattern E  = Pattern.Compile("[a-zA-Z]");
                Pattern N  = Pattern.Compile("[0-9]");
                Matcher m  = E.Matcher(charp);
                Matcher ce = E.Matcher(charc);
                Matcher pe = E.Matcher(charp2);
                Matcher cn = N.Matcher(charc);
                Matcher pn = N.Matcher(charp2);
                // if current and previous characters are numbers...
                if (cur >= '0' && cur <= '9' && pre >= '0' && pre <= '9')
                {
                    if (cur == '9' && pre == '1' && cur1 == '9' && cur2 >= '0' && cur2 <= '9')
                    {
                        //199x
                        features.Add("YR-R2");
                    }
                    else
                    {
                        features.Add("2N-R2");
                    }
                }
                else
                {
                    // if current and previous characters are not both numbers
                    // but previous char is a number
                    // i.e. patterns like "1N" , "2A", etc
                    if (pre >= '0' && pre <= '9')
                    {
                        features.Add("1N-R2");
                    }
                    else
                    {
                        // if previous character is an English character
                        if (m.Matches())
                        {
                            features.Add("E-R2");
                        }
                        else
                        {
                            // if the previous character contains no radical (and it exist)
                            if (prer.Equals(".") && charp.Length == 1)
                            {
                                if (ce.Matches())
                                {
                                    features.Add("PU+E-R2");
                                }
                                if (pe.Matches())
                                {
                                    features.Add("E+PU-R2");
                                }
                                if (cn.Matches())
                                {
                                    features.Add("PU+N-R2");
                                }
                                if (pn.Matches())
                                {
                                    features.Add("N+PU-R2");
                                }
                                features.Add("PU-R2");
                            }
                        }
                    }
                }
                string engType = IsEnglish(charp, charc);
                string engPU   = IsEngPU(charp);
                if (!engType.Equals(string.Empty))
                {
                    features.Add(engType);
                }
                if (!engPU.Equals(string.Empty) && !engType.Equals(string.Empty))
                {
                    StringBuilder sb = new StringBuilder();
                    sb.Append(engPU).Append(engType).Append("R2");
                    features.Add(sb.ToString());
                }
            }
            //end of use rule
            // features using "Character.getType" information!
            string origS = c.GetString <CoreAnnotations.OriginalCharAnnotation>();
            char   origC = ' ';

            if (origS.Length > 0)
            {
                origC = origS[0];
            }
            int type = char.GetType(origC);

            switch (type)
            {
            case char.UppercaseLetter:
            case char.LowercaseLetter:
            {
                // A-Z and full-width A-Z
                // a-z and full-width a-z
                features.Add("CHARTYPE-LETTER");
                break;
            }

            case char.DecimalDigitNumber:
            {
                features.Add("CHARTYPE-DECIMAL_DIGIT_NUMBER");
                break;
            }

            case char.OtherLetter:
            {
                // mostly chinese chars
                features.Add("CHARTYPE-OTHER_LETTER");
                break;
            }

            default:
            {
                // other types
                features.Add("CHARTYPE-MISC");
                break;
            }
            }
            features.Add("cliqueCpC");
            return(features);
        }
示例#18
0
        protected internal virtual ICollection <string> FeaturesC <_T0>(PaddedList <_T0> cInfo, int loc)
            where _T0 : CoreLabel
        {
            ICollection <string> features = new List <string>();
            CoreLabel            c        = cInfo[loc];
            CoreLabel            c2       = cInfo[loc + 1];
            CoreLabel            c3       = cInfo[loc + 2];
            CoreLabel            p        = cInfo[loc - 1];
            CoreLabel            p2       = cInfo[loc - 2];
            CoreLabel            p3       = cInfo[loc - 3];
            string charc   = c.GetString <CoreAnnotations.CharAnnotation>();
            string charc2  = c2.GetString <CoreAnnotations.CharAnnotation>();
            string charc3  = c3.GetString <CoreAnnotations.CharAnnotation>();
            string charp   = p.GetString <CoreAnnotations.CharAnnotation>();
            string charp2  = p2.GetString <CoreAnnotations.CharAnnotation>();
            string charp3  = p3.GetString <CoreAnnotations.CharAnnotation>();
            int    cI      = c.Get(typeof(CoreAnnotations.UTypeAnnotation));
            string uTypec  = (cI != null ? cI.ToString() : string.Empty);
            int    c2I     = c2.Get(typeof(CoreAnnotations.UTypeAnnotation));
            string uTypec2 = (c2I != null ? c2I.ToString() : string.Empty);
            int    c3I     = c3.Get(typeof(CoreAnnotations.UTypeAnnotation));
            string uTypec3 = (c3I != null ? c3I.ToString() : string.Empty);
            int    pI      = p.Get(typeof(CoreAnnotations.UTypeAnnotation));
            string uTypep  = (pI != null ? pI.ToString() : string.Empty);
            int    p2I     = p2.Get(typeof(CoreAnnotations.UTypeAnnotation));
            string uTypep2 = (p2I != null ? p2I.ToString() : string.Empty);

            /* N-gram features. N is upto 2. */
            if (flags.useWord1)
            {
                // features.add(charc +"c");
                // features.add(charc2+"c2");
                // features.add(charp +"p");
                // features.add(charp + charc  +"pc");
                // features.add(charc + charc2  +"cc2");
                // cdm: need hyphen so you can see which of charp or charc2 is null....
                // features.add(charp + "-" + charc2 + "pc2");
                features.Add(charc + "::c");
                features.Add(charc2 + "::c2");
                features.Add(charp + "::p");
                features.Add(charp2 + "::p2");
                // trying to restore the features that Huishin described in SIGHAN 2005 paper
                features.Add(charc + charc2 + "::cn");
                features.Add(charc + charc3 + "::cn2");
                features.Add(charp + charc + "::pc");
                features.Add(charp + charc2 + "::pn");
                features.Add(charp2 + charp + "::p2p");
                features.Add(charp2 + charc + "::p2c");
                features.Add(charc2 + charc + "::n2c");
            }
            if (flags.dictionary != null || flags.serializedDictionary != null)
            {
                DictionaryFeaturesC(typeof(CoreAnnotations.LBeginAnnotation), typeof(CoreAnnotations.LMiddleAnnotation), typeof(CoreAnnotations.LEndAnnotation), string.Empty, features, p, c, c2);
            }
            if (flags.dictionary2 != null)
            {
                DictionaryFeaturesC(typeof(CoreAnnotations.D2_LBeginAnnotation), typeof(CoreAnnotations.D2_LMiddleAnnotation), typeof(CoreAnnotations.D2_LEndAnnotation), "-D2-", features, p, c, c2);
            }
            if (flags.useFeaturesC4gram || flags.useFeaturesC5gram || flags.useFeaturesC6gram)
            {
                features.Add(charp2 + charp + "p2p");
                features.Add(charp2 + "p2");
            }
            if (flags.useFeaturesC5gram || flags.useFeaturesC6gram)
            {
                features.Add(charc3 + "c3");
                features.Add(charc2 + charc3 + "c2c3");
            }
            if (flags.useFeaturesC6gram)
            {
                features.Add(charp3 + "p3");
                features.Add(charp3 + charp2 + "p3p2");
            }
            if (flags.useUnicodeType || flags.useUnicodeType4gram || flags.useUnicodeType5gram)
            {
                features.Add(uTypep + "-" + uTypec + "-" + uTypec2 + "-uType3");
            }
            if (flags.useUnicodeType4gram || flags.useUnicodeType5gram)
            {
                features.Add(uTypep2 + "-" + uTypep + "-" + uTypec + "-" + uTypec2 + "-uType4");
            }
            if (flags.useUnicodeType5gram)
            {
                features.Add(uTypep2 + "-" + uTypep + "-" + uTypec + "-" + uTypec2 + "-" + uTypec3 + "-uType5");
            }
            if (flags.useUnicodeBlock)
            {
                features.Add(p.GetString <CoreAnnotations.UBlockAnnotation>() + "-" + c.GetString <CoreAnnotations.UBlockAnnotation>() + "-" + c2.GetString <CoreAnnotations.UBlockAnnotation>() + "-uBlock");
            }
            if (flags.useShapeStrings)
            {
                if (flags.useShapeStrings1)
                {
                    features.Add(p.GetString <CoreAnnotations.ShapeAnnotation>() + "ps");
                    features.Add(c.GetString <CoreAnnotations.ShapeAnnotation>() + "cs");
                    features.Add(c2.GetString <CoreAnnotations.ShapeAnnotation>() + "c2s");
                }
                if (flags.useShapeStrings3)
                {
                    features.Add(p.GetString <CoreAnnotations.ShapeAnnotation>() + c.GetString <CoreAnnotations.ShapeAnnotation>() + c2.GetString <CoreAnnotations.ShapeAnnotation>() + "pscsc2s");
                }
                if (flags.useShapeStrings4)
                {
                    features.Add(p2.GetString <CoreAnnotations.ShapeAnnotation>() + p.GetString <CoreAnnotations.ShapeAnnotation>() + c.GetString <CoreAnnotations.ShapeAnnotation>() + c2.GetString <CoreAnnotations.ShapeAnnotation>() + "p2spscsc2s");
                }
                if (flags.useShapeStrings5)
                {
                    features.Add(p2.GetString <CoreAnnotations.ShapeAnnotation>() + p.GetString <CoreAnnotations.ShapeAnnotation>() + c.GetString <CoreAnnotations.ShapeAnnotation>() + c2.GetString <CoreAnnotations.ShapeAnnotation>() + c3.GetString <CoreAnnotations.ShapeAnnotation
                                                                                                                                                                                                                                                         >() + "p2spscsc2sc3s");
                }
            }
            features.Add("cliqueC");
            return(features);
        }
示例#19
0
		// All the tags we need
		// Patterns we need
		// In theory 块 钱 should be separated by segmenter, but just in case segmenter fails
		// TODO(yuhao): Need to add support for 块 钱, 毛 钱, 角 钱, 角, 五 块 二
		// This only works when POS = NT
		// This is used to capture a special case of date in Chinese: 70 后 or 七零 后
		// order it by number of characters DESC for handy one-by-one matching of string suffix
		/// <summary>Use a set of heuristic rules to assign NER tags to tokens.</summary>
		/// <param name="document">
		/// A
		/// <see cref="System.Collections.IList{E}"/>
		/// of something that extends
		/// <see cref="Edu.Stanford.Nlp.Util.ICoreMap"/>
		/// .
		/// </param>
		/// <returns/>
		public override IList<CoreLabel> Classify(IList<CoreLabel> document)
		{
			// The actual implementation of the classifier
			PaddedList<CoreLabel> pl = new PaddedList<CoreLabel>(document, pad);
			for (int i = 0; i < sz; i++)
			{
				CoreLabel me = pl[i];
				CoreLabel prev = pl[i - 1];
				CoreLabel next = pl[i + 1];
				// by default set to be "O"
				me.Set(typeof(CoreAnnotations.AnswerAnnotation), flags.backgroundSymbol);
				// If current word is OD, label it as ORDINAL
				if (me.GetString<CoreAnnotations.PartOfSpeechAnnotation>().Equals("OD"))
				{
					me.Set(typeof(CoreAnnotations.AnswerAnnotation), OrdinalTag);
				}
				else
				{
					if (CurrencyWordPattern.Matcher(me.Word()).Matches() && prev.GetString<CoreAnnotations.PartOfSpeechAnnotation>().Equals("CD"))
					{
						// If current word is currency word and prev word is a CD
						me.Set(typeof(CoreAnnotations.AnswerAnnotation), MoneyTag);
					}
					else
					{
						if (me.GetString<CoreAnnotations.PartOfSpeechAnnotation>().Equals("CD"))
						{
							// TODO(yuhao): Need to support Chinese captial numbers like 叁拾 (This won't be POS-tagged as CD).
							// If current word is a CD
							if (PercentWordPattern1.Matcher(me.Word()).Matches() || PercentWordPattern2.Matcher(me.Word()).Matches())
							{
								// If current word is a percent
								me.Set(typeof(CoreAnnotations.AnswerAnnotation), PercentTag);
							}
							else
							{
								if (RightScanFindsMoneyWord(pl, i))
								{
									// If one the right finds a currency word
									me.Set(typeof(CoreAnnotations.AnswerAnnotation), MoneyTag);
								}
								else
								{
									if (me.Word().Length == 2 && ChineseAndArabicNumeralsPattern.Matcher(me.Word()).Matches() && DateAgeLocalizer.Equals(next.Word()))
									{
										// This is to extract a special case of DATE: 70 后 or 七零 后
										me.Set(typeof(CoreAnnotations.AnswerAnnotation), DateTag);
									}
									else
									{
										// Otherwise we should safely label it as NUMBER
										me.Set(typeof(CoreAnnotations.AnswerAnnotation), NumberTag);
									}
								}
							}
						}
						else
						{
							if (me.GetString<CoreAnnotations.PartOfSpeechAnnotation>().Equals("NT"))
							{
								// If current word is a NT (temporal noun)
								if (DatePattern1.Matcher(me.Word()).Matches() || DatePattern2.Matcher(me.Word()).Matches() || DatePattern3.Matcher(me.Word()).Matches() || DatePattern4.Matcher(me.Word()).Matches() || DatePattern5.Matcher(me.Word()).Matches() || DateWords.Contains
									(me.Word()))
								{
									me.Set(typeof(CoreAnnotations.AnswerAnnotation), DateTag);
								}
								else
								{
									if (TimePattern1.Matcher(me.Word()).Matches() || TimeWords.Contains(me.Word()))
									{
										me.Set(typeof(CoreAnnotations.AnswerAnnotation), TimeTag);
									}
									else
									{
										// TIME may have more variants (really?) so always add as TIME by default
										me.Set(typeof(CoreAnnotations.AnswerAnnotation), TimeTag);
									}
								}
							}
							else
							{
								if (DateAgeLocalizer.Equals(me.Word()) && prev.Word() != null && prev.Word().Length == 2 && ChineseAndArabicNumeralsPattern.Matcher(prev.Word()).Matches())
								{
									// Label 后 as DATE if the sequence is 70 后 or 七零 后
									me.Set(typeof(CoreAnnotations.AnswerAnnotation), DateTag);
								}
							}
						}
					}
				}
			}
			return document;
		}
        public override ICollection <string> GetCliqueFeatures(PaddedList <In> info, int position, Clique clique)
        {
            IList <string> features = new List <string>(Arrays.AsList(info[position].Word().Split(" ")));

            return(features);
        }
 /// <summary>
 /// This method returns a
 /// <see cref="System.Collections.ICollection{E}"/>
 /// of the features
 /// calculated for the word at the specified position in info (the list of
 /// words) for the specified
 /// <see cref="Clique"/>
 /// .
 /// It should return the actual String features, <b>NOT</b> wrapped in any
 /// other object, as the wrapping
 /// will be done automatically.
 /// Because it takes a
 /// <see cref="Edu.Stanford.Nlp.Util.PaddedList{E}"/>
 /// you don't
 /// need to worry about indices which are outside of the list.
 /// </summary>
 /// <param name="info">A PaddedList of the feature-value pairs</param>
 /// <param name="position">The current position to extract features at</param>
 /// <param name="clique">
 /// The particular clique for which to extract features. It
 /// should be a member of the knownCliques list.
 /// </param>
 /// <returns>
 /// A
 /// <see cref="System.Collections.ICollection{E}"/>
 /// of the features
 /// calculated for the word at the specified position in info.
 /// </returns>
 public abstract ICollection <string> GetCliqueFeatures(PaddedList <In> info, int position, Clique clique);
示例#22
0
        public virtual ICollection <string> FeaturesCpC(PaddedList <IN> cInfo, int loc)
        {
            ICollection <string> features = new List <string>();
            CoreLabel            c        = cInfo[loc];
            CoreLabel            c1       = cInfo[loc + 1];
            CoreLabel            c2       = cInfo[loc + 2];
            CoreLabel            c3       = cInfo[loc + 3];
            CoreLabel            p        = cInfo[loc - 1];
            CoreLabel            p2       = cInfo[loc - 2];
            CoreLabel            p3       = cInfo[loc - 3];
            string charc = c.Get(typeof(CoreAnnotations.CharAnnotation));

            if (charc == null)
            {
                charc = string.Empty;
            }
            string charc1 = c1.Get(typeof(CoreAnnotations.CharAnnotation));

            if (charc1 == null)
            {
                charc1 = string.Empty;
            }
            string charc2 = c2.Get(typeof(CoreAnnotations.CharAnnotation));

            if (charc2 == null)
            {
                charc2 = string.Empty;
            }
            string charc3 = c3.Get(typeof(CoreAnnotations.CharAnnotation));

            if (charc3 == null)
            {
                charc3 = string.Empty;
            }
            string charp = p.Get(typeof(CoreAnnotations.CharAnnotation));

            if (charp == null)
            {
                charp = string.Empty;
            }
            string charp2 = p2.Get(typeof(CoreAnnotations.CharAnnotation));

            if (charp2 == null)
            {
                charp2 = string.Empty;
            }
            string charp3 = p3.Get(typeof(CoreAnnotations.CharAnnotation));

            if (charp3 == null)
            {
                charp3 = string.Empty;
            }

            /*
             * N-gram features. N is upto 2.
             */
            if (flags.useWord2)
            {
                // features.add(charc +"c");
                // features.add(charc1+"c1");
                // features.add(charp +"p");
                // features.add(charp +charc  +"pc");
                // if( flags.useMsr ){
                //   features.add(charc +charc1 +"cc1");
                //   features.add(charp + charc1 +"pc1");
                // }
                features.Add(charc + "::c");
                features.Add(charc1 + "::c1");
                features.Add(charp + "::p");
                features.Add(charp2 + "::p2");
                // trying to restore the features that Huishin described in SIGHAN 2005 paper
                features.Add(charc + charc1 + "::cn");
                features.Add(charp + charc + "::pc");
                features.Add(charp + charc1 + "::pn");
                features.Add(charp2 + charp + "::p2p");
                features.Add(charp2 + charc + "::p2c");
                features.Add(charc2 + charc + "::n2c");
                features.Add("|word2");
            }

            /*
             * Radical N-gram features. N is upto 4.
             * Smoothing method of N-gram, because there are too many characters in Chinese.
             * (It works better than N-gram when they are used individually. less sparse)
             */
            char rcharc;
            char rcharc1;
            char rcharc2;
            char rcharc3;
            char rcharp;
            char rcharp1;
            char rcharp2;
            char rcharp3;

            if (charc.Length == 0)
            {
                rcharc = 'n';
            }
            else
            {
                rcharc = RadicalMap.GetRadical(charc[0]);
            }
            if (charc1.Length == 0)
            {
                rcharc1 = 'n';
            }
            else
            {
                rcharc1 = RadicalMap.GetRadical(charc1[0]);
            }
            if (charc2.Length == 0)
            {
                rcharc2 = 'n';
            }
            else
            {
                rcharc2 = RadicalMap.GetRadical(charc2[0]);
            }
            if (charc3.Length == 0)
            {
                rcharc3 = 'n';
            }
            else
            {
                rcharc3 = RadicalMap.GetRadical(charc3[0]);
            }
            if (charp.Length == 0)
            {
                rcharp = 'n';
            }
            else
            {
                rcharp = RadicalMap.GetRadical(charp[0]);
            }
            if (charp2.Length == 0)
            {
                rcharp2 = 'n';
            }
            else
            {
                rcharp2 = RadicalMap.GetRadical(charp2[0]);
            }
            if (charp3.Length == 0)
            {
                rcharp3 = 'n';
            }
            else
            {
                rcharp3 = RadicalMap.GetRadical(charp3[0]);
            }
            if (flags.useRad2)
            {
                features.Add(rcharc + "rc");
                features.Add(rcharc1 + "rc1");
                features.Add(rcharp + "rp");
                features.Add(rcharp + rcharc + "rpc");
                features.Add(rcharc + rcharc1 + "rcc1");
                features.Add(rcharp + rcharc + rcharc1 + "rpcc1");
                features.Add("|rad2");
            }
            /* non-word dictionary:SEEM bi-gram marked as non-word */
            if (flags.useDict2)
            {
                NonDict2 nd = new NonDict2(flags);
                features.Add(nd.CheckDic(charp + charc, flags) + "nondict");
                features.Add("|useDict2");
            }
            if (flags.useOutDict2)
            {
                if (outDict == null)
                {
                    logger.Info("reading " + flags.outDict2 + " as a seen lexicon");
                    outDict = new CorpusDictionary(flags.outDict2, true);
                }
                features.Add(outDict.GetW(charp + charc) + "outdict");
                // -1 0
                features.Add(outDict.GetW(charc + charc1) + "outdict");
                // 0 1
                features.Add(outDict.GetW(charp2 + charp) + "outdict");
                // -2 -1
                features.Add(outDict.GetW(charp2 + charp + charc) + "outdict");
                // -2 -1 0
                features.Add(outDict.GetW(charp3 + charp2 + charp) + "outdict");
                // -3 -2 -1
                features.Add(outDict.GetW(charp + charc + charc1) + "outdict");
                // -1 0 1
                features.Add(outDict.GetW(charc + charc1 + charc2) + "outdict");
                // 0 1 2
                features.Add(outDict.GetW(charp + charc + charc1 + charc2) + "outdict");
            }
            // -1 0 1 2

            /*
             * (CTB/ASBC/HK/PK/MSR) POS information of each characters.
             * If a character falls into some function categories,
             * it is very likely there is a boundary.
             * A lot of Chinese function words belong to single characters.
             * This feature is also good for numbers and punctuations.
             * DE* are grouped into DE.
             */
            if (flags.useCTBChar2 || flags.useASBCChar2 || flags.useHKChar2 || flags.usePKChar2 || flags.useMSRChar2)
            {
                string[] tagsets;
                // the "useChPos" now only works for CTB and PK
                if (flags.useChPos)
                {
                    if (flags.useCTBChar2)
                    {
                        tagsets = new string[] { "AD", "AS", "BA", "CC", "CD", "CS", "DE", "DT", "ETC", "IJ", "JJ", "LB", "LC", "M", "NN", "NR", "NT", "OD", "P", "PN", "PU", "SB", "SP", "VA", "VC", "VE", "VV" };
                    }
                    else
                    {
                        if (flags.usePKChar2)
                        {
                            //tagsets = new String[]{"r", "j", "t", "a", "nz", "l", "vn", "i", "m", "ns", "nr", "v", "n", "q", "Ng", "b", "d", "nt"};
                            tagsets = new string[] { "2", "3", "4" };
                        }
                        else
                        {
                            throw new Exception("only support settings for CTB and PK now.");
                        }
                    }
                }
                else
                {
                    //logger.info("Using Derived features");
                    tagsets = new string[] { "2", "3", "4" };
                }
                if (taDetector == null)
                {
                    taDetector = new TagAffixDetector(flags);
                }
                foreach (string tagset in tagsets)
                {
                    features.Add(taDetector.CheckDic(tagset + "p", charp) + taDetector.CheckDic(tagset + "i", charp) + taDetector.CheckDic(tagset + "s", charc) + taDetector.CheckInDic(charp) + taDetector.CheckInDic(charc) + tagset + "prep-sufc");
                }
            }
            // features.add("|ctbchar2");  // Added a constant feature several times!!

            /*
             * In error analysis, we found English words and numbers are often separated.
             * Rule 1: isNumber feature: check if the current and previous char is a number.
             * Rule 2: Disambiguation of time point and time duration.
             * Rule 3: isEnglish feature: check if the current and previous character is an english letter.
             * Rule 4: English name feature: check if the current char is a conjunct pu for English first and last name, since there is no space between two names.
             * Most of PUs are a good indicator for word boundary, but - and .  is a strong indicator that there is no boundry within a previous , a follow char and it.
             */
            if (flags.useRule2)
            {
                /* Reduplication features */
                // previous character == current character
                if (charp.Equals(charc))
                {
                    features.Add("11");
                }
                // previous character == next character
                if (charp.Equals(charc1))
                {
                    features.Add("22");
                }
                // current character == next next character
                // fire only when usePk and useHk are both false.
                // Notice: this should be (almost) the same as the "22" feature, but we keep it for now.
                if (!flags.usePk && !flags.useHk)
                {
                    if (charc.Equals(charc2))
                    {
                        features.Add("33");
                    }
                }
                char cur1 = ' ';
                char cur2 = ' ';
                char cur  = ' ';
                char pre  = ' ';
                // actually their length must be either 0 or 1
                if (charc1.Length > 0)
                {
                    cur1 = charc1[0];
                }
                if (charc2.Length > 0)
                {
                    cur2 = charc2[0];
                }
                if (charc.Length > 0)
                {
                    cur = charc[0];
                }
                if (charp.Length > 0)
                {
                    pre = charp[0];
                }
                string prer = rcharp.ToString();
                // the radical of previous character
                Pattern E  = Pattern.Compile("[a-zA-Z]");
                Pattern N  = Pattern.Compile("[0-9]");
                Matcher m  = E.Matcher(charp);
                Matcher ce = E.Matcher(charc);
                Matcher pe = E.Matcher(charp2);
                Matcher cn = N.Matcher(charc);
                Matcher pn = N.Matcher(charp2);
                // if current and previous characters are numbers...
                if (cur >= '0' && cur <= '9' && pre >= '0' && pre <= '9')
                {
                    if (cur == '9' && pre == '1' && cur1 == '9' && cur2 >= '0' && cur2 <= '9')
                    {
                        //199x
                        features.Add("YR");
                    }
                    else
                    {
                        features.Add("2N");
                    }
                }
                else
                {
                    // if current and previous characters are not both numbers
                    // but previous char is a number
                    // i.e. patterns like "1N" , "2A", etc
                    if (pre >= '0' && pre <= '9')
                    {
                        features.Add("1N");
                    }
                    else
                    {
                        // if previous character is an English character
                        if (m.Matches())
                        {
                            features.Add("E");
                        }
                        else
                        {
                            // if the previous character contains no radical (and it exist)
                            if (prer.Equals(".") && charp.Length == 1)
                            {
                                // fire only when usePk and useHk are both false. Not sure why. -pichuan
                                if (!flags.useHk && !flags.usePk)
                                {
                                    if (ce.Matches())
                                    {
                                        features.Add("PU+E");
                                    }
                                    if (pe.Matches())
                                    {
                                        features.Add("E+PU");
                                    }
                                    if (cn.Matches())
                                    {
                                        features.Add("PU+N");
                                    }
                                    if (pn.Matches())
                                    {
                                        features.Add("N+PU");
                                    }
                                }
                                features.Add("PU");
                            }
                        }
                    }
                }
                string engType = IsEnglish(charp, charc);
                string engPU   = IsEngPU(charp);
                if (!engType.Equals(string.Empty))
                {
                    features.Add(engType);
                }
                if (!engPU.Equals(string.Empty) && !engType.Equals(string.Empty))
                {
                    features.Add(engPU + engType);
                }
            }
            //end of use rule
            // features using "Character.getType" information!
            string origS = c.Get(typeof(CoreAnnotations.OriginalCharAnnotation));
            char   origC = ' ';

            if (origS.Length > 0)
            {
                origC = origS[0];
            }
            int type = char.GetType(origC);

            switch (type)
            {
            case char.UppercaseLetter:
            case char.LowercaseLetter:
            {
                // A-Z and full-width A-Z
                // a-z and full-width a-z
                features.Add("CHARTYPE-LETTER");
                break;
            }

            case char.DecimalDigitNumber:
            {
                features.Add("CHARTYPE-DECIMAL_DIGIT_NUMBER");
                break;
            }

            case char.OtherLetter:
            {
                // mostly chinese chars
                features.Add("CHARTYPE-OTHER_LETTER");
                break;
            }

            default:
            {
                // other types
                features.Add("CHARTYPE-MISC");
                break;
            }
            }
            return(features);
        }