Пример #1
0
        /// <summary>Applies feature count thresholds to the Dataset.</summary>
        /// <remarks>
        /// Applies feature count thresholds to the Dataset.
        /// Only features that match pattern_i and occur at
        /// least threshold_i times (for some i) are kept.
        /// </remarks>
        /// <param name="thresholds">a list of pattern, threshold pairs</param>
        public virtual void ApplyFeatureCountThreshold(IList <Pair <Pattern, int> > thresholds)
        {
            // get feature counts
            float[] counts = GetFeatureCounts();
            // build a new featureIndex
            IIndex <F> newFeatureIndex = new HashIndex <F>();

LOOP_continue:
            foreach (F f in featureIndex)
            {
                foreach (Pair <Pattern, int> threshold in thresholds)
                {
                    Pattern p = threshold.First();
                    Matcher m = p.Matcher(f.ToString());
                    if (m.Matches())
                    {
                        if (counts[featureIndex.IndexOf(f)] >= threshold.second)
                        {
                            newFeatureIndex.Add(f);
                        }
                        goto LOOP_continue;
                    }
                }
                // we only get here if it didn't match anything on the list
                newFeatureIndex.Add(f);
            }

            counts = null;
            int[] featMap = new int[featureIndex.Size()];
            for (int i = 0; i < featMap.Length; i++)
            {
                featMap[i] = newFeatureIndex.IndexOf(featureIndex.Get(i));
            }
            featureIndex = null;
            for (int i_1 = 0; i_1 < size; i_1++)
            {
                IList <int> featList = new List <int>(data[i_1].Length);
                for (int j = 0; j < data[i_1].Length; j++)
                {
                    if (featMap[data[i_1][j]] >= 0)
                    {
                        featList.Add(featMap[data[i_1][j]]);
                    }
                }
                data[i_1] = new int[featList.Count];
                for (int j_1 = 0; j_1 < data[i_1].Length; j_1++)
                {
                    data[i_1][j_1] = featList[j_1];
                }
            }
            featureIndex = newFeatureIndex;
        }
        /// <summary>Generic method to select features based on the feature scores vector provided as an argument.</summary>
        /// <param name="numFeatures">number of features to be selected.</param>
        /// <param name="scores">a vector of size total number of features in the data.</param>
        public virtual void SelectFeatures(int numFeatures, double[] scores)
        {
            IList <ScoredObject <F> > scoredFeatures = new List <ScoredObject <F> >();

            for (int i = 0; i < scores.Length; i++)
            {
                scoredFeatures.Add(new ScoredObject <F>(featureIndex.Get(i), scores[i]));
            }
            scoredFeatures.Sort(ScoredComparator.DescendingComparator);
            IIndex <F> newFeatureIndex = new HashIndex <F>();

            for (int i_1 = 0; i_1 < scoredFeatures.Count && i_1 < numFeatures; i_1++)
            {
                newFeatureIndex.Add(scoredFeatures[i_1].Object());
            }
            //logger.info(scoredFeatures.get(i));
            for (int i_2 = 0; i_2 < size; i_2++)
            {
                int[] newData  = new int[data[i_2].Length];
                int   curIndex = 0;
                for (int j = 0; j < data[i_2].Length; j++)
                {
                    int index;
                    if ((index = newFeatureIndex.IndexOf(featureIndex.Get(data[i_2][j]))) != -1)
                    {
                        newData[curIndex++] = index;
                    }
                }
                int[] newDataTrimmed = new int[curIndex];
                lock (typeof(Runtime))
                {
                    System.Array.Copy(newData, 0, newDataTrimmed, 0, curIndex);
                }
                data[i_2] = newDataTrimmed;
            }
            featureIndex = newFeatureIndex;
        }