/// <summary>Applies feature count thresholds to the Dataset.</summary> /// <remarks> /// Applies feature count thresholds to the Dataset. /// Only features that match pattern_i and occur at /// least threshold_i times (for some i) are kept. /// </remarks> /// <param name="thresholds">a list of pattern, threshold pairs</param> public virtual void ApplyFeatureCountThreshold(IList <Pair <Pattern, int> > thresholds) { // get feature counts float[] counts = GetFeatureCounts(); // build a new featureIndex IIndex <F> newFeatureIndex = new HashIndex <F>(); LOOP_continue: foreach (F f in featureIndex) { foreach (Pair <Pattern, int> threshold in thresholds) { Pattern p = threshold.First(); Matcher m = p.Matcher(f.ToString()); if (m.Matches()) { if (counts[featureIndex.IndexOf(f)] >= threshold.second) { newFeatureIndex.Add(f); } goto LOOP_continue; } } // we only get here if it didn't match anything on the list newFeatureIndex.Add(f); } counts = null; int[] featMap = new int[featureIndex.Size()]; for (int i = 0; i < featMap.Length; i++) { featMap[i] = newFeatureIndex.IndexOf(featureIndex.Get(i)); } featureIndex = null; for (int i_1 = 0; i_1 < size; i_1++) { IList <int> featList = new List <int>(data[i_1].Length); for (int j = 0; j < data[i_1].Length; j++) { if (featMap[data[i_1][j]] >= 0) { featList.Add(featMap[data[i_1][j]]); } } data[i_1] = new int[featList.Count]; for (int j_1 = 0; j_1 < data[i_1].Length; j_1++) { data[i_1][j_1] = featList[j_1]; } } featureIndex = newFeatureIndex; }
/// <summary>Generic method to select features based on the feature scores vector provided as an argument.</summary> /// <param name="numFeatures">number of features to be selected.</param> /// <param name="scores">a vector of size total number of features in the data.</param> public virtual void SelectFeatures(int numFeatures, double[] scores) { IList <ScoredObject <F> > scoredFeatures = new List <ScoredObject <F> >(); for (int i = 0; i < scores.Length; i++) { scoredFeatures.Add(new ScoredObject <F>(featureIndex.Get(i), scores[i])); } scoredFeatures.Sort(ScoredComparator.DescendingComparator); IIndex <F> newFeatureIndex = new HashIndex <F>(); for (int i_1 = 0; i_1 < scoredFeatures.Count && i_1 < numFeatures; i_1++) { newFeatureIndex.Add(scoredFeatures[i_1].Object()); } //logger.info(scoredFeatures.get(i)); for (int i_2 = 0; i_2 < size; i_2++) { int[] newData = new int[data[i_2].Length]; int curIndex = 0; for (int j = 0; j < data[i_2].Length; j++) { int index; if ((index = newFeatureIndex.IndexOf(featureIndex.Get(data[i_2][j]))) != -1) { newData[curIndex++] = index; } } int[] newDataTrimmed = new int[curIndex]; lock (typeof(Runtime)) { System.Array.Copy(newData, 0, newDataTrimmed, 0, curIndex); } data[i_2] = newDataTrimmed; } featureIndex = newFeatureIndex; }