/// <summary>Applies feature count thresholds to the Dataset.</summary> /// <remarks> /// Applies feature count thresholds to the Dataset. /// Only features that match pattern_i and occur at /// least threshold_i times (for some i) are kept. /// </remarks> /// <param name="thresholds">a list of pattern, threshold pairs</param> public virtual void ApplyFeatureCountThreshold(IList <Pair <Pattern, int> > thresholds) { // get feature counts float[] counts = GetFeatureCounts(); // build a new featureIndex IIndex <F> newFeatureIndex = new HashIndex <F>(); LOOP_continue: foreach (F f in featureIndex) { foreach (Pair <Pattern, int> threshold in thresholds) { Pattern p = threshold.First(); Matcher m = p.Matcher(f.ToString()); if (m.Matches()) { if (counts[featureIndex.IndexOf(f)] >= threshold.second) { newFeatureIndex.Add(f); } goto LOOP_continue; } } // we only get here if it didn't match anything on the list newFeatureIndex.Add(f); } counts = null; int[] featMap = new int[featureIndex.Size()]; for (int i = 0; i < featMap.Length; i++) { featMap[i] = newFeatureIndex.IndexOf(featureIndex.Get(i)); } featureIndex = null; for (int i_1 = 0; i_1 < size; i_1++) { IList <int> featList = new List <int>(data[i_1].Length); for (int j = 0; j < data[i_1].Length; j++) { if (featMap[data[i_1][j]] >= 0) { featList.Add(featMap[data[i_1][j]]); } } data[i_1] = new int[featList.Count]; for (int j_1 = 0; j_1 < data[i_1].Length; j_1++) { data[i_1][j_1] = featList[j_1]; } } featureIndex = newFeatureIndex; }
protected virtual void SetUp() { underlying = new HashIndex <string>(); underlying.Add("foo0"); underlying.Add("foo1"); underlying.Add("foo2"); underlying.Add("foo3"); underlying.Add("foo4"); NUnit.Framework.Assert.AreEqual(5, underlying.Count); spillover = new DeltaIndex <string>(underlying); spillover.Add("foo1"); spillover.Add("foo5"); spillover.Add("foo6"); }
/// <summary>create an index for each parameter - the prior probs and the features with all of their values</summary> protected internal virtual IIndex <IntTuple> CreateIndex() { IIndex <IntTuple> index = new HashIndex <IntTuple>(); for (int c = 0; c < numClasses; c++) { index.Add(new IntUni(c)); for (int f = 0; f < numFeatures; f++) { for (int val = 0; val < numValues[f]; val++) { index.Add(new IntTriple(c, f, val)); } } } return(index); }
public virtual IIndex <IntPair> CreateIndex() { IIndex <IntPair> index = new HashIndex <IntPair>(); for (int x = 0; x < px.Length; x++) { int numberY = NumY(x); for (int y = 0; y < numberY; y++) { index.Add(new IntPair(x, y)); } } return(index); }
/// <summary>Applies a feature max count threshold to the RVFDataset.</summary> /// <remarks> /// Applies a feature max count threshold to the RVFDataset. All features that /// occur greater than <i>k</i> times are expunged. /// </remarks> public override void ApplyFeatureMaxCountThreshold(int k) { float[] counts = GetFeatureCounts(); HashIndex <F> newFeatureIndex = new HashIndex <F>(); int[] featMap = new int[featureIndex.Size()]; for (int i = 0; i < featMap.Length; i++) { F feat = featureIndex.Get(i); if (counts[i] <= k) { int newIndex = newFeatureIndex.Count; newFeatureIndex.Add(feat); featMap[i] = newIndex; } else { featMap[i] = -1; } } // featureIndex.remove(feat); featureIndex = newFeatureIndex; // counts = null; // This is unnecessary; JVM can clean it up for (int i_1 = 0; i_1 < size; i_1++) { IList <int> featList = new List <int>(data[i_1].Length); IList <double> valueList = new List <double>(values[i_1].Length); for (int j = 0; j < data[i_1].Length; j++) { if (featMap[data[i_1][j]] >= 0) { featList.Add(featMap[data[i_1][j]]); valueList.Add(values[i_1][j]); } } data[i_1] = new int[featList.Count]; values[i_1] = new double[valueList.Count]; for (int j_1 = 0; j_1 < data[i_1].Length; j_1++) { data[i_1][j_1] = featList[j_1]; values[i_1][j_1] = valueList[j_1]; } } }
/// <summary>Retains the given features in the Dataset.</summary> /// <remarks> /// Retains the given features in the Dataset. All features that /// do not occur in features are expunged. /// </remarks> public virtual void RetainFeatures(ICollection <F> features) { //float[] counts = getFeatureCounts(); IIndex <F> newFeatureIndex = new HashIndex <F>(); int[] featMap = new int[featureIndex.Size()]; for (int i = 0; i < featMap.Length; i++) { F feat = featureIndex.Get(i); if (features.Contains(feat)) { int newIndex = newFeatureIndex.Size(); newFeatureIndex.Add(feat); featMap[i] = newIndex; } else { featMap[i] = -1; } } // featureIndex.remove(feat); featureIndex = newFeatureIndex; // counts = null; // This is unnecessary; JVM can clean it up for (int i_1 = 0; i_1 < size; i_1++) { IList <int> featList = new List <int>(data[i_1].Length); for (int j = 0; j < data[i_1].Length; j++) { if (featMap[data[i_1][j]] >= 0) { featList.Add(featMap[data[i_1][j]]); } } data[i_1] = new int[featList.Count]; for (int j_1 = 0; j_1 < data[i_1].Length; j_1++) { data[i_1][j_1] = featList[j_1]; } } }
/// <summary>Generic method to select features based on the feature scores vector provided as an argument.</summary> /// <param name="numFeatures">number of features to be selected.</param> /// <param name="scores">a vector of size total number of features in the data.</param> public virtual void SelectFeatures(int numFeatures, double[] scores) { IList <ScoredObject <F> > scoredFeatures = new List <ScoredObject <F> >(); for (int i = 0; i < scores.Length; i++) { scoredFeatures.Add(new ScoredObject <F>(featureIndex.Get(i), scores[i])); } scoredFeatures.Sort(ScoredComparator.DescendingComparator); IIndex <F> newFeatureIndex = new HashIndex <F>(); for (int i_1 = 0; i_1 < scoredFeatures.Count && i_1 < numFeatures; i_1++) { newFeatureIndex.Add(scoredFeatures[i_1].Object()); } //logger.info(scoredFeatures.get(i)); for (int i_2 = 0; i_2 < size; i_2++) { int[] newData = new int[data[i_2].Length]; int curIndex = 0; for (int j = 0; j < data[i_2].Length; j++) { int index; if ((index = newFeatureIndex.IndexOf(featureIndex.Get(data[i_2][j]))) != -1) { newData[curIndex++] = index; } } int[] newDataTrimmed = new int[curIndex]; lock (typeof(Runtime)) { System.Array.Copy(newData, 0, newDataTrimmed, 0, curIndex); } data[i_2] = newDataTrimmed; } featureIndex = newFeatureIndex; }
/// <exception cref="System.Exception"/> protected internal override void LoadTextClassifier(BufferedReader br) { base.LoadTextClassifier(br); string line = br.ReadLine(); string[] toks = line.Split("\\t"); if (!toks[0].Equals("nodeFeatureIndicesMap.size()=")) { throw new Exception("format error in nodeFeatureIndicesMap"); } int nodeFeatureIndicesMapSize = System.Convert.ToInt32(toks[1]); nodeFeatureIndicesMap = new HashIndex <int>(); int count = 0; while (count < nodeFeatureIndicesMapSize) { line = br.ReadLine(); toks = line.Split("\\t"); int idx = System.Convert.ToInt32(toks[0]); if (count != idx) { throw new Exception("format error"); } nodeFeatureIndicesMap.Add(System.Convert.ToInt32(toks[1])); count++; } line = br.ReadLine(); toks = line.Split("\\t"); if (!toks[0].Equals("edgeFeatureIndicesMap.size()=")) { throw new Exception("format error"); } int edgeFeatureIndicesMapSize = System.Convert.ToInt32(toks[1]); edgeFeatureIndicesMap = new HashIndex <int>(); count = 0; while (count < edgeFeatureIndicesMapSize) { line = br.ReadLine(); toks = line.Split("\\t"); int idx = System.Convert.ToInt32(toks[0]); if (count != idx) { throw new Exception("format error"); } edgeFeatureIndicesMap.Add(System.Convert.ToInt32(toks[1])); count++; } int weightsLength = -1; if (flags.secondOrderNonLinear) { line = br.ReadLine(); toks = line.Split("\\t"); if (!toks[0].Equals("inputLayerWeights4Edge.length=")) { throw new Exception("format error"); } weightsLength = System.Convert.ToInt32(toks[1]); inputLayerWeights4Edge = new double[weightsLength][]; count = 0; while (count < weightsLength) { line = br.ReadLine(); toks = line.Split("\\t"); int weights2Length = System.Convert.ToInt32(toks[0]); inputLayerWeights4Edge[count] = new double[weights2Length]; string[] weightsValue = toks[1].Split(" "); if (weights2Length != weightsValue.Length) { throw new Exception("weights format error"); } for (int i2 = 0; i2 < weights2Length; i2++) { inputLayerWeights4Edge[count][i2] = double.ParseDouble(weightsValue[i2]); } count++; } line = br.ReadLine(); toks = line.Split("\\t"); if (!toks[0].Equals("outputLayerWeights4Edge.length=")) { throw new Exception("format error"); } weightsLength = System.Convert.ToInt32(toks[1]); outputLayerWeights4Edge = new double[weightsLength][]; count = 0; while (count < weightsLength) { line = br.ReadLine(); toks = line.Split("\\t"); int weights2Length = System.Convert.ToInt32(toks[0]); outputLayerWeights4Edge[count] = new double[weights2Length]; string[] weightsValue = toks[1].Split(" "); if (weights2Length != weightsValue.Length) { throw new Exception("weights format error"); } for (int i2 = 0; i2 < weights2Length; i2++) { outputLayerWeights4Edge[count][i2] = double.ParseDouble(weightsValue[i2]); } count++; } } else { line = br.ReadLine(); toks = line.Split("\\t"); if (!toks[0].Equals("linearWeights.length=")) { throw new Exception("format error"); } weightsLength = System.Convert.ToInt32(toks[1]); linearWeights = new double[weightsLength][]; count = 0; while (count < weightsLength) { line = br.ReadLine(); toks = line.Split("\\t"); int weights2Length = System.Convert.ToInt32(toks[0]); linearWeights[count] = new double[weights2Length]; string[] weightsValue = toks[1].Split(" "); if (weights2Length != weightsValue.Length) { throw new Exception("weights format error"); } for (int i2 = 0; i2 < weights2Length; i2++) { linearWeights[count][i2] = double.ParseDouble(weightsValue[i2]); } count++; } } line = br.ReadLine(); toks = line.Split("\\t"); if (!toks[0].Equals("inputLayerWeights.length=")) { throw new Exception("format error"); } weightsLength = System.Convert.ToInt32(toks[1]); inputLayerWeights = new double[weightsLength][]; count = 0; while (count < weightsLength) { line = br.ReadLine(); toks = line.Split("\\t"); int weights2Length = System.Convert.ToInt32(toks[0]); inputLayerWeights[count] = new double[weights2Length]; string[] weightsValue = toks[1].Split(" "); if (weights2Length != weightsValue.Length) { throw new Exception("weights format error"); } for (int i2 = 0; i2 < weights2Length; i2++) { inputLayerWeights[count][i2] = double.ParseDouble(weightsValue[i2]); } count++; } line = br.ReadLine(); toks = line.Split("\\t"); if (!toks[0].Equals("outputLayerWeights.length=")) { throw new Exception("format error"); } weightsLength = System.Convert.ToInt32(toks[1]); outputLayerWeights = new double[weightsLength][]; count = 0; while (count < weightsLength) { line = br.ReadLine(); toks = line.Split("\\t"); int weights2Length = System.Convert.ToInt32(toks[0]); outputLayerWeights[count] = new double[weights2Length]; string[] weightsValue = toks[1].Split(" "); if (weights2Length != weightsValue.Length) { throw new Exception("weights format error"); } for (int i2 = 0; i2 < weights2Length; i2++) { outputLayerWeights[count][i2] = double.ParseDouble(weightsValue[i2]); } count++; } }