예제 #1
0
        /// <summary>Applies feature count thresholds to the Dataset.</summary>
        /// <remarks>
        /// Applies feature count thresholds to the Dataset.
        /// Only features that match pattern_i and occur at
        /// least threshold_i times (for some i) are kept.
        /// </remarks>
        /// <param name="thresholds">a list of pattern, threshold pairs</param>
        public virtual void ApplyFeatureCountThreshold(IList <Pair <Pattern, int> > thresholds)
        {
            // get feature counts
            float[] counts = GetFeatureCounts();
            // build a new featureIndex
            IIndex <F> newFeatureIndex = new HashIndex <F>();

LOOP_continue:
            foreach (F f in featureIndex)
            {
                foreach (Pair <Pattern, int> threshold in thresholds)
                {
                    Pattern p = threshold.First();
                    Matcher m = p.Matcher(f.ToString());
                    if (m.Matches())
                    {
                        if (counts[featureIndex.IndexOf(f)] >= threshold.second)
                        {
                            newFeatureIndex.Add(f);
                        }
                        goto LOOP_continue;
                    }
                }
                // we only get here if it didn't match anything on the list
                newFeatureIndex.Add(f);
            }

            counts = null;
            int[] featMap = new int[featureIndex.Size()];
            for (int i = 0; i < featMap.Length; i++)
            {
                featMap[i] = newFeatureIndex.IndexOf(featureIndex.Get(i));
            }
            featureIndex = null;
            for (int i_1 = 0; i_1 < size; i_1++)
            {
                IList <int> featList = new List <int>(data[i_1].Length);
                for (int j = 0; j < data[i_1].Length; j++)
                {
                    if (featMap[data[i_1][j]] >= 0)
                    {
                        featList.Add(featMap[data[i_1][j]]);
                    }
                }
                data[i_1] = new int[featList.Count];
                for (int j_1 = 0; j_1 < data[i_1].Length; j_1++)
                {
                    data[i_1][j_1] = featList[j_1];
                }
            }
            featureIndex = newFeatureIndex;
        }
예제 #2
0
 protected virtual void SetUp()
 {
     underlying = new HashIndex <string>();
     underlying.Add("foo0");
     underlying.Add("foo1");
     underlying.Add("foo2");
     underlying.Add("foo3");
     underlying.Add("foo4");
     NUnit.Framework.Assert.AreEqual(5, underlying.Count);
     spillover = new DeltaIndex <string>(underlying);
     spillover.Add("foo1");
     spillover.Add("foo5");
     spillover.Add("foo6");
 }
        /// <summary>create an index for each parameter - the prior probs and the features with all of their values</summary>
        protected internal virtual IIndex <IntTuple> CreateIndex()
        {
            IIndex <IntTuple> index = new HashIndex <IntTuple>();

            for (int c = 0; c < numClasses; c++)
            {
                index.Add(new IntUni(c));
                for (int f = 0; f < numFeatures; f++)
                {
                    for (int val = 0; val < numValues[f]; val++)
                    {
                        index.Add(new IntTriple(c, f, val));
                    }
                }
            }
            return(index);
        }
        public virtual IIndex <IntPair> CreateIndex()
        {
            IIndex <IntPair> index = new HashIndex <IntPair>();

            for (int x = 0; x < px.Length; x++)
            {
                int numberY = NumY(x);
                for (int y = 0; y < numberY; y++)
                {
                    index.Add(new IntPair(x, y));
                }
            }
            return(index);
        }
        /// <summary>Applies a feature max count threshold to the RVFDataset.</summary>
        /// <remarks>
        /// Applies a feature max count threshold to the RVFDataset. All features that
        /// occur greater than <i>k</i> times are expunged.
        /// </remarks>
        public override void ApplyFeatureMaxCountThreshold(int k)
        {
            float[]       counts          = GetFeatureCounts();
            HashIndex <F> newFeatureIndex = new HashIndex <F>();

            int[] featMap = new int[featureIndex.Size()];
            for (int i = 0; i < featMap.Length; i++)
            {
                F feat = featureIndex.Get(i);
                if (counts[i] <= k)
                {
                    int newIndex = newFeatureIndex.Count;
                    newFeatureIndex.Add(feat);
                    featMap[i] = newIndex;
                }
                else
                {
                    featMap[i] = -1;
                }
            }
            // featureIndex.remove(feat);
            featureIndex = newFeatureIndex;
            // counts = null; // This is unnecessary; JVM can clean it up
            for (int i_1 = 0; i_1 < size; i_1++)
            {
                IList <int>    featList  = new List <int>(data[i_1].Length);
                IList <double> valueList = new List <double>(values[i_1].Length);
                for (int j = 0; j < data[i_1].Length; j++)
                {
                    if (featMap[data[i_1][j]] >= 0)
                    {
                        featList.Add(featMap[data[i_1][j]]);
                        valueList.Add(values[i_1][j]);
                    }
                }
                data[i_1]   = new int[featList.Count];
                values[i_1] = new double[valueList.Count];
                for (int j_1 = 0; j_1 < data[i_1].Length; j_1++)
                {
                    data[i_1][j_1]   = featList[j_1];
                    values[i_1][j_1] = valueList[j_1];
                }
            }
        }
예제 #6
0
        /// <summary>Retains the given features in the Dataset.</summary>
        /// <remarks>
        /// Retains the given features in the Dataset.  All features that
        /// do not occur in features are expunged.
        /// </remarks>
        public virtual void RetainFeatures(ICollection <F> features)
        {
            //float[] counts = getFeatureCounts();
            IIndex <F> newFeatureIndex = new HashIndex <F>();

            int[] featMap = new int[featureIndex.Size()];
            for (int i = 0; i < featMap.Length; i++)
            {
                F feat = featureIndex.Get(i);
                if (features.Contains(feat))
                {
                    int newIndex = newFeatureIndex.Size();
                    newFeatureIndex.Add(feat);
                    featMap[i] = newIndex;
                }
                else
                {
                    featMap[i] = -1;
                }
            }
            // featureIndex.remove(feat);
            featureIndex = newFeatureIndex;
            // counts = null; // This is unnecessary; JVM can clean it up
            for (int i_1 = 0; i_1 < size; i_1++)
            {
                IList <int> featList = new List <int>(data[i_1].Length);
                for (int j = 0; j < data[i_1].Length; j++)
                {
                    if (featMap[data[i_1][j]] >= 0)
                    {
                        featList.Add(featMap[data[i_1][j]]);
                    }
                }
                data[i_1] = new int[featList.Count];
                for (int j_1 = 0; j_1 < data[i_1].Length; j_1++)
                {
                    data[i_1][j_1] = featList[j_1];
                }
            }
        }
        /// <summary>Generic method to select features based on the feature scores vector provided as an argument.</summary>
        /// <param name="numFeatures">number of features to be selected.</param>
        /// <param name="scores">a vector of size total number of features in the data.</param>
        public virtual void SelectFeatures(int numFeatures, double[] scores)
        {
            IList <ScoredObject <F> > scoredFeatures = new List <ScoredObject <F> >();

            for (int i = 0; i < scores.Length; i++)
            {
                scoredFeatures.Add(new ScoredObject <F>(featureIndex.Get(i), scores[i]));
            }
            scoredFeatures.Sort(ScoredComparator.DescendingComparator);
            IIndex <F> newFeatureIndex = new HashIndex <F>();

            for (int i_1 = 0; i_1 < scoredFeatures.Count && i_1 < numFeatures; i_1++)
            {
                newFeatureIndex.Add(scoredFeatures[i_1].Object());
            }
            //logger.info(scoredFeatures.get(i));
            for (int i_2 = 0; i_2 < size; i_2++)
            {
                int[] newData  = new int[data[i_2].Length];
                int   curIndex = 0;
                for (int j = 0; j < data[i_2].Length; j++)
                {
                    int index;
                    if ((index = newFeatureIndex.IndexOf(featureIndex.Get(data[i_2][j]))) != -1)
                    {
                        newData[curIndex++] = index;
                    }
                }
                int[] newDataTrimmed = new int[curIndex];
                lock (typeof(Runtime))
                {
                    System.Array.Copy(newData, 0, newDataTrimmed, 0, curIndex);
                }
                data[i_2] = newDataTrimmed;
            }
            featureIndex = newFeatureIndex;
        }
예제 #8
0
        /// <exception cref="System.Exception"/>
        protected internal override void LoadTextClassifier(BufferedReader br)
        {
            base.LoadTextClassifier(br);
            string line = br.ReadLine();

            string[] toks = line.Split("\\t");
            if (!toks[0].Equals("nodeFeatureIndicesMap.size()="))
            {
                throw new Exception("format error in nodeFeatureIndicesMap");
            }
            int nodeFeatureIndicesMapSize = System.Convert.ToInt32(toks[1]);

            nodeFeatureIndicesMap = new HashIndex <int>();
            int count = 0;

            while (count < nodeFeatureIndicesMapSize)
            {
                line = br.ReadLine();
                toks = line.Split("\\t");
                int idx = System.Convert.ToInt32(toks[0]);
                if (count != idx)
                {
                    throw new Exception("format error");
                }
                nodeFeatureIndicesMap.Add(System.Convert.ToInt32(toks[1]));
                count++;
            }
            line = br.ReadLine();
            toks = line.Split("\\t");
            if (!toks[0].Equals("edgeFeatureIndicesMap.size()="))
            {
                throw new Exception("format error");
            }
            int edgeFeatureIndicesMapSize = System.Convert.ToInt32(toks[1]);

            edgeFeatureIndicesMap = new HashIndex <int>();
            count = 0;
            while (count < edgeFeatureIndicesMapSize)
            {
                line = br.ReadLine();
                toks = line.Split("\\t");
                int idx = System.Convert.ToInt32(toks[0]);
                if (count != idx)
                {
                    throw new Exception("format error");
                }
                edgeFeatureIndicesMap.Add(System.Convert.ToInt32(toks[1]));
                count++;
            }
            int weightsLength = -1;

            if (flags.secondOrderNonLinear)
            {
                line = br.ReadLine();
                toks = line.Split("\\t");
                if (!toks[0].Equals("inputLayerWeights4Edge.length="))
                {
                    throw new Exception("format error");
                }
                weightsLength          = System.Convert.ToInt32(toks[1]);
                inputLayerWeights4Edge = new double[weightsLength][];
                count = 0;
                while (count < weightsLength)
                {
                    line = br.ReadLine();
                    toks = line.Split("\\t");
                    int weights2Length = System.Convert.ToInt32(toks[0]);
                    inputLayerWeights4Edge[count] = new double[weights2Length];
                    string[] weightsValue = toks[1].Split(" ");
                    if (weights2Length != weightsValue.Length)
                    {
                        throw new Exception("weights format error");
                    }
                    for (int i2 = 0; i2 < weights2Length; i2++)
                    {
                        inputLayerWeights4Edge[count][i2] = double.ParseDouble(weightsValue[i2]);
                    }
                    count++;
                }
                line = br.ReadLine();
                toks = line.Split("\\t");
                if (!toks[0].Equals("outputLayerWeights4Edge.length="))
                {
                    throw new Exception("format error");
                }
                weightsLength           = System.Convert.ToInt32(toks[1]);
                outputLayerWeights4Edge = new double[weightsLength][];
                count = 0;
                while (count < weightsLength)
                {
                    line = br.ReadLine();
                    toks = line.Split("\\t");
                    int weights2Length = System.Convert.ToInt32(toks[0]);
                    outputLayerWeights4Edge[count] = new double[weights2Length];
                    string[] weightsValue = toks[1].Split(" ");
                    if (weights2Length != weightsValue.Length)
                    {
                        throw new Exception("weights format error");
                    }
                    for (int i2 = 0; i2 < weights2Length; i2++)
                    {
                        outputLayerWeights4Edge[count][i2] = double.ParseDouble(weightsValue[i2]);
                    }
                    count++;
                }
            }
            else
            {
                line = br.ReadLine();
                toks = line.Split("\\t");
                if (!toks[0].Equals("linearWeights.length="))
                {
                    throw new Exception("format error");
                }
                weightsLength = System.Convert.ToInt32(toks[1]);
                linearWeights = new double[weightsLength][];
                count         = 0;
                while (count < weightsLength)
                {
                    line = br.ReadLine();
                    toks = line.Split("\\t");
                    int weights2Length = System.Convert.ToInt32(toks[0]);
                    linearWeights[count] = new double[weights2Length];
                    string[] weightsValue = toks[1].Split(" ");
                    if (weights2Length != weightsValue.Length)
                    {
                        throw new Exception("weights format error");
                    }
                    for (int i2 = 0; i2 < weights2Length; i2++)
                    {
                        linearWeights[count][i2] = double.ParseDouble(weightsValue[i2]);
                    }
                    count++;
                }
            }
            line = br.ReadLine();
            toks = line.Split("\\t");
            if (!toks[0].Equals("inputLayerWeights.length="))
            {
                throw new Exception("format error");
            }
            weightsLength     = System.Convert.ToInt32(toks[1]);
            inputLayerWeights = new double[weightsLength][];
            count             = 0;
            while (count < weightsLength)
            {
                line = br.ReadLine();
                toks = line.Split("\\t");
                int weights2Length = System.Convert.ToInt32(toks[0]);
                inputLayerWeights[count] = new double[weights2Length];
                string[] weightsValue = toks[1].Split(" ");
                if (weights2Length != weightsValue.Length)
                {
                    throw new Exception("weights format error");
                }
                for (int i2 = 0; i2 < weights2Length; i2++)
                {
                    inputLayerWeights[count][i2] = double.ParseDouble(weightsValue[i2]);
                }
                count++;
            }
            line = br.ReadLine();
            toks = line.Split("\\t");
            if (!toks[0].Equals("outputLayerWeights.length="))
            {
                throw new Exception("format error");
            }
            weightsLength      = System.Convert.ToInt32(toks[1]);
            outputLayerWeights = new double[weightsLength][];
            count = 0;
            while (count < weightsLength)
            {
                line = br.ReadLine();
                toks = line.Split("\\t");
                int weights2Length = System.Convert.ToInt32(toks[0]);
                outputLayerWeights[count] = new double[weights2Length];
                string[] weightsValue = toks[1].Split(" ");
                if (weights2Length != weightsValue.Length)
                {
                    throw new Exception("weights format error");
                }
                for (int i2 = 0; i2 < weights2Length; i2++)
                {
                    outputLayerWeights[count][i2] = double.ParseDouble(weightsValue[i2]);
                }
                count++;
            }
        }