Пример #1
0
        /// <summary>
        /// The estimation of classifier parameters has to run in two cycles. This is the Preprocess Phrase. In order to later compute
        /// the variance of each category/feature combination, the avarage of each category/feature combination has to be computed before.
        ///
        /// The count of items for each category is computed to be used later to obtain the apriory probability.
        /// The count of items for each category/feature combination is computed to later obtain the posteriory probability.
        /// </summary>
        /// <param name="examples"></param>
        private void PreprocessData()
        {
            for (int i = 0; i < _itemsCount; i++)
            {
                int category = ConvertToCategoryIdentifier(Description.Label.Type, Y[i]);

                CategoryItemsCount[category]++;

                var values = X[i];
                int j      = 0;
                for (int helper = 0; helper < Description.Features.Count(); helper++)
                {
                    var feature = Description.Features[helper];
                    if (ContinuesTypes.Contains(feature.Type))
                    {
                        var value = values[j];

                        // add the value to the avarage (later this will be divided by the count to obtain the avarage).
                        CategoryFeatureAvg[category][j] += value;

                        //have a small list of values for each feature/category
                        if (CategoryFeatureValues[category][j] == null)
                        {
                            CategoryFeatureValues[category][j] = new List <double>();
                        }
                        CategoryFeatureItemsCount[category][j]++;
                        CategoryFeatureValues[category][j].Add(value);
                    }

                    if (feature is StringProperty)
                    {
                        var sp        = feature as StringProperty;
                        var wordCount = sp.Dictionary.Count();
                        //each string is converted into binary vector.  I will loop over the vector representing the string.
                        for (int k = 0; k < wordCount; k++)
                        {
                            if (values[j + k] == 1)
                            {
                                CategoryFeatureItemsCount[category][j + k]++;
                            }
                        }

                        j += wordCount - 1;
                    }//string property


                    if (feature.Type == typeof(bool))
                    {
                        if (values[j] == 1)
                        {
                            CategoryFeatureItemsCount[category][j]++;
                        }
                    }
                    j++;
                } //features
            }     //items
        }
Пример #2
0
        /// <summary>
        /// This method builds the model - it fills the Apriory,Posteriory, CategoryFeatureAvg and CategoryFeatureVariance with approriate values.
        /// </summary>
        private void BuildTheModel()
        {
            _itemsCount    = Y.Count();
            _featuresCount = X.Cols;
            _categoryCount = Y.Distinct().Count();

            //initialize two dimensional arrays
            InitializeArrays();

            //prepare the data - avg, counts of items foreach category/feature.
            PreprocessData();

            for (int i = 0; i < _categoryCount; i++)
            {
                Apriory[i] = CategoryItemsCount[i] / _itemsCount;
                int j = 0;
                for (int featureIndex = 0; featureIndex < Description.Features.Count(); featureIndex++)
                {
                    var feature = Description.Features[featureIndex];

                    if (ContinuesTypes.Contains(feature.Type))
                    {
                        //values were added so far to this fied
                        //no divide the result by the count
                        CategoryFeatureAvg[i][j] /= CategoryFeatureItemsCount[i][j];
                        var variance = Helper.Variance(CategoryFeatureValues[i][j].ToArray(), CategoryFeatureAvg[i][j]);
                        CategoryFeatureVariance[i][j] = variance;
                    }

                    if (feature is StringProperty)
                    {
                        var sp        = feature as StringProperty;
                        var wordCount = sp.Dictionary.Count();
                        for (int k = 0; k < wordCount; k++)
                        {
                            Posteriory[i][j + k] = GetProbability(i, j + k);
                        }
                        j += wordCount - 1;
                    }

                    if (feature.Type == typeof(bool))
                    {
                        Posteriory[i][j] = GetProbability(i, j);
                    }
                    j++;
                }
            }
        }