/// <summary> /// The estimation of classifier parameters has to run in two cycles. This is the Preprocess Phrase. In order to later compute /// the variance of each category/feature combination, the avarage of each category/feature combination has to be computed before. /// /// The count of items for each category is computed to be used later to obtain the apriory probability. /// The count of items for each category/feature combination is computed to later obtain the posteriory probability. /// </summary> /// <param name="examples"></param> private void PreprocessData() { for (int i = 0; i < _itemsCount; i++) { int category = ConvertToCategoryIdentifier(Description.Label.Type, Y[i]); CategoryItemsCount[category]++; var values = X[i]; int j = 0; for (int helper = 0; helper < Description.Features.Count(); helper++) { var feature = Description.Features[helper]; if (ContinuesTypes.Contains(feature.Type)) { var value = values[j]; // add the value to the avarage (later this will be divided by the count to obtain the avarage). CategoryFeatureAvg[category][j] += value; //have a small list of values for each feature/category if (CategoryFeatureValues[category][j] == null) { CategoryFeatureValues[category][j] = new List <double>(); } CategoryFeatureItemsCount[category][j]++; CategoryFeatureValues[category][j].Add(value); } if (feature is StringProperty) { var sp = feature as StringProperty; var wordCount = sp.Dictionary.Count(); //each string is converted into binary vector. I will loop over the vector representing the string. for (int k = 0; k < wordCount; k++) { if (values[j + k] == 1) { CategoryFeatureItemsCount[category][j + k]++; } } j += wordCount - 1; }//string property if (feature.Type == typeof(bool)) { if (values[j] == 1) { CategoryFeatureItemsCount[category][j]++; } } j++; } //features } //items }
/// <summary> /// This method builds the model - it fills the Apriory,Posteriory, CategoryFeatureAvg and CategoryFeatureVariance with approriate values. /// </summary> private void BuildTheModel() { _itemsCount = Y.Count(); _featuresCount = X.Cols; _categoryCount = Y.Distinct().Count(); //initialize two dimensional arrays InitializeArrays(); //prepare the data - avg, counts of items foreach category/feature. PreprocessData(); for (int i = 0; i < _categoryCount; i++) { Apriory[i] = CategoryItemsCount[i] / _itemsCount; int j = 0; for (int featureIndex = 0; featureIndex < Description.Features.Count(); featureIndex++) { var feature = Description.Features[featureIndex]; if (ContinuesTypes.Contains(feature.Type)) { //values were added so far to this fied //no divide the result by the count CategoryFeatureAvg[i][j] /= CategoryFeatureItemsCount[i][j]; var variance = Helper.Variance(CategoryFeatureValues[i][j].ToArray(), CategoryFeatureAvg[i][j]); CategoryFeatureVariance[i][j] = variance; } if (feature is StringProperty) { var sp = feature as StringProperty; var wordCount = sp.Dictionary.Count(); for (int k = 0; k < wordCount; k++) { Posteriory[i][j + k] = GetProbability(i, j + k); } j += wordCount - 1; } if (feature.Type == typeof(bool)) { Posteriory[i][j] = GetProbability(i, j); } j++; } } }