private Perceptron buildPerceptronForAllData(IEnumerable <TupleStruct <double[], int> > trainingData, HashSet <int> positiveClasses, int dimension) { Perceptron perceptron = new Perceptron(dimension, normalizePerceptrons); perceptron.Train(trainingData.Select(item => new TupleStruct <double[], int>(item.Item1, positiveClasses.Contains(item.Item2) ? 1 : -1)).ToArray()); return(perceptron); }
//Here we take in an IGrouping, since one probably already exists. It could easily just be a list. private Perceptron buildPerceptronEvenWeights(IEnumerable <IGrouping <int, LabeledInstance> > groupsByLabel, HashSet <int> positiveClasses, int[] groupSizes, int dimension) { TupleStruct <double[], int, double>[] perceptronTrainingData = groupsByLabel.SelectMany( grp => grp.Select(instance => new TupleStruct <double[], int, double>(instance.values, (positiveClasses.Contains(grp.Key) ? 1 : -1), 1.0 / groupSizes[grp.Key])) ).ToArray(); Perceptron p = new Perceptron(dimension, normalizePerceptrons); p.Train(perceptronTrainingData); return(p); }
private Perceptron buildPerceptronEvenClassSizes(IEnumerable <IGrouping <int, LabeledInstance> > groupsByLabel, HashSet <int> positiveClasses, int minClassSize, int dimension) { //TODO: Solve the numbers dilema. //Approach 1, take a constant number of every class. If less exist in a negative, they are not taken. This may cause inflation of the positive class. TupleStruct <double[], int>[] perceptronTrainingData = groupsByLabel.SelectMany( grp => grp.Take(minClassSize).Select(instance => new TupleStruct <double[], int>(instance.values, positiveClasses.Contains(grp.Key) ? 1 : -1)) ).ToArray(); /* * //Approach 2, take a constant number of negative examples. Favors big negative classes. * TupleStruct<double[], int>[] positiveData = groupsByLabel.Where (grp => positiveClasses.Contains(grp.Key)).SelectMany( * grp => grp.Take(minClassSize).Select (instance => new TupleStruct<double[], int>(instance.values, 1)) * ).ToArray(); * * //TODO: This approach has its own problems: the negative data is biased * TupleStruct<double[], int>[] negativeData = groupsByLabel.Where (grp => !positiveClasses.Contains(grp.Key)).Flatten1().Shuffle ().Take (minClassSize * positiveClasses.Count) */ Perceptron p = new Perceptron(dimension, normalizePerceptrons); p.Train(perceptronTrainingData); return(p); }
public void Train(IEnumerable <LabeledInstance> trainingData) { trainingData = trainingData.ToArray(); //TODO performance. classes = trainingData.Select(item => item.label).Distinct().Order().ToArray(); Dictionary <string, int> classLookup = classes.IndexLookupDictionary(); int dimension = trainingData.First().values.Length; //TODO 0 case. int perceptronCount = (int)Math.Ceiling(Math.Log(classes.Length, 2) * perceptronCountFactor); //Need at least log_2 perceptrons to be able to represent any item with a TRUE combination. Take twice as many to improve predictive power. Random rand; if (cloudSizeStDev == 0) { rand = null; } else { rand = new Random(); } switch (trainingMode) { case PerceptronTrainingMode.TRAIN_ALL_DATA: { //TODO Don't shuffle every time. Highly inefficient. perceptrons = Enumerable.Range(0, perceptronCount).AsParallel().Select(val => { int[] classIndices = Enumerable.Range(0, classes.Length).Shuffle().Take(classesToTake(rand)).ToArray(); Perceptron perceptron = buildPerceptronForAllData(trainingData.Shuffle().Select(instance => new TupleStruct <double[], int>(instance.values, classLookup[instance.label])), new HashSet <int>(classIndices), dimension); return(new TupleStruct <Perceptron, int[]>(perceptron, classIndices)); } ).ToArray(); break; } case PerceptronTrainingMode.TRAIN_EVEN_SIZE: { IEnumerable <IGrouping <int, LabeledInstance> > byLabel = trainingData.GroupBy(item => classLookup[item.label]); //TODO: What type of enumerable does this return? Can it be enumerated multiple times efficiently? int[] classCounts = new int[classes.Length]; foreach (IGrouping <int, LabeledInstance> grp in byLabel) //TODO: Might be nice to have a higher order for this, like ToDictionary but with first type an int. { classCounts[grp.Key] = grp.Count(); } perceptrons = Enumerable.Range(0, perceptronCount).AsParallel().Select(index => { int[] classIndices = Enumerable.Range(0, classes.Length).Shuffle().Take(classesToTake(rand)).ToArray(); //TODO: Don't shufle every time. Perceptron perceptron = buildPerceptronEvenClassSizes(byLabel, new HashSet <int>(classIndices), classCounts.Min(), dimension); return(new TupleStruct <Perceptron, int[]>(perceptron, classIndices)); } ).ToArray(); break; } case PerceptronTrainingMode.TRAIN_EVEN_WEIGHTS: { IEnumerable <IGrouping <int, LabeledInstance> > byLabel = trainingData.GroupBy(item => classLookup[item.label]); //TODO: What type of enumerable does this return? Can it be enumerated multiple times efficiently? int[] classCounts = new int[classes.Length]; foreach (IGrouping <int, LabeledInstance> grp in byLabel) //TODO: Might be nice to have a higher order for this, like ToDictionary but with first type an int. { classCounts[grp.Key] = grp.Count(); } perceptrons = Enumerable.Range(0, perceptronCount).AsParallel().Select(index => { int[] classIndices = Enumerable.Range(0, classes.Length).Shuffle().Take(classesToTake(rand)).ToArray(); Perceptron perceptron = buildPerceptronEvenWeights(byLabel, new HashSet <int>(classIndices), classCounts, dimension); return(new TupleStruct <Perceptron, int[]>(perceptron, classIndices)); } ).ToArray(); break; } } }