private static TwoByOne CreateTwoByOne <T>(IEnumerable <T> rowList, Func <T, int> targetValFunc) { TwoByOne twoByOne = TwoByOne.GetInstance(rowList, row => { int targetVal = targetValFunc(row); Helper.CheckCondition(targetVal == 0 || targetVal == 1, "Expect the target to be 0 or 1"); return(targetVal == 1); }); return(twoByOne); }
/// <summary> /// See nicer wrapper: MannWhitneyUTestOneSided. /// this is a one-sided test looking for the case where the group labelled with 1 is larger than the group labelled with 0 /// </summary> /// <typeparam name="T"></typeparam> /// <param name="rowList"></param> /// <param name="scoreAccessor"></param> /// <param name="label01Accessor"></param> /// <param name="maxNumPermutations"></param> /// <param name="forceAssymptoticApprox"></param> /// <param name="neverDoExactPermutations"></param> /// <param name="parallelOptionsOrNullFor1"></param> /// <returns>The z score and the p-value</returns> public static KeyValuePair <double, double> ComputeZ0AndPValue <T>(IList <T> rowList, Func <T, double> scoreAccessor, Func <T, int> label01Accessor, int maxNumPermutations = 10000, bool forceAssymptoticApprox = false, bool neverDoExactPermutations = false, ParallelOptions parallelOptionsOrNullFor1 = null) { ParallelOptions parallelOptions = parallelOptionsOrNullFor1 ?? new ParallelOptions() { MaxDegreeOfParallelism = 1 }; //var zeroAndCountThenOneAndCount = CreateZeroAndCountThenOneAndCount(rowList, pTargetFunc, targetValFunc, parallelOptions); //int n0 = zeroAndCountThenOneAndCount.First().Value; //int n1 = SpecialFunctions.FirstAndOnly(zeroAndCountThenOneAndCount.Skip(1)).Value;// the class we think has larger values for the one-tailed test //having problems with the parallelOptions above, so re-writing like this int n0 = rowList.Where(elt => label01Accessor(elt) == 0).Count(); int n1 = rowList.Where(elt => label01Accessor(elt) == 1).Count(); double z0; //Helper.CheckCondition(ignoreSafetyOfNormal || (n0 > 10 && n1 > 10), "The count should be at least 10 for the normal distribution to work"); double p; if ((n0 > 10 && n1 > 10) || forceAssymptoticApprox) { z0 = ComputeZ0 <T>(rowList, parallelOptions, n0, n1, scoreAccessor, label01Accessor); p = 1.0 - SpecialFunctions.ZScoreToOneTailedPValue(z0, 1e-10); SanityCheckP(z0, p); } else { ParallelOptions parallelOptions1 = new ParallelOptions { MaxDegreeOfParallelism = 1 }; //now need to check out here if using all permutations or not to bypass Carl's code if not double logExactPermutationCount = SpecialFunctions.LogFactorialNMOverFactorialNFactorialMApprox(n0, n1); bool useExactPermutations = (logExactPermutationCount <= Math.Log(maxNumPermutations)) && !neverDoExactPermutations; List <double> zList; if (useExactPermutations) { z0 = ComputeZ0 <T>(rowList, parallelOptions, n0, n1, scoreAccessor, label01Accessor); /*faster than this is to simply permute the ranks of the real data (including ties), rather than the real data itself, but leaving this in for when exact permutations are needed*/ zList = (from permutation in SpecialFunctions.Permute01Targets(rowList, scoreAccessor, label01Accessor, maxNumPermutations) .AsParallel().WithDegreeOfParallelism(parallelOptions.MaxDegreeOfParallelism) let z = ComputeZ0(permutation, parallelOptions1, n0, n1, pair => pair.Key, pair => pair.Value) orderby z select z).ToList(); } else { /*-------------------------------------------------------------------------------------------------- * NB there is now a dead branch in SpecialFunctions.Permute01Targets(), which formerly used to do both * 'exact'/'complete' and 'inexact'/'subsampled' permutations. Now it only does the former ,and the 'inexact' is here. This is because I * do it much faster, but didn't want to bother with doing the 'exact'. * -------------------------------------------------------------------------------------------------*/ //don't bother converting to z, just use u instead List <double> listOfAllValues = rowList.Select(elt => scoreAccessor(elt)).ToList(); List <double> ranksWithTies = SpecialFunctions.RanksWithTies(listOfAllValues); //List<int> indsOfClass0 = Enumerable.Range(0, n0 + n1).ToList().Where(elt => targetValFunc(rowList[elt]) == 0).ToList(); //List<double> ranksWithTiesClass0 = ranksWithTies.SubList(indsOfClass0); //double u0 = ComputeUFromRanks(ranksWithTiesClass0); List <int> indsOfClass1 = Enumerable.Range(0, n0 + n1).ToList().Where(elt => label01Accessor(rowList[elt]) == 1).ToList(); List <double> ranksWithTiesClass1 = ranksWithTies.SubList(indsOfClass1); double u1 = ComputeUFromRanks(ranksWithTiesClass1); //!!!not parallelized List <double> uList = new List <double>(); Random myRand = new MachineInvariantRandom("123456"); for (int perm = 0; perm < maxNumPermutations; perm++) { ranksWithTies.ShuffleInPlace(myRand); List <double> ranksWithTies0 = ranksWithTies.SubSequence(0, n0).ToList(); double thisUscore0 = ComputeUFromRanks(ranksWithTies0); List <double> ranksWithTies1 = ranksWithTies.SubSequence(n0, n1).ToList(); double thisUscore1 = ComputeUFromRanks(ranksWithTies1); //if it were 2-sided, we would use this (I think) //double uScore = Math.Min(thisUscore0, thisUscore1); //but it's one-sided, so we use the one from the set that had labels "1" double uScore = thisUscore1; //double thisZ = ComputeZfromU(n0, n1, uScore); uList.Add(uScore); } //to let the rest of the code do what it should zList = uList; z0 = u1; } TwoByOne twoByOne = TwoByOne.GetInstance(zList, z => z0 <= z); p = twoByOne.Freq; //Can't SanityCheckP(z0, p) because ties mean it wont always get the right answer } ////To get two-sided, which says "are they different" use this pTwoSided = 2 * ((p < .5) ? p : (1-p)); //ResultsRow resultRow = new ResultsRow { DataSetName = dataSetName, CidGroup= cidGroup, PValue = p, N0 = n0, N1 = n1, UScore0 = uScore0, UScore1 = uScore1, Z0 = z0, Z1 = -z0 }; //return resultRow; return(new KeyValuePair <double, double>(z0, p)); }