/// <summary> /// Add a bunch of 1's in the last row, and call this row key "bias"--useful for regression /// </summary> /// <returns></returns> public static Matrix <string, string, double> AddBiasRow(this Matrix <string, string, double> x) { DoubleArray biasArray = ShoUtils.DoubleArrayOnes(1, x.ColCount);//new DoubleArray(1,x.ColCount); ShoMatrix biasMatrix = new ShoMatrix(biasArray, new List <string> { "bias" }, x.ColKeys, x.MissingValue); return(x.MergeRowsView <string, string, double>(true, biasMatrix)); //DoubleArray newD = ShoUtils.AddBiasToInputData(this.DoubleArray.T, this.ColCount).T; //List<string> newRowKeys = this.RowKeys.ToList(); //newRowKeys.Add("bias"); //ShoMatrix x = new ShoMatrix(newD, newRowKeys, this.ColKeys, this.MissingValue); //return x; }
//private static Random _myRand = new Random(123456); /// <summary> /// Like doing ind=find(x satisfies some condition) in Matlab /// </summary> /// <param name="x"></param> /// <param name="testPredicate"></param> /// <returns></returns> public static IntArray Find(this IntArray x, Predicate <double> testPredicate) { List <int> ind = new List <int>(); if (x.IsVector()) { for (int j = 0; j < x.Length; j++) { if (testPredicate(x[j])) { ind.Add(j); } } } else { throw new NotImplementedException(); } return(ShoUtils.ToIntArray(ind)); }
//for each row, average any columns that have the same groupId if numRepeat=-1 public static ShoMatrix AverageColSubsetsInSameGroup(this ShoMatrix M, Matrix <string, string, string> groupIds, out ShoMatrix numInstancesPerGroup, ParallelOptions parallelOptions, int repeatNum) { M.CheckEqualColKeys(groupIds.ColKeys.ToList()); Helper.CheckCondition(groupIds.RowCount == 1, "group ids should have only one row"); List <string> uniqueGroupIds = groupIds.Unique().ToList(); int G = uniqueGroupIds.Count; DoubleArray averagedResults = new DoubleArray(M.RowCount, G); DoubleArray numInstancesPerGroupArray = ShoUtils.DoubleArrayZeros(1, G); BoolArray haveProcessedGroupG = new BoolArray(1, G); for (int g = 0; g < G; g++) { string gId = uniqueGroupIds[g]; List <int> theseCols = new List <int>(); for (int n = 0; n < groupIds.ColCount; n++) { if (groupIds[0, n] == uniqueGroupIds[g]) { if (repeatNum < 0 || !haveProcessedGroupG[g])//averaging, or else it is the right repeat { theseCols.Add(n); haveProcessedGroupG[g] = true; } } } DoubleArray avgValues = M.DoubleArray.GetCols(theseCols).Mean(DimOp.OverCol); averagedResults.SetCol(g, avgValues); numInstancesPerGroupArray[0, g] = theseCols.Count(); } numInstancesPerGroup = new ShoMatrix(numInstancesPerGroupArray, new List <string>() { "count" }, uniqueGroupIds, double.NaN); return(new ShoMatrix(averagedResults, M.RowKeys, uniqueGroupIds, double.NaN)); }
/// <summary> /// pVal is the probability that we would observe as big an AUC diff as we /// did if the ROC curves were drawn from the null hypothesis (which is that /// one model does not perform better than the other) /// /// think of it this way: we want a null distribution which says that there /// is no difference between the ROCs. Since an AUC difference /// cannot arise from any entries in the ordered lists that match up, we can /// ignore these (though we could include them as well, but it would fall out /// in the wash). So instead, we assume (know) that all the information in the /// differences in AUCs is contained in the mismatched pairs, and we want to /// destroy this info for the null, so we swap the values between the two /// models. However, we want to keep the number of positive/negative samples /// the same, so when we swap one pair, we must also swap another in the /// other direction. /// /// Note that in this method, we use a fast AUC computation which ignores ties. /// This is fine for random experiemnts since the ties will fall out in the wash. /// </summary> /// <param name="roc1"></param> /// <param name="roc2"></param> /// <returns>pValue</returns> public static double ROCswapPermTest(ROC roc1, ROC roc2, int numTrial, double maxFPR, ParallelOptions parallelOptions) { string randomStringSeed = "78923";//"123456"; Helper.CheckCondition(roc1._lowerScoreIsMoreLikelyClass1 == roc2._lowerScoreIsMoreLikelyClass1); double realAUCdiff = Math.Abs(roc1._AUC - roc2._AUC); DoubleArray permDiffs = new DoubleArray(1, numTrial); //to make it the same as my matlab code, rename these: DoubleArray orderedTargets1 = DoubleArray.From(roc1._classLabels); DoubleArray orderedTargets2 = DoubleArray.From(roc2._classLabels); //orderedTargets1.WriteToCSVNoDate("orderedTargets1"); //orderedTargets2.WriteToCSVNoDate("orderedTargets2"); if (orderedTargets1.Length == 0) { throw new Exception("empty ROCs given as input"); } DoubleArray targetDiff = orderedTargets1 - orderedTargets2; IntArray posDiffInd = targetDiff.Find(v => v > 0); IntArray negDiffInd = targetDiff.Find(v => v < 0); int numPos = posDiffInd.Length; int numNeg = negDiffInd.Length; //Helper.CheckCondition(Math.Abs(numPos - numNeg) <= 1, "don't think this should happen when non-truncated ROCs are used"); double pVal; if (numPos == 0 || numNeg == 0) {//ROCs are identical pVal = 1; return(pVal); } //bug checking: int numPos1 = roc1._classLabels.ElementEQ(POS_LABEL).Sum(); int numNeg1 = roc1._classLabels.ElementEQ(NEG_LABEL).Sum(); int numPos2 = roc2._classLabels.ElementEQ(POS_LABEL).Sum(); int numNeg2 = roc2._classLabels.ElementEQ(NEG_LABEL).Sum(); //these won't be true if we're using a truncated ROC test //Helper.CheckCondition(numPos1 == numPos2, "rocs must correspond to the same labelled data (i.e, # of 1/0 must be the same)"); //Helper.CheckCondition(numNeg1 == numNeg2, "rocs must correspond to the same labelled data (i.e, # of 1/0 must be the same)"); //for bug checking, keep track of avg number of swaps: //DoubleArray numPairedSwaps = new DoubleArray(1, numTrial); //DoubleArray auc1tmp= new DoubleArray(1, numTrial); //DoubleArray auc2tmp = new DoubleArray(1, numTrial); int numPairs = Math.Min(numPos, numNeg); //for (int t = 0; t < numTrial; t++) Parallel.For(0, numTrial, t => { //Helper.CheckCondition((orderedTargets1 - orderedTargets2).Abs().Sum() != 0); Random myRand = SpecialFunctions.GetRandFromSeedAndNullIndex(randomStringSeed, t + 1); //randomly pair up each positive mismatch with each negative mismatch IntArray posIndRand = posDiffInd.RandomPermutation(myRand); IntArray negIndRand = negDiffInd.RandomPermutation(myRand); //throw new NotImplementedException("Change to GetSlice()"); IntArray possiblePosPairs = posIndRand.GetColSlice(0, 1, numPairs - 1); //ColSlice(0, 1, numPairs - 1); IntArray possibleNegPairs = negIndRand.GetColSlice(0, 1, numPairs - 1); //ColSlice(0, 1, numPairs - 1); IntArray possiblePairs = ShoUtils.CatArrayCol(possiblePosPairs, possibleNegPairs).T; //Helper.CheckCondition(possiblePairs.size1 == numPairs, "something went wrong"); //randomly pick each pair with prob=0.5 to include in the swap: DoubleArray randVec = (new DoubleArray(1, numPairs)).FillRandUseSeed(myRand); IntArray pairsOfPairsToBothSwapInd = randVec.Find(v => v >= 0.5); List <int> listInd = pairsOfPairsToBothSwapInd.T.ToListOrEmpty(); //numPairedSwaps[t] = listInd.Count; DoubleArray newTarg1 = DoubleArray.From(orderedTargets1); DoubleArray newTarg2 = DoubleArray.From(orderedTargets2); if (listInd.Count > 0) { //throw new NotImplementedException("Change to GetSlice()"); List <int> swapThesePairs = possiblePairs.GetRows(pairsOfPairsToBothSwapInd.T.ToList()).ToVector().ToList(); //swap the chosen pairs with a 1-x //Helper.CheckCondition((newTarg1.GetColsE(swapThesePairs) - newTarg2.GetColsE(swapThesePairs)).Abs().Sum() == swapThesePairs.Count); //throw new NotImplementedException("Change to SetSlice()"); newTarg1.SetCols(swapThesePairs, 1 - newTarg1.GetCols(swapThesePairs)); //.GetColsE(swapThesePairs)); newTarg2.SetCols(swapThesePairs, 1 - newTarg2.GetCols(swapThesePairs)); //GetColsE(swapThesePairs)); //newTarg1.WriteToCSVNoDate("newTarg1Swapped"); //newTarg2.WriteToCSVNoDate("newTarg2Swapped"); //Helper.CheckCondition(newTarg1.Sum() == orderedTargets1.Sum()); //Helper.CheckCondition((newTarg1 - newTarg2).Abs().Sum() == numPos + numNeg); //Helper.CheckCondition((newTarg1 - newTarg2).Find(v => v != 0).Length == numPos + numNeg); //Helper.CheckCondition((newTarg1 - orderedTargets1).Abs().Sum() == swapThesePairs.Count); //Helper.CheckCondition((newTarg2 - orderedTargets2).Abs().Sum() == swapThesePairs.Count); //Helper.CheckCondition((orderedTargets1 - orderedTargets2).Abs().Sum() != 0); } double AUC1, AUC2; if (maxFPR == 1) { //do it the cheap way AUC1 = ComputeAUCfromOrderedList(newTarg1); AUC2 = ComputeAUCfromOrderedList(newTarg2); } else { //do it with manual integration, the more expensive way AUC1 = new ROC(newTarg1, roc1._classProbs, roc1._lowerScoreIsMoreLikelyClass1, maxFPR, true)._AucAtMaxFpr; AUC2 = new ROC(newTarg2, roc2._classProbs, roc2._lowerScoreIsMoreLikelyClass1, maxFPR, true)._AucAtMaxFpr; } //auc1tmp[t] = AUC1; //auc2tmp[t] = AUC2; //permDiffs[t] = Math.Abs(AUC1 - AUC2); permDiffs[t] = (AUC1 - AUC2); } ); //double markerSize = 0.1; //ShoUtils.MakePlotAndView(permDiffs, "permDiffs", false, markerSize, "."); //ShoUtils.MakePlotAndView(numPairedSwaps, "numPairedSwaps", false, markerSize, "*"); permDiffs = permDiffs.Map(v => Math.Abs(v)); //debugging: //permDiffs.WriteToCSVNoDate("permDiffs"); //numPairedSwaps.WriteToCSVNoDate("numPairedSwapsC#"); double pseudoCount = 1; pVal = (pseudoCount + (double)(permDiffs >= realAUCdiff).Sum()) / (double)numTrial; pVal = Math.Min(pVal, 1); //ShoUtils.MakePlotAndView((auc1tmp-auc2tmp).Map(v=>Math.Abs(v)), "auc1-auc2", false, 0.2, "."); //System.Console.WriteLine("Avg # swaps: " + numPairedSwaps.Mean() + " ( of " + numPairs + " total), numGreaterSwaps=" + (double)(permDiffs >= realAUCdiff).Sum() + ", p=" + pVal + ", realAUCdiff=" + String.Format("{0:0.00000}", realAUCdiff)); return(pVal); }
public static void Main(string[] args) { //SpecialFunctions.CheckDate(2010, 4, 16); //double[][] ragged = new double[][]{new double[]{1,2,3},new double[]{4,5,6}}; //TestIt(ragged); //double[,] twoD = new double[,] {{ 1, 2, 3 },{ 4, 5, 6 } }; ////TestIt(twoD);Nope //var sparse = SparseMatrix<string, string, double>.CreateEmptyInstance(new[] { "key1", "key2" }, new[] { "cid1" }, double.NaN); //TestIt(sparse); ////BioMatrixSample.BioMatrixSample.DemoMatrix(Console.Out); ////Bio.Matrix.MatrixUnitTest.MainTest(doOutOfRangeTest: true, parallelOptions: new ParallelOptions { MaxDegreeOfParallelism = Environment.ProcessorCount }); //return; Console.WriteLine(Environment.MachineName); Console.WriteLine(Helper.CreateDelimitedString(" ", args)); try { ShoUtils.SetShoDirEnvironmentVariable(1); ArgumentCollection argumentCollection = new CommandArguments(args); if (argumentCollection.ExtractOptionalFlag("help")) { Console.WriteLine(""); Console.WriteLine(UsageMessage); return; } bool useCorrel = argumentCollection.ExtractOptionalFlag("correl"); //bool doubleUp = argCollection.ExtractOptionalFlag("doubleUp"); ParallelOptions parallelOptions = new ParallelOptions { MaxDegreeOfParallelism = argumentCollection.ExtractOptional("MaxDegreeOfParallelism", Environment.ProcessorCount) }; int randomSeed = argumentCollection.ExtractOptional <int>("randomSeed", (int)MachineInvariantRandom.GetSeedUInt("Eigenstrat")); int?randomRowCountOrNull = argumentCollection.ExtractOptional <int?>("randomRowCount", null); argumentCollection.CheckNoMoreOptions(3); int maxValue = argumentCollection.ExtractNext <int>("maxValue"); string inputDenseFileName = argumentCollection.ExtractNext <string>("inputDenseFile"); string outputCovFileName = argumentCollection.ExtractNext <string>("outputCovFile"); argumentCollection.CheckThatEmpty(); Console.WriteLine("Reading input file " + inputDenseFileName); //var originalMatrix = MatrixFactorySSC.Parse(inputDenseFileName, '?', parallelOptions); Console.WriteLine("Using 'GetInstanceFromDenseAnsi' How about 'GetInstanceFromRowKeysAnsi', too?"); using (var originalMatrix = RowKeysAnsi.GetInstanceFromDenseAnsi(inputDenseFileName, parallelOptions)) { Matrix <string, string, char> matrixOptionallyCutDown; if (null != randomRowCountOrNull) { Random random = new Random(randomSeed); var sampleRowKeys = SpecialFunctions.SelectRandom(originalMatrix.RowKeys, randomRowCountOrNull.Value, ref random); matrixOptionallyCutDown = originalMatrix.SelectRowsView(sampleRowKeys); } else { matrixOptionallyCutDown = originalMatrix; } var gMatrix = matrixOptionallyCutDown.ConvertValueView(new CharToDoubleWithLimitsConverter(maxValue), double.NaN); //DenseMatrix<string, string, double>.CreateDefaultInstance var xMatrix = StandardizeGToCreateX <ShoMatrix>(maxValue, gMatrix, ShoMatrix.CreateDefaultInstance, parallelOptions); var psiMatrix = CreatePsiTheMatrixOfCovarianceValues(useCorrel, xMatrix, /*isOKToDestroyXMatrix*/ true, parallelOptions); Console.WriteLine("Writing output file " + outputCovFileName); psiMatrix.WriteDense(outputCovFileName); } } catch (Exception exception) { Console.WriteLine(""); Console.WriteLine(exception.Message); if (exception.InnerException != null) { Console.WriteLine(exception.InnerException.Message); } Console.WriteLine(""); Console.WriteLine(UsageMessage); Console.WriteLine(exception.StackTrace); throw new Exception("", exception); } }