public static /*List<int[,]>*/ IEnumerable <double> _GetDistOfClassifier(DataTable data) { //var knownValues = nilnul.data.tbl.col.cels.vals._DistinctX. Distinct(data, data.Columns.Count-1); var bag = new nilnul.txt.Bag1( nilnul.data.tbl.col.cels._ValsX._Txts_assumeIndexInRange(data, data.Columns.Count - 1) ) ; var dist = nilnul.stat.dist_.finite._FroOccursX._Dbls_assumeTotalPositive(bag, nilnul.txt.Comp.Singleton); return(dist); }
/// <summary> /// /// </summary> /// <param name="data">assume : classEntropy not nil, attr entropy is not nil</param> /// <param name="candidateColIndex"></param> /// <param name="entropyOfDataset">the entropy of the whole dataset with respect of the classifier</param> /// <returns></returns> public static double GetGainRatioForCol(DataTable data, int candidateColIndex, double entropyOfDataset) { var totalRows = data.Rows.Count; var amountForDifferentValue = _GetDistByVal(data, candidateColIndex); var candidate_andClassStat = _GetClassStatByCandidateVal(data, candidateColIndex); //var dist = _GetDistOfCol(data, colIndex); var candidateStat = new nilnul.txt.Bag1(); candidate_andClassStat.Select(kv => new KeyValuePair <string, Num1>(kv.Key, kv.Value.cardinality)).ForEach( x => candidateStat.Add(x.Key, x.Value) ); var candidateDistribution = nilnul.stat.dist_.finite._FroBagX._ProbInVowedDbl_assumeTotalPositive( candidateStat ); var candidateEntropy = nilnul.stat.dist_.finite._EntropyX.Dbl_ofAssumeDistribution( candidateDistribution.Values.Cast <ProbDbl>() ); var eachCandidate_with_ClassEntropy = candidate_andClassStat.Select( d => new KeyValuePair <string, double>( d.Key, nilnul.stat.dist_.finite._EntropyX.Entropy_ofAssumeDistribution( nilnul.stat.dist_.finite._FroBagX._ProbInDbl_assumeTotalPositive( d.Value ).Values.Cast <double>() ) ) ); var conditionalEntropyOfClassOnCandidate = eachCandidate_with_ClassEntropy.Select( candidate => candidateDistribution[candidate.Key] //prob of candidate * candidate.Value //entropy ).Sum(); var stepsForCalculation = new List <double>(); foreach (var item in amountForDifferentValue) { // helper for calculation var firstDivision = item[0, 1] / (double)item[0, 0]; var secondDivision = (item[0, 0] - item[0, 1]) / (double)item[0, 0]; // prevent dividedByZeroException if (firstDivision == 0 || secondDivision == 0) { stepsForCalculation.Add(0.0); } else { stepsForCalculation.Add(-firstDivision * Math.Log(firstDivision, 2) - secondDivision * Math.Log(secondDivision, 2)); } } /// to change to ratio var gain = stepsForCalculation.Select((t, i) => amountForDifferentValue[i][0, 0] / (double)totalRows * t).Sum(); gain = entropyOfDataset - gain; var gain1 = entropyOfDataset - conditionalEntropyOfClassOnCandidate; var gainRatio = gain1 / candidateEntropy; return(gainRatio); return(gain); }
public IEnumerable <(nilnul.data.mining._associater.Association <string>, double)> getRules( IEnumerable <Observation> observations ) { var minSupport = (observations.Count() * _support); var itemCountS = new nilnul.txt.Bag1( observations.SelectMany(s => s) ); var supportedItems = new nilnul.txt.Bag1( itemCountS.Where(x => (double)x.Value.en >= minSupport) ); var frequentItemSetS = new nilnul.obj.Bag1 <IEnumerable <string> >( new NotNull2 <IEqualityComparer <IEnumerable <string> > >( new nilnul.obj.str_.seq.Eq <string>() ) ); supportedItems.Each( component => { frequentItemSetS.add( new[] { component.Key } ); } ); var itemSetCardinality = 1; while (true) { var itemsInConsideration = new nilnul.txt.Set(frequentItemSetS.Keys.SelectMany(x => x)); var newFreqItemSets = new nilnul.obj.Bag1 <IEnumerable <string> >( new NotNull2 <IEqualityComparer <IEnumerable <string> > >( new nilnul.obj.str_.seq.Eq <string>() ) ); itemSetCardinality++; observations.Each( observation => { var intersected = nilnul.set.op_.binary_._IntersectX.Intersect( itemsInConsideration , observation ); var combinated = nilnul.set.family.op_.of_.set_.combinate_._ByIndexsX._Cord_assumeDistinct( intersected, (itemSetCardinality) ); combinated.Each( combinatedInstance => newFreqItemSets.add( combinatedInstance ) ); } ); newFreqItemSets.removeKeys_ofFinite( newFreqItemSets.Where(x => (double)x.Value.en < minSupport).Select(y => y.Key).ToArray() ); if (newFreqItemSets.None()) { ///The algorithm gets terminated when the frequent itemsets cannot be extended further. break; } else { frequentItemSetS = newFreqItemSets; } } var rules = new List <(nilnul.data.mining._associater.Association <string>, double)>(); ///now we get the frequent itemSetS. ///to extract rules from each set. /// foreach (var frequentSet in frequentItemSetS) { for (int i = 1 /*0*/; i < /*=*/ frequentSet.Key.Count(); i++) { foreach ( var combinated in nilnul.set.family.op_.of_.set_.combinate_._ByIndexsX._Cord_assumeDistinct( frequentSet.Key , i ) ) { var complement = frequentSet.Key.Except(combinated) ; rules.Add( ( new mining._associater.Association <string>( combinated , complement ) , nilnul.stat.dist_.finite_.multivar_.binary.observation.str._ConfidenceX.Confidence( observations.Select(s => new HashSet <string>(s)) , combinated, complement ) ) ); } } } ///now we get the ruleGrpS /// return(rules.Where(x => x.Item2 >= this._confidence)); }