/// <summary> /// Compute VI using pre-computed information (that can be cashed across multiple evaluations /// </summary> /// <param name="nodeSubGrouping"></param> /// <param name="numTypes"></param> /// <param name="baseNameProbDist"></param> /// <returns></returns> double IVariationOfInformationComputer <NodeName> .ComputeVariationOfInformation(List <HashSet <LatticeNode <NodeName> > > nodeSubGrouping, int numTotalNodes) { Debug.Assert(_baseSubtokenProbDist != null); var subtokenDistrPerGroup = nodeSubGrouping.Select(g => SubtokenProbDist(g.Where(n => n.Data.Length > 0).SelectMany(n => n.Data))).ToList(); // P(subtoken|type) // Compute H(subtoken|type) double subtokenGivenTypeConditionalEntropy = 0; int idx = 0; foreach (var typeGroup in nodeSubGrouping) { var subtokenDistrForGroup = subtokenDistrPerGroup[idx]; MultinomialDistribution <string> typeGroupParentProb = null; if (_dirichletAlpha > 0) { typeGroupParentProb = GetParentNameDistribution(typeGroup, _baseSubtokenProbDist); } double probType = ((double)typeGroup.Where(n => n.Data.Length > 0).Count()) / numTotalNodes; var subtokenEntropy = subtokenDistrForGroup.Elements .Select(e => subtokenDistrForGroup.ProbabilityOf(e, typeGroupParentProb, _dirichletAlpha)) .Select(p => - p * Math.Log(p)).Sum(); subtokenGivenTypeConditionalEntropy += probType * subtokenEntropy; idx++; } // Compute H(type|subtokens) double typeGivenSubtokenConditionalEntropy = 0; foreach (var subtoken in _baseSubtokenProbDist.Elements) { var baseProbSubtoken = _baseSubtokenProbDist.ProbabilityOf(subtoken); var entropyOfTypeGivenSubtoken = subtokenDistrPerGroup .Select(g => { return(((double)g[subtoken]) / (double)_baseSubtokenProbDist[subtoken]); }) .Where(p => p != 0) .Select(p => - p * Math.Log(p)).Sum(); typeGivenSubtokenConditionalEntropy += baseProbSubtoken * entropyOfTypeGivenSubtoken; } Debug.Assert(subtokenGivenTypeConditionalEntropy >= 0); Debug.Assert(typeGivenSubtokenConditionalEntropy >= 0); return(subtokenGivenTypeConditionalEntropy + typeGivenSubtokenConditionalEntropy); }
public MultinomialDistribution <string> GetParentNameDistribution(HashSet <LatticeNode <NodeName> > group, MultinomialDistribution <string> baseDistribution, double distanceDiscount = .9, double tolerance = 10e-4) { var distribution = new MultinomialDistribution <string>(); // Add minimally the base distribution to avoid NaNs foreach (var subtoken in baseDistribution.Elements) { distribution.Add(subtoken, (decimal)(tolerance * baseDistribution.ProbabilityOf(subtoken))); } var visited = new HashSet <LatticeNode <NodeName> >(group); var toVisit = new Stack <(LatticeNode <NodeName>, int)>(); foreach (var parentNode in group.SelectMany(n => n.Parents).Where(n => !visited.Contains(n))) { toVisit.Push((parentNode, 1)); } while (toVisit.Count > 0) { (var nextNode, var depth) = toVisit.Pop(); visited.Add(nextNode); decimal countAs = (decimal)(Math.Pow(distanceDiscount, depth)); foreach (var subtoken in nextNode.Data) { distribution.Add(subtoken, countAs); } foreach (var parentNode in nextNode.Parents.Where(n => !visited.Contains(n))) { toVisit.Push((parentNode, depth + 1)); } } return(distribution); }
public static double ProbName(NodeName subtokens, MultinomialDistribution <string> distribution, MultinomialDistribution <string> prior = null, double dirichletAlpha = .1) => CrossEntropyNameMultinomial(subtokens.Select(s => distribution.ProbabilityOf(s, prior, dirichletAlpha)).ToArray());