/// <summary> /// Creates a uniform distribution array over Dirichlets /// </summary> /// <param name="length">Length of array</param> /// <param name="valueLength">Dimension of each Dirichlet</param> /// <returns></returns> private static DirichletArray CreateUniformDirichletArray( int length, int valueLength, Sparsity sparsity) { Dirichlet[] result = new Dirichlet[length]; for (int i=0; i < length; i++) result[i] = Dirichlet.Uniform(valueLength, sparsity); return (DirichletArray)Distribution<Vector>.Array<Dirichlet>(result); }
/// <summary> /// Creates a uniform Discrete distribution with a specified sparsity, from 0 to dimension-1. /// </summary> /// <param name="dimension">Dimension</param> /// <param name="sparsity">Sparsity</param> protected Discrete(int dimension, Sparsity sparsity) { prob = Vector.Zero(dimension, sparsity); SetToUniform(); }
public static Discrete Uniform(int numValues, Sparsity sparsity) { Discrete d = new Discrete(numValues, sparsity); return(d); }
/// <summary> /// Creates a discrete /// </summary> /// <param name="dimension">The dimension of the underlying discrete</param> /// <param name="sparsity">The sparsity of the underlying discrete</param> protected GenericDiscreteBase(int dimension, Sparsity sparsity) { disc = Discrete.Uniform(dimension, sparsity); }
/// <summary> /// Constructs an LDA model /// </summary> /// <param name="sizeVocab">Size of vocabulary</param> /// <param name="numTopics">Number of topics</param> public LDAShared(int numBatches, int sizeVocab, int numTopics) { SizeVocab = sizeVocab; NumTopics = numTopics; ThetaSparsity = Sparsity.Dense; PhiSparsity = Sparsity.ApproximateWithTolerance(0.00000000001); // Allow for round-off error NumDocuments = Variable.New <int>().Named("NumDocuments"); NumBatches = numBatches; IterationsPerPass = new int[] { 1, 3, 5, 7, 9 }; //--------------------------------------------- // The model //--------------------------------------------- Range D = new Range(NumDocuments).Named("D"); Range W = new Range(SizeVocab).Named("W"); Range T = new Range(NumTopics).Named("T"); NumWordsInDoc = Variable.Array <int>(D).Named("NumWordsInDoc"); Range WInD = new Range(NumWordsInDoc[D]).Named("WInD"); Evidence = SharedVariable <bool> .Random(new Bernoulli(0.5)).Named("Evidence"); Evidence.IsEvidenceVariable = true; Phi = SharedVariable <Vector> .Random(T, CreateUniformDirichletArray(numTopics, sizeVocab, PhiSparsity)).Named("Phi"); // Phi definition sub-model - just one copy PhiDefModel = new Model(1).Named("PhiDefModel"); IfBlock evidencePhiDefBlock = null; EvidencePhiDef = Evidence.GetCopyFor(PhiDefModel).Named("EvidencePhiDef"); evidencePhiDefBlock = Variable.If(EvidencePhiDef); PhiDef = Variable.Array <Vector>(T).Named("PhiDef"); PhiDef.SetSparsity(PhiSparsity); PhiDef.SetValueRange(W); PhiPrior = Variable.Array <Dirichlet>(T).Named("PhiPrior"); PhiDef[T] = Variable <Vector> .Random(PhiPrior[T]); Phi.SetDefinitionTo(PhiDefModel, PhiDef); evidencePhiDefBlock.CloseBlock(); // Document sub-model - many copies DocModel = new Model(numBatches).Named("DocModel"); IfBlock evidenceDocBlock = null; EvidenceDoc = Evidence.GetCopyFor(DocModel).Named("EvidenceDoc"); evidenceDocBlock = Variable.If(EvidenceDoc); Theta = Variable.Array <Vector>(D).Named("Theta"); Theta.SetSparsity(ThetaSparsity); Theta.SetValueRange(T); ThetaPrior = Variable.Array <Dirichlet>(D).Named("ThetaPrior"); Theta[D] = Variable <Vector> .Random(ThetaPrior[D]); PhiDoc = Phi.GetCopyFor(DocModel); PhiDoc.AddAttribute(new MarginalPrototype(Dirichlet.Uniform(sizeVocab, PhiSparsity))); Words = Variable.Array(Variable.Array <int>(WInD), D).Named("Words"); WordCounts = Variable.Array(Variable.Array <double>(WInD), D).Named("WordCounts"); using (Variable.ForEach(D)) { using (Variable.ForEach(WInD)) { using (Variable.Repeat(WordCounts[D][WInD])) { Variable <int> topic = Variable.Discrete(Theta[D]).Named("topic"); using (Variable.Switch(topic)) { Words[D][WInD] = Variable.Discrete(PhiDoc[topic]); } } } } evidenceDocBlock.CloseBlock(); // Initialization to break symmetry ThetaInit = Variable.New <IDistribution <Vector[]> >().Named("ThetaInit"); Theta.InitialiseTo(ThetaInit); EnginePhiDef = new InferenceEngine(new VariationalMessagePassing()); EnginePhiDef.Compiler.ShowWarnings = false; EnginePhiDef.ModelName = "LDASharedPhiDef"; Engine = new InferenceEngine(new VariationalMessagePassing()); Engine.OptimiseForVariables = new IVariable[] { Theta, PhiDoc, EvidenceDoc }; Engine.Compiler.ShowWarnings = false; Engine.ModelName = "LDAShared"; Engine.Compiler.ReturnCopies = false; Engine.Compiler.FreeMemory = true; }
private IDistribution <Vector[]> GetInitialisation(double initMaxPseudoCount, double initWordsPerTopic, Sparsity sparsity, double beta) { Dirichlet[] initPhi = new Dirichlet[TotalTopics.ObservedValue]; Random r = new Random(12347); for (int i = 0; i < TotalTopics.ObservedValue; i++) { Vector v = Vector.Constant(TotalWords.ObservedValue, beta, sparsity); int[] perm = Rand.Perm(TotalWords.ObservedValue); int numWords = Poisson.Sample(initWordsPerTopic); for (int j = 0; j < numWords; j++) { v[perm[j]] += initMaxPseudoCount * r.NextDouble(); } initPhi[i] = new Dirichlet(v); } return(Distribution <Vector> .Array(initPhi)); }
public void LDATest() { int numTopics = 2; int numVocab = 10; int numDocs = 5; double alpha = 1.0 / numTopics; double beta = 1.0 / numVocab; Dirichlet[] truePhi, trueTheta; Rand.Restart(12347); int[][] wordsInDoc = GenerateToyLDAData(numTopics, numVocab, numDocs, 100, out trueTheta, out truePhi); Rand.Restart(12347); var modelDense = new LDAModel(); modelDense.Engine.NumberOfIterations = 15; modelDense.Engine.ModelName = "LdaDense"; modelDense.Infer(10.0, numVocab, numTopics, wordsInDoc, alpha, beta, Sparsity.Dense, Sparsity.Dense, Sparsity.Dense); // Same model, but run as sparse Rand.Restart(12347); var modelSparse = new LDAModel(); modelSparse.Engine.NumberOfIterations = 15; modelSparse.Engine.ModelName = "LdaSparse"; modelSparse.Engine.Compiler.ReturnCopies = false; // previous tolerances appeared to be set to alpha and beta but were in fact set to dense due to a downstream bug. modelSparse.Infer(10.0, numVocab, numTopics, wordsInDoc, alpha, beta, Sparsity.ApproximateWithTolerance(1e-7), Sparsity.ApproximateWithTolerance(1e-7), Sparsity.ApproximateWithTolerance(1e-6)); Console.WriteLine(StringUtil.JoinColumns("Phi sparsity = ", SparsityFraction(modelSparse.PostPhi))); Console.WriteLine(StringUtil.JoinColumns("Theta sparsity = ", SparsityFraction(modelSparse.PostTheta))); for (int i = 0; i < numDocs; i++) { Assert.Equal(Sparsity.Dense, modelDense.PostTheta[i].Sparsity); //Assert.True(modelSparse.PostTheta[i].Sparsity.IsApproximate); Assert.Equal(0.0, modelDense.PostTheta[i].MaxDiff(modelSparse.PostTheta[i]), 1e-3); } for (int i = 0; i < numTopics; i++) { Assert.Equal(Sparsity.Dense, modelDense.PostPhi[i].Sparsity); // Assert.True(modelSparse.PostPhi[i].Sparsity.IsApproximate); Assert.Equal(0.0, modelDense.PostPhi[i].MaxDiff(modelSparse.PostPhi[i]), 1e-3); } }
#pragma warning disable 162 #endif /// <summary> /// Run inference on the LDA model /// </summary> /// <param name="initMaxPseudoCount">Max psudo-count for initialisation</param> /// <param name="alpha">Pseudo-counts for theta</param> /// <param name="beta">Pseudo-counts for phi</param> public void Infer(double initMaxPseudoCount, int numVocab, int numTopics, int[][] wordsInDoc, double alpha, double beta, Sparsity alphaSparsity, Sparsity betaSparsity, Sparsity phiSparsity) { // Set up the observed values TotalWords.ObservedValue = numVocab; TotalDocuments.ObservedValue = wordsInDoc.Length; TotalTopics.ObservedValue = numTopics; int[] numWordsInDoc = new int[wordsInDoc.Length]; for (int i = 0; i < wordsInDoc.Length; i++) { numWordsInDoc[i] = wordsInDoc[i].Length; } NumWordsInDoc.ObservedValue = numWordsInDoc; Words.ObservedValue = wordsInDoc; Vector alphaVector = Vector.Constant(numTopics, alpha, alphaSparsity); Vector betaVector = Vector.Constant(numVocab, beta, betaSparsity); AlphaPrior.ObservedValue = new Dirichlet(alphaVector); BetaPrior.ObservedValue = new Dirichlet(betaVector); // This has to occur after the other observed values are set: PhiInit.ObservedValue = GetInitialisation(initMaxPseudoCount, numVocab / numTopics, phiSparsity, beta); if (false) { // for debugging, put the observed values in the code. TotalWords.IsReadOnly = true; TotalDocuments.IsReadOnly = true; TotalTopics.IsReadOnly = true; NumWordsInDoc.IsReadOnly = true; Words.IsReadOnly = true; AlphaPrior.IsReadOnly = true; BetaPrior.IsReadOnly = true; PhiInit.IsReadOnly = true; } Engine.OptimiseForVariables = new List <IVariable>() { Theta, Phi }; PostTheta = Engine.Infer <Dirichlet[]>(Theta); PostPhi = Engine.Infer <Dirichlet[]>(Phi); }