public bool LoadData(string[] inputfiles, out LdaDocument[] qaDocuments) { StreamReader sr = null; int count = 0; List<LdaDocument> documents = new List<LdaDocument>(); foreach (string inputfile in inputfiles) { Console.WriteLine("[Begin] read {0}", inputfile); try { sr = new StreamReader(inputfile, Encoding.UTF8); string queLine = null; while ((queLine = sr.ReadLine()) != null) { if (count++ < 50000) continue; string[] tmp = queLine.Split(new string[] { "\t" }, StringSplitOptions.RemoveEmptyEntries); if (tmp.Length < 2) continue; LdaDocument document = new LdaDocument(); List<int> queWordList = new List<int>(); var questionWords = tmp[0].Split(new char[] { '|' },StringSplitOptions.RemoveEmptyEntries); foreach (var word in questionWords) { try { int index = int.Parse(word); queWordList.Add(index); } catch (Exception e){ Console.WriteLine(e); } } List<int> quetags = new List<int>(); var tags = tmp[1].Split(new char[] { '|' }, StringSplitOptions.RemoveEmptyEntries); foreach (var tag in tags) { quetags.Add(int.Parse(tag)); } document.tag = quetags.ToArray(); document.doc = queWordList.ToArray(); documents.Add(document); } } catch (Exception e) { Console.WriteLine(e); } if (sr != null) { sr.Close(); } Console.WriteLine("[End] read {0}", inputfile); } qaDocuments = documents.ToArray(); documents = null; return true; }
public GibbsLDA(LdaDocument[] documents, int vocbulrySize) { this.docs = documents; this.docCount = documents.Length; this.vocbulrySize = vocbulrySize; }
public double[] GetRankDocHMMTrans(LdaDocument rankDoc) { if (rankDoc.doc == null || rankDoc.doc.Length == 0) { return new double[topicSum_K]; } if (topicDis == null) { InitTopicDis(); } if (phi == null) { ComputePhi(); } if (topicToTopic == null) { ComputeTopicToTopic(); } int queryLength = rankDoc.doc.Length; rankDoc.wordTopic = new int[queryLength]; double[,] dis = new double[queryLength, topicSum_K]; int[,] preIndex = new int[queryLength, topicSum_K]; for (int i = 0; i < topicSum_K; i++) { double maxDis = -1; for (int j = 0; j < topicSum_K; j++) { double curDis = topicDis[j] * topicToTopic[j][i]; if (curDis > maxDis) { maxDis = curDis; } } dis[0, i] = maxDis * phi[i, rankDoc.doc[0]]; preIndex[0, i] = -1; } for (int i = 1; i < queryLength; i++) { for (int j = 0; j < topicSum_K; j++) { double maxDis = -1; int index = -1; for (int s = 0; s < topicSum_K; s++) { double curDis = dis[i - 1, s] * topicToTopic[s][j]; if (curDis > maxDis) { maxDis = curDis; index = s; } } dis[i, j] = maxDis * phi[j, rankDoc.doc[i]]; preIndex[i, j] = index; } } int max = 0; for (int i = 0; i < topicSum_K; i++) { if (dis[queryLength - 1, i] > dis[queryLength - 1, max]) { max = i; } } int[] rankTopicSum = new int[topicSum_K]; for (int i = queryLength - 1; i >= 0; i--) { rankDoc.wordTopic[i] = max; rankTopicSum[max]++; max = preIndex[i, max]; } double[] rankTheta = new double[topicSum_K]; for (int i = 0; i < topicSum_K; i++) { rankTheta[i] = (rankTopicSum[i] + alpha) / (rankDoc.doc.Length + topicSum_K * alpha); } return rankTheta; }
public double[] GetRankDocumentTheta(LdaDocument rankDoc) { if (rankDoc.doc == null) { return null; } double[] rankDocTheta = new double[topicSum_K]; # region InitRankDocumentState int[] rankDocTopicSum = new int[topicSum_K]; int length = rankDoc.doc.Length; int[,] rankDocWordTopicSum = new int[length, topicSum_K]; int[] wordTopic = new int[length]; for (int i = 0; i < length; i++) { int topic = random.Next(topicSum_K); wordTopic[i] = topic; rankDocTopicSum[topic]++; rankDocWordTopicSum[i, topic]++; } rankDoc.wordTopic = wordTopic; int rankNumstats = 0; #endregion InitRankDocumentState for (int it = 0; it < ITERATIONS; it++) { #region RankDocSample if (rankDoc.wordTopic != null) { double[] p = new double[topicSum_K]; for (int i = 0; i < rankDoc.doc.Length; i++) { int word = rankDoc.doc[i]; int topic = rankDoc.wordTopic[i]; rankDocTopicSum[topic]--; rankDocWordTopicSum[i, topic]--; for (int j = 0; j < topicSum_K; j++) { p[j] = (wordTopicSum[word, j] + rankDocWordTopicSum[i, j] + beta) / (topicWordSum[j] + rankDocTopicSum[j] + vocbulrySize * beta) * (rankDocTopicSum[j] + alpha); } for (int k = 1; k < topicSum_K; k++) { p[k] += p[k - 1]; } double u = random.NextDouble() * p[topicSum_K - 1]; for (topic = 0; topic < topicSum_K; topic++) { if (p[topic] > u) { break; } } rankDocWordTopicSum[i, topic]++; rankDocTopicSum[topic]++; rankDoc.wordTopic[i] = topic; } } #endregion RankDocSample if ((it > BURN_IN) && (SAMPLE_LAG > 0) && (it % SAMPLE_LAG == 0)) { #region UpdateRankDocParams for (int topic = 0; topic < topicSum_K; topic++) { rankDocTheta[topic] += ((rankDocTopicSum[topic] + alpha) / (rankDoc.doc.Length + topicSum_K * alpha)); } rankNumstats++; #endregion UpdateRankDocParams } } #region ComputRankDocTheta if (SAMPLE_LAG > 0) { for (int topic = 0; topic < topicSum_K; topic++) { rankDocTheta[topic] = rankDocTheta[topic] / rankNumstats; } } else { for (int topic = 0; topic < topicSum_K; topic++) { rankDocTheta[topic] = ((rankDocTopicSum[topic] + alpha) / (rankDoc.doc.Length + topicSum_K * alpha)); } } #endregion ComputRankDocTheta Console.Write("."); return rankDocTheta; }
public double TopicLikelihood(LdaDocument question, LdaDocument candidate) { var questionTheta = GetRankDocumentTheta(question); var candidateTheta = GetRankDocumentTheta(candidate); double score = 0; for (int i = 0; i < topicSum_K; i++) { score += (questionTheta[i] * candidateTheta[i] / topicWordSum[i]); } return score * question.doc.Length; }