public void TrainNewModel(Corpora cor, CommandLineOption opt) { InitOption(opt); InitModel(cor); PrintModelInfo(); GibbsSampling(niters); }
public void TrainNewModel(Corpora cor, CommandLineOption opt, string outputFile) { this.outputfile = outputFile; InitOption(opt); InitModel(cor); PrintModelInfo(); GibbsSampling(niters); }
public void TrainNewModel(Corpora cor, CommandLineOption opt, string outputFile) { this.outputfile = outputFile; InitOption(opt); InitModel(cor); PrintModelInfo(); GibbsSampling(niters); RandomWalkHelper.ExportDataForMatlab(0.001, thetaSource, thetaTarget, K); }
static void Main(string[] args) { CommandLineOption opt = GetDefaultOption(); Parser parser = new Parser(); var stopwatch = new Stopwatch(); try { parser.ParseArguments(args, opt); string[] info = File.ReadAllLines(opt.info); int sourceAuthorNum = Convert.ToInt32(info[0]); int targetAuthorNum = Convert.ToInt32(info[1]); int sourcePaperNum = Convert.ToInt32(info[2]); int targetPaperNum = Convert.ToInt32(info[3]); int collaborationPaperNum = Convert.ToInt32(info[4]); List <string> papers = File.ReadAllLines(opt.papers).ToList(); List <string> authors = File.ReadAllLines(opt.authors).ToList(); var sourcePapers = papers.Take(sourcePaperNum).ToArray(); var targetPapers = papers.Skip(sourcePaperNum).Take(targetPaperNum).ToArray(); var collaborationPapers = papers.Skip(sourcePaperNum + targetPaperNum).ToArray(); var sourceAuthors = authors.Take(sourcePaperNum).ToArray(); var targetAuthors = authors.Skip(sourcePaperNum).Take(targetPaperNum).ToArray(); var collaborationAuthors = authors.Skip(sourcePaperNum + targetPaperNum).ToArray(); int vocabSize = 12; Corpora sourceCor = new Corpora(vocabSize, sourceAuthorNum); sourceCor.LoadDataFile(sourcePapers, sourceAuthors); ACTGibbsSampling sourceModel = new ACTGibbsSampling(); sourceModel.TrainNewModel(sourceCor, opt, "outSource.txt"); Corpora targetCor = new Corpora(vocabSize, targetAuthorNum); targetCor.LoadDataFile(targetPapers, targetAuthors); ACTGibbsSampling targetModel = new ACTGibbsSampling(); targetModel.TrainNewModel(targetCor, opt, "outTarget.txt"); Corpora collaborationCor = new Corpora(vocabSize, 0); collaborationCor.LoadDataFile(collaborationPapers, collaborationAuthors); CTLGibbsSampling collaborationModel = new CTLGibbsSampling(sourceModel, targetModel); collaborationModel.TrainNewModel(collaborationCor, opt, "outColl.txt"); } catch (Exception ex) { Console.WriteLine(ex.StackTrace); Console.WriteLine(ex.Message); } Console.ReadLine(); }
private void InitModel(Corpora cor) { this.cor = cor; M = cor.totalDocuments; //jumlah dokumen Voc = cor.VocabSize; //jumlah kata di vocab p = new double[K]; ps = new double[2]; pSource = new double[K]; pTarget = new double[K]; sourceAuthorNum = sourceModel.cor.TotalAuthor; targetAuthorNum = targetModel.cor.TotalAuthor; zx = new int[Voc][]; //kata-topik zxSource = new int[Voc][]; zxTarget = new int[Voc][]; for (int w = 0; w < Voc; w++) { zx[w] = new int[K]; zxSource[w] = new int[K]; zxTarget[w] = new int[K]; sourceModel.zx[w].CopyTo(zxSource[w], 0); targetModel.zx[w].CopyTo(zxTarget[w], sourceModel.K); } zxsum = new int[K]; zxsumSource = new int[K]; zxsumTarget = new int[K]; sourceModel.zxsum.CopyTo(zxsumSource, 0); targetModel.zxsum.CopyTo(zxsumTarget, sourceModel.K); zvSource = new int[sourceModel.V][]; zvTarget = new int[targetModel.V][]; zvSourceColl = new int[sourceModel.V][]; zvTargetColl = new int[targetModel.V][]; zvSourceCollSum = new int[sourceModel.V]; zvTargetCollSum = new int[targetModel.V]; thetaSource = new double[sourceModel.V][]; thetaTarget = new double[targetModel.V][]; for (int v = 0; v < sourceModel.V; v++) { zvSource[v] = new int[K]; sourceModel.zv[v].CopyTo(zvSource[v], 0); zvSourceColl[v] = new int[K]; thetaSource[v] = new double[K]; } for (int v = 0; v < targetModel.V; v++) { zvTarget[v] = new int[K]; targetModel.zv[v].CopyTo(zvTarget[v], sourceModel.K); zvTargetColl[v] = new int[K]; thetaTarget[v] = new double[K]; } zvsumSource = sourceModel.zvsum; zvsumTarget = targetModel.zvsum; zvv = new int[sourceAuthorNum][][]; //author pair-topik zvvsum = new int[sourceAuthorNum][]; //jumlah topik untuk setiap author pair vartheta = new double[sourceAuthorNum][][]; //author pair-topik for (int a = 0; a < sourceAuthorNum; a++) { zvv[a] = new int[targetAuthorNum][]; vartheta[a] = new double[targetAuthorNum][]; zvvsum[a] = new int[targetAuthorNum]; for (int b = 0; b < targetAuthorNum; b++) { zvv[a][b] = new int[K]; vartheta[a][b] = new double[K]; } } phi = new double[K][]; //phi untuk topik kata for (int k = 0; k < K; k++) { phi[k] = new double[Voc]; } dc = new int[M][]; cz = new int[2][]; for (int m = 0; m < M; m++) { dc[m] = new int[2]; } for (int i = 0; i < 2; i++) { cz[i] = new int[K]; } words = new int[cor.totalWords]; //sejumlah token pada korpus, isinya wordID sesuai vocab doc = new int[cor.totalWords]; //docID untuk setiap kata sourceAuthors = new int[cor.totalWords]; targetAuthors = new int[cor.totalWords]; coins = new int[cor.totalWords]; isSourceDomain = new int[cor.totalWords]; z = new int[cor.totalWords]; //topik untuk setiap kata wn = 0; for (int i = 0; i < M; i++) //iterate dokumen { Document currDoc = cor.Docs[i]; int docLength = currDoc.Length; for (int j = 0; j < docLength; j++) //iterate token pada dokumen { words[wn] = currDoc.Words[j]; //menyimpan wordID dari vocab untuk setiap kata int coin = (rnd.NextDouble() < 0.5) ? 0 : 1; coins[wn] = coin; dc[i][coin] += 1; int sourceAuthor = currDoc.RandomSourceAuthorID; int targetAuthor = currDoc.RandomTargetAuthorID; sourceAuthors[wn] = sourceAuthor; targetAuthors[wn] = targetAuthor; //Console.Write(sourceAuthor); //Console.Write(targetAuthor); doc[wn] = i; wn++; } } for (int i = 0; i < wn; i++) { int topic = rnd.Next(K); //Console.Write(topic); z[i] = topic; cz[coins[i]][topic] += 1; if (coins[i] == 0) { zx[words[i]][topic] += 1; //coba ini ntar di luar zxsum[topic] += 1; zvv[sourceAuthors[i]][targetAuthors[i]][topic] += 1; zvvsum[sourceAuthors[i]][targetAuthors[i]] += 1; zvSourceColl[sourceAuthors[i]][topic] += 1; zvTargetColl[targetAuthors[i]][topic] += 1; zvSourceCollSum[sourceAuthors[i]] += 1; zvTargetCollSum[targetAuthors[i]] += 1; } else { if (topic < sourceModel.K) { //Console.WriteLine("source: " + topic + " - " + sourceAuthors[i]); isSourceDomain[i] = 1; zxSource[words[i]][topic] += 1; zxsumSource[topic] += 1; zvSource[sourceAuthors[i]][topic] += 1; zvsumSource[sourceAuthors[i]] += 1; } else { //Console.WriteLine("target: " + topic + " - " + targetAuthors[i]); isSourceDomain[i] = 0; zxTarget[words[i]][topic] += 1; zxsumTarget[topic] += 1; zvTarget[targetAuthors[i]][topic] += 1; zvsumTarget[targetAuthors[i]] += 1; } } } SaveModel("initial.json"); }
private void InitModel(Corpora cor) { this.cor = cor; M = cor.totalDocuments; //jumlah dokumen Voc = cor.VocabSize; //jumlah kata di vocab V = cor.TotalAuthor; //jumlah author Console.WriteLine(V); p = new double[K]; zx = new int[Voc][]; //kata-topik zv = new int[V][]; //author-topik for (int w = 0; w < Voc; w++) { zx[w] = new int[K]; } for (int v = 0; v < V; v++) { zv[v] = new int[K]; } zxsum = new int[K]; //jumlah kata untuk setiap topik zvsum = new int[V]; //jumlah topik untuk setiap author words = new int[cor.totalWords]; //sejumlah token pada korpus, isinya wordID sesuai vocab docs = new int[cor.totalWords]; //docID untuk setiap kata authors = new int[cor.totalWords]; //authorID untuk setiap kata z = new int[cor.totalWords]; //topik untuk setiap kata wn = 0; for (int i = 0; i < M; i++) //iterate dokumen { int l = cor.Docs[i].Length; for (int j = 0; j < l; j++) //iterate token pada dokumen { words[wn] = cor.Docs[i].Words[j]; //menyimpan wordID dari vocab untuk setiap kata int author = cor.Docs[i].RandomAuthorID; authors[wn] = author; zvsum[author] += 1; //jumlah topik pada author diinisialisasi sama dengan jumlah kata docs[wn] = i; wn++; } } for (int i = 0; i < wn; i++) { int topic = rnd.Next(K); //select random topik untuk tiap kata lalu update nilai statistik zx[words[i]][topic] += 1; zv[authors[i]][topic] += 1; zxsum[topic] += 1; z[i] = topic; } theta = new double[V][]; //theta untuk author topik --> dimensi nya kebalik sama nd for (int v = 0; v < V; v++) { theta[v] = new double[K]; } phi = new double[K][]; //phi untuk topik kata for (int k = 0; k < K; k++) { phi[k] = new double[Voc]; } }
private void InitModel(Corpora cor) { this.cor = cor; M = cor.totalDocuments; //jumlah dokumen V = cor.VocabSize; //jumlah kata di vocab p = new double[K]; Random rnd = new Random(); nw = new int[V][]; //kata-topik nd = new int[M][]; //dokumen-topik for (int w = 0; w < V; w++) { nw[w] = new int[K]; } for (int m = 0; m < M; m++) { nd[m] = new int[K]; } nwsum = new int[K]; //jumlah kata untuk setiap topik ndsum = new int[M]; //jumlah topik untuk setiap dokumen words = new int[cor.totalWords]; //sejumlah token pada korpus, isinya wordID sesuai vocab docs = new int[cor.totalWords]; //docID untuk setiap kata z = new int[cor.totalWords]; //topik untuk setiap kata wn = 0; for (int i = 0; i < M; i++) //iterate dokumen { int l = cor.Docs[i].Length; for (int j = 0; j < l; j++) //iterate token pada dokumen { words[wn] = cor.Docs[i].Words[j]; //menyimpan wordID dari vocab untuk setiap kata docs[wn] = i; wn++; } ndsum[i] = l; //jumlah topik pada dokumen diinisialisasi sama dengan jumlah kata } for (int i = 0; i < wn; i++) { int topic = rnd.Next(K); //select random topik untuk tiap kata lalu update nilai statistik nw[words[i]][topic] += 1; nd[docs[i]][topic] += 1; nwsum[topic] += 1; z[i] = topic; } theta = new double[M][]; for (int m = 0; m < M; m++) { theta[m] = new double[K]; } phi = new double[K][]; //phi untuk topik kata --> dimensi nya kebalik sama nw for (int k = 0; k < K; k++) { phi[k] = new double[V]; } }