Exemplo n.º 1
0
 public void TrainNewModel(Corpora cor, CommandLineOption opt)
 {
     InitOption(opt);
     InitModel(cor);
     PrintModelInfo();
     GibbsSampling(niters);
 }
Exemplo n.º 2
0
        public void TrainNewModel(Corpora cor, CommandLineOption opt, string outputFile)
        {
            this.outputfile = outputFile;

            InitOption(opt);
            InitModel(cor);
            PrintModelInfo();
            GibbsSampling(niters);
        }
Exemplo n.º 3
0
        static void Main(string[] args)
        {
            CommandLineOption opt    = GetDefaultOption();
            Parser            parser = new Parser();
            var stopwatch            = new Stopwatch();

            try
            {
                parser.ParseArguments(args, opt);

                string[] info = File.ReadAllLines(opt.info);

                int sourceAuthorNum       = Convert.ToInt32(info[0]);
                int targetAuthorNum       = Convert.ToInt32(info[1]);
                int sourcePaperNum        = Convert.ToInt32(info[2]);
                int targetPaperNum        = Convert.ToInt32(info[3]);
                int collaborationPaperNum = Convert.ToInt32(info[4]);

                List <string> papers  = File.ReadAllLines(opt.papers).ToList();
                List <string> authors = File.ReadAllLines(opt.authors).ToList();

                var sourcePapers        = papers.Take(sourcePaperNum).ToArray();
                var targetPapers        = papers.Skip(sourcePaperNum).Take(targetPaperNum).ToArray();
                var collaborationPapers = papers.Skip(sourcePaperNum + targetPaperNum).ToArray();

                var sourceAuthors        = authors.Take(sourcePaperNum).ToArray();
                var targetAuthors        = authors.Skip(sourcePaperNum).Take(targetPaperNum).ToArray();
                var collaborationAuthors = authors.Skip(sourcePaperNum + targetPaperNum).ToArray();

                int vocabSize = 12;

                Corpora sourceCor = new Corpora(vocabSize, sourceAuthorNum);
                sourceCor.LoadDataFile(sourcePapers, sourceAuthors);
                ACTPhraseSampling sourceModel = new ACTPhraseSampling();
                sourceModel.TrainNewModel(sourceCor, opt, "outSource.txt");

                Corpora targetCor = new Corpora(vocabSize, targetAuthorNum);
                targetCor.LoadDataFile(targetPapers, targetAuthors);
                ACTPhraseSampling targetModel = new ACTPhraseSampling();
                targetModel.TrainNewModel(targetCor, opt, "outTarget.txt");

                Corpora collaborationCor = new Corpora(vocabSize, 0);
                collaborationCor.LoadDataFile(collaborationPapers, collaborationAuthors);
                CTLPhraseSampling collaborationModel = new CTLPhraseSampling(sourceModel, targetModel);
                collaborationModel.TrainNewModel(collaborationCor, opt, "outColl.txt");
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex.StackTrace);
                Console.WriteLine(ex.Message);
            }

            Console.ReadLine();
        }
Exemplo n.º 4
0
        private void InitModel(Corpora cor)
        {
            this.cor = cor;

            M   = cor.totalDocuments;                       //jumlah dokumen
            Voc = cor.VocabSize;                            //jumlah kata di vocab
            V   = cor.TotalAuthor;                          //jumlah author

            Console.WriteLine(V);

            p = new double[K];

            zx = new int[Voc][];                            //kata-topik
            zv = new int[V][];                              //author-topik

            for (int w = 0; w < Voc; w++)
            {
                zx[w] = new int[K];
            }

            for (int v = 0; v < V; v++)
            {
                zv[v] = new int[K];
            }

            zxsum = new int[K];                             //jumlah kata untuk setiap topik
            zvsum = new int[V];                             //jumlah topik untuk setiap author

            words   = new int[cor.totalWords];              //sejumlah token pada korpus, isinya wordID sesuai vocab
            docs    = new int[cor.totalWords];              //docID untuk setiap kata
            authors = new int[cor.totalWords];              //authorID untuk setiap kata
            z       = new int[cor.totalWords];              //topik untuk setiap kata
            wn      = 0;

            for (int i = 0; i < M; i++)                     //iterate dokumen
            {
                int l = cor.Docs[i].Length;

                for (int j = 0; j < l; j++)                 //iterate token pada dokumen
                {
                    words[wn] = cor.Docs[i].Words[j];       //menyimpan wordID dari vocab untuk setiap kata

                    int author = cor.Docs[i].RandomAuthorID;
                    authors[wn]    = author;
                    zvsum[author] += 1;                         //jumlah topik pada author diinisialisasi sama dengan jumlah kata

                    docs[wn] = i;
                    wn++;
                }
            }

            for (int i = 0; i < wn; i++)
            {
                int topic = rnd.Next(K);                    //select random topik untuk tiap kata lalu update nilai statistik
                zx[words[i]][topic]   += 1;
                zv[authors[i]][topic] += 1;
                zxsum[topic]          += 1;
                z[i] = topic;
            }

            theta = new double[V][];                        //theta untuk author topik --> dimensi nya kebalik sama nd

            for (int v = 0; v < V; v++)
            {
                theta[v] = new double[K];
            }

            phi = new double[K][];                          //phi untuk topik kata

            for (int k = 0; k < K; k++)
            {
                phi[k] = new double[Voc];
            }
        }
Exemplo n.º 5
0
        private void InitModel(Corpora cor)
        {
            this.cor = cor;

            M   = cor.totalDocuments;                                     //jumlah dokumen
            Voc = cor.VocabSize;                                          //jumlah kata di vocab

            p       = new double[K];
            ps      = new double[2];
            pSource = new double[K];
            pTarget = new double[K];

            sourceAuthorNum = sourceModel.cor.TotalAuthor;
            targetAuthorNum = targetModel.cor.TotalAuthor;

            zx       = new int[Voc][];                                      //kata-topik
            zxSource = new int[Voc][];
            zxTarget = new int[Voc][];

            for (int w = 0; w < Voc; w++)
            {
                zx[w]       = new int[K];
                zxSource[w] = new int[K];
                zxTarget[w] = new int[K];

                sourceModel.zx[w].CopyTo(zxSource[w], 0);
                targetModel.zx[w].CopyTo(zxTarget[w], sourceModel.K);
            }

            zxsum       = new int[K];
            zxsumSource = new int[K];
            zxsumTarget = new int[K];

            sourceModel.zxsum.CopyTo(zxsumSource, 0);
            targetModel.zxsum.CopyTo(zxsumTarget, sourceModel.K);

            zvSource = new int[sourceModel.V][];
            zvTarget = new int[targetModel.V][];

            for (int v = 0; v < sourceModel.V; v++)
            {
                zvSource[v] = new int[K];
                sourceModel.zv[v].CopyTo(zvSource[v], 0);
            }

            for (int v = 0; v < targetModel.V; v++)
            {
                zvTarget[v] = new int[K];
                targetModel.zv[v].CopyTo(zvTarget[v], sourceModel.K);
            }

            zvsumSource = sourceModel.zvsum;
            zvsumTarget = targetModel.zvsum;

            zvv      = new int[sourceAuthorNum][][];                     //author pair-topik
            zvvsum   = new int[sourceAuthorNum][];                       //jumlah topik untuk setiap author pair
            vartheta = new double[sourceAuthorNum][][];                  //author pair-topik

            for (int a = 0; a < sourceAuthorNum; a++)
            {
                zvv[a]      = new int[targetAuthorNum][];
                vartheta[a] = new double[targetAuthorNum][];
                zvvsum[a]   = new int[targetAuthorNum];

                for (int b = 0; b < targetAuthorNum; b++)
                {
                    zvv[a][b]      = new int[K];
                    vartheta[a][b] = new double[K];
                }
            }

            phi = new double[K][];                                          //phi untuk topik kata

            for (int k = 0; k < K; k++)
            {
                phi[k] = new double[Voc];
            }

            dc = new int[M][];
            cz = new int[2][];

            for (int m = 0; m < M; m++)
            {
                dc[m] = new int[2];
            }

            for (int i = 0; i < 2; i++)
            {
                cz[i] = new int[K];
            }

            words = new int[cor.totalWords];                        //sejumlah token pada korpus, isinya wordID sesuai vocab
            doc   = new int[cor.totalWords];                        //docID untuk setiap kata
            //authors = new int[cor.totalWords];                       //authorID untuk setiap kata
            sourceAuthors  = new int[cor.totalWords];
            targetAuthors  = new int[cor.totalWords];
            coins          = new int[cor.totalWords];
            isSourceDomain = new int[cor.totalWords];
            z  = new int[cor.totalWords];                           //topik untuk setiap kata
            wn = 0;

            for (int i = 0; i < M; i++)                             //iterate dokumen
            {
                Document currDoc   = cor.Docs[i];
                int      docLength = currDoc.Length;

                for (int j = 0; j < docLength; j++)                 //iterate token pada dokumen
                {
                    words[wn] = currDoc.Words[j];                   //menyimpan wordID dari vocab untuk setiap kata

                    int coin = (rnd.NextDouble() < 0.5) ? 0 : 1;
                    coins[wn]    = coin;
                    dc[i][coin] += 1;

                    int sourceAuthor = currDoc.RandomSourceAuthorID;
                    int targetAuthor = currDoc.RandomTargetAuthorID;

                    sourceAuthors[wn] = sourceAuthor;
                    targetAuthors[wn] = targetAuthor;

                    doc[wn] = i;
                    wn++;
                }
            }

            for (int i = 0; i < wn; i++)
            {
                int topic = rnd.Next(K);
                z[i] = topic;
                cz[coins[i]][topic] += 1;

                if (coins[i] == 0)
                {
                    zx[words[i]][topic] += 1;       //coba ini ntar di luar
                    zxsum[topic]        += 1;

                    zvv[sourceAuthors[i]][targetAuthors[i]][topic] += 1;
                    zvvsum[sourceAuthors[i]][targetAuthors[i]]     += 1;
                }
                else
                {
                    if (topic < sourceModel.K)
                    {
                        isSourceDomain[i]          = 1;
                        zxSource[words[i]][topic] += 1;
                        zxsumSource[topic]        += 1;

                        zvSource[sourceAuthors[i]][topic] += 1;
                        zvsumSource[sourceAuthors[i]]     += 1;
                    }
                    else
                    {
                        isSourceDomain[i]          = 0;
                        zxTarget[words[i]][topic] += 1;
                        zxsumTarget[topic]        += 1;

                        zvTarget[targetAuthors[i]][topic] += 1;
                        zvsumTarget[targetAuthors[i]]     += 1;
                    }
                }
            }
        }
Exemplo n.º 6
0
        private void InitModel(Corpora cor)
        {
            this.cor = cor;

            M = cor.totalDocuments;   //jumlah dokumen
            V = cor.VocabSize;        //jumlah kata di vocab

            p = new double[K];
            Random rnd = new Random();

            nw = new int[V][];          //kata-topik
            nd = new int[M][];          //dokumen-topik

            for (int w = 0; w < V; w++)
            {
                nw[w] = new int[K];
            }
            for (int m = 0; m < M; m++)
            {
                nd[m] = new int[K];
            }

            nwsum = new int[K];                 //jumlah kata untuk setiap topik
            ndsum = new int[M];                 //jumlah topik untuk setiap dokumen

            words = new int[cor.totalWords];    //sejumlah token pada korpus, isinya wordID sesuai vocab
            docs  = new int[cor.totalWords];    //docID untuk setiap kata
            z     = new int[cor.totalWords];    //topik untuk setiap kata
            wn    = 0;

            for (int i = 0; i < M; i++)         //iterate dokumen
            {
                int l = cor.Docs[i].Length;

                for (int j = 0; j < l; j++)           //iterate token pada dokumen
                {
                    words[wn] = cor.Docs[i].Words[j]; //menyimpan wordID dari vocab untuk setiap kata
                    docs[wn]  = i;
                    wn++;
                }

                ndsum[i] = l;                   //jumlah topik pada dokumen diinisialisasi sama dengan jumlah kata
            }

            for (int i = 0; i < wn; i++)
            {
                int topic = rnd.Next(K);        //select random topik untuk tiap kata lalu update nilai statistik
                nw[words[i]][topic] += 1;
                nd[docs[i]][topic]  += 1;
                nwsum[topic]        += 1;
                z[i] = topic;
            }

            theta = new double[M][];

            for (int m = 0; m < M; m++)
            {
                theta[m] = new double[K];
            }

            phi = new double[K][];              //phi untuk topik kata --> dimensi nya kebalik sama nw

            for (int k = 0; k < K; k++)
            {
                phi[k] = new double[V];
            }
        }