예제 #1
0
        private static void PrepareDataType2(string sourceDomain, string targetDomain, int trainingLimit)
        {
            MySqlConnection con = new MySqlConnection("host=localhost;user=root;password=password0!;database=aminer_big;");

            con.Open();

            #region Author Source
            //string sql = "SELECT distinct(authorNamePlain) FROM aminer_big.temp_paperauthors where domainName = '" + sourceDomain + "' and year < " + trainingLimit;
            string          sql    = "SELECT distinct(authorNamePlain) FROM aminer_big.temp_paperauthors where domainName = '" + sourceDomain + "'";
            MySqlCommand    cmd    = new MySqlCommand(sql, con);
            MySqlDataReader reader = cmd.ExecuteReader();

            AuthorDictionary sourceAD      = new AuthorDictionary();
            List <string>    sourceAuthors = new List <string>();
            string           tempAuthor;
            int authorID;

            while (reader.Read())
            {
                tempAuthor = reader.GetString("authorNamePlain");
                sourceAuthors.Add(tempAuthor);
                authorID = sourceAD.GetAuthor(tempAuthor);
            }

            con.Close();
            con.Open();

            Console.WriteLine(sourceDomain + " authors: " + sourceAuthors.Count);
            #endregion

            #region Author Target
            //sql = "SELECT distinct(authorNamePlain) FROM aminer_big.temp_paperauthors where domainName = '" + targetDomain + "' and year < " + trainingLimit;
            sql    = "SELECT distinct(authorNamePlain) FROM aminer_big.temp_paperauthors where domainName = '" + targetDomain + "'";
            cmd    = new MySqlCommand(sql, con);
            reader = cmd.ExecuteReader();

            AuthorDictionary targetAD      = new AuthorDictionary();
            List <string>    targetAuthors = new List <string>();

            while (reader.Read())
            {
                tempAuthor = reader.GetString("authorNamePlain");
                targetAuthors.Add(tempAuthor);
                authorID = targetAD.GetAuthor(tempAuthor);
            }

            con.Close();
            con.Open();

            Console.WriteLine(targetDomain + " authors: " + targetAuthors.Count);
            #endregion

            #region Paper
            sql    = "SELECT paperID, category, temp_paperauthors.year, authorNamePlain, title, abstract FROM aminer_big.temp_paperauthors inner join m_topicpaperauthor on paperID = id where (domainName = '" + sourceDomain + "' or domainName = '" + targetDomain + "')";
            cmd    = new MySqlCommand(sql, con);
            reader = cmd.ExecuteReader();

            List <Paper> papers      = new List <Paper>();
            int          tempPaperID = 0;
            int          sourceAuthorTemp;
            int          targetAuthorTemp;
            Paper        paper = new Paper(tempPaperID, 0, string.Empty, string.Empty);

            while (reader.Read())
            {
                int paperID = Convert.ToInt32(reader.GetString("paperID"));

                if (paperID != tempPaperID)
                {
                    string titleAndAbstract = reader.GetString("title") + " " + reader.GetString("abstract");
                    string paperDomain      = reader.GetString("category");
                    int    year             = Convert.ToInt32(reader.GetString("year"));

                    paper = new Paper(paperID, year, paperDomain, titleAndAbstract);
                    papers.Add(paper);
                    tempPaperID = paperID;
                }

                string author = reader.GetString("authorNamePlain");
                paper.Authors.Add(author);

                sourceAuthorTemp = sourceAD.GetAuthorID(author);
                targetAuthorTemp = targetAD.GetAuthorID(author);

                if (sourceAuthorTemp != -1)
                {
                    paper.SourceAuthorIDs.Add(sourceAuthorTemp);
                }

                if (targetAuthorTemp != -1)
                {
                    paper.TargetAuthorIDs.Add(targetAuthorTemp);
                }

                if (sourceAuthorTemp == -1 && targetAuthorTemp == -1)
                {
                    Console.WriteLine("==========>>>" + author);
                }
            }

            con.Close();
            #endregion

            var sourcePapers                = papers.Where(x => x.PaperDomain == sourceDomain);
            var targetPapers                = papers.Where(x => x.PaperDomain == targetDomain);
            var collaborationPapers         = papers.Where(x => x.SourceAuthorIDs.Count >= 1 && x.TargetAuthorIDs.Count >= 1);
            var collaborationPapersTraining = collaborationPapers.Where(x => x.Year < trainingLimit);
            var collaborationPapersTesting  = collaborationPapers.Where(x => x.Year >= trainingLimit);
            var undetected = papers.Where(x => x.SourceAuthorIDs.Count == 0 && x.TargetAuthorIDs.Count == 0);

            Console.WriteLine("all: " + papers.Count);
            Console.WriteLine(sourceDomain + ": " + sourcePapers.Count());
            Console.WriteLine(targetDomain + ": " + targetPapers.Count());
            Console.WriteLine("coll: " + collaborationPapers.Count() + " - " + collaborationPapers.Where(x => x.Authors.Count > 1).Count());
            Console.WriteLine("========= UNIDENTIFIED DOMAIN: " + undetected.Count());

            Console.WriteLine("Collaboration training: " + collaborationPapersTraining.Count());
            Console.WriteLine("Collaboration testing: " + collaborationPapersTesting.Count());

            string folderName = sourceDomain + " - " + targetDomain;
            System.IO.Directory.CreateDirectory(folderName);

            #region Data Training
            using (StreamWriter file = new StreamWriter(folderName + "/info.txt"))
            {
                file.WriteLine(sourceAuthors.Count);
                file.WriteLine(targetAuthors.Count);
                file.WriteLine(sourcePapers.Count());
                file.WriteLine(targetPapers.Count());
                file.WriteLine(collaborationPapersTraining.Count());
                file.WriteLine(collaborationPapersTesting.Count());
            }

            using (StreamWriter file = new StreamWriter(folderName + "/sourceAuthorIDs.txt"))
            {
                foreach (var item in sourceAD.Author2Id)
                {
                    file.WriteLine(item.Value + " " + item.Key);
                }
            }

            using (StreamWriter file = new StreamWriter(folderName + "/targetAuthorIDs.txt"))
            {
                foreach (var item in targetAD.Author2Id)
                {
                    file.WriteLine(item.Value + " " + item.Key);
                }
            }

            using (StreamWriter file = new StreamWriter(folderName + "/papers.txt"))
            {
                foreach (var item in sourcePapers)
                {
                    file.WriteLine(item.TitleAndAbstract);
                }

                foreach (var item in targetPapers)
                {
                    file.WriteLine(item.TitleAndAbstract);
                }

                foreach (var item in collaborationPapersTraining)
                {
                    file.WriteLine(item.TitleAndAbstract);
                }
            }

            using (StreamWriter file = new StreamWriter(folderName + "/authors.txt"))
            {
                foreach (var item in sourcePapers)
                {
                    file.WriteLine(item.AllAuthors);
                }

                foreach (var item in targetPapers)
                {
                    file.WriteLine(item.AllAuthors);
                }

                foreach (var item in collaborationPapersTraining)
                {
                    file.WriteLine(item.AllAuthors + " - " + item.Year);
                }
            }
            #endregion

            #region Data Testing
            List <AuthorPair> authorPairs = new List <AuthorPair>();

            foreach (var item in collaborationPapersTesting)
            {
                foreach (var sourceAuth in item.SourceAuthorIDs)
                {
                    foreach (var targetAuth in item.TargetAuthorIDs)
                    {
                        authorPairs.Add(new AuthorPair(sourceAuth, targetAuth));
                    }
                }
            }

            var authorMapping = authorPairs.GroupBy(x => x.SourceAuthorID).Select(g => new Collaborations(g.Key, g.Select(x => x.TargetAuthorID).Distinct().OrderBy(x => x).ToList())).OrderBy(x => x.SourceAuthorID);

            using (StreamWriter file = new StreamWriter(folderName + "/testData.txt"))
            {
                foreach (var item in authorMapping)
                {
                    file.WriteLine(item.SourceAuthorID + " " + string.Join(",", item.Collaborators));
                }
            }
            #endregion

            #region check intersection data
            var collAuthorTraining = new List <int>();

            foreach (var item in collaborationPapersTraining)
            {
                collAuthorTraining.AddRange(item.SourceAuthorIDs);
            }

            collAuthorTraining = collAuthorTraining.Distinct().ToList();
            var collAuthorTesting = authorMapping.Where(x => x.Collaborators.Count() >= 5).Select(x => x.SourceAuthorID).ToList();
            var both = collAuthorTesting.Intersect(collAuthorTraining);

            Console.WriteLine("coll author training: " + collAuthorTraining.Count);
            Console.WriteLine("coll author testing dengan kolaborator minimal 5: " + collAuthorTesting.Count);
            Console.WriteLine("intersection: " + both.Count());
            #endregion

            Console.WriteLine(Environment.NewLine);

            #region check author with no single domain data
            //List<string> singleDomain = new List<string>();
            //List<string> collaboration = new List<string>();

            //foreach (var item in sourcePapers)
            //{
            //    singleDomain.AddRange(item.Authors);
            //}

            //foreach (var item in targetPapers)
            //{
            //    singleDomain.AddRange(item.Authors);
            //}

            //foreach (var item in collaborationPapers)
            //{
            //    collaboration.AddRange(item.Authors);
            //}

            //IEnumerable<string> differenceQuery = collaboration.Except(singleDomain);
            //Console.WriteLine("Authors in collaboration but not singleDomain " + differenceQuery.Count());

            //foreach (string s in differenceQuery)
            //{
            //    Console.WriteLine(s);
            //}
            #endregion
        }
예제 #2
0
        private static void PrepareData(string sourceDomain, string targetDomain)
        {
            MySqlConnection con = new MySqlConnection("host=localhost;user=root;password=password0!;database=aminer_big;");

            con.Open();

            string          sql    = "SELECT distinct(authorNamePlain) FROM aminer_big.temp_paperauthors where domainName = '" + sourceDomain + "'";
            MySqlCommand    cmd    = new MySqlCommand(sql, con);
            MySqlDataReader reader = cmd.ExecuteReader();

            AuthorDictionary sourceAD      = new AuthorDictionary();
            List <string>    sourceAuthors = new List <string>();
            string           tempAuthor;
            int authorID;

            while (reader.Read())
            {
                tempAuthor = reader.GetString("authorNamePlain");
                sourceAuthors.Add(tempAuthor);
                authorID = sourceAD.GetAuthor(tempAuthor);
            }

            con.Close();
            con.Open();

            Console.WriteLine(sourceDomain + " authors: " + sourceAuthors.Count);

            sql    = "SELECT distinct(authorNamePlain) FROM aminer_big.temp_paperauthors where domainName = '" + targetDomain + "'";
            cmd    = new MySqlCommand(sql, con);
            reader = cmd.ExecuteReader();

            AuthorDictionary targetAD      = new AuthorDictionary();
            List <string>    targetAuthors = new List <string>();

            while (reader.Read())
            {
                tempAuthor = reader.GetString("authorNamePlain");
                targetAuthors.Add(tempAuthor);
                authorID = targetAD.GetAuthor(tempAuthor);
            }

            con.Close();
            con.Open();

            Console.WriteLine(targetDomain + " authors: " + targetAuthors.Count);

            sql    = "SELECT paperID, category, authorNamePlain, title, abstract FROM aminer_big.temp_paperauthors inner join m_topicpaperauthor on paperID = id where domainName = '" + sourceDomain + "' or domainName = '" + targetDomain + "'";
            cmd    = new MySqlCommand(sql, con);
            reader = cmd.ExecuteReader();

            List <Paper> papers      = new List <Paper>();
            int          tempPaperID = 0;
            int          sourceAuthorTemp;
            int          targetAuthorTemp;
            Paper        paper = new Paper(tempPaperID, 0, string.Empty, string.Empty);

            while (reader.Read())
            {
                int paperID = Convert.ToInt32(reader.GetString("paperID"));

                if (paperID != tempPaperID)
                {
                    string titleAndAbstract = reader.GetString("title") + " " + reader.GetString("abstract");
                    string paperDomain      = reader.GetString("category");
                    int    year             = Convert.ToInt32(reader.GetString("year"));

                    paper = new Paper(paperID, year, paperDomain, titleAndAbstract);
                    papers.Add(paper);
                    tempPaperID = paperID;
                }

                string author = reader.GetString("authorNamePlain");
                paper.Authors.Add(author);

                sourceAuthorTemp = sourceAD.GetAuthorID(author);
                targetAuthorTemp = targetAD.GetAuthorID(author);

                if (sourceAuthorTemp != -1)
                {
                    paper.SourceAuthorIDs.Add(sourceAuthorTemp);
                }

                if (targetAuthorTemp != -1)
                {
                    paper.TargetAuthorIDs.Add(targetAuthorTemp);
                }

                if (sourceAuthorTemp == -1 && targetAuthorTemp == -1)
                {
                    Console.WriteLine("==========>>>" + author);
                }
            }

            con.Close();

            var paper1Author        = papers.Where(x => x.Authors.Count == 1);
            var sourcePapers        = papers.Where(x => x.SourceAuthorIDs.Count >= 1 && x.TargetAuthorIDs.Count == 0);
            var targetPapers        = papers.Where(x => x.SourceAuthorIDs.Count == 0 && x.TargetAuthorIDs.Count >= 1);
            var collaborationPapers = papers.Where(x => x.SourceAuthorIDs.Count >= 1 && x.TargetAuthorIDs.Count >= 1);
            var undetected          = papers.Where(x => x.SourceAuthorIDs.Count == 0 && x.TargetAuthorIDs.Count == 0);

            Console.WriteLine("all: " + papers.Count);
            Console.WriteLine(sourceDomain + ": " + sourcePapers.Count());
            Console.WriteLine(targetDomain + ": " + targetPapers.Count());
            Console.WriteLine("coll: " + collaborationPapers.Count() + " - " + collaborationPapers.Where(x => x.Authors.Count > 1).Count());
            Console.WriteLine("========= UNIDENTIFIED DOMAIN: " + undetected.Count());

            using (StreamWriter file = new StreamWriter("info.txt"))
            {
                file.WriteLine(sourceAuthors.Count);
                file.WriteLine(targetAuthors.Count);
                file.WriteLine(sourcePapers.Count());
                file.WriteLine(targetPapers.Count());
                file.WriteLine(collaborationPapers.Count());
            }

            using (StreamWriter file = new StreamWriter("papers.txt"))
            {
                foreach (var item in sourcePapers)
                {
                    file.WriteLine(item.TitleAndAbstract);
                }

                foreach (var item in targetPapers)
                {
                    file.WriteLine(item.TitleAndAbstract);
                }

                foreach (var item in collaborationPapers)
                {
                    file.WriteLine(item.TitleAndAbstract);
                }
            }

            using (StreamWriter file = new StreamWriter("authors.txt"))
            {
                foreach (var item in sourcePapers)
                {
                    file.WriteLine(item.AllAuthors);
                }

                foreach (var item in targetPapers)
                {
                    file.WriteLine(item.AllAuthors);
                }

                foreach (var item in collaborationPapers)
                {
                    file.WriteLine(item.AllAuthors);
                }
            }

            #region check author with no single domain data
            //List<string> singleDomain = new List<string>();
            //List<string> collaboration = new List<string>();

            //foreach (var item in sourcePapers)
            //{
            //    singleDomain.AddRange(item.Authors);
            //}

            //foreach (var item in targetPapers)
            //{
            //    singleDomain.AddRange(item.Authors);
            //}

            //foreach (var item in collaborationPapers)
            //{
            //    collaboration.AddRange(item.Authors);
            //}

            //IEnumerable<string> differenceQuery = collaboration.Except(singleDomain);
            //Console.WriteLine("Authors in collaboration but not singleDomain " + differenceQuery.Count());

            //foreach (string s in differenceQuery)
            //{
            //    Console.WriteLine(s);
            //}
            #endregion
        }