Beispiel #1
0
        public static void CrawlAndStoreCrawlerGitDetailData(string inputStrFirst, string InputStrSecond)
        {
            int[] indexData = GetIndex(inputStrFirst, InputStrSecond);
            int   startIndex = indexData[0], endIndex = indexData[1];

            if (startIndex < 0)
            {
                return;
            }
            DateTime startTime;
            bool     start         = startIndex > 0;
            string   existLastData = DBUtils.getExistLastData();

            if (existLastData == "")
            {
                Console.WriteLine("get the exist last data in DB error!!!");
                Logger.WriteLog("get the exist last data in DB error!!!");
                return;
            }
            List <string> inputList = FileUtils.ReadFileLine(Configuration.URLFile);
            List <DBCrawlerGitDetailDataModel> insertData = new List <DBCrawlerGitDetailDataModel>();

            Console.WriteLine("Lines Count: " + inputList.Count);
            endIndex = endIndex > 0 ? endIndex : inputList.Count;
            Console.WriteLine("Crawler GitDetailData from line " + startIndex + " to line " + endIndex + " in file \"" + Configuration.URLFile + "\".");
            //HashSet<string> existPKData = DBUtils.GetExistData();
            startTime = DateTime.Now;
            for (int i = startIndex; i < endIndex; i++)
            {
                string[] eles           = inputList[i].Split(new char[] { '\t' });
                string   repositoryPath = eles[0];
                if (!start)
                {
                    if (repositoryPath == existLastData)
                    {
                        start = true;
                        Console.WriteLine("Crawl and store data to CrawlerGitDetailData start!!!");
                        Logger.WriteLog("Crawl and store data to CrawlerGitDetailData start!!!");
                    }
                    continue;
                }
                //Console.WriteLine(repositoryPath);
                //if (existPKData.Contains(repositoryPath))
                //{
                //    continue;
                //}
                string         readmePrefixName = repositoryPath.Substring(Configuration.RootURL.Length + 1, repositoryPath.Length - Configuration.RootURL.Length - 1).Replace("/", "_");
                int            impressionCount  = Convert.ToInt32(eles[1]);
                int            clickCount       = Convert.ToInt32(eles[2]);
                HttpStatusCode statusCode;
                Dictionary <string, string> header;
                string htmlContent;
                try
                {
                    htmlContent = CrawlerClass.Crawl(repositoryPath, out statusCode, out header);
                }
                catch (Exception e)
                {
                    Logger.WriteLog("error: crawl \"" + repositoryPath + "\" " + e.Message);
                    continue;
                }
                string downloadRelativeURL = HtmlResolve.GetGitDownloadURL(htmlContent);
                string downloadURL         = "";
                if (downloadRelativeURL != null && downloadRelativeURL != "")
                {
                    downloadURL = Configuration.RootURL + downloadRelativeURL;
                }
                string        readmeSuffixName;
                string        repositoryContent;
                List <string> topicsList;
                string        readmeFileContent;
                try
                {
                    repositoryContent = HtmlResolve.getRepositoryContent(htmlContent);
                    topicsList        = HtmlResolve.getTopicsList(htmlContent);
                    readmeFileContent = HtmlResolve.getReadmeContent(htmlContent, out readmeSuffixName);
                }
                catch (Exception e)
                {
                    Logger.WriteLog("getReadmeContent error: " + e.Message);
                    Logger.WriteLog("the repositoryPath is \"" + repositoryPath + "\".");
                    continue;
                }
                string readmeFileName = FileUtils.SaveReadmeFile(readmePrefixName, readmeSuffixName, readmeFileContent);
                if (readmeFileName == "")
                {
                    Logger.WriteLog("SaveReadmeFile error: readmePrefixName is \"" + readmePrefixName + "\" and readmeSuffixName is \"" + readmeSuffixName + "\".");
                }
                DBCrawlerGitDetailDataModel model = new DBCrawlerGitDetailDataModel(repositoryPath, downloadURL, impressionCount, clickCount, repositoryContent, topicsList, readmeFileName);

                insertData.Add(model);
                if (insertData.Count == Configuration.DBInsertCountEveryTime)
                {
                    DBUtils.StoreDataToDBCrawlerGitDetailDataPart(ref insertData);
                    insertData = new List <DBCrawlerGitDetailDataModel>();
                    Console.WriteLine("Store Data To DBCrawlerGitDetailData Part End!!!");
                    Console.WriteLine("Line index: " + i);
                    DateTime now      = DateTime.Now;
                    int      timeCost = (int)(now - startTime).TotalSeconds;
                    Console.WriteLine("Time cost: " + timeCost + "s");
                    startTime = now;
                }
            }
            if (insertData.Count > 0)
            {
                DBUtils.StoreDataToDBCrawlerGitDetailDataPart(ref insertData);
            }
        }
Beispiel #2
0
        public static void UpdateDBReadmeName(string inputStrFirst, string InputStrSecond)
        {
            int[] indexData = GetIndex(inputStrFirst, InputStrSecond);
            int   startIndex = indexData[0], endIndex = indexData[1];

            if (startIndex < 0)
            {
                return;
            }
            DateTime      startTime           = DateTime.Now;
            List <string> emptyReadmeDataList = DBUtils.GetEmptyReadmeData();
            DateTime      now                 = DateTime.Now;
            int           timeCost            = (int)(now - startTime).TotalSeconds;

            startTime = now;
            Console.WriteLine("Get Empty Readme Data Time cost: " + timeCost + "s");
            Console.WriteLine("Empty Readme Data Line Count: " + emptyReadmeDataList.Count);
            endIndex = endIndex > 0 ? endIndex : emptyReadmeDataList.Count;
            Console.WriteLine("Crawler GitDetailData from line " + startIndex + " to line " + endIndex + " in file \"" + Configuration.URLFile + "\".");
            //HashSet<string> existPKData = DBUtils.GetExistData();
            int updateCount = 0;

            for (int i = startIndex; i < endIndex; i++)
            {
                string         repositoryPath   = emptyReadmeDataList[i];
                string         readmePrefixName = repositoryPath.Substring(Configuration.RootURL.Length + 1, repositoryPath.Length - Configuration.RootURL.Length - 1).Replace("/", "_");
                HttpStatusCode statusCode;
                Dictionary <string, string> header;
                string htmlContent;
                if (i % Configuration.DBInsertCountEveryTime == 0)
                {
                    Console.WriteLine("Line index: " + i + "\t" + "updateCount: " + updateCount);
                    updateCount = 0;
                }
                try
                {
                    htmlContent = CrawlerClass.Crawl(repositoryPath, out statusCode, out header);
                }
                catch (Exception e)
                {
                    Logger.WriteLog("error: crawl \"" + repositoryPath + "\" " + e.Message);
                    continue;
                }
                string readmeSuffixName;
                string readmeFileContent = HtmlResolve.getReadmeContent(htmlContent, out readmeSuffixName);
                if (readmeFileContent == null || readmeFileContent.Length < 1)
                {
                    Logger.WriteLog("\"" + repositoryPath + "\" crawler readmeFileContent is empty!!!");
                    continue;
                }
                string readmeFileName = FileUtils.SaveReadmeFile(readmePrefixName, readmeSuffixName, readmeFileContent);
                if (readmeFileName == "")
                {
                    Logger.WriteLog("SaveReadmeFile error: readmePrefixName is \"" + readmePrefixName + "\" and readmeSuffixName is \"" + readmeSuffixName + "\".");
                }
                else
                {
                    DBUtils.UpdateEmptyReadmeData(repositoryPath, readmeFileName);
                    updateCount++;
                }
            }
        }
Beispiel #3
0
        public static void CrawlAndStoreGitData(string inputStrFirst, string InputStrSecond)
        {
            List <string>         inputList  = FileUtils.ReadFileLine(Configuration.URLFile);
            List <DBGitDataModel> insertData = new List <DBGitDataModel>();
            Dictionary <string, DBGitDataModel> existDownLoadURLData;
            HashSet <string> existPKData = DBUtils.GetExistZipData(out existDownLoadURLData);

            for (int i = 0; i < inputList.Count; i++)
            {
                string[] eles           = inputList[i].Split(new char[] { '\t' });
                string   repositoryPath = eles[0];
                Console.WriteLine(repositoryPath);
                if (existPKData.Contains(repositoryPath))
                {
                    continue;
                }
                int            impressionCount = Convert.ToInt32(eles[1]);
                int            clickCount      = Convert.ToInt32(eles[2]);
                HttpStatusCode statusCode;
                Dictionary <string, string> header;
                string htmlContent;
                try
                {
                    htmlContent = CrawlerClass.Crawl(repositoryPath, out statusCode, out header);
                }
                catch (Exception e)
                {
                    Logger.WriteLog("error: crawl \"" + repositoryPath + "\" " + e.Message);
                    continue;
                }
                string downloadRelativeURL = HtmlResolve.GetGitDownloadURL(htmlContent);
                if (downloadRelativeURL == null || downloadRelativeURL == "")
                {
                    continue;
                }
                string downloadURL = Configuration.RootURL + downloadRelativeURL;
                string fileName;
                string dirName;
                if (existDownLoadURLData.ContainsKey(downloadURL))
                {
                    DBGitDataModel modelTemp = existDownLoadURLData[downloadURL];
                    fileName = modelTemp.fileName;
                    dirName  = modelTemp.dirName;
                }
                else
                {
                    fileName = CrawlerClass.HttpDownloadFile(downloadURL, Configuration.DownloadZipDir);
                    dirName  = FileUtils.ZipExtractToDirectory(Path.Combine(Configuration.DownloadZipDir, fileName), Configuration.ZipExtractDir);
                    if (dirName == "")
                    {
                        continue;
                    }
                }
                DBGitDataModel model = new DBGitDataModel(repositoryPath, downloadURL, impressionCount, clickCount, fileName, dirName);
                insertData.Add(model);
                if (insertData.Count == Configuration.DBInsertCountEveryTime)
                {
                    DBUtils.StoreDataToDBGitDataPart(insertData);
                    insertData = new List <DBGitDataModel>();
                    Console.WriteLine("Store Data To DBGitData Part End!!!");
                }
            }
            if (insertData.Count > 0)
            {
                DBUtils.StoreDataToDBGitDataPart(insertData);
            }
        }