public static void UpdateDBReadmeName(string inputStrFirst, string InputStrSecond) { int[] indexData = GetIndex(inputStrFirst, InputStrSecond); int startIndex = indexData[0], endIndex = indexData[1]; if (startIndex < 0) { return; } DateTime startTime = DateTime.Now; List <string> emptyReadmeDataList = DBUtils.GetEmptyReadmeData(); DateTime now = DateTime.Now; int timeCost = (int)(now - startTime).TotalSeconds; startTime = now; Console.WriteLine("Get Empty Readme Data Time cost: " + timeCost + "s"); Console.WriteLine("Empty Readme Data Line Count: " + emptyReadmeDataList.Count); endIndex = endIndex > 0 ? endIndex : emptyReadmeDataList.Count; Console.WriteLine("Crawler GitDetailData from line " + startIndex + " to line " + endIndex + " in file \"" + Configuration.URLFile + "\"."); //HashSet<string> existPKData = DBUtils.GetExistData(); int updateCount = 0; for (int i = startIndex; i < endIndex; i++) { string repositoryPath = emptyReadmeDataList[i]; string readmePrefixName = repositoryPath.Substring(Configuration.RootURL.Length + 1, repositoryPath.Length - Configuration.RootURL.Length - 1).Replace("/", "_"); HttpStatusCode statusCode; Dictionary <string, string> header; string htmlContent; if (i % Configuration.DBInsertCountEveryTime == 0) { Console.WriteLine("Line index: " + i + "\t" + "updateCount: " + updateCount); updateCount = 0; } try { htmlContent = CrawlerClass.Crawl(repositoryPath, out statusCode, out header); } catch (Exception e) { Logger.WriteLog("error: crawl \"" + repositoryPath + "\" " + e.Message); continue; } string readmeSuffixName; string readmeFileContent = HtmlResolve.getReadmeContent(htmlContent, out readmeSuffixName); if (readmeFileContent == null || readmeFileContent.Length < 1) { Logger.WriteLog("\"" + repositoryPath + "\" crawler readmeFileContent is empty!!!"); continue; } string readmeFileName = FileUtils.SaveReadmeFile(readmePrefixName, readmeSuffixName, readmeFileContent); if (readmeFileName == "") { Logger.WriteLog("SaveReadmeFile error: readmePrefixName is \"" + readmePrefixName + "\" and readmeSuffixName is \"" + readmeSuffixName + "\"."); } else { DBUtils.UpdateEmptyReadmeData(repositoryPath, readmeFileName); updateCount++; } } }
public static void CrawlAndStoreCrawlerGitDetailData(string inputStrFirst, string InputStrSecond) { int[] indexData = GetIndex(inputStrFirst, InputStrSecond); int startIndex = indexData[0], endIndex = indexData[1]; if (startIndex < 0) { return; } DateTime startTime; bool start = startIndex > 0; string existLastData = DBUtils.getExistLastData(); if (existLastData == "") { Console.WriteLine("get the exist last data in DB error!!!"); Logger.WriteLog("get the exist last data in DB error!!!"); return; } List <string> inputList = FileUtils.ReadFileLine(Configuration.URLFile); List <DBCrawlerGitDetailDataModel> insertData = new List <DBCrawlerGitDetailDataModel>(); Console.WriteLine("Lines Count: " + inputList.Count); endIndex = endIndex > 0 ? endIndex : inputList.Count; Console.WriteLine("Crawler GitDetailData from line " + startIndex + " to line " + endIndex + " in file \"" + Configuration.URLFile + "\"."); //HashSet<string> existPKData = DBUtils.GetExistData(); startTime = DateTime.Now; for (int i = startIndex; i < endIndex; i++) { string[] eles = inputList[i].Split(new char[] { '\t' }); string repositoryPath = eles[0]; if (!start) { if (repositoryPath == existLastData) { start = true; Console.WriteLine("Crawl and store data to CrawlerGitDetailData start!!!"); Logger.WriteLog("Crawl and store data to CrawlerGitDetailData start!!!"); } continue; } //Console.WriteLine(repositoryPath); //if (existPKData.Contains(repositoryPath)) //{ // continue; //} string readmePrefixName = repositoryPath.Substring(Configuration.RootURL.Length + 1, repositoryPath.Length - Configuration.RootURL.Length - 1).Replace("/", "_"); int impressionCount = Convert.ToInt32(eles[1]); int clickCount = Convert.ToInt32(eles[2]); HttpStatusCode statusCode; Dictionary <string, string> header; string htmlContent; try { htmlContent = CrawlerClass.Crawl(repositoryPath, out statusCode, out header); } catch (Exception e) { Logger.WriteLog("error: crawl \"" + repositoryPath + "\" " + e.Message); continue; } string downloadRelativeURL = HtmlResolve.GetGitDownloadURL(htmlContent); string downloadURL = ""; if (downloadRelativeURL != null && downloadRelativeURL != "") { downloadURL = Configuration.RootURL + downloadRelativeURL; } string readmeSuffixName; string repositoryContent; List <string> topicsList; string readmeFileContent; try { repositoryContent = HtmlResolve.getRepositoryContent(htmlContent); topicsList = HtmlResolve.getTopicsList(htmlContent); readmeFileContent = HtmlResolve.getReadmeContent(htmlContent, out readmeSuffixName); } catch (Exception e) { Logger.WriteLog("getReadmeContent error: " + e.Message); Logger.WriteLog("the repositoryPath is \"" + repositoryPath + "\"."); continue; } string readmeFileName = FileUtils.SaveReadmeFile(readmePrefixName, readmeSuffixName, readmeFileContent); if (readmeFileName == "") { Logger.WriteLog("SaveReadmeFile error: readmePrefixName is \"" + readmePrefixName + "\" and readmeSuffixName is \"" + readmeSuffixName + "\"."); } DBCrawlerGitDetailDataModel model = new DBCrawlerGitDetailDataModel(repositoryPath, downloadURL, impressionCount, clickCount, repositoryContent, topicsList, readmeFileName); insertData.Add(model); if (insertData.Count == Configuration.DBInsertCountEveryTime) { DBUtils.StoreDataToDBCrawlerGitDetailDataPart(ref insertData); insertData = new List <DBCrawlerGitDetailDataModel>(); Console.WriteLine("Store Data To DBCrawlerGitDetailData Part End!!!"); Console.WriteLine("Line index: " + i); DateTime now = DateTime.Now; int timeCost = (int)(now - startTime).TotalSeconds; Console.WriteLine("Time cost: " + timeCost + "s"); startTime = now; } } if (insertData.Count > 0) { DBUtils.StoreDataToDBCrawlerGitDetailDataPart(ref insertData); } }