private static void StoreDataToDBCrawlerGitDetailDataPart(ref List <DBCrawlerGitDetailDataModel> inputList, int startIndex, int endIndex) { if (startIndex >= endIndex) { return; } string dbInsertSql = "INSERT INTO [" + Configuration.DBName + "].[dbo].[" + Configuration.TableName + "]([repositoryPath],[downloadURL],[impressionCount],[clickCount],[repositoryContent],[topicsList],[readmeFileName])VALUES"; for (int i = startIndex; i < endIndex; i++) { DBCrawlerGitDetailDataModel model = inputList[i]; string topicsListStr = ""; for (int j = 0; j < model.topicsList.Count; j++) { if (j > 0) { topicsListStr += ";"; } topicsListStr += model.topicsList[j]; } dbInsertSql += "('" + model.repositoryPath + "','" + model.downloadURL + "','" + model.impressionCount + "','" + model.clickCount + "','" + model.repositoryContent + "','" + topicsListStr + "','" + model.readmeFileName + "'),"; } bool success = ExecuteByText(dbInsertSql.Remove(dbInsertSql.Length - 1, 1)); if (!success) { if (startIndex + 1 < endIndex) { int midIndex = (startIndex + endIndex) / 2; StoreDataToDBCrawlerGitDetailDataPart(ref inputList, startIndex, midIndex); StoreDataToDBCrawlerGitDetailDataPart(ref inputList, midIndex, endIndex); } } }
public static void CrawlAndStoreCrawlerGitDetailData(string inputStrFirst, string InputStrSecond) { int[] indexData = GetIndex(inputStrFirst, InputStrSecond); int startIndex = indexData[0], endIndex = indexData[1]; if (startIndex < 0) { return; } DateTime startTime; bool start = startIndex > 0; string existLastData = DBUtils.getExistLastData(); if (existLastData == "") { Console.WriteLine("get the exist last data in DB error!!!"); Logger.WriteLog("get the exist last data in DB error!!!"); return; } List <string> inputList = FileUtils.ReadFileLine(Configuration.URLFile); List <DBCrawlerGitDetailDataModel> insertData = new List <DBCrawlerGitDetailDataModel>(); Console.WriteLine("Lines Count: " + inputList.Count); endIndex = endIndex > 0 ? endIndex : inputList.Count; Console.WriteLine("Crawler GitDetailData from line " + startIndex + " to line " + endIndex + " in file \"" + Configuration.URLFile + "\"."); //HashSet<string> existPKData = DBUtils.GetExistData(); startTime = DateTime.Now; for (int i = startIndex; i < endIndex; i++) { string[] eles = inputList[i].Split(new char[] { '\t' }); string repositoryPath = eles[0]; if (!start) { if (repositoryPath == existLastData) { start = true; Console.WriteLine("Crawl and store data to CrawlerGitDetailData start!!!"); Logger.WriteLog("Crawl and store data to CrawlerGitDetailData start!!!"); } continue; } //Console.WriteLine(repositoryPath); //if (existPKData.Contains(repositoryPath)) //{ // continue; //} string readmePrefixName = repositoryPath.Substring(Configuration.RootURL.Length + 1, repositoryPath.Length - Configuration.RootURL.Length - 1).Replace("/", "_"); int impressionCount = Convert.ToInt32(eles[1]); int clickCount = Convert.ToInt32(eles[2]); HttpStatusCode statusCode; Dictionary <string, string> header; string htmlContent; try { htmlContent = CrawlerClass.Crawl(repositoryPath, out statusCode, out header); } catch (Exception e) { Logger.WriteLog("error: crawl \"" + repositoryPath + "\" " + e.Message); continue; } string downloadRelativeURL = HtmlResolve.GetGitDownloadURL(htmlContent); string downloadURL = ""; if (downloadRelativeURL != null && downloadRelativeURL != "") { downloadURL = Configuration.RootURL + downloadRelativeURL; } string readmeSuffixName; string repositoryContent; List <string> topicsList; string readmeFileContent; try { repositoryContent = HtmlResolve.getRepositoryContent(htmlContent); topicsList = HtmlResolve.getTopicsList(htmlContent); readmeFileContent = HtmlResolve.getReadmeContent(htmlContent, out readmeSuffixName); } catch (Exception e) { Logger.WriteLog("getReadmeContent error: " + e.Message); Logger.WriteLog("the repositoryPath is \"" + repositoryPath + "\"."); continue; } string readmeFileName = FileUtils.SaveReadmeFile(readmePrefixName, readmeSuffixName, readmeFileContent); if (readmeFileName == "") { Logger.WriteLog("SaveReadmeFile error: readmePrefixName is \"" + readmePrefixName + "\" and readmeSuffixName is \"" + readmeSuffixName + "\"."); } DBCrawlerGitDetailDataModel model = new DBCrawlerGitDetailDataModel(repositoryPath, downloadURL, impressionCount, clickCount, repositoryContent, topicsList, readmeFileName); insertData.Add(model); if (insertData.Count == Configuration.DBInsertCountEveryTime) { DBUtils.StoreDataToDBCrawlerGitDetailDataPart(ref insertData); insertData = new List <DBCrawlerGitDetailDataModel>(); Console.WriteLine("Store Data To DBCrawlerGitDetailData Part End!!!"); Console.WriteLine("Line index: " + i); DateTime now = DateTime.Now; int timeCost = (int)(now - startTime).TotalSeconds; Console.WriteLine("Time cost: " + timeCost + "s"); startTime = now; } } if (insertData.Count > 0) { DBUtils.StoreDataToDBCrawlerGitDetailDataPart(ref insertData); } }