private static void StoreDataToDBGitDataPart(List <DBGitDataModel> inputList, int startIndex, int endIndex) { if (startIndex >= endIndex) { return; } string dbInsertSql = "INSERT INTO [" + Configuration.DBName + "].[dbo].[" + Configuration.TableName + "]([repositoryPath],[downloadURL],[impressionCount],[clickCount],[fileName],[dirName])VALUES"; for (int i = 0; i < inputList.Count; i++) { DBGitDataModel model = inputList[i]; dbInsertSql += "('" + model.repositoryPath + "','" + model.downloadURL + "','" + model.impressionCount + "','" + model.clickCount + "','" + model.fileName + "','" + model.dirName + "'),"; } bool success = ExecuteByText(dbInsertSql.Remove(dbInsertSql.Length - 1, 1)); if (!success) { if (startIndex + 1 < endIndex) { int midIndex = (startIndex + endIndex) / 2; StoreDataToDBGitDataPart(inputList, startIndex, midIndex); StoreDataToDBGitDataPart(inputList, midIndex, endIndex); } } }
public static void StoreDataToDBAll(List <DBGitDataModel> inputList) { Connection = new SqlConnection("Server=" + Configuration.DBserver + ";DataBase=" + Configuration.DBName + ";uid=" + Configuration.DBUID + ";pwd=" + Configuration.DBPWD); Connection.Open(); string dbInsertSql = "INSERT INTO [" + Configuration.DBName + "].[dbo].[" + Configuration.TableName + "]([repositoryPath],[downloadURL],[impressionCount],[clickCount],[fileName],[dirName])VALUES"; int lineCount = 0; for (int i = 0; i < inputList.Count; i++) { DBGitDataModel model = inputList[i]; dbInsertSql += "('" + model.repositoryPath + "','" + model.downloadURL + "','" + model.impressionCount + "','" + model.clickCount + "','" + model.fileName + "','" + model.dirName + "'),"; lineCount++; if (lineCount == Configuration.DBExecuteCountEveryTime) { ExecuteByText(dbInsertSql.Remove(dbInsertSql.Length - 1, 1)); dbInsertSql = "INSERT INTO [" + Configuration.DBName + "].[dbo].[" + Configuration.TableName + "]([repositoryPath],[downloadURL],[impressionCount],[clickCount],[fileName],[dirName])VALUES"; lineCount = 0; } } if (lineCount > 0) { ExecuteByText(dbInsertSql.Remove(dbInsertSql.Length - 1, 1)); } Connection.Close(); }
public static void CheckZipFile(string inputStrFirst, string InputStrSecond) { HashSet <string> fileSetLocal = FileUtils.TraverseFile(Configuration.DownloadZipDir); HashSet <string> dirSetLocal = FileUtils.TraverseFolder(Configuration.ZipExtractDir); Dictionary <string, DBGitDataModel> existDownLoadURLData; DBUtils.GetExistZipData(out existDownLoadURLData); List <string> fileListDBExtra = new List <string>(); List <string> dirListDBExtra = new List <string>(); foreach (string downLoadURLData in existDownLoadURLData.Keys) { DBGitDataModel model = existDownLoadURLData[downLoadURLData]; if (fileSetLocal.Contains(model.fileName)) { fileSetLocal.Remove(model.fileName); } else { fileListDBExtra.Add(model.fileName); } if (dirSetLocal.Contains(model.dirName)) { dirSetLocal.Remove(model.dirName); } else { dirListDBExtra.Add(model.dirName); } } Console.WriteLine("Extra local file: "); foreach (string fileName in fileSetLocal) { Console.WriteLine(fileName); } Console.WriteLine("Extra local directory: "); foreach (string dirName in dirSetLocal) { Console.WriteLine(dirName); } Console.WriteLine("Extra DB file: "); foreach (string fileName in fileListDBExtra) { Console.WriteLine(fileName); } Console.WriteLine("Extra DB directory: "); foreach (string dirName in dirListDBExtra) { Console.WriteLine(dirName); } }
public static HashSet <string> GetExistZipData(out Dictionary <string, DBGitDataModel> existDownLoadURLData) { existDownLoadURLData = new Dictionary <string, DBGitDataModel>(); HashSet <string> pkNameSet = new HashSet <string>(); Connection = new SqlConnection("Server=" + Configuration.DBserver + ";DataBase=" + Configuration.DBName + ";uid=" + Configuration.DBUID + ";pwd=" + Configuration.DBPWD); Connection.Open(); string dbQuerySql = "SELECT * FROM [" + Configuration.DBName + "].[dbo].[" + Configuration.TableName + "] order by id"; try { SqlCommand cmd = new SqlCommand(dbQuerySql, Connection); cmd.CommandType = CommandType.Text; SqlDataReader reader = cmd.ExecuteReader(); while (reader.Read()) { string repositoryPath = reader["repositoryPath"].ToString(); string downloadURL = reader["downloadURL"].ToString(); int impressionCount = Convert.ToInt32(reader["impressionCount"].ToString()); int clickCount = Convert.ToInt32(reader["clickCount"].ToString()); string fileName = reader["fileName"].ToString(); string dirName = reader["dirName"].ToString(); pkNameSet.Add(repositoryPath); DBGitDataModel model = new DBGitDataModel(repositoryPath, downloadURL, impressionCount, clickCount, fileName, dirName); if (!existDownLoadURLData.ContainsKey(downloadURL)) { existDownLoadURLData.Add(downloadURL, model); } } Connection.Close(); return(pkNameSet); } catch (Exception ex) { Connection.Close(); Console.WriteLine(ex.Message); return(pkNameSet); } }
public DBCrawlerGitDetailDataModel( string repositoryPath, string downloadURL, int impressionCount, int clickCount, string repositoryContent, List <string> topicsList, string readmeFileName) { this.repositoryPath = DBGitDataModel.CheckDBStringData(repositoryPath); this.downloadURL = DBGitDataModel.CheckDBStringData(downloadURL); this.impressionCount = impressionCount; this.clickCount = clickCount; this.repositoryContent = DBGitDataModel.CheckDBStringData(repositoryContent); this.topicsList = topicsList; if (this.topicsList != null) { for (int i = 0; i < this.topicsList.Count; i++) { this.topicsList[i] = DBGitDataModel.CheckDBStringData(this.topicsList[i]); } } this.readmeFileName = readmeFileName; }
public static void CrawlAndStoreGitData(string inputStrFirst, string InputStrSecond) { List <string> inputList = FileUtils.ReadFileLine(Configuration.URLFile); List <DBGitDataModel> insertData = new List <DBGitDataModel>(); Dictionary <string, DBGitDataModel> existDownLoadURLData; HashSet <string> existPKData = DBUtils.GetExistZipData(out existDownLoadURLData); for (int i = 0; i < inputList.Count; i++) { string[] eles = inputList[i].Split(new char[] { '\t' }); string repositoryPath = eles[0]; Console.WriteLine(repositoryPath); if (existPKData.Contains(repositoryPath)) { continue; } int impressionCount = Convert.ToInt32(eles[1]); int clickCount = Convert.ToInt32(eles[2]); HttpStatusCode statusCode; Dictionary <string, string> header; string htmlContent; try { htmlContent = CrawlerClass.Crawl(repositoryPath, out statusCode, out header); } catch (Exception e) { Logger.WriteLog("error: crawl \"" + repositoryPath + "\" " + e.Message); continue; } string downloadRelativeURL = HtmlResolve.GetGitDownloadURL(htmlContent); if (downloadRelativeURL == null || downloadRelativeURL == "") { continue; } string downloadURL = Configuration.RootURL + downloadRelativeURL; string fileName; string dirName; if (existDownLoadURLData.ContainsKey(downloadURL)) { DBGitDataModel modelTemp = existDownLoadURLData[downloadURL]; fileName = modelTemp.fileName; dirName = modelTemp.dirName; } else { fileName = CrawlerClass.HttpDownloadFile(downloadURL, Configuration.DownloadZipDir); dirName = FileUtils.ZipExtractToDirectory(Path.Combine(Configuration.DownloadZipDir, fileName), Configuration.ZipExtractDir); if (dirName == "") { continue; } } DBGitDataModel model = new DBGitDataModel(repositoryPath, downloadURL, impressionCount, clickCount, fileName, dirName); insertData.Add(model); if (insertData.Count == Configuration.DBInsertCountEveryTime) { DBUtils.StoreDataToDBGitDataPart(insertData); insertData = new List <DBGitDataModel>(); Console.WriteLine("Store Data To DBGitData Part End!!!"); } } if (insertData.Count > 0) { DBUtils.StoreDataToDBGitDataPart(insertData); } }