public static void DownloadGitCodeZip(string inputStrFirst, string InputStrSecond) { int[] indexData = GetIndex(inputStrFirst, InputStrSecond); int startIndex = indexData[0], endIndex = indexData[1]; if (startIndex < 0) { return; } DateTime startTime = DateTime.Now; List <string> repositoryNameList; List <string> downloadURLList = DBUtils.GetDownloadURLList(out repositoryNameList); DateTime now = DateTime.Now; int timeCost = (int)(now - startTime).TotalSeconds; startTime = now; Console.WriteLine("Get Name List Time cost: " + timeCost + "s"); if (downloadURLList.Count != repositoryNameList.Count) { Console.WriteLine("downloadURLList.Count: \"" + downloadURLList.Count + "\" is not equal to repositoryNameList.Count: \"" + repositoryNameList.Count + "\", return!!!"); return; } Console.WriteLine("Data Line Count: " + downloadURLList.Count); endIndex = endIndex > 0 ? endIndex : downloadURLList.Count; //ThreadPool.SetMaxThreads(1, 1); for (int i = startIndex; i < endIndex; i++) { //ThreadPool.QueueUserWorkItem(new WaitCallback(DownloadFileTaskMethod), new GitCodeZipInfoClass(downloadURLList[i], repositoryNameList[i].Replace("/", "_").Replace("\\", "_") + ".zip")); string fileName = CrawlerClass.HttpDownloadFile(downloadURLList[i], Configuration.DownloadZipDir, false, false, repositoryNameList[i].Replace("/", "_").Replace("\\", "_") + ".zip"); Console.WriteLine("Run " + i + " line fileName: " + fileName + " will be downloaded"); //Thread.Sleep(100); } Console.WriteLine("Download Git Code Zip All End*****************************"); }
private static void DownloadFileTaskMethod(Object gitCodeZipInfoObj) { CrawlerClass.HttpDownloadFile(((GitCodeZipInfoClass)gitCodeZipInfoObj).downloadURL, Configuration.DownloadZipDir, false, true, ((GitCodeZipInfoClass)gitCodeZipInfoObj).repositoryName); Console.WriteLine(((GitCodeZipInfoClass)gitCodeZipInfoObj).repositoryName + " will be downloaded"); }
public static void CrawlAndStoreGitData(string inputStrFirst, string InputStrSecond) { List <string> inputList = FileUtils.ReadFileLine(Configuration.URLFile); List <DBGitDataModel> insertData = new List <DBGitDataModel>(); Dictionary <string, DBGitDataModel> existDownLoadURLData; HashSet <string> existPKData = DBUtils.GetExistZipData(out existDownLoadURLData); for (int i = 0; i < inputList.Count; i++) { string[] eles = inputList[i].Split(new char[] { '\t' }); string repositoryPath = eles[0]; Console.WriteLine(repositoryPath); if (existPKData.Contains(repositoryPath)) { continue; } int impressionCount = Convert.ToInt32(eles[1]); int clickCount = Convert.ToInt32(eles[2]); HttpStatusCode statusCode; Dictionary <string, string> header; string htmlContent; try { htmlContent = CrawlerClass.Crawl(repositoryPath, out statusCode, out header); } catch (Exception e) { Logger.WriteLog("error: crawl \"" + repositoryPath + "\" " + e.Message); continue; } string downloadRelativeURL = HtmlResolve.GetGitDownloadURL(htmlContent); if (downloadRelativeURL == null || downloadRelativeURL == "") { continue; } string downloadURL = Configuration.RootURL + downloadRelativeURL; string fileName; string dirName; if (existDownLoadURLData.ContainsKey(downloadURL)) { DBGitDataModel modelTemp = existDownLoadURLData[downloadURL]; fileName = modelTemp.fileName; dirName = modelTemp.dirName; } else { fileName = CrawlerClass.HttpDownloadFile(downloadURL, Configuration.DownloadZipDir); dirName = FileUtils.ZipExtractToDirectory(Path.Combine(Configuration.DownloadZipDir, fileName), Configuration.ZipExtractDir); if (dirName == "") { continue; } } DBGitDataModel model = new DBGitDataModel(repositoryPath, downloadURL, impressionCount, clickCount, fileName, dirName); insertData.Add(model); if (insertData.Count == Configuration.DBInsertCountEveryTime) { DBUtils.StoreDataToDBGitDataPart(insertData); insertData = new List <DBGitDataModel>(); Console.WriteLine("Store Data To DBGitData Part End!!!"); } } if (insertData.Count > 0) { DBUtils.StoreDataToDBGitDataPart(insertData); } }