Exemple #1
        public static void UpdateDBReadmeName(string inputStrFirst, string InputStrSecond)
            int[] indexData = GetIndex(inputStrFirst, InputStrSecond);
            int   startIndex = indexData[0], endIndex = indexData[1];

            if (startIndex < 0)
            DateTime      startTime           = DateTime.Now;
            List <string> emptyReadmeDataList = DBUtils.GetEmptyReadmeData();
            DateTime      now                 = DateTime.Now;
            int           timeCost            = (int)(now - startTime).TotalSeconds;

            startTime = now;
            Console.WriteLine("Get Empty Readme Data Time cost: " + timeCost + "s");
            Console.WriteLine("Empty Readme Data Line Count: " + emptyReadmeDataList.Count);
            endIndex = endIndex > 0 ? endIndex : emptyReadmeDataList.Count;
            Console.WriteLine("Crawler GitDetailData from line " + startIndex + " to line " + endIndex + " in file \"" + Configuration.URLFile + "\".");
            //HashSet<string> existPKData = DBUtils.GetExistData();
            int updateCount = 0;

            for (int i = startIndex; i < endIndex; i++)
                string         repositoryPath   = emptyReadmeDataList[i];
                string         readmePrefixName = repositoryPath.Substring(Configuration.RootURL.Length + 1, repositoryPath.Length - Configuration.RootURL.Length - 1).Replace("/", "_");
                HttpStatusCode statusCode;
                Dictionary <string, string> header;
                string htmlContent;
                if (i % Configuration.DBInsertCountEveryTime == 0)
                    Console.WriteLine("Line index: " + i + "\t" + "updateCount: " + updateCount);
                    updateCount = 0;
                    htmlContent = CrawlerClass.Crawl(repositoryPath, out statusCode, out header);
                catch (Exception e)
                    Logger.WriteLog("error: crawl \"" + repositoryPath + "\" " + e.Message);
                string readmeSuffixName;
                string readmeFileContent = HtmlResolve.getReadmeContent(htmlContent, out readmeSuffixName);
                if (readmeFileContent == null || readmeFileContent.Length < 1)
                    Logger.WriteLog("\"" + repositoryPath + "\" crawler readmeFileContent is empty!!!");
                string readmeFileName = FileUtils.SaveReadmeFile(readmePrefixName, readmeSuffixName, readmeFileContent);
                if (readmeFileName == "")
                    Logger.WriteLog("SaveReadmeFile error: readmePrefixName is \"" + readmePrefixName + "\" and readmeSuffixName is \"" + readmeSuffixName + "\".");
                    DBUtils.UpdateEmptyReadmeData(repositoryPath, readmeFileName);
Exemple #2
        public static void CrawlAndStoreCrawlerGitDetailData(string inputStrFirst, string InputStrSecond)
            int[] indexData = GetIndex(inputStrFirst, InputStrSecond);
            int   startIndex = indexData[0], endIndex = indexData[1];

            if (startIndex < 0)
            DateTime startTime;
            bool     start         = startIndex > 0;
            string   existLastData = DBUtils.getExistLastData();

            if (existLastData == "")
                Console.WriteLine("get the exist last data in DB error!!!");
                Logger.WriteLog("get the exist last data in DB error!!!");
            List <string> inputList = FileUtils.ReadFileLine(Configuration.URLFile);
            List <DBCrawlerGitDetailDataModel> insertData = new List <DBCrawlerGitDetailDataModel>();

            Console.WriteLine("Lines Count: " + inputList.Count);
            endIndex = endIndex > 0 ? endIndex : inputList.Count;
            Console.WriteLine("Crawler GitDetailData from line " + startIndex + " to line " + endIndex + " in file \"" + Configuration.URLFile + "\".");
            //HashSet<string> existPKData = DBUtils.GetExistData();
            startTime = DateTime.Now;
            for (int i = startIndex; i < endIndex; i++)
                string[] eles           = inputList[i].Split(new char[] { '\t' });
                string   repositoryPath = eles[0];
                if (!start)
                    if (repositoryPath == existLastData)
                        start = true;
                        Console.WriteLine("Crawl and store data to CrawlerGitDetailData start!!!");
                        Logger.WriteLog("Crawl and store data to CrawlerGitDetailData start!!!");
                //if (existPKData.Contains(repositoryPath))
                //    continue;
                string         readmePrefixName = repositoryPath.Substring(Configuration.RootURL.Length + 1, repositoryPath.Length - Configuration.RootURL.Length - 1).Replace("/", "_");
                int            impressionCount  = Convert.ToInt32(eles[1]);
                int            clickCount       = Convert.ToInt32(eles[2]);
                HttpStatusCode statusCode;
                Dictionary <string, string> header;
                string htmlContent;
                    htmlContent = CrawlerClass.Crawl(repositoryPath, out statusCode, out header);
                catch (Exception e)
                    Logger.WriteLog("error: crawl \"" + repositoryPath + "\" " + e.Message);
                string downloadRelativeURL = HtmlResolve.GetGitDownloadURL(htmlContent);
                string downloadURL         = "";
                if (downloadRelativeURL != null && downloadRelativeURL != "")
                    downloadURL = Configuration.RootURL + downloadRelativeURL;
                string        readmeSuffixName;
                string        repositoryContent;
                List <string> topicsList;
                string        readmeFileContent;
                    repositoryContent = HtmlResolve.getRepositoryContent(htmlContent);
                    topicsList        = HtmlResolve.getTopicsList(htmlContent);
                    readmeFileContent = HtmlResolve.getReadmeContent(htmlContent, out readmeSuffixName);
                catch (Exception e)
                    Logger.WriteLog("getReadmeContent error: " + e.Message);
                    Logger.WriteLog("the repositoryPath is \"" + repositoryPath + "\".");
                string readmeFileName = FileUtils.SaveReadmeFile(readmePrefixName, readmeSuffixName, readmeFileContent);
                if (readmeFileName == "")
                    Logger.WriteLog("SaveReadmeFile error: readmePrefixName is \"" + readmePrefixName + "\" and readmeSuffixName is \"" + readmeSuffixName + "\".");
                DBCrawlerGitDetailDataModel model = new DBCrawlerGitDetailDataModel(repositoryPath, downloadURL, impressionCount, clickCount, repositoryContent, topicsList, readmeFileName);

                if (insertData.Count == Configuration.DBInsertCountEveryTime)
                    DBUtils.StoreDataToDBCrawlerGitDetailDataPart(ref insertData);
                    insertData = new List <DBCrawlerGitDetailDataModel>();
                    Console.WriteLine("Store Data To DBCrawlerGitDetailData Part End!!!");
                    Console.WriteLine("Line index: " + i);
                    DateTime now      = DateTime.Now;
                    int      timeCost = (int)(now - startTime).TotalSeconds;
                    Console.WriteLine("Time cost: " + timeCost + "s");
                    startTime = now;
            if (insertData.Count > 0)
                DBUtils.StoreDataToDBCrawlerGitDetailDataPart(ref insertData);