Пример #1
0
        private static void StoreDataToDBCrawlerGitDetailDataPart(ref List <DBCrawlerGitDetailDataModel> inputList, int startIndex, int endIndex)
        {
            if (startIndex >= endIndex)
            {
                return;
            }
            string dbInsertSql = "INSERT INTO [" + Configuration.DBName + "].[dbo].[" + Configuration.TableName + "]([repositoryPath],[downloadURL],[impressionCount],[clickCount],[repositoryContent],[topicsList],[readmeFileName])VALUES";

            for (int i = startIndex; i < endIndex; i++)
            {
                DBCrawlerGitDetailDataModel model = inputList[i];
                string topicsListStr = "";
                for (int j = 0; j < model.topicsList.Count; j++)
                {
                    if (j > 0)
                    {
                        topicsListStr += ";";
                    }
                    topicsListStr += model.topicsList[j];
                }
                dbInsertSql += "('" + model.repositoryPath + "','" + model.downloadURL + "','" + model.impressionCount + "','" + model.clickCount + "','" + model.repositoryContent + "','" + topicsListStr + "','" + model.readmeFileName + "'),";
            }
            bool success = ExecuteByText(dbInsertSql.Remove(dbInsertSql.Length - 1, 1));

            if (!success)
            {
                if (startIndex + 1 < endIndex)
                {
                    int midIndex = (startIndex + endIndex) / 2;
                    StoreDataToDBCrawlerGitDetailDataPart(ref inputList, startIndex, midIndex);
                    StoreDataToDBCrawlerGitDetailDataPart(ref inputList, midIndex, endIndex);
                }
            }
        }
Пример #2
0
        public static void CrawlAndStoreCrawlerGitDetailData(string inputStrFirst, string InputStrSecond)
        {
            int[] indexData = GetIndex(inputStrFirst, InputStrSecond);
            int   startIndex = indexData[0], endIndex = indexData[1];

            if (startIndex < 0)
            {
                return;
            }
            DateTime startTime;
            bool     start         = startIndex > 0;
            string   existLastData = DBUtils.getExistLastData();

            if (existLastData == "")
            {
                Console.WriteLine("get the exist last data in DB error!!!");
                Logger.WriteLog("get the exist last data in DB error!!!");
                return;
            }
            List <string> inputList = FileUtils.ReadFileLine(Configuration.URLFile);
            List <DBCrawlerGitDetailDataModel> insertData = new List <DBCrawlerGitDetailDataModel>();

            Console.WriteLine("Lines Count: " + inputList.Count);
            endIndex = endIndex > 0 ? endIndex : inputList.Count;
            Console.WriteLine("Crawler GitDetailData from line " + startIndex + " to line " + endIndex + " in file \"" + Configuration.URLFile + "\".");
            //HashSet<string> existPKData = DBUtils.GetExistData();
            startTime = DateTime.Now;
            for (int i = startIndex; i < endIndex; i++)
            {
                string[] eles           = inputList[i].Split(new char[] { '\t' });
                string   repositoryPath = eles[0];
                if (!start)
                {
                    if (repositoryPath == existLastData)
                    {
                        start = true;
                        Console.WriteLine("Crawl and store data to CrawlerGitDetailData start!!!");
                        Logger.WriteLog("Crawl and store data to CrawlerGitDetailData start!!!");
                    }
                    continue;
                }
                //Console.WriteLine(repositoryPath);
                //if (existPKData.Contains(repositoryPath))
                //{
                //    continue;
                //}
                string         readmePrefixName = repositoryPath.Substring(Configuration.RootURL.Length + 1, repositoryPath.Length - Configuration.RootURL.Length - 1).Replace("/", "_");
                int            impressionCount  = Convert.ToInt32(eles[1]);
                int            clickCount       = Convert.ToInt32(eles[2]);
                HttpStatusCode statusCode;
                Dictionary <string, string> header;
                string htmlContent;
                try
                {
                    htmlContent = CrawlerClass.Crawl(repositoryPath, out statusCode, out header);
                }
                catch (Exception e)
                {
                    Logger.WriteLog("error: crawl \"" + repositoryPath + "\" " + e.Message);
                    continue;
                }
                string downloadRelativeURL = HtmlResolve.GetGitDownloadURL(htmlContent);
                string downloadURL         = "";
                if (downloadRelativeURL != null && downloadRelativeURL != "")
                {
                    downloadURL = Configuration.RootURL + downloadRelativeURL;
                }
                string        readmeSuffixName;
                string        repositoryContent;
                List <string> topicsList;
                string        readmeFileContent;
                try
                {
                    repositoryContent = HtmlResolve.getRepositoryContent(htmlContent);
                    topicsList        = HtmlResolve.getTopicsList(htmlContent);
                    readmeFileContent = HtmlResolve.getReadmeContent(htmlContent, out readmeSuffixName);
                }
                catch (Exception e)
                {
                    Logger.WriteLog("getReadmeContent error: " + e.Message);
                    Logger.WriteLog("the repositoryPath is \"" + repositoryPath + "\".");
                    continue;
                }
                string readmeFileName = FileUtils.SaveReadmeFile(readmePrefixName, readmeSuffixName, readmeFileContent);
                if (readmeFileName == "")
                {
                    Logger.WriteLog("SaveReadmeFile error: readmePrefixName is \"" + readmePrefixName + "\" and readmeSuffixName is \"" + readmeSuffixName + "\".");
                }
                DBCrawlerGitDetailDataModel model = new DBCrawlerGitDetailDataModel(repositoryPath, downloadURL, impressionCount, clickCount, repositoryContent, topicsList, readmeFileName);

                insertData.Add(model);
                if (insertData.Count == Configuration.DBInsertCountEveryTime)
                {
                    DBUtils.StoreDataToDBCrawlerGitDetailDataPart(ref insertData);
                    insertData = new List <DBCrawlerGitDetailDataModel>();
                    Console.WriteLine("Store Data To DBCrawlerGitDetailData Part End!!!");
                    Console.WriteLine("Line index: " + i);
                    DateTime now      = DateTime.Now;
                    int      timeCost = (int)(now - startTime).TotalSeconds;
                    Console.WriteLine("Time cost: " + timeCost + "s");
                    startTime = now;
                }
            }
            if (insertData.Count > 0)
            {
                DBUtils.StoreDataToDBCrawlerGitDetailDataPart(ref insertData);
            }
        }