Esempio n. 1
0
        public static void UpdateDBReadmeName(string inputStrFirst, string InputStrSecond)
        {
            int[] indexData = GetIndex(inputStrFirst, InputStrSecond);
            int   startIndex = indexData[0], endIndex = indexData[1];

            if (startIndex < 0)
            {
                return;
            }
            DateTime      startTime           = DateTime.Now;
            List <string> emptyReadmeDataList = DBUtils.GetEmptyReadmeData();
            DateTime      now                 = DateTime.Now;
            int           timeCost            = (int)(now - startTime).TotalSeconds;

            startTime = now;
            Console.WriteLine("Get Empty Readme Data Time cost: " + timeCost + "s");
            Console.WriteLine("Empty Readme Data Line Count: " + emptyReadmeDataList.Count);
            endIndex = endIndex > 0 ? endIndex : emptyReadmeDataList.Count;
            Console.WriteLine("Crawler GitDetailData from line " + startIndex + " to line " + endIndex + " in file \"" + Configuration.URLFile + "\".");
            //HashSet<string> existPKData = DBUtils.GetExistData();
            int updateCount = 0;

            for (int i = startIndex; i < endIndex; i++)
            {
                string         repositoryPath   = emptyReadmeDataList[i];
                string         readmePrefixName = repositoryPath.Substring(Configuration.RootURL.Length + 1, repositoryPath.Length - Configuration.RootURL.Length - 1).Replace("/", "_");
                HttpStatusCode statusCode;
                Dictionary <string, string> header;
                string htmlContent;
                if (i % Configuration.DBInsertCountEveryTime == 0)
                {
                    Console.WriteLine("Line index: " + i + "\t" + "updateCount: " + updateCount);
                    updateCount = 0;
                }
                try
                {
                    htmlContent = CrawlerClass.Crawl(repositoryPath, out statusCode, out header);
                }
                catch (Exception e)
                {
                    Logger.WriteLog("error: crawl \"" + repositoryPath + "\" " + e.Message);
                    continue;
                }
                string readmeSuffixName;
                string readmeFileContent = HtmlResolve.getReadmeContent(htmlContent, out readmeSuffixName);
                if (readmeFileContent == null || readmeFileContent.Length < 1)
                {
                    Logger.WriteLog("\"" + repositoryPath + "\" crawler readmeFileContent is empty!!!");
                    continue;
                }
                string readmeFileName = FileUtils.SaveReadmeFile(readmePrefixName, readmeSuffixName, readmeFileContent);
                if (readmeFileName == "")
                {
                    Logger.WriteLog("SaveReadmeFile error: readmePrefixName is \"" + readmePrefixName + "\" and readmeSuffixName is \"" + readmeSuffixName + "\".");
                }
                else
                {
                    DBUtils.UpdateEmptyReadmeData(repositoryPath, readmeFileName);
                    updateCount++;
                }
            }
        }
Esempio n. 2
0
        public static void CrawlAndStoreCrawlerGitDetailData(string inputStrFirst, string InputStrSecond)
        {
            int[] indexData = GetIndex(inputStrFirst, InputStrSecond);
            int   startIndex = indexData[0], endIndex = indexData[1];

            if (startIndex < 0)
            {
                return;
            }
            DateTime startTime;
            bool     start         = startIndex > 0;
            string   existLastData = DBUtils.getExistLastData();

            if (existLastData == "")
            {
                Console.WriteLine("get the exist last data in DB error!!!");
                Logger.WriteLog("get the exist last data in DB error!!!");
                return;
            }
            List <string> inputList = FileUtils.ReadFileLine(Configuration.URLFile);
            List <DBCrawlerGitDetailDataModel> insertData = new List <DBCrawlerGitDetailDataModel>();

            Console.WriteLine("Lines Count: " + inputList.Count);
            endIndex = endIndex > 0 ? endIndex : inputList.Count;
            Console.WriteLine("Crawler GitDetailData from line " + startIndex + " to line " + endIndex + " in file \"" + Configuration.URLFile + "\".");
            //HashSet<string> existPKData = DBUtils.GetExistData();
            startTime = DateTime.Now;
            for (int i = startIndex; i < endIndex; i++)
            {
                string[] eles           = inputList[i].Split(new char[] { '\t' });
                string   repositoryPath = eles[0];
                if (!start)
                {
                    if (repositoryPath == existLastData)
                    {
                        start = true;
                        Console.WriteLine("Crawl and store data to CrawlerGitDetailData start!!!");
                        Logger.WriteLog("Crawl and store data to CrawlerGitDetailData start!!!");
                    }
                    continue;
                }
                //Console.WriteLine(repositoryPath);
                //if (existPKData.Contains(repositoryPath))
                //{
                //    continue;
                //}
                string         readmePrefixName = repositoryPath.Substring(Configuration.RootURL.Length + 1, repositoryPath.Length - Configuration.RootURL.Length - 1).Replace("/", "_");
                int            impressionCount  = Convert.ToInt32(eles[1]);
                int            clickCount       = Convert.ToInt32(eles[2]);
                HttpStatusCode statusCode;
                Dictionary <string, string> header;
                string htmlContent;
                try
                {
                    htmlContent = CrawlerClass.Crawl(repositoryPath, out statusCode, out header);
                }
                catch (Exception e)
                {
                    Logger.WriteLog("error: crawl \"" + repositoryPath + "\" " + e.Message);
                    continue;
                }
                string downloadRelativeURL = HtmlResolve.GetGitDownloadURL(htmlContent);
                string downloadURL         = "";
                if (downloadRelativeURL != null && downloadRelativeURL != "")
                {
                    downloadURL = Configuration.RootURL + downloadRelativeURL;
                }
                string        readmeSuffixName;
                string        repositoryContent;
                List <string> topicsList;
                string        readmeFileContent;
                try
                {
                    repositoryContent = HtmlResolve.getRepositoryContent(htmlContent);
                    topicsList        = HtmlResolve.getTopicsList(htmlContent);
                    readmeFileContent = HtmlResolve.getReadmeContent(htmlContent, out readmeSuffixName);
                }
                catch (Exception e)
                {
                    Logger.WriteLog("getReadmeContent error: " + e.Message);
                    Logger.WriteLog("the repositoryPath is \"" + repositoryPath + "\".");
                    continue;
                }
                string readmeFileName = FileUtils.SaveReadmeFile(readmePrefixName, readmeSuffixName, readmeFileContent);
                if (readmeFileName == "")
                {
                    Logger.WriteLog("SaveReadmeFile error: readmePrefixName is \"" + readmePrefixName + "\" and readmeSuffixName is \"" + readmeSuffixName + "\".");
                }
                DBCrawlerGitDetailDataModel model = new DBCrawlerGitDetailDataModel(repositoryPath, downloadURL, impressionCount, clickCount, repositoryContent, topicsList, readmeFileName);

                insertData.Add(model);
                if (insertData.Count == Configuration.DBInsertCountEveryTime)
                {
                    DBUtils.StoreDataToDBCrawlerGitDetailDataPart(ref insertData);
                    insertData = new List <DBCrawlerGitDetailDataModel>();
                    Console.WriteLine("Store Data To DBCrawlerGitDetailData Part End!!!");
                    Console.WriteLine("Line index: " + i);
                    DateTime now      = DateTime.Now;
                    int      timeCost = (int)(now - startTime).TotalSeconds;
                    Console.WriteLine("Time cost: " + timeCost + "s");
                    startTime = now;
                }
            }
            if (insertData.Count > 0)
            {
                DBUtils.StoreDataToDBCrawlerGitDetailDataPart(ref insertData);
            }
        }