コード例 #1
0
ファイル: DBUtils.cs プロジェクト: lvjianjunljj/Crawler
        private static void StoreDataToDBGitDataPart(List <DBGitDataModel> inputList, int startIndex, int endIndex)
        {
            if (startIndex >= endIndex)
            {
                return;
            }
            string dbInsertSql = "INSERT INTO [" + Configuration.DBName + "].[dbo].[" + Configuration.TableName + "]([repositoryPath],[downloadURL],[impressionCount],[clickCount],[fileName],[dirName])VALUES";

            for (int i = 0; i < inputList.Count; i++)
            {
                DBGitDataModel model = inputList[i];
                dbInsertSql += "('" + model.repositoryPath + "','" + model.downloadURL + "','" + model.impressionCount + "','" + model.clickCount + "','" + model.fileName + "','" + model.dirName + "'),";
            }
            bool success = ExecuteByText(dbInsertSql.Remove(dbInsertSql.Length - 1, 1));

            if (!success)
            {
                if (startIndex + 1 < endIndex)
                {
                    int midIndex = (startIndex + endIndex) / 2;
                    StoreDataToDBGitDataPart(inputList, startIndex, midIndex);
                    StoreDataToDBGitDataPart(inputList, midIndex, endIndex);
                }
            }
        }
コード例 #2
0
ファイル: DBUtils.cs プロジェクト: lvjianjunljj/Crawler
        public static void StoreDataToDBAll(List <DBGitDataModel> inputList)
        {
            Connection = new SqlConnection("Server=" + Configuration.DBserver + ";DataBase=" + Configuration.DBName + ";uid=" + Configuration.DBUID + ";pwd=" + Configuration.DBPWD);
            Connection.Open();
            string dbInsertSql = "INSERT INTO [" + Configuration.DBName + "].[dbo].[" + Configuration.TableName + "]([repositoryPath],[downloadURL],[impressionCount],[clickCount],[fileName],[dirName])VALUES";
            int    lineCount   = 0;

            for (int i = 0; i < inputList.Count; i++)
            {
                DBGitDataModel model = inputList[i];
                dbInsertSql += "('" + model.repositoryPath + "','" + model.downloadURL + "','" + model.impressionCount + "','" + model.clickCount + "','" + model.fileName + "','" + model.dirName + "'),";
                lineCount++;
                if (lineCount == Configuration.DBExecuteCountEveryTime)
                {
                    ExecuteByText(dbInsertSql.Remove(dbInsertSql.Length - 1, 1));
                    dbInsertSql = "INSERT INTO [" + Configuration.DBName + "].[dbo].[" + Configuration.TableName + "]([repositoryPath],[downloadURL],[impressionCount],[clickCount],[fileName],[dirName])VALUES";
                    lineCount   = 0;
                }
            }
            if (lineCount > 0)
            {
                ExecuteByText(dbInsertSql.Remove(dbInsertSql.Length - 1, 1));
            }
            Connection.Close();
        }
コード例 #3
0
ファイル: Program.cs プロジェクト: lvjianjunljj/Crawler
        public static void CheckZipFile(string inputStrFirst, string InputStrSecond)
        {
            HashSet <string> fileSetLocal = FileUtils.TraverseFile(Configuration.DownloadZipDir);
            HashSet <string> dirSetLocal  = FileUtils.TraverseFolder(Configuration.ZipExtractDir);
            Dictionary <string, DBGitDataModel> existDownLoadURLData;

            DBUtils.GetExistZipData(out existDownLoadURLData);
            List <string> fileListDBExtra = new List <string>();
            List <string> dirListDBExtra  = new List <string>();

            foreach (string downLoadURLData in existDownLoadURLData.Keys)
            {
                DBGitDataModel model = existDownLoadURLData[downLoadURLData];
                if (fileSetLocal.Contains(model.fileName))
                {
                    fileSetLocal.Remove(model.fileName);
                }
                else
                {
                    fileListDBExtra.Add(model.fileName);
                }
                if (dirSetLocal.Contains(model.dirName))
                {
                    dirSetLocal.Remove(model.dirName);
                }
                else
                {
                    dirListDBExtra.Add(model.dirName);
                }
            }
            Console.WriteLine("Extra local file: ");
            foreach (string fileName in fileSetLocal)
            {
                Console.WriteLine(fileName);
            }
            Console.WriteLine("Extra local directory: ");
            foreach (string dirName in dirSetLocal)
            {
                Console.WriteLine(dirName);
            }
            Console.WriteLine("Extra DB file: ");
            foreach (string fileName in fileListDBExtra)
            {
                Console.WriteLine(fileName);
            }
            Console.WriteLine("Extra DB directory: ");
            foreach (string dirName in dirListDBExtra)
            {
                Console.WriteLine(dirName);
            }
        }
コード例 #4
0
ファイル: DBUtils.cs プロジェクト: lvjianjunljj/Crawler
        public static HashSet <string> GetExistZipData(out Dictionary <string, DBGitDataModel> existDownLoadURLData)
        {
            existDownLoadURLData = new Dictionary <string, DBGitDataModel>();
            HashSet <string> pkNameSet = new HashSet <string>();

            Connection = new SqlConnection("Server=" + Configuration.DBserver + ";DataBase=" + Configuration.DBName + ";uid=" + Configuration.DBUID + ";pwd=" + Configuration.DBPWD);
            Connection.Open();
            string dbQuerySql = "SELECT * FROM [" + Configuration.DBName + "].[dbo].[" + Configuration.TableName + "] order by id";

            try
            {
                SqlCommand cmd = new SqlCommand(dbQuerySql, Connection);
                cmd.CommandType = CommandType.Text;

                SqlDataReader reader = cmd.ExecuteReader();
                while (reader.Read())
                {
                    string repositoryPath  = reader["repositoryPath"].ToString();
                    string downloadURL     = reader["downloadURL"].ToString();
                    int    impressionCount = Convert.ToInt32(reader["impressionCount"].ToString());
                    int    clickCount      = Convert.ToInt32(reader["clickCount"].ToString());
                    string fileName        = reader["fileName"].ToString();
                    string dirName         = reader["dirName"].ToString();
                    pkNameSet.Add(repositoryPath);
                    DBGitDataModel model = new DBGitDataModel(repositoryPath, downloadURL, impressionCount, clickCount, fileName, dirName);
                    if (!existDownLoadURLData.ContainsKey(downloadURL))
                    {
                        existDownLoadURLData.Add(downloadURL, model);
                    }
                }
                Connection.Close();
                return(pkNameSet);
            }
            catch (Exception ex)
            {
                Connection.Close();
                Console.WriteLine(ex.Message);
                return(pkNameSet);
            }
        }
コード例 #5
0
 public DBCrawlerGitDetailDataModel(
     string repositoryPath,
     string downloadURL,
     int impressionCount,
     int clickCount,
     string repositoryContent,
     List <string> topicsList,
     string readmeFileName)
 {
     this.repositoryPath    = DBGitDataModel.CheckDBStringData(repositoryPath);
     this.downloadURL       = DBGitDataModel.CheckDBStringData(downloadURL);
     this.impressionCount   = impressionCount;
     this.clickCount        = clickCount;
     this.repositoryContent = DBGitDataModel.CheckDBStringData(repositoryContent);
     this.topicsList        = topicsList;
     if (this.topicsList != null)
     {
         for (int i = 0; i < this.topicsList.Count; i++)
         {
             this.topicsList[i] = DBGitDataModel.CheckDBStringData(this.topicsList[i]);
         }
     }
     this.readmeFileName = readmeFileName;
 }
コード例 #6
0
ファイル: Program.cs プロジェクト: lvjianjunljj/Crawler
        public static void CrawlAndStoreGitData(string inputStrFirst, string InputStrSecond)
        {
            List <string>         inputList  = FileUtils.ReadFileLine(Configuration.URLFile);
            List <DBGitDataModel> insertData = new List <DBGitDataModel>();
            Dictionary <string, DBGitDataModel> existDownLoadURLData;
            HashSet <string> existPKData = DBUtils.GetExistZipData(out existDownLoadURLData);

            for (int i = 0; i < inputList.Count; i++)
            {
                string[] eles           = inputList[i].Split(new char[] { '\t' });
                string   repositoryPath = eles[0];
                Console.WriteLine(repositoryPath);
                if (existPKData.Contains(repositoryPath))
                {
                    continue;
                }
                int            impressionCount = Convert.ToInt32(eles[1]);
                int            clickCount      = Convert.ToInt32(eles[2]);
                HttpStatusCode statusCode;
                Dictionary <string, string> header;
                string htmlContent;
                try
                {
                    htmlContent = CrawlerClass.Crawl(repositoryPath, out statusCode, out header);
                }
                catch (Exception e)
                {
                    Logger.WriteLog("error: crawl \"" + repositoryPath + "\" " + e.Message);
                    continue;
                }
                string downloadRelativeURL = HtmlResolve.GetGitDownloadURL(htmlContent);
                if (downloadRelativeURL == null || downloadRelativeURL == "")
                {
                    continue;
                }
                string downloadURL = Configuration.RootURL + downloadRelativeURL;
                string fileName;
                string dirName;
                if (existDownLoadURLData.ContainsKey(downloadURL))
                {
                    DBGitDataModel modelTemp = existDownLoadURLData[downloadURL];
                    fileName = modelTemp.fileName;
                    dirName  = modelTemp.dirName;
                }
                else
                {
                    fileName = CrawlerClass.HttpDownloadFile(downloadURL, Configuration.DownloadZipDir);
                    dirName  = FileUtils.ZipExtractToDirectory(Path.Combine(Configuration.DownloadZipDir, fileName), Configuration.ZipExtractDir);
                    if (dirName == "")
                    {
                        continue;
                    }
                }
                DBGitDataModel model = new DBGitDataModel(repositoryPath, downloadURL, impressionCount, clickCount, fileName, dirName);
                insertData.Add(model);
                if (insertData.Count == Configuration.DBInsertCountEveryTime)
                {
                    DBUtils.StoreDataToDBGitDataPart(insertData);
                    insertData = new List <DBGitDataModel>();
                    Console.WriteLine("Store Data To DBGitData Part End!!!");
                }
            }
            if (insertData.Count > 0)
            {
                DBUtils.StoreDataToDBGitDataPart(insertData);
            }
        }