private void ParseRobot() { var iTemp = new clsURL { URL = clsParse.GetHostFromUrl(_iSeedPage) + "/robots.txt" }; //iTemp.IsMobile = true; iRobot = clsParse.ParseRobot(clsDownloader.Download(iTemp, clsDownloader.DownloadUA.MozillaFirefox, Application.StartupPath + Resources.frmCrawler_ReCreateSetting__storage_)); }
private bool CheckURL(string url) { //1. Check if is inside website //2. Check METE //3. Check ROBOT var iTemp = new clsURL { URL = url }; //Use This to prevert "#comment" fragment var UrlContainHash = !url.Contains("#"); var UrlContainEqual = !url.Contains("="); return(clsParse.CheckURLInsideWebSite(url, _iWebSite) && !iQueue.CheckVisited(iTemp) && !iFinishedQueue.CheckVisited(iTemp) && UrlContainHash && UrlContainEqual && clsParse.CheckRobotRule(url, iRobot)); }
public frmCrawler(string[] args) { for (var i = 0; i < args.Length; i++) { if (args[i] == ("-politeness")) { _iPoliteness = int.Parse(args[i + 1]); } if (args[i] == ("-maxpages")) { _iMaxPages = int.Parse(args[i + 1]); } if (args[i] == ("-maxthreads")) { //_iMaxThreads = int.Parse(args[i + 1]); } } if (clsParse.CheckURL(args[args.Length - 1])) { _iSeedPage = args[args.Length - 1]; } else { Close(); } try { clsFileStorer.DeleteFolder(Application.StartupPath + "\\storage"); } catch (Exception) { // } InitializeComponent(); cmdStart.Enabled = false; ReCreateSetting(); var iSeedURL = new clsURL { URL = _iSeedPage }; iQueue.AddURL(iSeedURL); _iWebSite = clsParse.GetHostFromUrl(_iSeedPage); AddLog("WebSite Host: " + _iWebSite); }
public void FetchWebpage(clsURL URL) { var s = clsParse.CheckURLinHTML(clsDownloader.Download(URL, clsDownloader.DownloadUA.MozillaFirefox, Application.StartupPath + Resources.frmCrawler_ReCreateSetting__storage_)); iFinishedQueue.AddURL(URL); try { if (s.Count != 0) { foreach (var iURL in s) { //Check if the URL is suitable if (CheckURL(iURL)) { var iTempURL = new clsURL { URL = iURL }; iQueue.AddURL(iTempURL); } } Thread.Sleep(_iPoliteness * 1000); } else { MessageBox.Show("Network problem. Fetch error in: " + URL.URL); } } catch (Exception) { //Due to network problem the url could not fetched! especially the seed url MessageBox.Show("Network problem. Fetch error in: " + URL.URL); } }
public bool Equals(clsURL other) { return(other.URL == URL); }
public static string Download(clsURL url, DownloadUA iUA, string StoragePath) { var client = new WebClient(); string iReturn; string iUAString = ""; switch (iUA) { case DownloadUA.Chrome: iUAString = ""; break; case DownloadUA.MSIE: iUAString = ""; break; case DownloadUA.MozillaFirefox: iUAString = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;)"; break; } client.Headers.Add("user-agent", iUAString); try { Stream _data = client.OpenRead(url.URL); var _reader = new StreamReader(_data); iReturn = _reader.ReadToEnd(); if (_data != null) { _data.Close(); _reader.Close(); } } catch (Exception) { return(null); } if (!url.URL.ToLower().Contains("robots.txt")) { byte[] toEncodeAsBytes = System.Text.Encoding.ASCII.GetBytes(url.URL); var iFileName = Convert.ToBase64String(toEncodeAsBytes); if (url.IsMobile) { //Save to Mobile url.FileSave = StoragePath + "Mobile\\" + iFileName; clsFileStorer.StoreinFile(iReturn, StoragePath + "Mobile\\" + iFileName); } else { url.FileSave = StoragePath + "NonMobile\\" + iFileName; //Save to NonMobile clsFileStorer.StoreinFile(iReturn, StoragePath + "NonMobile\\" + iFileName); } } return(iReturn); }
private static string GetMobileString(clsURL iURL) { return(iURL.IsMobile ? "Mobile" : "NotMobile"); }