Esempio n. 1
0
        private void ParseRobot()
        {
            var iTemp = new clsURL {
                URL = clsParse.GetHostFromUrl(_iSeedPage) + "/robots.txt"
            };

            //iTemp.IsMobile = true;
            iRobot = clsParse.ParseRobot(clsDownloader.Download(iTemp,
                                                                clsDownloader.DownloadUA.MozillaFirefox,
                                                                Application.StartupPath +
                                                                Resources.frmCrawler_ReCreateSetting__storage_));
        }
Esempio n. 2
0
        private bool CheckURL(string url)
        {
            //1. Check if is inside website
            //2. Check METE
            //3. Check ROBOT
            var iTemp = new clsURL {
                URL = url
            };
            //Use This to prevert "#comment" fragment
            var UrlContainHash  = !url.Contains("#");
            var UrlContainEqual = !url.Contains("=");

            return(clsParse.CheckURLInsideWebSite(url, _iWebSite) && !iQueue.CheckVisited(iTemp) &&
                   !iFinishedQueue.CheckVisited(iTemp) && UrlContainHash && UrlContainEqual &&
                   clsParse.CheckRobotRule(url, iRobot));
        }
Esempio n. 3
0
        public frmCrawler(string[] args)
        {
            for (var i = 0; i < args.Length; i++)
            {
                if (args[i] == ("-politeness"))
                {
                    _iPoliteness = int.Parse(args[i + 1]);
                }
                if (args[i] == ("-maxpages"))
                {
                    _iMaxPages = int.Parse(args[i + 1]);
                }
                if (args[i] == ("-maxthreads"))
                {
                    //_iMaxThreads = int.Parse(args[i + 1]);
                }
            }
            if (clsParse.CheckURL(args[args.Length - 1]))
            {
                _iSeedPage = args[args.Length - 1];
            }
            else
            {
                Close();
            }
            try
            {
                clsFileStorer.DeleteFolder(Application.StartupPath + "\\storage");
            }
            catch (Exception)
            {
                //
            }


            InitializeComponent();
            cmdStart.Enabled = false;
            ReCreateSetting();
            var iSeedURL = new clsURL {
                URL = _iSeedPage
            };

            iQueue.AddURL(iSeedURL);
            _iWebSite = clsParse.GetHostFromUrl(_iSeedPage);
            AddLog("WebSite Host: " + _iWebSite);
        }
Esempio n. 4
0
        public void FetchWebpage(clsURL URL)
        {
            var s =
                clsParse.CheckURLinHTML(clsDownloader.Download(URL, clsDownloader.DownloadUA.MozillaFirefox,
                                                               Application.StartupPath +
                                                               Resources.frmCrawler_ReCreateSetting__storage_));


            iFinishedQueue.AddURL(URL);

            try
            {
                if (s.Count != 0)
                {
                    foreach (var iURL in s)
                    {
                        //Check if the URL is suitable
                        if (CheckURL(iURL))
                        {
                            var iTempURL = new clsURL {
                                URL = iURL
                            };
                            iQueue.AddURL(iTempURL);
                        }
                    }
                    Thread.Sleep(_iPoliteness * 1000);
                }
                else
                {
                    MessageBox.Show("Network problem. Fetch error in: " + URL.URL);
                }
            }
            catch (Exception)
            {
                //Due to network problem the url could not fetched! especially the seed url
                MessageBox.Show("Network problem. Fetch error in: " + URL.URL);
            }
        }
Esempio n. 5
0
 public bool Equals(clsURL other)
 {
     return(other.URL == URL);
 }
Esempio n. 6
0
        public static string Download(clsURL url, DownloadUA iUA, string StoragePath)
        {
            var    client = new WebClient();
            string iReturn;
            string iUAString = "";

            switch (iUA)
            {
            case DownloadUA.Chrome:
                iUAString = "";
                break;

            case DownloadUA.MSIE:
                iUAString = "";
                break;

            case DownloadUA.MozillaFirefox:
                iUAString = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;)";
                break;
            }

            client.Headers.Add("user-agent", iUAString);
            try
            {
                Stream _data   = client.OpenRead(url.URL);
                var    _reader = new StreamReader(_data);
                iReturn = _reader.ReadToEnd();
                if (_data != null)
                {
                    _data.Close();
                    _reader.Close();
                }
            }
            catch (Exception)
            {
                return(null);
            }

            if (!url.URL.ToLower().Contains("robots.txt"))
            {
                byte[] toEncodeAsBytes
                    = System.Text.Encoding.ASCII.GetBytes(url.URL);
                var iFileName = Convert.ToBase64String(toEncodeAsBytes);

                if (url.IsMobile)
                {
                    //Save to Mobile


                    url.FileSave = StoragePath + "Mobile\\" + iFileName;
                    clsFileStorer.StoreinFile(iReturn, StoragePath + "Mobile\\" + iFileName);
                }
                else
                {
                    url.FileSave = StoragePath + "NonMobile\\" + iFileName;
                    //Save to NonMobile
                    clsFileStorer.StoreinFile(iReturn, StoragePath + "NonMobile\\" + iFileName);
                }
            }


            return(iReturn);
        }
Esempio n. 7
0
 private static string GetMobileString(clsURL iURL)
 {
     return(iURL.IsMobile ? "Mobile" : "NotMobile");
 }