Exemple #1
0
        public M4Spider()
        {
            _maxOutboundLinks = 99;
            _numOutboundLinks = 0;
            _avoidHttps       = true;
            _insideLinks      = false;
            _verifyBackLinks  = true;
            _unSpidered       = new List <string>();
            _spideredURLs     = new List <string>();
            _obLinks          = new List <string>();
            _avoidPatterns    = new List <string>();
            _proxyDomain      = "";
            _proxyLogin       = "";
            _proxyPassword    = "";
            _proxyPort        = 0;
            _backLinkDN       = "";
            SR      = new SpiderResults();
            PrxShut = new ProxyShuttler();



            AvoidPatterns.Add("google.");
            AvoidPatterns.Add("yahoo.");
            AvoidPatterns.Add("bing.");
            AvoidPatterns.Add("altavista.");
            AvoidPatterns.Add("princeton.edu");
            AvoidPatterns.Add("amazon.com");
            AvoidPatterns.Add("baidu.com");
            AvoidPatterns.Add("planet-lab.edu");
        }
Exemple #2
0
        private bool ParsePage(StreamReader strmRdr, string DN)
        {
            SR.Init(_backLinkDN);
            if (strmRdr != null)
            {
                Uri thisUri;
                try
                {
                    thisUri = new Uri(DN);
                }
                catch (Exception URIex)
                {
                    _errMsg = URIex.Message;
                    return(false);
                }

                SR.Host     = thisUri.Host;
                SR.Path     = thisUri.PathAndQuery;
                SR.Port     = thisUri.Port.ToString();
                SR.Protocol = thisUri.Scheme;
                SR.DDN      = _backLinkDN;

                try
                {
                    // Create an instance of StreamReader to read from CapturePage().
                    // The using statement also closes the StreamReader.
                    _errMsg = "";

                    using (strmRdr)
                    {
                        string HypLinkPattern  = "href=\"(?<Link>http:.*?)\"";
                        string sHypLinkPattern = "href=\"(?<Link>https?:.*?)\"";
                        string HostPattern     = @"^[a-z][a-z0-9+\-.]*://([a-z0-9\-._~%!$&'()*+,;=]+@)?(?<host>[a-z0-9\-._~%]+|\[[a-z0-9\-._~%!$&'()*+,;=:]+\])";
                        string VerifyPattern   = @"^[a-z][a-z0-9+\-.]*://([a-z0-9\-._~%!$&'()*+,;=]+@)?(?<host>.*(domain))";

                        string KeywordPattern     = "<meta name=\"keywords\" content=\"(?<keywords>.*?)\">";
                        string DescriptionPattern = "<meta name=\"description\" content=\"(?<description>.*?)\">";
                        string val;

                        string source, DN_host = "";
                        bool   lb_avoid = false;

                        string          totalSrc = "";
                        MatchCollection HypLinkMatch, KeywordMatch, DescripMatch;
                        Match           DNmatch, hlDNmatch, DNmatch2;

                        //totalSrc = strmRdr.ReadToEnd();

                        //while ((source = strmRdr.ReadToEnd()) != null)//strmRdr.ReadLine()) != null)
                        if ((source = strmRdr.ReadToEnd()) != null)
                        {
                            if (null != strmRdr)
                            {
                                strmRdr.Close();
                            }
                            _errMsg      = "";
                            KeywordMatch = Regex.Matches(source, KeywordPattern, RegexOptions.IgnoreCase | RegexOptions.Singleline);
                            if (KeywordMatch.Count > 0)//.Success)
                            {
                                foreach (Match m1 in KeywordMatch)
                                {
                                    val = m1.Groups["keywords"].ToString();
                                    if (val != null)
                                    {
                                        SR.Keywords = val.ToString();
                                    }
                                }
                            }

                            DescripMatch = Regex.Matches(source, DescriptionPattern, RegexOptions.Singleline);
                            if (DescripMatch.Count > 0)//.Success)
                            {
                                foreach (Match m2 in DescripMatch)
                                {
                                    val = m2.Groups["description"].ToString();
                                    if (val != null)
                                    {
                                        SR.Description = val.ToString();
                                    }
                                }
                            }

                            if (!_avoidHttps)
                            {
                                HypLinkPattern = sHypLinkPattern;
                            }



                            HypLinkMatch = Regex.Matches(source, HypLinkPattern, RegexOptions.Singleline);
                            if (HypLinkMatch.Count > 0)//.Success)
                            {
                                DateTime startTime = DateTime.Now;

                                foreach (Match m in HypLinkMatch)
                                {
                                    if (_numOutboundLinks >= _maxOutboundLinks)
                                    {
                                        _sr1 = SR;
                                        _numUnspidered--;
                                        _numSpidered++;
                                        _errMsg = "";
                                        return(true);
                                    }

                                    val = m.Groups["Link"].ToString();

                                    if (val != null)
                                    {
                                        foreach (string item in AvoidPatterns)
                                        {
                                            lb_avoid = false;
                                            if (val.ToLower().Contains(item.ToLower()))
                                            {
                                                lb_avoid = true;
                                                break;
                                            }
                                        }

                                        if (!lb_avoid)
                                        {
                                            hlDNmatch = Regex.Match(val, HostPattern);
                                            if (hlDNmatch.Success)
                                            {
                                                DN_host = hlDNmatch.Groups["host"].ToString();
                                            }

                                            if (DN_host != null && DN_host.Trim() != "")
                                            {
                                                if (!_insideLinks)
                                                {
                                                    DNmatch = Regex.Match(val, BackLinkDN);
                                                    if (!DNmatch.Success)
                                                    {
                                                        if (_verifyBackLinks) // verify the linked page has the domain linked back
                                                        {
                                                            StreamReader strmrdr2 = CapturePage(val);
                                                            if (strmrdr2 != null)
                                                            {
                                                                try
                                                                {
                                                                    using (strmrdr2)
                                                                    {
                                                                        totalSrc = strmrdr2.ReadToEnd();
                                                                        strmrdr2.Close();
                                                                        VerifyPattern = VerifyPattern.Replace("(domain)", BackLinkDN);
                                                                        DNmatch2      = Regex.Match(totalSrc, VerifyPattern, RegexOptions.IgnoreCase);// | RegexOptions.Singleline);
                                                                        if (!DNmatch2.Success)
                                                                        {
                                                                            //foreach (string item in AvoidPatterns)
                                                                            //{
                                                                            //    if (val.ToLower().Contains(item.ToLower()))
                                                                            //    {
                                                                            //        lb_avoid = true;
                                                                            //        break;
                                                                            //    }
                                                                            //}

                                                                            //if (!lb_avoid)
                                                                            //{
                                                                            SR.HyperLinks.Add(val.ToString());
                                                                            _numOutboundLinks++;
                                                                            _errMsg = "";
                                                                            //}
                                                                            //lb_avoid = false;
                                                                        }
                                                                    }
                                                                }
                                                                catch (Exception e3)
                                                                {
                                                                    _errMsg = e3.Message;
                                                                    _numFailed++;
                                                                }
                                                            }
                                                            else
                                                            {
                                                                _numFailed++;
                                                            }
                                                        }
                                                        else //then just add the link
                                                        {
                                                            SR.HyperLinks.Add(val.ToString());
                                                            _numOutboundLinks++;
                                                            _errMsg = "";
                                                        }
                                                    }
                                                }
                                            }
                                        }
                                    }
                                }

                                DateTime stopTime = DateTime.Now;
                                TimeSpan timeDif  = stopTime - startTime;
                                float    procTime = (float)timeDif.TotalSeconds;
                                bool     blrtn    = CycleTime("Backlinks in ParsePage", BackLinkDN, procTime);
                            }

                            if (_numOutboundLinks >= _maxOutboundLinks)
                            {
                                _sr1 = SR;
                                _numUnspidered--;
                                _numSpidered++;
                                _errMsg = "";
                                return(true);
                            }
                        }
                    }

                    _sr1 = SR;
                    _numUnspidered--;
                    _numSpidered++;
                    _errMsg = "";
                    return(true);
                }
                catch (Exception e1)
                {
                    _errMsg = e1.Message;
                    _numFailed++;
                    return(false);
                }
            }
            else
            {
                _numFailed++;
                return(false);
            }
        }