public M4Spider() { _maxOutboundLinks = 99; _numOutboundLinks = 0; _avoidHttps = true; _insideLinks = false; _verifyBackLinks = true; _unSpidered = new List <string>(); _spideredURLs = new List <string>(); _obLinks = new List <string>(); _avoidPatterns = new List <string>(); _proxyDomain = ""; _proxyLogin = ""; _proxyPassword = ""; _proxyPort = 0; _backLinkDN = ""; SR = new SpiderResults(); PrxShut = new ProxyShuttler(); AvoidPatterns.Add("google."); AvoidPatterns.Add("yahoo."); AvoidPatterns.Add("bing."); AvoidPatterns.Add("altavista."); AvoidPatterns.Add("princeton.edu"); AvoidPatterns.Add("amazon.com"); AvoidPatterns.Add("baidu.com"); AvoidPatterns.Add("planet-lab.edu"); }
private bool ParsePage(StreamReader strmRdr, string DN) { SR.Init(_backLinkDN); if (strmRdr != null) { Uri thisUri; try { thisUri = new Uri(DN); } catch (Exception URIex) { _errMsg = URIex.Message; return(false); } SR.Host = thisUri.Host; SR.Path = thisUri.PathAndQuery; SR.Port = thisUri.Port.ToString(); SR.Protocol = thisUri.Scheme; SR.DDN = _backLinkDN; try { // Create an instance of StreamReader to read from CapturePage(). // The using statement also closes the StreamReader. _errMsg = ""; using (strmRdr) { string HypLinkPattern = "href=\"(?<Link>http:.*?)\""; string sHypLinkPattern = "href=\"(?<Link>https?:.*?)\""; string HostPattern = @"^[a-z][a-z0-9+\-.]*://([a-z0-9\-._~%!$&'()*+,;=]+@)?(?<host>[a-z0-9\-._~%]+|\[[a-z0-9\-._~%!$&'()*+,;=:]+\])"; string VerifyPattern = @"^[a-z][a-z0-9+\-.]*://([a-z0-9\-._~%!$&'()*+,;=]+@)?(?<host>.*(domain))"; string KeywordPattern = "<meta name=\"keywords\" content=\"(?<keywords>.*?)\">"; string DescriptionPattern = "<meta name=\"description\" content=\"(?<description>.*?)\">"; string val; string source, DN_host = ""; bool lb_avoid = false; string totalSrc = ""; MatchCollection HypLinkMatch, KeywordMatch, DescripMatch; Match DNmatch, hlDNmatch, DNmatch2; //totalSrc = strmRdr.ReadToEnd(); //while ((source = strmRdr.ReadToEnd()) != null)//strmRdr.ReadLine()) != null) if ((source = strmRdr.ReadToEnd()) != null) { if (null != strmRdr) { strmRdr.Close(); } _errMsg = ""; KeywordMatch = Regex.Matches(source, KeywordPattern, RegexOptions.IgnoreCase | RegexOptions.Singleline); if (KeywordMatch.Count > 0)//.Success) { foreach (Match m1 in KeywordMatch) { val = m1.Groups["keywords"].ToString(); if (val != null) { SR.Keywords = val.ToString(); } } } DescripMatch = Regex.Matches(source, DescriptionPattern, RegexOptions.Singleline); if (DescripMatch.Count > 0)//.Success) { foreach (Match m2 in DescripMatch) { val = m2.Groups["description"].ToString(); if (val != null) { SR.Description = val.ToString(); } } } if (!_avoidHttps) { HypLinkPattern = sHypLinkPattern; } HypLinkMatch = Regex.Matches(source, HypLinkPattern, RegexOptions.Singleline); if (HypLinkMatch.Count > 0)//.Success) { DateTime startTime = DateTime.Now; foreach (Match m in HypLinkMatch) { if (_numOutboundLinks >= _maxOutboundLinks) { _sr1 = SR; _numUnspidered--; _numSpidered++; _errMsg = ""; return(true); } val = m.Groups["Link"].ToString(); if (val != null) { foreach (string item in AvoidPatterns) { lb_avoid = false; if (val.ToLower().Contains(item.ToLower())) { lb_avoid = true; break; } } if (!lb_avoid) { hlDNmatch = Regex.Match(val, HostPattern); if (hlDNmatch.Success) { DN_host = hlDNmatch.Groups["host"].ToString(); } if (DN_host != null && DN_host.Trim() != "") { if (!_insideLinks) { DNmatch = Regex.Match(val, BackLinkDN); if (!DNmatch.Success) { if (_verifyBackLinks) // verify the linked page has the domain linked back { StreamReader strmrdr2 = CapturePage(val); if (strmrdr2 != null) { try { using (strmrdr2) { totalSrc = strmrdr2.ReadToEnd(); strmrdr2.Close(); VerifyPattern = VerifyPattern.Replace("(domain)", BackLinkDN); DNmatch2 = Regex.Match(totalSrc, VerifyPattern, RegexOptions.IgnoreCase);// | RegexOptions.Singleline); if (!DNmatch2.Success) { //foreach (string item in AvoidPatterns) //{ // if (val.ToLower().Contains(item.ToLower())) // { // lb_avoid = true; // break; // } //} //if (!lb_avoid) //{ SR.HyperLinks.Add(val.ToString()); _numOutboundLinks++; _errMsg = ""; //} //lb_avoid = false; } } } catch (Exception e3) { _errMsg = e3.Message; _numFailed++; } } else { _numFailed++; } } else //then just add the link { SR.HyperLinks.Add(val.ToString()); _numOutboundLinks++; _errMsg = ""; } } } } } } } DateTime stopTime = DateTime.Now; TimeSpan timeDif = stopTime - startTime; float procTime = (float)timeDif.TotalSeconds; bool blrtn = CycleTime("Backlinks in ParsePage", BackLinkDN, procTime); } if (_numOutboundLinks >= _maxOutboundLinks) { _sr1 = SR; _numUnspidered--; _numSpidered++; _errMsg = ""; return(true); } } } _sr1 = SR; _numUnspidered--; _numSpidered++; _errMsg = ""; return(true); } catch (Exception e1) { _errMsg = e1.Message; _numFailed++; return(false); } } else { _numFailed++; return(false); } }