private static bool shouldVisit(string url, nav nav_from) { // TODO - Check for depth if (nav_from.depth >= maxDepth) { return(false); } // Drop all product pages if (url.Contains("prd-")) { return(false); } if (url.Contains("edgesuite.net")) { nav_from.nav_from = "img"; } // Ignore filetypes if (url.Contains(".ico") || url.Contains(".css")) { return(false); } // Ignore non-kohls pages if (!url.Contains("kohls.com") && !url.Contains("kohlsecommerce.com")) { return(false); } // Ignore catalog.jsp pages, as we've seeded them all already if (nav_from.nav_from.Contains("catalog.jsp") && url.Contains("catalog")) { return(false); } // Trying removing all catalog.jsp's. They should only be added via the seedThread if (url.Contains("catalog.jsp")) //&& url.Contains("N=0")) { return(false); } if (url.Contains("cs.kohls.com")) { return(false); } if (url.Contains("search.jsp") || url.Contains("/search/")) { return(false); } if (url.Contains(".shtml")) { return(false); } // Lets not let a catalog page call a catalog page if (url.Contains("catalog") && nav_from.nav_from.Contains("catalog")) { return(false); } // Ignore catalog pages, wayy too many to count, might have to add them later // TODO - add catalog back in, have to strip the queryString first //if (url.Contains("catalog/")) // return false; return(true); }
private static void getPage(string url, nav nav_from) { // Check if visited, if so, return if (checkVisited(url)) { return; } // Handle akamaized images if (!checkImages && (url.Contains("media") && url.Contains("edgesuite.net"))) { return; } if (url.Contains("media") && url.Contains("edgesuite.net") && !url.Contains(".swf")) { if (checkImage(url)) { // TEST string html = getHtml(nav_from.nav_from); if (!html.Contains(url)) { // we have a problem with manifest creation logger.Debug("Error with manifest creation. URL: " + url + " - Nav_From: " + nav_from); return; } // END TEST if ((nav_from.nav_from.Contains("catalog.jsp") && catalogCheckFlag) || !nav_from.nav_from.Contains("catalog.jsp")) { addMissingImage(url, nav_from.nav_from); } logger.Debug("Image coming soon:" + url + " - " + nav_from); missingImageCount++; } else { imageCount++; } pagesVisited.TryAdd(url, nav_from.nav_from); return; } string responseFromServer = ""; responseFromServer = getHtml(url); // Counter purposes // is Product? string productCode = ""; if (url.Contains("prd-")) { foreach (Match m2 in Regex.Matches(url, @"prd[^/]+", RegexOptions.Singleline)) { productCode = m2.ToString(); break; } if (productCode != "") { // Try and add the product, if it fails, we've already crawled it try { productsVisited.TryAdd(productCode, ""); } catch { //logger.Info("Error adding product to productsVisited" + productCode); } } } logger.Debug("Starting to regex HTML received Size: " + responseFromServer.Length); // Ok we have some html, lets regex it string startString = "<div id=\"breadcrumb\""; string endString = "</div><!-- #dimensions -->"; string startStringATG = "<div class=\"clearfix\" id=\"breadcrumb\""; string endStringATG = "<ul id=\"image-size-toggle\""; responseFromServer = Regex.Replace(responseFromServer, startString + ".+?" + endString, "", RegexOptions.Singleline); responseFromServer = Regex.Replace(responseFromServer, startStringATG + ".+?" + endStringATG, "", RegexOptions.Singleline); responseFromServer = Regex.Replace(responseFromServer, @":80", "", RegexOptions.Singleline); responseFromServer = Regex.Replace(responseFromServer, @":443", "", RegexOptions.Singleline); responseFromServer = Regex.Replace(responseFromServer, "<link rel=\"canonical.+?>", "", RegexOptions.Singleline); responseFromServer = Regex.Replace(responseFromServer, "'[+].*[+]'", "(IGNORE)", RegexOptions.Singleline); responseFromServer = Regex.Replace(responseFromServer, "MORE TO CONSIDER" + ".+?" + "<!-- ForEach Ends -->", "", RegexOptions.Singleline); responseFromServer = Regex.Replace(responseFromServer, "MORE TO CONSIDER" + ".+?" + "<!-- Right panel Ends -->", "", RegexOptions.Singleline); responseFromServer = Regex.Replace(responseFromServer, "<script type=\"text/javascript\"" + ".+?" + "</script>", "", RegexOptions.Singleline); responseFromServer = Regex.Replace(responseFromServer, ";jsessionid=[a-zA-Z0-9]+!\\-*[0-9]+!\\-*[0-9]+", "", RegexOptions.Singleline); responseFromServer = Regex.Replace(responseFromServer, ";jsessionid=[a-zA-Z0-9]+!\\-*[0-9]+", "", RegexOptions.Singleline); responseFromServer = Regex.Replace(responseFromServer, "WS=0\\&", "", RegexOptions.Singleline); responseFromServer = Regex.Replace(responseFromServer, "\\&WS=0", "", RegexOptions.Singleline); responseFromServer = Regex.Replace(responseFromServer, @"&S=\d", "", RegexOptions.Singleline); startString = "<ul id=\"navigation\">"; endString = "</noscript>"; responseFromServer = Regex.Replace(responseFromServer, startString + ".+?" + endString, "", RegexOptions.Singleline); startString = "<!-- BEGIN RELATED -->"; endString = "<!-- END MORE-RESULTS -->"; responseFromServer = Regex.Replace(responseFromServer, startString + ".+?" + endString, "", RegexOptions.Singleline); // Remove cs.kohls.com chainlinks.. lets do it now to speed up building manifest responseFromServer = Regex.Replace(responseFromServer, @"http(s)?:\/\/cs\.kohls\.com\/.+?;$", "", RegexOptions.Singleline); logger.Debug("Regex complete. Size: " + responseFromServer.Length); if (responseFromServer.Contains("wrongSite is assigned")) { logger.Debug("wrongSite ERROR: " + url); } // Check for dead links /* * if (deadLinkRegex.Matches(responseFromServer).Count > 0) { * logger.Debug("NEW DEAD LINK FOUND ON PAGE: " + url); * logger.Debug("LINKED FROM PAGE: " + nav_from.nav_from); * addError("DEAD", url, nav_from.nav_from); * } */ if (responseFromServer.Contains(@"</span>1 – 0")) { logger.Debug("NEW DEAD LINK FOUND ON PAGE: " + url); logger.Debug("LINKED FROM PAGE: " + nav_from.nav_from); addError("DEAD", url, nav_from.nav_from); } foreach (Match m in Regex.Matches(responseFromServer, @"<img[^s]+src=""""", RegexOptions.Singleline)) { logger.Error("IMAGE NOT FOUND: " + m.Value.ToString()); } // We're done processing url, add to pagesVisited //lock (pagesVisited) Interlocked.Increment(ref pagesVisitedCount); pagesVisited.TryAdd(url, nav_from.nav_from); // Not a product, lets generate a manifest createManifest(responseFromServer, url, nav_from); }
private static void createManifest(string pageText, string url, nav nav_from) { // Generate nav object nav currNav = new nav(url, nav_from.depth + 1); // Ignore js/css foreach (Match m in Regex.Matches(url, @"(js\b|css\b)", RegexOptions.Singleline)) { return; } // Find all links in responseFromServer string textMatch = ""; string pattern = @"((href|src)=""[^""]+|url\((""|')[^(""|')]+)"; foreach (Match m in Regex.Matches(pageText, pattern, RegexOptions.Singleline)) { textMatch = m.ToString().Replace("href=\"", ""); if (textMatch.Contains("/null")) { continue; } if (textMatch.Contains("#")) { continue; } if (textMatch.Contains("mobile.kohls.com")) { continue; } textMatch = textMatch.ToString().Replace("url(\"", ""); textMatch = textMatch.ToString().Replace("url('", ""); textMatch = textMatch.Replace("src=\"", ""); textMatch = textMatch.Replace("javascript:launchCorporate('", ""); textMatch = textMatch.Replace("')", ""); // TODO - this is temp, move to prd-id for checking product pages // Strip parameters from products if (textMatch.Contains("prd-") && textMatch.Contains("?")) { textMatch = textMatch.Substring(0, textMatch.IndexOf("?")); } if (!textMatch.Contains("javascript") && (!textMatch.Contains("jsessionid") || url == "http://www.kohls.com") && !textMatch.Contains("inc_omniture_akamai.jsp")) { if (!textMatch.Contains("www.") && !textMatch.ToLower().Contains("http")) { Uri result; Uri.TryCreate(new Uri(url), textMatch, out result); textMatch = result.ToString(); } // Check if we should visit this url if (shouldVisit(textMatch, currNav)) { try { // final check, somehow null is sneaking through - probably fix later to improve performance if (textMatch.Contains(@"/null")) { continue; } // Add to manifest if (pagesToVisit.TryAdd(textMatch, currNav)) { // Log the page manifest creation logger.Debug("queueing: " + textMatch + " - from: " + url); } else { logger.Debug("Queue failed, duplicate detected: " + url); } } // We have a duplicate, move along catch (Exception e) { logger.Debug("Queue failed at tryAdd, look into this as shouldVisit should have caught it"); logger.Debug(e.Message); continue; } } } } }