public static string GetWebsitePathFromUri(ExtractionScope scope, Uri rootUri) { string websitePath = null; switch (scope) { case ExtractionScope.Domain: websitePath = HtmlFileUtils.GetPathValidChars(GetBaseDomain(rootUri)); break; case ExtractionScope.SubDomain: websitePath = HtmlFileUtils.GetPathValidChars(GetSubDomain(rootUri)); break; //case ExtractionScope.Path: default: websitePath = HtmlFileUtils.GetPathValidChars(GetSubDomain(rootUri) + GetRootPath(rootUri.AbsolutePath).Replace("/", "_")); break; } return(websitePath); }
/// <summary> /// Method called to decide which pages to crawl /// </summary> public static bool ShouldCrawlUri(ExtractionScope scope, Uri candidateUri, Uri rootUri) { switch (scope) { case ExtractionScope.Domain: return(GetBaseDomain(candidateUri) == GetBaseDomain(rootUri)); case ExtractionScope.SubDomain: return(GetSubDomain(candidateUri) == GetSubDomain(rootUri)); //case ExtractionScope.Path: default: if (GetSubDomain(candidateUri) == GetSubDomain(rootUri)) { return(candidateUri.AbsolutePath.StartsWith(GetRootPath(rootUri.AbsolutePath))); } else { return(false); } } }
public void ParseParam(string keyValueParam) { int equalsIndex = keyValueParam.IndexOf("="); if (equalsIndex < 0 || equalsIndex == (keyValueParam.Length - 1)) { throw new Exception("Syntax error in params at : " + keyValueParam); } string[] keyValue = keyValueParam.Split('='); string key = keyValue[0].Trim().ToLower(); string value = keyValue[1].Trim(); switch (key) { case "scope": switch (value.ToLower()) { case "domain": Scope = ExtractionScope.Domain; break; case "subdomain": Scope = ExtractionScope.SubDomain; break; case "path": Scope = ExtractionScope.Path; break; default: throw new Exception("Invalid value for key scope at : " + keyValueParam); } break; case "rooturl": RootUrl = new Uri(value); break; case "storagedir": StorageDir = value; break; case "maxduration": MaxDuration = Int32.Parse(value); break; case "maxpagecount": MaxPageCount = Int32.Parse(value); break; case "minuniquetext": MinUniqueText = Int32.Parse(value); break; case "maxsizeondisk": MaxSizeOnDisk = Int32.Parse(value); break; case "mincrawldelay": MinCrawlDelay = Int32.Parse(value); break; case "excludeurls": UrlPatternsToExclude.Add(value); break; default: throw new Exception("Invalid parameter at : " + keyValueParam); } }