public WebStructure(WebStructureOptions webStructureOptions) { Options = webStructureOptions; if (!OnionCr.UrlValidator(Options.RootWebsite)) { throw new Exception($"Invalid root page. '{Options.RootWebsite}'"); } if (Options.MaxSubpages < 1 || Options.MaxSubpages > 100) { throw new Exception("Subpages must be between 1 and 1000"); } if (Options.MaxHostCount < 1 || Options.MaxHostCount > 100000) { throw new Exception("Maximum page count must be between 1 and 100000"); } }
public List <string> ExtractUrls(string InputString) { List <string> resutlt = new List <string>(); resutlt.AddRange(OnionCr.ExtractOnionUrls(InputString).Where(x => OnionCr.UrlValidator(x)).ToList()); if (Options.includeClearWeb) { resutlt.AddRange(OnionCr.ExtractSurfaceUrls(InputString).Where(x => OnionCr.UrlValidator(x)).ToList()); } if (Options.includeIpHosts) { resutlt.AddRange(OnionCr.ExtractIpUrls(InputString).Where(x => OnionCr.UrlValidator(x)).ToList()); } return(resutlt); }
public async Task CrawlAsync(bool continueExistingCrawl = false) { if (continueExistingCrawl && queuedLinks.Count > 0) { RaiseEventResponse("Continuing the crawl process."); goto bypass; } else if (continueExistingCrawl) { RaiseEventResponse("There is'nt any url in queue. Please re-initialize."); return; } var seedQueue = new QueuedUrl(); seedQueue.url = Options.RootWebsite; seedQueue.parent = Options.RootWebsite; queuedLinks.Add(seedQueue); bypass: cancellationTokenSource = new CancellationTokenSource(); while (queuedLinks.Count > 0) { if (cancellationTokenSource.IsCancellationRequested) { cancellationTokenSource = null; RaiseEventResponse("Crawling is Cancelled."); return; } try { if (WebStructureObject.GroupBy(x => x.Host).ToList().Count >= Options.MaxHostCount) { break; } //If max host count acceed string URL = queuedLinks[0].url; // if (WebStructureObject.Count(x => x.Host == GetHost(URL)) >= Options.MaxSubpages) // { // queuedLinks.RemoveAt(0); // continue; // } // <==Old code var NewOnionSite = new OnionSite(GetHost(URL)); var htmlContent = await DownloadWebsiteAsync(URL); //Downloads HTML CODE var externalLinksList = ExtractExternalUrls(URL, htmlContent); var subLinksList = ExtractSubUrls(URL, htmlContent).Take(Options.MaxSubpages).ToList(); NewOnionSite.title = OnionCr.GetPageTitle(URL); NewOnionSite.externalLinks.AddRange(externalLinksList.Where(x => !isWebStrObjContains_hostOf(x)).ToList()); NewOnionSite.parentLink = queuedLinks[0].parent; // COMMON CODE TO CREATE ONION SITE OBJECT CLASS if (isWebStrObjContains_hostOf(URL)) { OnionSite PARENTonionSite = WebStructureObject.Where(x => x.Host == GetHost(URL)).ToList()[0]; PARENTonionSite.subLinks.Add(NewOnionSite); NewOnionSite.parentSite = PARENTonionSite; } // If Host is visited before else { var subLinksQueue = ConvertToQueuedUrl(URL, subLinksList); if (queuedLinks.Count <= Options.maxQueue) { queuedLinks.AddRange(subLinksQueue); } WebStructureObject.Add(NewOnionSite); } //if host is not visited before. var externalLinksQueue = ConvertToQueuedUrl(URL, externalLinksList); if (queuedLinks.Count <= Options.maxQueue) { queuedLinks.AddRange(externalLinksQueue); // QUEUE EXTERNAL LINKS LEADS TO OTHER HOSTS. } queuedLinks.RemoveAt(0); }catch (Exception ex) { queuedLinks.RemoveAt(0); RaiseEventResponse("ERROR:" + ex.Message); continue; } } RaiseEventResponse("DONE CRAWLING."); }