Пример #1
0
 public WebStructure(WebStructureOptions webStructureOptions)
 {
     Options = webStructureOptions;
     if (!OnionCr.UrlValidator(Options.RootWebsite))
     {
         throw new Exception($"Invalid root page. '{Options.RootWebsite}'");
     }
     if (Options.MaxSubpages < 1 || Options.MaxSubpages > 100)
     {
         throw new Exception("Subpages must be between 1 and 1000");
     }
     if (Options.MaxHostCount < 1 || Options.MaxHostCount > 100000)
     {
         throw new Exception("Maximum page count must be between 1 and 100000");
     }
 }
Пример #2
0
        public List <string> ExtractUrls(string InputString)
        {
            List <string> resutlt = new List <string>();

            resutlt.AddRange(OnionCr.ExtractOnionUrls(InputString).Where(x => OnionCr.UrlValidator(x)).ToList());
            if (Options.includeClearWeb)
            {
                resutlt.AddRange(OnionCr.ExtractSurfaceUrls(InputString).Where(x => OnionCr.UrlValidator(x)).ToList());
            }

            if (Options.includeIpHosts)
            {
                resutlt.AddRange(OnionCr.ExtractIpUrls(InputString).Where(x => OnionCr.UrlValidator(x)).ToList());
            }
            return(resutlt);
        }
Пример #3
0
        public async Task CrawlAsync(bool continueExistingCrawl = false)
        {
            if (continueExistingCrawl && queuedLinks.Count > 0)
            {
                RaiseEventResponse("Continuing the crawl process.");
                goto bypass;
            }
            else if (continueExistingCrawl)
            {
                RaiseEventResponse("There is'nt any url in queue. Please re-initialize.");
                return;
            }

            var seedQueue = new QueuedUrl();

            seedQueue.url    = Options.RootWebsite;
            seedQueue.parent = Options.RootWebsite;
            queuedLinks.Add(seedQueue);


bypass:
            cancellationTokenSource = new CancellationTokenSource();
            while (queuedLinks.Count > 0)
            {
                if (cancellationTokenSource.IsCancellationRequested)
                {
                    cancellationTokenSource = null;
                    RaiseEventResponse("Crawling is Cancelled.");
                    return;
                }



                try
                {
                    if (WebStructureObject.GroupBy(x => x.Host).ToList().Count >= Options.MaxHostCount)
                    {
                        break;
                    } //If max host count acceed



                    string URL = queuedLinks[0].url;

                    //  if (WebStructureObject.Count(x => x.Host == GetHost(URL)) >= Options.MaxSubpages)
                    //  {
                    //      queuedLinks.RemoveAt(0);
                    //      continue;
                    //  } //  <==Old code

                    var NewOnionSite = new OnionSite(GetHost(URL));

                    var htmlContent = await DownloadWebsiteAsync(URL); //Downloads HTML CODE

                    var externalLinksList = ExtractExternalUrls(URL, htmlContent);
                    var subLinksList      = ExtractSubUrls(URL, htmlContent).Take(Options.MaxSubpages).ToList();


                    NewOnionSite.title = OnionCr.GetPageTitle(URL);
                    NewOnionSite.externalLinks.AddRange(externalLinksList.Where(x => !isWebStrObjContains_hostOf(x)).ToList());
                    NewOnionSite.parentLink = queuedLinks[0].parent; // COMMON CODE TO CREATE ONION SITE OBJECT CLASS



                    if (isWebStrObjContains_hostOf(URL))
                    {
                        OnionSite PARENTonionSite = WebStructureObject.Where(x => x.Host == GetHost(URL)).ToList()[0];
                        PARENTonionSite.subLinks.Add(NewOnionSite);
                        NewOnionSite.parentSite = PARENTonionSite;
                    } // If Host is visited before
                    else
                    {
                        var subLinksQueue = ConvertToQueuedUrl(URL, subLinksList);
                        if (queuedLinks.Count <= Options.maxQueue)
                        {
                            queuedLinks.AddRange(subLinksQueue);
                        }

                        WebStructureObject.Add(NewOnionSite);
                    } //if host is not visited before.


                    var externalLinksQueue = ConvertToQueuedUrl(URL, externalLinksList);
                    if (queuedLinks.Count <= Options.maxQueue)
                    {
                        queuedLinks.AddRange(externalLinksQueue); // QUEUE EXTERNAL LINKS LEADS TO OTHER HOSTS.
                    }

                    queuedLinks.RemoveAt(0);
                }catch (Exception ex)
                {
                    queuedLinks.RemoveAt(0);
                    RaiseEventResponse("ERROR:" + ex.Message);
                    continue;
                }
            }

            RaiseEventResponse("DONE CRAWLING.");
        }