コード例 #1
0
        public void newRootURL(string url, int max_thread_number, bool externalURLactivated)
        {
            if (URLexist(url))
            {
                return; // skip the root url since it already exists
            }
            lRootURLs.Add(new URLRootData(url, max_thread_number, externalURLactivated));

            URLChildData newRootData = new URLChildData(url, CrawledType.internalURL, url);

            lock (dToBeCrawledURLs)  // 2019103027
                dToBeCrawledURLs.Add(newRootData.url, newRootData);
        }
コード例 #2
0
ファイル: URLCrawler.cs プロジェクト: simayseyrek/WebCrawler
        // Function that parses HTML document for internal and external links
        private void ParseHTML()
        {
            // extracting all links
            foreach (HtmlNode link in htmlDoc.DocumentNode.SelectNodes("//a[@href]"))
            {
                // Obtain the URL
                HtmlAttribute att = link.Attributes["href"];

                // Adds host part (for the internal links)
                var crawled_url = new Uri(parentURLData.uriURL, att.Value);
                // TODO: check item type (png, zip, vs vs.)

                // Decide if the url is internal or external for the baseURL
                CrawledType type = CrawledType.externalURL;
                if (Uri.Compare(parentURLData.uriURL, crawled_url, UriComponents.Host, UriFormat.SafeUnescaped, StringComparison.CurrentCulture) == 0)
                {
                    type = CrawledType.internalURL;
                }

                // add the child url to the parent's list
                URLChildData child = new URLChildData(crawled_url.AbsoluteUri, type, parentURLData.rootURL);
                parentURLData.childrenURLs.Add(child);
            }
        }
コード例 #3
0
 // Constructor
 public URLParentData(URLChildData child) : base(child.url, child.rootURL) // 2019103012
 {
     errorCounter = child.errorCounter;
     childrenURLs = new List <URLChildData>();
 }
コード例 #4
0
        private void startSingleCrawler(URLChildData childToBecomeParent)
        {
            // transfer url from toBeCrawled to crawling
            lock (dToBeCrawledURLs) // 2019103027
                dToBeCrawledURLs.Remove(childToBecomeParent.url);
            lock (dCrawlingURLs)    // 2019103027
                dCrawlingURLs.Add(childToBecomeParent.url, childToBecomeParent);

            URLParentData parent  = new URLParentData(childToBecomeParent);
            URLCrawler    crawler = new URLCrawler(parent);

            // create a new task for crawler operation
            var urlCrawlerTask = Task.Factory.StartNew(() =>
            {
                crawler.start();
            }).ContinueWith(taskInfo =>
            {
                if (parent.state == CrawlerState.Crawled)
                {
                    // transfer url from crawling hs to crawled hs
                    lock (dCrawlingURLs) // 2019103027
                        dCrawlingURLs.Remove(childToBecomeParent.url);
                    lock (dCrawledURLs)  // 2019103027
                        dCrawledURLs.Add(parent.url, parent);

                    // loops over each child URL, checks if exists and adds it to tobecrawled hs
                    foreach (URLChildData child in parent.childrenURLs)
                    {
                        // 2019103026
                        bool externalURLactivated = lRootURLs.Where(rootData => rootData.rootURL == child.rootURL)
                                                    .Select(rootData => rootData.externalActivated)
                                                    .First();

                        // discards external URLs if externalURL checkbox is not activated
                        if (child.type == CrawledType.externalURL && !externalURLactivated)
                        {
                            continue;
                        }

                        if (!URLexist(child.url))
                        {
                            lock (dToBeCrawledURLs)
                                dToBeCrawledURLs.Add(child.url, child);
                        }
                    }
                }
                else
                {
                    childToBecomeParent.errorCounter = parent.errorCounter;

                    // handle CrawlerState.Failed
                    if (childToBecomeParent.errorCounter >= 3)
                    {
                        childToBecomeParent.urlDisabledTime = DateTime.Now;
                        childToBecomeParent.errorCounter    = 0;
                        lock (disabledURLs)
                            disabledURLs.Add(childToBecomeParent);
                    }
                    else
                    {
                        // transfer url from toBeCrawled to crawling
                        lock (dToBeCrawledURLs)
                            dToBeCrawledURLs.Add(childToBecomeParent.url, childToBecomeParent);
                    }
                    // remove url from crawling list
                    lock (dCrawlingURLs)
                        dCrawlingURLs.Remove(childToBecomeParent.url);
                }
            });
            Tuple <string, Task, string> newTask = new Tuple <string, Task, string>(childToBecomeParent.rootURL, urlCrawlerTask, childToBecomeParent.url);

            tl_activelyRunningTasks.Add(newTask);
        }