Esempio n. 1
0
        public CrawledWindow(URLParentData parent)
        {
            InitializeComponent();

            tb_title.Text       = parent.title;
            tb_url.Text         = parent.url;
            tb_rootURL.Text     = parent.rootURL;
            tb_crawledTime.Text = parent.urlCrawledTime.ToString();

            TimeSpan ts = new TimeSpan(parent.urlCrawledTime.Ticks - parent.urlRegisteredTime.Ticks);

            tb_duration.Text = ts.TotalSeconds + " s";

            foreach (URLChildData child in parent.childrenURLs)
            {
                lb_children.Items.Add(child.url);
            }
        }
Esempio n. 2
0
 // Take input url as string
 // 2019103002
 public URLCrawler(URLParentData parentURLData)
 {
     this.parentURLData = parentURLData;
     htmlWeb            = new HtmlWeb();
     htmlDoc            = new HtmlDocument();
 }
Esempio n. 3
0
        private void startSingleCrawler(URLChildData childToBecomeParent)
        {
            // transfer url from toBeCrawled to crawling
            lock (dToBeCrawledURLs) // 2019103027
                dToBeCrawledURLs.Remove(childToBecomeParent.url);
            lock (dCrawlingURLs)    // 2019103027
                dCrawlingURLs.Add(childToBecomeParent.url, childToBecomeParent);

            URLParentData parent  = new URLParentData(childToBecomeParent);
            URLCrawler    crawler = new URLCrawler(parent);

            // create a new task for crawler operation
            var urlCrawlerTask = Task.Factory.StartNew(() =>
            {
                crawler.start();
            }).ContinueWith(taskInfo =>
            {
                if (parent.state == CrawlerState.Crawled)
                {
                    // transfer url from crawling hs to crawled hs
                    lock (dCrawlingURLs) // 2019103027
                        dCrawlingURLs.Remove(childToBecomeParent.url);
                    lock (dCrawledURLs)  // 2019103027
                        dCrawledURLs.Add(parent.url, parent);

                    // loops over each child URL, checks if exists and adds it to tobecrawled hs
                    foreach (URLChildData child in parent.childrenURLs)
                    {
                        // 2019103026
                        bool externalURLactivated = lRootURLs.Where(rootData => rootData.rootURL == child.rootURL)
                                                    .Select(rootData => rootData.externalActivated)
                                                    .First();

                        // discards external URLs if externalURL checkbox is not activated
                        if (child.type == CrawledType.externalURL && !externalURLactivated)
                        {
                            continue;
                        }

                        if (!URLexist(child.url))
                        {
                            lock (dToBeCrawledURLs)
                                dToBeCrawledURLs.Add(child.url, child);
                        }
                    }
                }
                else
                {
                    childToBecomeParent.errorCounter = parent.errorCounter;

                    // handle CrawlerState.Failed
                    if (childToBecomeParent.errorCounter >= 3)
                    {
                        childToBecomeParent.urlDisabledTime = DateTime.Now;
                        childToBecomeParent.errorCounter    = 0;
                        lock (disabledURLs)
                            disabledURLs.Add(childToBecomeParent);
                    }
                    else
                    {
                        // transfer url from toBeCrawled to crawling
                        lock (dToBeCrawledURLs)
                            dToBeCrawledURLs.Add(childToBecomeParent.url, childToBecomeParent);
                    }
                    // remove url from crawling list
                    lock (dCrawlingURLs)
                        dCrawlingURLs.Remove(childToBecomeParent.url);
                }
            });
            Tuple <string, Task, string> newTask = new Tuple <string, Task, string>(childToBecomeParent.rootURL, urlCrawlerTask, childToBecomeParent.url);

            tl_activelyRunningTasks.Add(newTask);
        }