Exemplo n.º 1
0
        /// <summary>
        /// On a url being parsed from a webpage.
        /// </summary>
        internal void OnUrl(string urlStr, Crawler crawler)
        {
            // parse the url
            Url url = Url.Parse(urlStr);

            // was the url valid?
            if (url == Url.Empty)
            {
                // no, skip
                //Log.Debug("Parsed an invalid url '" + urlStr + "'.");
                return;
            }

            // should the url be parsed?
            if (!_session.TraverseDomains && !_session.SeedHosts.Contains(url.Host.ToLowercase()))
            {
                // no, return
                return;
            }

            // is the on url callback set? yes, run
            if (_session.OnUrl != null && !_session.OnUrl(url))
            {
                return;
            }

            // is the url from a crawler?
            if (crawler != null)
            {
                // has the url been parsed recently? yes, skip
                var cache = _caches[crawler];
                if (cache.Contains(url))
                {
                    return;
                }

                // add the url to the skip cache
                cache.Add(url);

                // yes, is the extension empty?
                if (Url.IsWebPageExtension(url.Extension))
                {
                    ManagerUpdate.Control.AddSingle(OnUrlWebPage, crawler, url);
                }
                else
                {
                    ManagerUpdate.Control.AddSingle(OnUrlAsset, crawler, url);
                }
            }
            else
            {
                // add a read index to the stats
                Stats.UpdateRead();

                // add the url to the new urls table
                _session.OnNewUrl(url);
            }
        }
Exemplo n.º 2
0
        //-------------------------------------------//

        /// <summary>
        /// Commit urls and score to the DB.
        /// </summary>
        private void Commit()
        {
            _lock.Take();
            ArrayRig <Url> newUrls = _newUrls;

            _newUrls = new ArrayRig <Url>();
            ArrayRig <Url> oldUrls = _oldUrls;

            _oldUrls = new ArrayRig <Url>();
            int score = _score;

            _changed    = false;
            _committing = false;
            _lock.Release();

            // iterate new urls
            foreach (Url url in newUrls)
            {
                // add to new urls table
                _session.OnNewUrl(url);
            }
            // iterate old urls
            foreach (Url url in oldUrls)
            {
                // add to old urls table
                _session.OnUrlParsed(url);
            }

            // commit host score changes
            _session.OnHostUpdate(this);

            if (Disposed)
            {
                Dispose();
            }
        }