/// <summary> /// On a url being parsed from a webpage. /// </summary> internal void OnUrl(string urlStr, Crawler crawler) { // parse the url Url url = Url.Parse(urlStr); // was the url valid? if (url == Url.Empty) { // no, skip //Log.Debug("Parsed an invalid url '" + urlStr + "'."); return; } // should the url be parsed? if (!_session.TraverseDomains && !_session.SeedHosts.Contains(url.Host.ToLowercase())) { // no, return return; } // is the on url callback set? yes, run if (_session.OnUrl != null && !_session.OnUrl(url)) { return; } // is the url from a crawler? if (crawler != null) { // has the url been parsed recently? yes, skip var cache = _caches[crawler]; if (cache.Contains(url)) { return; } // add the url to the skip cache cache.Add(url); // yes, is the extension empty? if (Url.IsWebPageExtension(url.Extension)) { ManagerUpdate.Control.AddSingle(OnUrlWebPage, crawler, url); } else { ManagerUpdate.Control.AddSingle(OnUrlAsset, crawler, url); } } else { // add a read index to the stats Stats.UpdateRead(); // add the url to the new urls table _session.OnNewUrl(url); } }
//-------------------------------------------// /// <summary> /// Commit urls and score to the DB. /// </summary> private void Commit() { _lock.Take(); ArrayRig <Url> newUrls = _newUrls; _newUrls = new ArrayRig <Url>(); ArrayRig <Url> oldUrls = _oldUrls; _oldUrls = new ArrayRig <Url>(); int score = _score; _changed = false; _committing = false; _lock.Release(); // iterate new urls foreach (Url url in newUrls) { // add to new urls table _session.OnNewUrl(url); } // iterate old urls foreach (Url url in oldUrls) { // add to old urls table _session.OnUrlParsed(url); } // commit host score changes _session.OnHostUpdate(this); if (Disposed) { Dispose(); } }