Exemple #1
0
        public override void GetCookies(string absoluteUri, CookieContainer cookieContainer)
        {
            try
            {
                lock (_cookieContainerLock)
                {
                    if (cookieContainer == null)
                    {
                        cookieContainer = new CookieContainer();
                    }

                    Uri uri = new Uri(absoluteUri);

                    uri = new Uri(uri.Scheme + Uri.SchemeDelimiter + UserDefinedFunctions.ExtractDomain(absoluteUri).Value);

                    if (cookieContainer.GetCookies(uri).Count == 0)
                    {
                        string cookieHeaders = InternetGetCookieEx(absoluteUri);

                        if (cookieHeaders != null)
                        {
                            CookieCollection cookieCollection = BuildCookieCollection(cookieHeaders);

                            cookieContainer.Add(uri, cookieCollection);
                        }
                    }

                    uri = new Uri(uri.Scheme + Uri.SchemeDelimiter + "www." + uri.Host);

                    if (cookieContainer.GetCookies(uri).Count == 0)
                    {
                        string cookieHeaders = InternetGetCookieEx(absoluteUri);

                        if (cookieHeaders != null)
                        {
                            CookieCollection cookieCollection = BuildCookieCollection(cookieHeaders);

                            cookieContainer.Add(uri, cookieCollection);
                        }
                    }
                }
            }
            catch (Exception)
            {
            }
        }
        protected override void CreateDocument(Document document, long discoveryID, DiscoveryType discoveryType, string absoluteUri, string contentToIndex, int codePage, string fullTextIndexType, float strength, string discoveryPath, int threadNumber)
        {
            //a bare bones example of what you could do to add a new field to the index...
            //if (discoveryType == DiscoveryType.WebPage)
            //{
            //    HtmlDocument htmlDocument = new HtmlDocument();

            //    htmlDocument.LoadHtml(contentToIndex);

            //    HtmlNode htmlNode = htmlDocument.DocumentNode.SelectSingleNode("/html/body");

            //    string body = htmlNode.InnerText;

            //    document.Add(new Field("body", "Mike", Field.Store.NO, Field.Index.UN_TOKENIZED));
            //}

            document.Add(new Field("indexkey", discoveryType.ToString().ToLower().Substring(0, 1) + discoveryID, Field.Store.YES, Field.Index.NOT_ANALYZED));

            document.Add(new Field("discoveryid", discoveryID.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
            document.Add(new Field("discoverytype", discoveryType.ToString().ToLower(), Field.Store.YES, Field.Index.NOT_ANALYZED));

            //Discovery
            document.Add(new Field("absoluteuri", absoluteUri, Field.Store.YES, Field.Index.ANALYZED));

            //core fields
            document.Add(new Field("text", contentToIndex, Field.Store.NO, Field.Index.ANALYZED));
            document.Add(new Field("codepage", codePage.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
            document.Add(new Field("title", _title.Match(contentToIndex).Groups["Title"].Value.Trim(), Field.Store.YES, Field.Index.ANALYZED));

            //DiscoveryPath
            document.Add(new Field("discoverypath", discoveryPath, Field.Store.YES, Field.Index.NO));

            //AbsoluteUri Classification
            document.Add(new Field("domain", UserDefinedFunctions.ExtractDomain(absoluteUri).Value, Field.Store.YES, Field.Index.NOT_ANALYZED));
            document.Add(new Field("extension", UserDefinedFunctions.ExtractExtension(absoluteUri, false).Value, Field.Store.YES, Field.Index.NOT_ANALYZED));
            document.Add(new Field("host", UserDefinedFunctions.ExtractHost(absoluteUri).Value, Field.Store.YES, Field.Index.NOT_ANALYZED));
            document.Add(new Field("scheme", UserDefinedFunctions.ExtractScheme(absoluteUri, false).Value, Field.Store.YES, Field.Index.NOT_ANALYZED));

            //FullTextIndexType - used to store the extension that can be used with the default IIS MIME types configuration... (.pl images cannot be served without MIME type modification...)
            document.Add(new Field("fulltextindextype", fullTextIndexType, Field.Store.YES, Field.Index.NOT_ANALYZED));

            AddDocument(document, absoluteUri, strength);
        }
Exemple #3
0
        public override void UpdateCookies(string absoluteUri, CookieContainer cookieContainer, CookieCollection cookieCollection)
        {
            try
            {
                lock (_cookieContainerLock)
                {
                    if (cookieContainer != null && cookieCollection != null)
                    {
                        Uri uri = new Uri(absoluteUri);

                        uri = new Uri(uri.Scheme + Uri.SchemeDelimiter + UserDefinedFunctions.ExtractDomain(absoluteUri).Value);

                        cookieContainer.Add(uri, cookieCollection);

                        uri = new Uri(uri.Scheme + Uri.SchemeDelimiter + "www." + uri.Host);

                        cookieContainer.Add(uri, cookieCollection);
                    }
                }
            }
            catch (Exception)
            {
            }
        }
Exemple #4
0
 private void Engine_CrawlRequestCompleted2(CrawlRequest <ArachnodeDAO> sender)
 {
     if (UserDefinedFunctions.ExtractDomain(sender.Discovery.Uri.AbsoluteUri).Value != "nbc.com")
     {
     }
 }
        public override bool ManagePoliteness(CrawlRequest <TArachnodeDAO> crawlRequest, PolitenessState politenessState, IArachnodeDAO arachnodeDAO)
        {
            if (crawlRequest != null && crawlRequest.Politeness == null)
            {
                string domain = UserDefinedFunctions.ExtractDomain(crawlRequest.Discovery.Uri.AbsoluteUri).Value;

                //politeness/throttling can operate per host (cars.msn.com) or per domain (msn.com)...
                //string host = UserDefinedFunctions.ExtractHost(crawlRequest.Discovery.Uri.AbsoluteUri).Value;
                //domain = host;

                crawlRequest.Politeness = _cache.GetPoliteness(domain);

                if (crawlRequest.Politeness == null)
                {
                    crawlRequest.Politeness = new Politeness(domain);
                    crawlRequest.Politeness.FirstHttpWebRequest = DateTime.Now;

                    _cache.AddPoliteness(crawlRequest.Politeness);
                }
            }

            if (crawlRequest != null && crawlRequest.Politeness != null)
            {
                if (politenessState == PolitenessState.HttpWebRequestRequested)
                {
                    if ((crawlRequest.Politeness.CrawlDelayInMilliseconds != 0 && DateTime.Now.Subtract(crawlRequest.Politeness.LastHttpWebRequestCompleted).TotalMilliseconds < crawlRequest.Politeness.CrawlDelayInMilliseconds))
                    {
                        ResubmitCrawlRequest(crawlRequest, true, arachnodeDAO);

                        return(false);
                    }

                    if (crawlRequest.Politeness.ActiveHttpWebRequests >= crawlRequest.Politeness.MaximumActiveHttpWebRequests)
                    {
                        ResubmitCrawlRequest(crawlRequest, true, arachnodeDAO);

                        return(false);
                    }

                    if (ApplicationSettings.AutoThrottleHttpWebRequests)
                    {
                        if (crawlRequest.Politeness.AutoThrottleHttpWebRequests)
                        {
                            if (crawlRequest.Politeness.CrawlDelayInMilliseconds == 0)
                            {
                                if (crawlRequest.Politeness.LastHttpWebRequestCompleted == DateTime.MinValue)
                                {
                                    crawlRequest.Politeness.LastHttpWebRequestCompleted = crawlRequest.Politeness.LastHttpWebRequestRequested;
                                }

                                double millisecondsBetweenLastCanceledAndLastCompleted = crawlRequest.Politeness.LastHttpWebRequestCanceled.Subtract(crawlRequest.Politeness.LastHttpWebRequestCompleted).TotalMilliseconds;
                                double millisecondsBetweenNowAndLastRequested          = DateTime.Now.Subtract(crawlRequest.Politeness.LastHttpWebRequestRequested).TotalMilliseconds;

                                if (crawlRequest.Politeness.AutoThrottleCrawlDelayInMilliseconds == 0 && millisecondsBetweenLastCanceledAndLastCompleted > 0)
                                {
                                    crawlRequest.Politeness.AutoThrottleCrawlDelayInMilliseconds = millisecondsBetweenLastCanceledAndLastCompleted;
                                }

                                if (crawlRequest.Politeness.AutoThrottleCrawlDelayInMilliseconds > millisecondsBetweenNowAndLastRequested)
                                {
                                    if (millisecondsBetweenLastCanceledAndLastCompleted > 0)
                                    {
                                        crawlRequest.Politeness.AutoThrottleCrawlDelayInMilliseconds = millisecondsBetweenLastCanceledAndLastCompleted;
                                    }

                                    ResubmitCrawlRequest(crawlRequest, true, arachnodeDAO);

                                    return(false);
                                }
                                else
                                {
                                    crawlRequest.Politeness.AutoThrottleCrawlDelayInMilliseconds *= 0.9;
                                }
                            }
                        }
                    }

                    lock (_lock)
                    {
                        crawlRequest.Politeness.ActiveHttpWebRequests++;
                    }

                    crawlRequest.Politeness.LastHttpWebRequestRequested = DateTime.Now;

                    return(true);
                }

                lock (_lock)
                {
                    crawlRequest.Politeness.ActiveHttpWebRequests--;

                    if (crawlRequest.Politeness.ActiveHttpWebRequests < 0)
                    {
                        //shouldn't occur...
                        crawlRequest.Politeness.ActiveHttpWebRequests = 0;
                    }
                }

                switch (crawlRequest.DataType.DiscoveryType)
                {
                case DiscoveryType.File:
                    switch (politenessState)
                    {
                    case PolitenessState.HttpWebRequestCompleted:
                        crawlRequest.Politeness.LastFileHttpWebRequestCompleted = DateTime.Now;
                        crawlRequest.Politeness.TotalFileHttpWebRequestsCompleted++;
                        break;

                    case PolitenessState.HttpWebRequestCanceled:
                        crawlRequest.Politeness.LastFileHttpWebRequestCanceled = DateTime.Now;
                        crawlRequest.Politeness.TotalFileHttpWebRequestsCanceled++;
                        break;
                    }
                    if (crawlRequest.Data != null)
                    {
                        crawlRequest.Politeness.TotalFileDownloadedBytes += crawlRequest.Data.LongLength;
                    }
                    crawlRequest.Politeness.TotalFileHttpWebResponseTime += crawlRequest.HttpWebResponseTime;
                    break;

                case DiscoveryType.Image:
                    switch (politenessState)
                    {
                    case PolitenessState.HttpWebRequestCompleted:
                        crawlRequest.Politeness.LastImageHttpWebRequestCompleted = DateTime.Now;
                        crawlRequest.Politeness.TotalImageHttpWebRequestsCompleted++;
                        break;

                    case PolitenessState.HttpWebRequestCanceled:
                        crawlRequest.Politeness.LastImageHttpWebRequestCanceled = DateTime.Now;
                        crawlRequest.Politeness.TotalImageHttpWebRequestsCanceled++;
                        break;
                    }
                    if (crawlRequest.Data != null)
                    {
                        crawlRequest.Politeness.TotalImageDownloadedBytes += crawlRequest.Data.LongLength;
                    }
                    crawlRequest.Politeness.TotalImageHttpWebResponseTime += crawlRequest.HttpWebResponseTime;
                    break;

                case DiscoveryType.WebPage:
                    switch (politenessState)
                    {
                    case PolitenessState.HttpWebRequestCompleted:
                        crawlRequest.Politeness.LastWebPageHttpWebRequestCompleted = DateTime.Now;
                        crawlRequest.Politeness.TotalWebPageHttpWebRequestsCompleted++;
                        break;

                    case PolitenessState.HttpWebRequestCanceled:
                        crawlRequest.Politeness.LastWebPageHttpWebRequestCanceled = DateTime.Now;
                        crawlRequest.Politeness.TotalWebPageHttpWebRequestsCanceled++;
                        break;
                    }
                    if (crawlRequest.Data != null)
                    {
                        crawlRequest.Politeness.TotalWebPageDownloadedBytes += crawlRequest.Data.LongLength;
                    }
                    crawlRequest.Politeness.TotalWebPageHttpWebResponseTime += crawlRequest.HttpWebResponseTime;
                    break;
                }

                switch (politenessState)
                {
                case PolitenessState.HttpWebRequestCompleted:
                    crawlRequest.Politeness.LastHttpWebRequestCompleted = DateTime.Now;
                    crawlRequest.Politeness.TotalHttpWebRequestsCompleted++;
                    break;

                case PolitenessState.HttpWebRequestCanceled:
                    crawlRequest.Politeness.LastHttpWebRequestCanceled = DateTime.Now;
                    crawlRequest.Politeness.TotalHttpWebRequestsCanceled++;

                    crawlRequest.Politeness.AutoThrottleHttpWebRequests = true;
                    break;
                }

                if (crawlRequest.Data != null)
                {
                    crawlRequest.Politeness.TotalDownloadedBytes += crawlRequest.Data.LongLength;
                }
                crawlRequest.Politeness.TotalHttpWebResponseTime += crawlRequest.HttpWebResponseTime;
            }

            return(true);
        }
Exemple #6
0
        /// <summary>
        ///     Determines whether the specified crawl request is restricted.
        /// </summary>
        /// <param name = "crawlRequest">The crawl request.</param>
        /// <param name = "absoluteUri">The absolute URI.</param>
        /// <param name = "uriClassificationType">Type of the URI classification.</param>
        /// <returns>
        ///     <c>true</c> if the specified crawl request is restricted; otherwise, <c>false</c>.
        /// </returns>
        protected override bool IsRestricted(CrawlRequest <TArachnodeDAO> crawlRequest, string absoluteUri, short uriClassificationType)
        {
            if (uriClassificationType == (short)UriClassificationType.None)
            {
                return(false);
            }

            if ((uriClassificationType & (short)UriClassificationType.Domain) == (short)UriClassificationType.Domain)
            {
                if (UserDefinedFunctions.ExtractDomain(crawlRequest.Discovery.Uri.AbsoluteUri) != UserDefinedFunctions.ExtractDomain(absoluteUri))
                {
                    return(true);
                }
            }

            if ((uriClassificationType & (short)UriClassificationType.Extension) == (short)UriClassificationType.Extension)
            {
                if (UserDefinedFunctions.ExtractExtension(crawlRequest.Discovery.Uri.AbsoluteUri, false) != UserDefinedFunctions.ExtractExtension(absoluteUri, false))
                {
                    return(true);
                }
            }

            if ((uriClassificationType & (short)UriClassificationType.FileExtension) == (short)UriClassificationType.FileExtension)
            {
                if (UserDefinedFunctions.ExtractFileExtension(crawlRequest.Discovery.Uri.AbsoluteUri) != UserDefinedFunctions.ExtractFileExtension(absoluteUri))
                {
                    return(true);
                }
            }

            if ((uriClassificationType & (short)UriClassificationType.Host) == (short)UriClassificationType.Host)
            {
                if (UserDefinedFunctions.ExtractHost(crawlRequest.Discovery.Uri.AbsoluteUri) != UserDefinedFunctions.ExtractHost(absoluteUri))
                {
                    return(true);
                }
            }

            if ((uriClassificationType & (short)UriClassificationType.Scheme) == (short)UriClassificationType.Scheme)
            {
                if (UserDefinedFunctions.ExtractScheme(crawlRequest.Discovery.Uri.AbsoluteUri, false) != UserDefinedFunctions.ExtractScheme(absoluteUri, false))
                {
                    return(true);
                }
            }

            if (uriClassificationType >= (short)UriClassificationType.OriginalDirectoryLevelUp)
            {
                string crawlRequestOriginatorAbsoluteUriDirectory;

                if (crawlRequest.Originator == null)
                {
                    crawlRequestOriginatorAbsoluteUriDirectory = Path.GetDirectoryName(HttpUtility.HtmlEncode(crawlRequest.Parent.Uri.LocalPath));
                }
                else
                {
                    crawlRequestOriginatorAbsoluteUriDirectory = Path.GetDirectoryName(HttpUtility.HtmlEncode(crawlRequest.Originator.Uri.LocalPath));
                }

                string absoluteUriDirectory = Path.GetDirectoryName(HttpUtility.HtmlEncode(new Uri(absoluteUri).LocalPath));

                if (crawlRequestOriginatorAbsoluteUriDirectory == null)
                {
                    crawlRequestOriginatorAbsoluteUriDirectory = "\\";
                }

                if (absoluteUriDirectory == null)
                {
                    absoluteUriDirectory = "\\";
                }

                if ((uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevelUp) == (short)UriClassificationType.OriginalDirectoryLevelUp)
                {
                    if ((uriClassificationType & (short)UriClassificationType.OriginalDirectory) == (short)UriClassificationType.OriginalDirectory || (uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevel) == (short)UriClassificationType.OriginalDirectoryLevel)
                    {
                        if ((uriClassificationType & (short)UriClassificationType.OriginalDirectory) == (short)UriClassificationType.OriginalDirectory)
                        {
                            if (crawlRequestOriginatorAbsoluteUriDirectory.Length - crawlRequestOriginatorAbsoluteUriDirectory.Replace("\\", string.Empty).Length <= absoluteUriDirectory.Length - absoluteUriDirectory.Replace("\\", string.Empty).Length)
                            {
                                if (crawlRequestOriginatorAbsoluteUriDirectory != absoluteUriDirectory && absoluteUriDirectory != "\\")
                                {
                                    return(true);
                                }

                                return(false);
                            }
                        }

                        if ((uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevel) == (short)UriClassificationType.OriginalDirectoryLevel)
                        {
                            if (crawlRequestOriginatorAbsoluteUriDirectory.Length - crawlRequestOriginatorAbsoluteUriDirectory.Replace("\\", string.Empty).Length < absoluteUriDirectory.Length - absoluteUriDirectory.Replace("\\", string.Empty).Length)
                            {
                                return(true);
                            }
                        }
                    }
                    else
                    {
                        if (crawlRequestOriginatorAbsoluteUriDirectory.Length - crawlRequestOriginatorAbsoluteUriDirectory.Replace("\\", string.Empty).Length <= absoluteUriDirectory.Length - absoluteUriDirectory.Replace("\\", string.Empty).Length)
                        {
                            if (crawlRequest.Discovery.Uri.AbsoluteUri != absoluteUri)
                            {
                                return(true);
                            }

                            if (crawlRequest.CurrentDepth == 1)
                            {
                                crawlRequest.IsStorable = false;

                                return(false);
                            }
                        }
                    }
                }

                if ((uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevelDown) == (short)UriClassificationType.OriginalDirectoryLevelDown)
                {
                    if ((uriClassificationType & (short)UriClassificationType.OriginalDirectory) == (short)UriClassificationType.OriginalDirectory || (uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevel) == (short)UriClassificationType.OriginalDirectoryLevel)
                    {
                        if ((uriClassificationType & (short)UriClassificationType.OriginalDirectory) == (short)UriClassificationType.OriginalDirectory)
                        {
                            if (crawlRequestOriginatorAbsoluteUriDirectory.Length - crawlRequestOriginatorAbsoluteUriDirectory.Replace("\\", string.Empty).Length >= absoluteUriDirectory.Length - absoluteUriDirectory.Replace("\\", string.Empty).Length)
                            {
                                if (crawlRequestOriginatorAbsoluteUriDirectory != absoluteUriDirectory)
                                {
                                    return(true);
                                }
                            }

                            if (!absoluteUriDirectory.StartsWith(crawlRequestOriginatorAbsoluteUriDirectory))
                            {
                                return(true);
                            }
                        }

                        if ((uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevel) == (short)UriClassificationType.OriginalDirectoryLevel)
                        {
                            if (crawlRequestOriginatorAbsoluteUriDirectory.Length - crawlRequestOriginatorAbsoluteUriDirectory.Replace("\\", string.Empty).Length > absoluteUriDirectory.Length - absoluteUriDirectory.Replace("\\", string.Empty).Length)
                            {
                                return(true);
                            }
                        }
                    }
                    else
                    {
                        if (crawlRequestOriginatorAbsoluteUriDirectory.Length - crawlRequestOriginatorAbsoluteUriDirectory.Replace("\\", string.Empty).Length >= absoluteUriDirectory.Length - absoluteUriDirectory.Replace("\\", string.Empty).Length)
                        {
                            if (crawlRequest.Discovery.Uri.AbsoluteUri != absoluteUri)
                            {
                                return(true);
                            }

                            if (crawlRequest.CurrentDepth == 1)
                            {
                                crawlRequest.IsStorable = false;

                                return(false);
                            }
                        }
                    }
                }

                if ((uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevelDown) != (short)UriClassificationType.OriginalDirectoryLevelDown && (uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevelDown) != (short)UriClassificationType.OriginalDirectoryLevelDown)
                {
                    if ((uriClassificationType & (short)UriClassificationType.OriginalDirectory) == (short)UriClassificationType.OriginalDirectory)
                    {
                        if (crawlRequestOriginatorAbsoluteUriDirectory != absoluteUriDirectory)
                        {
                            return(true);
                        }
                    }

                    if ((uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevel) == (short)UriClassificationType.OriginalDirectoryLevel)
                    {
                        if (crawlRequestOriginatorAbsoluteUriDirectory.Length - crawlRequestOriginatorAbsoluteUriDirectory.Replace("\\", string.Empty).Length != absoluteUriDirectory.Length - absoluteUriDirectory.Replace("\\", string.Empty).Length)
                        {
                            return(true);
                        }
                    }
                }
            }


            return(false);
        }