Exemple #1
0
        /// <summary>
        ///     Gets the priority for host.
        /// </summary>
        /// <param name = "absoluteUri">The absolute URI.</param>
        /// <returns></returns>
        public override double?GetPriorityForHost(string absoluteUri)
        {
            double?strength;

            if (_priorities.TryGetValue(UserDefinedFunctions.ExtractHost(absoluteUri).Value, out strength))
            {
                return(strength);
            }

            return(0);
        }
Exemple #2
0
        /// <summary>
        ///     Gets the strength for host.
        /// </summary>
        /// <param name = "absoluteUri">The absolute URI.</param>
        /// <returns></returns>
        public override double?GetStrengthForHost(string absoluteUri)
        {
            double?strength;

            if (_hyperLinks_MOST_POPULAR_HOSTS_BY_HOSTS.TryGetValue(UserDefinedFunctions.ExtractHost(absoluteUri).Value, out strength))
            {
                return(strength);
            }

            return(0);
        }
Exemple #3
0
        public override bool WasCrawlRequestRedirected(CrawlRequest <TArachnodeDAO> crawlRequest)
        {
            if (crawlRequest.WebClient != null && crawlRequest.WebClient.HttpWebResponse != null)
            {
                //http://msdn.microsoft.com/en-us/library/system.net.httpstatuscode.aspx
                var statusCode = (int)crawlRequest.WebClient.HttpWebResponse.StatusCode;
                if ((statusCode >= 300 && statusCode <= 303) || statusCode == 307 || UserDefinedFunctions.ExtractHost(crawlRequest.WebClient.HttpWebRequest.RequestUri.AbsoluteUri).Value != UserDefinedFunctions.ExtractHost(crawlRequest.WebClient.HttpWebResponse.ResponseUri.AbsoluteUri).Value)
                {
                    return(true);
                }
            }

            return(false);
        }
        protected override void CreateDocument(Document document, long discoveryID, DiscoveryType discoveryType, string absoluteUri, string contentToIndex, int codePage, string fullTextIndexType, float strength, string discoveryPath, int threadNumber)
        {
            //a bare bones example of what you could do to add a new field to the index...
            //if (discoveryType == DiscoveryType.WebPage)
            //{
            //    HtmlDocument htmlDocument = new HtmlDocument();

            //    htmlDocument.LoadHtml(contentToIndex);

            //    HtmlNode htmlNode = htmlDocument.DocumentNode.SelectSingleNode("/html/body");

            //    string body = htmlNode.InnerText;

            //    document.Add(new Field("body", "Mike", Field.Store.NO, Field.Index.UN_TOKENIZED));
            //}

            document.Add(new Field("indexkey", discoveryType.ToString().ToLower().Substring(0, 1) + discoveryID, Field.Store.YES, Field.Index.NOT_ANALYZED));

            document.Add(new Field("discoveryid", discoveryID.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
            document.Add(new Field("discoverytype", discoveryType.ToString().ToLower(), Field.Store.YES, Field.Index.NOT_ANALYZED));

            //Discovery
            document.Add(new Field("absoluteuri", absoluteUri, Field.Store.YES, Field.Index.ANALYZED));

            //core fields
            document.Add(new Field("text", contentToIndex, Field.Store.NO, Field.Index.ANALYZED));
            document.Add(new Field("codepage", codePage.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
            document.Add(new Field("title", _title.Match(contentToIndex).Groups["Title"].Value.Trim(), Field.Store.YES, Field.Index.ANALYZED));

            //DiscoveryPath
            document.Add(new Field("discoverypath", discoveryPath, Field.Store.YES, Field.Index.NO));

            //AbsoluteUri Classification
            document.Add(new Field("domain", UserDefinedFunctions.ExtractDomain(absoluteUri).Value, Field.Store.YES, Field.Index.NOT_ANALYZED));
            document.Add(new Field("extension", UserDefinedFunctions.ExtractExtension(absoluteUri, false).Value, Field.Store.YES, Field.Index.NOT_ANALYZED));
            document.Add(new Field("host", UserDefinedFunctions.ExtractHost(absoluteUri).Value, Field.Store.YES, Field.Index.NOT_ANALYZED));
            document.Add(new Field("scheme", UserDefinedFunctions.ExtractScheme(absoluteUri, false).Value, Field.Store.YES, Field.Index.NOT_ANALYZED));

            //FullTextIndexType - used to store the extension that can be used with the default IIS MIME types configuration... (.pl images cannot be served without MIME type modification...)
            document.Add(new Field("fulltextindextype", fullTextIndexType, Field.Store.YES, Field.Index.NOT_ANALYZED));

            AddDocument(document, absoluteUri, strength);
        }
Exemple #5
0
        /// <summary>
        ///     Determines whether the specified crawl request is restricted.
        /// </summary>
        /// <param name = "crawlRequest">The crawl request.</param>
        /// <param name = "absoluteUri">The absolute URI.</param>
        /// <param name = "uriClassificationType">Type of the URI classification.</param>
        /// <returns>
        ///     <c>true</c> if the specified crawl request is restricted; otherwise, <c>false</c>.
        /// </returns>
        protected override bool IsRestricted(CrawlRequest <TArachnodeDAO> crawlRequest, string absoluteUri, short uriClassificationType)
        {
            if (uriClassificationType == (short)UriClassificationType.None)
            {
                return(false);
            }

            if ((uriClassificationType & (short)UriClassificationType.Domain) == (short)UriClassificationType.Domain)
            {
                if (UserDefinedFunctions.ExtractDomain(crawlRequest.Discovery.Uri.AbsoluteUri) != UserDefinedFunctions.ExtractDomain(absoluteUri))
                {
                    return(true);
                }
            }

            if ((uriClassificationType & (short)UriClassificationType.Extension) == (short)UriClassificationType.Extension)
            {
                if (UserDefinedFunctions.ExtractExtension(crawlRequest.Discovery.Uri.AbsoluteUri, false) != UserDefinedFunctions.ExtractExtension(absoluteUri, false))
                {
                    return(true);
                }
            }

            if ((uriClassificationType & (short)UriClassificationType.FileExtension) == (short)UriClassificationType.FileExtension)
            {
                if (UserDefinedFunctions.ExtractFileExtension(crawlRequest.Discovery.Uri.AbsoluteUri) != UserDefinedFunctions.ExtractFileExtension(absoluteUri))
                {
                    return(true);
                }
            }

            if ((uriClassificationType & (short)UriClassificationType.Host) == (short)UriClassificationType.Host)
            {
                if (UserDefinedFunctions.ExtractHost(crawlRequest.Discovery.Uri.AbsoluteUri) != UserDefinedFunctions.ExtractHost(absoluteUri))
                {
                    return(true);
                }
            }

            if ((uriClassificationType & (short)UriClassificationType.Scheme) == (short)UriClassificationType.Scheme)
            {
                if (UserDefinedFunctions.ExtractScheme(crawlRequest.Discovery.Uri.AbsoluteUri, false) != UserDefinedFunctions.ExtractScheme(absoluteUri, false))
                {
                    return(true);
                }
            }

            if (uriClassificationType >= (short)UriClassificationType.OriginalDirectoryLevelUp)
            {
                string crawlRequestOriginatorAbsoluteUriDirectory;

                if (crawlRequest.Originator == null)
                {
                    crawlRequestOriginatorAbsoluteUriDirectory = Path.GetDirectoryName(HttpUtility.HtmlEncode(crawlRequest.Parent.Uri.LocalPath));
                }
                else
                {
                    crawlRequestOriginatorAbsoluteUriDirectory = Path.GetDirectoryName(HttpUtility.HtmlEncode(crawlRequest.Originator.Uri.LocalPath));
                }

                string absoluteUriDirectory = Path.GetDirectoryName(HttpUtility.HtmlEncode(new Uri(absoluteUri).LocalPath));

                if (crawlRequestOriginatorAbsoluteUriDirectory == null)
                {
                    crawlRequestOriginatorAbsoluteUriDirectory = "\\";
                }

                if (absoluteUriDirectory == null)
                {
                    absoluteUriDirectory = "\\";
                }

                if ((uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevelUp) == (short)UriClassificationType.OriginalDirectoryLevelUp)
                {
                    if ((uriClassificationType & (short)UriClassificationType.OriginalDirectory) == (short)UriClassificationType.OriginalDirectory || (uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevel) == (short)UriClassificationType.OriginalDirectoryLevel)
                    {
                        if ((uriClassificationType & (short)UriClassificationType.OriginalDirectory) == (short)UriClassificationType.OriginalDirectory)
                        {
                            if (crawlRequestOriginatorAbsoluteUriDirectory.Length - crawlRequestOriginatorAbsoluteUriDirectory.Replace("\\", string.Empty).Length <= absoluteUriDirectory.Length - absoluteUriDirectory.Replace("\\", string.Empty).Length)
                            {
                                if (crawlRequestOriginatorAbsoluteUriDirectory != absoluteUriDirectory && absoluteUriDirectory != "\\")
                                {
                                    return(true);
                                }

                                return(false);
                            }
                        }

                        if ((uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevel) == (short)UriClassificationType.OriginalDirectoryLevel)
                        {
                            if (crawlRequestOriginatorAbsoluteUriDirectory.Length - crawlRequestOriginatorAbsoluteUriDirectory.Replace("\\", string.Empty).Length < absoluteUriDirectory.Length - absoluteUriDirectory.Replace("\\", string.Empty).Length)
                            {
                                return(true);
                            }
                        }
                    }
                    else
                    {
                        if (crawlRequestOriginatorAbsoluteUriDirectory.Length - crawlRequestOriginatorAbsoluteUriDirectory.Replace("\\", string.Empty).Length <= absoluteUriDirectory.Length - absoluteUriDirectory.Replace("\\", string.Empty).Length)
                        {
                            if (crawlRequest.Discovery.Uri.AbsoluteUri != absoluteUri)
                            {
                                return(true);
                            }

                            if (crawlRequest.CurrentDepth == 1)
                            {
                                crawlRequest.IsStorable = false;

                                return(false);
                            }
                        }
                    }
                }

                if ((uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevelDown) == (short)UriClassificationType.OriginalDirectoryLevelDown)
                {
                    if ((uriClassificationType & (short)UriClassificationType.OriginalDirectory) == (short)UriClassificationType.OriginalDirectory || (uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevel) == (short)UriClassificationType.OriginalDirectoryLevel)
                    {
                        if ((uriClassificationType & (short)UriClassificationType.OriginalDirectory) == (short)UriClassificationType.OriginalDirectory)
                        {
                            if (crawlRequestOriginatorAbsoluteUriDirectory.Length - crawlRequestOriginatorAbsoluteUriDirectory.Replace("\\", string.Empty).Length >= absoluteUriDirectory.Length - absoluteUriDirectory.Replace("\\", string.Empty).Length)
                            {
                                if (crawlRequestOriginatorAbsoluteUriDirectory != absoluteUriDirectory)
                                {
                                    return(true);
                                }
                            }

                            if (!absoluteUriDirectory.StartsWith(crawlRequestOriginatorAbsoluteUriDirectory))
                            {
                                return(true);
                            }
                        }

                        if ((uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevel) == (short)UriClassificationType.OriginalDirectoryLevel)
                        {
                            if (crawlRequestOriginatorAbsoluteUriDirectory.Length - crawlRequestOriginatorAbsoluteUriDirectory.Replace("\\", string.Empty).Length > absoluteUriDirectory.Length - absoluteUriDirectory.Replace("\\", string.Empty).Length)
                            {
                                return(true);
                            }
                        }
                    }
                    else
                    {
                        if (crawlRequestOriginatorAbsoluteUriDirectory.Length - crawlRequestOriginatorAbsoluteUriDirectory.Replace("\\", string.Empty).Length >= absoluteUriDirectory.Length - absoluteUriDirectory.Replace("\\", string.Empty).Length)
                        {
                            if (crawlRequest.Discovery.Uri.AbsoluteUri != absoluteUri)
                            {
                                return(true);
                            }

                            if (crawlRequest.CurrentDepth == 1)
                            {
                                crawlRequest.IsStorable = false;

                                return(false);
                            }
                        }
                    }
                }

                if ((uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevelDown) != (short)UriClassificationType.OriginalDirectoryLevelDown && (uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevelDown) != (short)UriClassificationType.OriginalDirectoryLevelDown)
                {
                    if ((uriClassificationType & (short)UriClassificationType.OriginalDirectory) == (short)UriClassificationType.OriginalDirectory)
                    {
                        if (crawlRequestOriginatorAbsoluteUriDirectory != absoluteUriDirectory)
                        {
                            return(true);
                        }
                    }

                    if ((uriClassificationType & (short)UriClassificationType.OriginalDirectoryLevel) == (short)UriClassificationType.OriginalDirectoryLevel)
                    {
                        if (crawlRequestOriginatorAbsoluteUriDirectory.Length - crawlRequestOriginatorAbsoluteUriDirectory.Replace("\\", string.Empty).Length != absoluteUriDirectory.Length - absoluteUriDirectory.Replace("\\", string.Empty).Length)
                        {
                            return(true);
                        }
                    }
                }
            }


            return(false);
        }