Ejemplo n.º 1
0
        public static string GetWebsitePathFromUri(ExtractionScope scope, Uri rootUri)
        {
            string websitePath = null;

            switch (scope)
            {
            case ExtractionScope.Domain:
                websitePath = HtmlFileUtils.GetPathValidChars(GetBaseDomain(rootUri));
                break;

            case ExtractionScope.SubDomain:
                websitePath = HtmlFileUtils.GetPathValidChars(GetSubDomain(rootUri));
                break;

            //case ExtractionScope.Path:
            default:
                websitePath = HtmlFileUtils.GetPathValidChars(GetSubDomain(rootUri) + GetRootPath(rootUri.AbsolutePath).Replace("/", "_"));
                break;
            }
            return(websitePath);
        }
Ejemplo n.º 2
0
        /// <summary>
        /// Method called to decide which pages to crawl
        /// </summary>
        public static bool ShouldCrawlUri(ExtractionScope scope, Uri candidateUri, Uri rootUri)
        {
            switch (scope)
            {
            case ExtractionScope.Domain:
                return(GetBaseDomain(candidateUri) == GetBaseDomain(rootUri));

            case ExtractionScope.SubDomain:
                return(GetSubDomain(candidateUri) == GetSubDomain(rootUri));

            //case ExtractionScope.Path:
            default:
                if (GetSubDomain(candidateUri) == GetSubDomain(rootUri))
                {
                    return(candidateUri.AbsolutePath.StartsWith(GetRootPath(rootUri.AbsolutePath)));
                }
                else
                {
                    return(false);
                }
            }
        }
Ejemplo n.º 3
0
        public void ParseParam(string keyValueParam)
        {
            int equalsIndex = keyValueParam.IndexOf("=");

            if (equalsIndex < 0 || equalsIndex == (keyValueParam.Length - 1))
            {
                throw new Exception("Syntax error in params at : " + keyValueParam);
            }
            string[] keyValue = keyValueParam.Split('=');
            string   key      = keyValue[0].Trim().ToLower();
            string   value    = keyValue[1].Trim();

            switch (key)
            {
            case "scope":
                switch (value.ToLower())
                {
                case "domain":
                    Scope = ExtractionScope.Domain;
                    break;

                case "subdomain":
                    Scope = ExtractionScope.SubDomain;
                    break;

                case "path":
                    Scope = ExtractionScope.Path;
                    break;

                default:
                    throw new Exception("Invalid value for key scope at : " + keyValueParam);
                }
                break;

            case "rooturl":
                RootUrl = new Uri(value);
                break;

            case "storagedir":
                StorageDir = value;
                break;

            case "maxduration":
                MaxDuration = Int32.Parse(value);
                break;

            case "maxpagecount":
                MaxPageCount = Int32.Parse(value);
                break;

            case "minuniquetext":
                MinUniqueText = Int32.Parse(value);
                break;

            case "maxsizeondisk":
                MaxSizeOnDisk = Int32.Parse(value);
                break;

            case "mincrawldelay":
                MinCrawlDelay = Int32.Parse(value);
                break;

            case "excludeurls":
                UrlPatternsToExclude.Add(value);
                break;

            default:
                throw new Exception("Invalid parameter at : " + keyValueParam);
            }
        }