예제 #1
0
 public PageLoaderSettings(Uri baseUri, string extensions, int levels, DomainRestriction restrictions)
 {
     this.baseUri = baseUri;
     this.ConfigureExtensionValidator(extensions);
     this.ConfigureDomainValidator(restrictions);
     this.Levels = levels < 0 ? int.MaxValue : levels;
 }
예제 #2
0
            private void ConfigureDomainValidator(DomainRestriction restrictions)
            {
                if (restrictions == DomainRestriction.SameDomainOnly)
                {
                    this.domainValidator += u => this.baseUri.Host == u.Host;
                    return;
                }

                if (restrictions == DomainRestriction.SubdomainsOnly)
                {
                    this.domainValidator += u => this.baseUri.IsBaseOf(u);
                    return;
                }

                this.domainValidator += u => true;
            }
예제 #3
0
 public RestrictionHelper(Uri baseUrl, DomainRestriction restriction, IEnumerable <string> extensions)
 {
     _baseUrl           = baseUrl;
     _domainRestriction = restriction;
     _extensions        = extensions;
 }
예제 #4
0
        public static async Task DownloadAsync(
            string urlPath,
            string folderPath,
            int analysisnLevel = 1,
            DomainRestriction domainRestriction = DomainRestriction.NoRestriction,
            bool traicingMode = false)
        {
            if (traicingMode)
            {
                notifier.Notify($"\nAnalise on level {analysisnLevel}");
            }

            string result;
            Uri    uri;

            try
            {
                uri = new Uri(urlPath);
            }
            catch (Exception e)
            {
                notifier.Notify($"----Some problems with path ({urlPath}).");
                notifier.Notify($"----{e.Message}");
                return;
            }

            try
            {
                if (traicingMode)
                {
                    notifier.Notify($"Processing {uri}");
                }
                result = await GetDataAsync(uri);
            }
            catch (Exception e)
            {
                notifier.Notify($"----Some problems while loading from {uri} are occured.");
                notifier.Notify($"----{e.Message}");
                return;
            }

            if (traicingMode)
            {
                notifier.Notify($"Create new directory: {folderPath}");
            }
            folderPath = CreateFolder(folderPath, uri.Host);

            var filePath = CreateAndFillFile(uri, folderPath, result);

            if (traicingMode)
            {
                notifier.Notify($"Write to {filePath}");
            }

            if (analysisnLevel <= 0)
            {
                return;
            }

            var nodes = GetHtmlNodes(result);

            if (nodes == null)
            {
                return;
            }

            foreach (var node in nodes)
            {
                var reference = node.GetAttributeValue("href", string.Empty);
                Uri newUri;
                try
                {
                    newUri = new Uri(reference);
                }
                catch (Exception e)
                {
                    notifier.Notify($"----Some problems with path ({reference}).");
                    notifier.Notify($"----{e.Message}");
                    return;
                }

                if (domainRestriction == DomainRestriction.InInitialURLOnly && newUri.Host != uri.Host)
                {
                    if (traicingMode)
                    {
                        notifier.Notify($"The url isn't in initial url: {reference}");
                    }
                    return;
                }

                if (reference != string.Empty)
                {
                    await DownloadAsync(reference, folderPath, analysisnLevel - 1, domainRestriction, traicingMode);
                }
            }
        }
예제 #5
0
 /// <summary>
 /// Initialize a new <see cref="DomainConstraint"/> instance.
 /// </summary>
 /// <param name="baseUri"></param>
 /// <param name="typeConstraint"></param>
 public DomainConstraint(Uri baseUri, DomainRestriction typeConstraint)
 {
     _baseUri        = baseUri;
     _typeConstraint = typeConstraint;
 }
예제 #6
0
        public PageLoader(string startFrom, string saveTo, string extensions, int levels, DomainRestriction restriction)
        {
            this.baseUrl   = new Uri(startFrom);
            this.directory = new DirectoryInfo(saveTo);
            if (!this.directory.Exists)
            {
                this.directory.Create();
            }

            this.visited = new List <string>();

            this.settings = new PageLoaderSettings(this.baseUrl, extensions, levels, restriction);
        }
예제 #7
0
        public static async Task StartToDownloadAsync(
            string url,
            int analysisnLevel = 1,
            DomainRestriction domainRestriction = DomainRestriction.InCurrentURLOnly,
            string storage           = "C:/tmp",
            bool showStatusInConsole = false)
        {
            // set a status one time for all code
            writer.SetStatus(showStatusInConsole);
            writer.Write(string.Format("\nAnalyze on level {0}", analysisnLevel));

            string result;
            Uri    uri;

            try
            {
                uri = new Uri(url);
            }
            catch (Exception e)
            {
                writer.Write(string.Format("->Error occured in path ({0}).", url));
                writer.Write(string.Format("->{0}", e.Message));
                return;
            }

            try
            {
                writer.Write(string.Format("Starting to analyze the {0}", uri));
                result = await GetDataAsync(uri);
            }
            catch (Exception e)
            {
                writer.Write(string.Format("->Some problems while loading from {0} are occured.", uri));
                writer.Write(string.Format("->{0}", e.Message));
                return;
            }

            #region saving in files

            writer.Write(string.Format("Create new directory: {0}", storage));
            storage = CreateFolder(storage, uri.Host);

            var filePath = CreateAndFillFile(uri, storage, result);

            writer.Write(string.Format("Saved to {0}", filePath));

            #endregion
            // if we choose to analyze only 1st level -then break the programm
            if (analysisnLevel <= 0)
            {
                return;
            }

            var nodes = GetHtmlNodes(result);
            if (nodes == null)
            {
                return;
            }
            var currentHostwithSheme = uri.Scheme + "://" + uri.Host;
            var currentPageSlash     = uri.ToString().LastIndexOf("/", StringComparison.Ordinal);
            var currPage             = uri.ToString().Substring(0, currentPageSlash);
            foreach (var node in nodes)
            {
                // get links from nodes
                var reference = node.GetAttributeValue("href", string.Empty);
                Uri newUri;
                try
                {
                    if (reference.StartsWith("/"))
                    {
                        reference = currentHostwithSheme + reference;
                    }
                    else if (reference.EndsWith(".shtml"))
                    {
                        reference = currPage + "/" + reference;
                    }
                    newUri = new Uri(reference);
                }
                catch (Exception e)
                {
                    // than we dont care about that - just ignore that and go ahead
                    // return;
                    continue;
                }

                if ((domainRestriction == DomainRestriction.InCurrentURLOnly) && (newUri.Host != uri.Host))
                {
                    writer.Write(string.Format("The url isn't in initial url: {0}", reference));
                    // go ahead
                    continue;
                }

                if (reference != string.Empty && reference != currentHostwithSheme + "/")
                {
                    await StartToDownloadAsync(reference, analysisnLevel - 1, domainRestriction, storage, showStatusInConsole);
                }
            }
        }