public PageLoaderSettings(Uri baseUri, string extensions, int levels, DomainRestriction restrictions) { this.baseUri = baseUri; this.ConfigureExtensionValidator(extensions); this.ConfigureDomainValidator(restrictions); this.Levels = levels < 0 ? int.MaxValue : levels; }
private void ConfigureDomainValidator(DomainRestriction restrictions) { if (restrictions == DomainRestriction.SameDomainOnly) { this.domainValidator += u => this.baseUri.Host == u.Host; return; } if (restrictions == DomainRestriction.SubdomainsOnly) { this.domainValidator += u => this.baseUri.IsBaseOf(u); return; } this.domainValidator += u => true; }
public RestrictionHelper(Uri baseUrl, DomainRestriction restriction, IEnumerable <string> extensions) { _baseUrl = baseUrl; _domainRestriction = restriction; _extensions = extensions; }
public static async Task DownloadAsync( string urlPath, string folderPath, int analysisnLevel = 1, DomainRestriction domainRestriction = DomainRestriction.NoRestriction, bool traicingMode = false) { if (traicingMode) { notifier.Notify($"\nAnalise on level {analysisnLevel}"); } string result; Uri uri; try { uri = new Uri(urlPath); } catch (Exception e) { notifier.Notify($"----Some problems with path ({urlPath})."); notifier.Notify($"----{e.Message}"); return; } try { if (traicingMode) { notifier.Notify($"Processing {uri}"); } result = await GetDataAsync(uri); } catch (Exception e) { notifier.Notify($"----Some problems while loading from {uri} are occured."); notifier.Notify($"----{e.Message}"); return; } if (traicingMode) { notifier.Notify($"Create new directory: {folderPath}"); } folderPath = CreateFolder(folderPath, uri.Host); var filePath = CreateAndFillFile(uri, folderPath, result); if (traicingMode) { notifier.Notify($"Write to {filePath}"); } if (analysisnLevel <= 0) { return; } var nodes = GetHtmlNodes(result); if (nodes == null) { return; } foreach (var node in nodes) { var reference = node.GetAttributeValue("href", string.Empty); Uri newUri; try { newUri = new Uri(reference); } catch (Exception e) { notifier.Notify($"----Some problems with path ({reference})."); notifier.Notify($"----{e.Message}"); return; } if (domainRestriction == DomainRestriction.InInitialURLOnly && newUri.Host != uri.Host) { if (traicingMode) { notifier.Notify($"The url isn't in initial url: {reference}"); } return; } if (reference != string.Empty) { await DownloadAsync(reference, folderPath, analysisnLevel - 1, domainRestriction, traicingMode); } } }
/// <summary> /// Initialize a new <see cref="DomainConstraint"/> instance. /// </summary> /// <param name="baseUri"></param> /// <param name="typeConstraint"></param> public DomainConstraint(Uri baseUri, DomainRestriction typeConstraint) { _baseUri = baseUri; _typeConstraint = typeConstraint; }
public PageLoader(string startFrom, string saveTo, string extensions, int levels, DomainRestriction restriction) { this.baseUrl = new Uri(startFrom); this.directory = new DirectoryInfo(saveTo); if (!this.directory.Exists) { this.directory.Create(); } this.visited = new List <string>(); this.settings = new PageLoaderSettings(this.baseUrl, extensions, levels, restriction); }
public static async Task StartToDownloadAsync( string url, int analysisnLevel = 1, DomainRestriction domainRestriction = DomainRestriction.InCurrentURLOnly, string storage = "C:/tmp", bool showStatusInConsole = false) { // set a status one time for all code writer.SetStatus(showStatusInConsole); writer.Write(string.Format("\nAnalyze on level {0}", analysisnLevel)); string result; Uri uri; try { uri = new Uri(url); } catch (Exception e) { writer.Write(string.Format("->Error occured in path ({0}).", url)); writer.Write(string.Format("->{0}", e.Message)); return; } try { writer.Write(string.Format("Starting to analyze the {0}", uri)); result = await GetDataAsync(uri); } catch (Exception e) { writer.Write(string.Format("->Some problems while loading from {0} are occured.", uri)); writer.Write(string.Format("->{0}", e.Message)); return; } #region saving in files writer.Write(string.Format("Create new directory: {0}", storage)); storage = CreateFolder(storage, uri.Host); var filePath = CreateAndFillFile(uri, storage, result); writer.Write(string.Format("Saved to {0}", filePath)); #endregion // if we choose to analyze only 1st level -then break the programm if (analysisnLevel <= 0) { return; } var nodes = GetHtmlNodes(result); if (nodes == null) { return; } var currentHostwithSheme = uri.Scheme + "://" + uri.Host; var currentPageSlash = uri.ToString().LastIndexOf("/", StringComparison.Ordinal); var currPage = uri.ToString().Substring(0, currentPageSlash); foreach (var node in nodes) { // get links from nodes var reference = node.GetAttributeValue("href", string.Empty); Uri newUri; try { if (reference.StartsWith("/")) { reference = currentHostwithSheme + reference; } else if (reference.EndsWith(".shtml")) { reference = currPage + "/" + reference; } newUri = new Uri(reference); } catch (Exception e) { // than we dont care about that - just ignore that and go ahead // return; continue; } if ((domainRestriction == DomainRestriction.InCurrentURLOnly) && (newUri.Host != uri.Host)) { writer.Write(string.Format("The url isn't in initial url: {0}", reference)); // go ahead continue; } if (reference != string.Empty && reference != currentHostwithSheme + "/") { await StartToDownloadAsync(reference, analysisnLevel - 1, domainRestriction, storage, showStatusInConsole); } } }