/// <summary> /// The function gets files list for the specified snapshot date /// </summary> /// <returns></returns> public async Task GetFilesListAsync(string homePageDate) { UpdateHomePageUrl(homePageDate); if (!Directory.Exists(DomainDirectory)) { Directory.CreateDirectory(DomainDirectory); } var fullPathName = DomainDirectory + StartPage; var status = await Task.Run(() => Helpers.GetFile(DomainHomePage, fullPathName), _cancellationToken); if (!status) { return; } var htmlPage = string.Empty; await Task.Run(() => { try { // if operation was canceled throw an exception _cancellationToken.ThrowIfCancellationRequested(); Correction.RemoveWbmCodeFromHtml(fullPathName); htmlPage = File.ReadAllText(fullPathName); } catch (OperationCanceledException) { return; } }, _cancellationToken); if (_cancellationToken.IsCancellationRequested) { return; } // get files var htmlFilesList = Task.Run(() => GetFilesList("html", "html", htmlPage), _cancellationToken); var jsFilesList = Task.Run(() => GetFilesList("js", "js", htmlPage), _cancellationToken); var cssFilesList = Task.Run(() => GetFilesList("css", "css", htmlPage), _cancellationToken); var imgsFilesList = Task.Run(() => GetFilesList("imgs", "images"), _cancellationToken); await Task.WhenAll(htmlFilesList, jsFilesList, cssFilesList, imgsFilesList); if (_cancellationToken.IsCancellationRequested) { return; } DomainLists.ClearFilesLists(); DomainLists.HtmlFilesList["available"].AddRange(htmlFilesList.Result); DomainLists.JsFilesList["available"].AddRange(jsFilesList.Result); DomainLists.CssFilesList["available"].AddRange(cssFilesList.Result); DomainLists.ImgsList["available"].AddRange(imgsFilesList.Result); }
/// <summary> /// The function saves a file from the url locally /// </summary> /// <param name="url">Url to a file or page</param> /// <returns>True if file successfully saved</returns> private bool SaveFile(string url) { var pathAndName = Helpers.GetPathAndNameFromUrl(url, StartPage); var fullPath = DomainDirectory + pathAndName["path"]; var fullPathName = fullPath + pathAndName["name"]; var isFileSaved = true; try { if (!Directory.Exists(fullPath)) { Directory.CreateDirectory(fullPath); } switch (_overwriteMode) { case Constants.Settings.OverwriteMode.OverwriteExisting: isFileSaved = Helpers.GetFile(url, fullPathName); break; case Constants.Settings.OverwriteMode.IgnoreExisting: default: if (!File.Exists(fullPathName) || fullPathName.Contains("index.html")) { isFileSaved = Helpers.GetFile(url, fullPathName); } break; } } catch (Exception) { isFileSaved = false; } var isWbmRedirectPage = false; if (isFileSaved && Helpers.IsWebPage(fullPathName)) { try { var htmlPage = File.ReadAllText(fullPathName); var list = Helpers.GetNodesAttributes(htmlPage, "//p", "innerText"); isWbmRedirectPage = list.Select(type => type).Any(item => item.Equals(@"Got an HTTP 302 response at crawl time", StringComparison.OrdinalIgnoreCase)); if (isWbmRedirectPage) { var homePageRedirect = @"<!DOCTYPE html><html><head></head><body><script type=""text/javascript"">window.location = ""/"";</script></body></html>"; File.WriteAllText(fullPathName, homePageRedirect); } } catch (Exception) { isWbmRedirectPage = false; } } if (isFileSaved && !isWbmRedirectPage && !string.IsNullOrEmpty(fullPathName)) { if (fullPathName.EndsWith("htm") || fullPathName.EndsWith("html")) { Correction.RemoveWbmCodeFromHtml(fullPathName); } if (fullPathName.EndsWith("css") || fullPathName.EndsWith("js") || fullPathName.EndsWith("htm") || fullPathName.EndsWith("html")) { Correction.AllUrlsInFile(fullPathName, DomainName); } } var pageUrl = url; var isSaved = isFileSaved ? "Yes" : "No"; var row = new string[] { pageUrl, isSaved }; DomainLists.DownloadedFilesList.Add(row); return(isFileSaved); }
/// <summary> /// /// </summary> /// <param name="type"></param> /// <param name="listType"></param> /// <param name="htmlPage"></param> /// <returns></returns> private List <string> GetFilesList(string type, string listType, string htmlPage = "") { var list = new List <string>(); var typeCopy = type; switch (typeCopy) { case "html": // create the sitemap list = Helpers.GetNodesAttributes(htmlPage, "//a", "href"); list.AddRange(Helpers.GetNodesAttributes(htmlPage, "//area", "href")); list.RemoveAll(item => (!item.Contains(DomainName) && item.Contains("/web/"))); list = Correction.UrlsInList(list, DomainHomePage, "html", DomainName); // get the homepage from the current selected snapshot list.Add(DomainHomePage); // remove duplicates list = list.Distinct().ToList(); if (_progress != null) { _progress.Report(new KeyValuePair <string, KeyValuePair <string, int> >(DomainName, new KeyValuePair <string, int>(listType, list.Count))); } break; case "js": list = Helpers.GetNodesAttributes(htmlPage, "//script", "src").Distinct().ToList(); list = Correction.UrlsInList(list, DomainHomePage, "js", DomainName); if (_progress != null) { _progress.Report(new KeyValuePair <string, KeyValuePair <string, int> >(DomainName, new KeyValuePair <string, int>(listType, list.Count))); } break; case "css": list = Helpers.GetNodesAttributes(htmlPage, "//link", "href").Distinct().ToList(); list = Correction.UrlsInList(list, DomainHomePage, "css", DomainName); if (_progress != null) { _progress.Report(new KeyValuePair <string, KeyValuePair <string, int> >(DomainName, new KeyValuePair <string, int>(listType, list.Count))); } break; case "imgs": try { // if operation was canceled throw an exception _cancellationToken.ThrowIfCancellationRequested(); var res = Helpers.CreateRequest(DomainAllFilesUrl); var body = res.Substring(res.LastIndexOf("<table id=\"resultsUrl\">", StringComparison.Ordinal)); var imgsFilesUrls = Helpers.GetList(body, Constants.Patterns.ImgsFilesPattern, "url") .Select(item => item.Contains("/web/*/") ? item.Replace("/web/*/", Constants.ArchiveUrls.GetTimeMapUrl) : item) .ToList(); var imgsWithTimeMap = imgsFilesUrls.Where(item => item.Contains(Constants.ArchiveUrls.GetTimeMapUrl)).ToList(); var imgsWithoutTimeMap = imgsFilesUrls.Where(item => !item.Contains(Constants.ArchiveUrls.GetTimeMapUrl)).ToList(); list.AddRange(imgsWithoutTimeMap); imgsFilesUrls.Clear(); imgsWithoutTimeMap.Clear(); // get the correct images urls var imgsListCopy = list; var counter = list.Count; Parallel.ForEach(imgsWithTimeMap, imgUrl => { try { // if operation was canceled throw an exception _cancellationToken.ThrowIfCancellationRequested(); var urlCopy = imgUrl; var item = GetUrlFromTimeMap(urlCopy); if (!string.IsNullOrEmpty(item)) { imgsListCopy.Add(item); } if (_progress != null) { _progress.Report(new KeyValuePair <string, KeyValuePair <string, int> >(DomainName, new KeyValuePair <string, int>(listType, ++counter))); } } catch (OperationCanceledException) { return; } catch (Exception) { } }); if (!_cancellationToken.IsCancellationRequested) { if (_progress != null) { _progress.Report(new KeyValuePair <string, KeyValuePair <string, int> >(DomainName, new KeyValuePair <string, int>(listType, counter))); } list = imgsListCopy; list = Correction.UrlsInList(list, DomainHomePage, "img", DomainName); } } catch (OperationCanceledException) { } catch (Exception) { list = new List <string>(); } break; } return(list); }