/// <summary> /// Recursively builds the HtmlDocument tree /// </summary> /// <param name="tree"></param> /// <param name="hashedDocs"></param> /// <param name="LinkPath"></param> /// <param name="rootURL"></param> private static HtmlDocumentTree createHtmlDocTreeSubroutine(HtmlDocumentTree tree, List <string> hashedDocs, string LinkPath, string rootURL) { List <string> innerLink = ScraperUtilities.getInnerLinks(tree.Node, LinkPath, rootURL); List <HtmlDocumentTree> childrenToAdd = new List <HtmlDocumentTree>(); foreach (string link in innerLink) { //string docHash = getHash(doc); if (!hashedDocs.Contains(link)) { hashedDocs.Add(link); Task <string> task = ScraperUtilities.AsyncUrlToTask(link); task.Wait(); var doc = new HtmlDocument(); doc.LoadHtml(task.Result); childrenToAdd.Add(new HtmlDocumentTree(doc, link)); } } foreach (HtmlDocumentTree child in childrenToAdd) { tree.ChildDocuments.Add(child); } foreach (HtmlDocumentTree child in childrenToAdd) { return(createHtmlDocTreeSubroutine(child, hashedDocs, LinkPath, rootURL)); } return(tree); }
/// <summary> /// new method to clean up get method in controller /// </summary> /// <returns></returns> public string RunDisplay() { // string baseUrl = "http://debian.osuosl.org/debian/pool/main/c/"; Task <string> task = ScraperUtilities.AsyncUrlToTask(rootURL); task.Wait(); var testDoc = new HtmlDocument(); testDoc.LoadHtml(task.Result); htmlDocumentTree = ScraperUtilities.createHtmlDocTree(testDoc, rootURL, LinkPath, rootURL); //List<string> htmlDocumentHashes = new List<string>(); //htmlDocumentHashes.Add(getHash(testDoc)); //HtmlDocumentTree tree = createHtmlDocTree(new HtmlDocumentTree(testDoc, rootURL), htmlDocumentHashes); string result = ""; //result += tree.ChildDocuments.Count.ToString() + "\n"; result += ScraperUtilities.displayHtmlDocumentTree(htmlDocumentTree); // result = displayAllUrls(testDoc); //result += displayInnerUrls(testDoc); //result += getFiles(testDoc); result = ""; //this just lists all the urls foreach (var x in urlList) { result += x + "\n"; } return(result); }
/// <summary> /// Wrapper function for creating the HtmlDocument tree /// </summary> /// <param name="root"></param> /// <param name="url"></param> /// <param name="LinkPath"></param> /// <param name="rootURL"></param> public static HtmlDocumentTree createHtmlDocTree(HtmlDocument root, string url, string LinkPath, string rootURL) { HtmlDocumentTree htmlDocumentTree = new HtmlDocumentTree(root, url); List <string> htmlDocumentHashes = new List <string>(); htmlDocumentHashes.Add(url); return(createHtmlDocTreeSubroutine(htmlDocumentTree, htmlDocumentHashes, LinkPath, rootURL)); }
/// <summary> /// Wrapper function for displaying the HtmlDocument tree /// </summary> /// <param name="tree"></param> public static string displayHtmlDocumentTree(HtmlDocumentTree tree) { string result = ""; foreach (HtmlDocumentTree child in tree.ChildDocuments) { result += child.Url + "\n"; } List <string> urlList = new List <string>(); string exeLinks = ""; return(displayHtmlDocumentTreeSubroutine(tree, "", 0, urlList, exeLinks)); }
/// <summary> /// Recursively displays the HtmlDocument tree /// </summary> /// <param name="tree"></param> /// <param name="result"></param> /// <param name="level"></param> /// <param name="urlList"></param> /// <param name="exeLinks"></param> static string displayHtmlDocumentTreeSubroutine( HtmlDocumentTree tree, string result, int level, List <string> urlList, string exeLinks) { urlList.Add(tree.Url); if (tree.ChildDocuments == null || tree.ChildDocuments.Count == 0) { exeLinks += result + "\n"; urlList.Add(tree.Url); } foreach (var child in tree.ChildDocuments) { displayHtmlDocumentTreeSubroutine(child, child.Url, level + 1, urlList, exeLinks); } return(exeLinks); }