Beispiel #1
0
        /// <summary>
        /// Recursively builds the HtmlDocument tree
        /// </summary>
        /// <param name="tree"></param>
        /// <param name="hashedDocs"></param>
        /// <param name="LinkPath"></param>
        /// <param name="rootURL"></param>
        private static HtmlDocumentTree createHtmlDocTreeSubroutine(HtmlDocumentTree tree, List <string> hashedDocs, string LinkPath, string rootURL)
        {
            List <string>           innerLink     = ScraperUtilities.getInnerLinks(tree.Node, LinkPath, rootURL);
            List <HtmlDocumentTree> childrenToAdd = new List <HtmlDocumentTree>();

            foreach (string link in innerLink)
            {
                //string docHash = getHash(doc);
                if (!hashedDocs.Contains(link))
                {
                    hashedDocs.Add(link);
                    Task <string> task = ScraperUtilities.AsyncUrlToTask(link);
                    task.Wait();
                    var doc = new HtmlDocument();
                    doc.LoadHtml(task.Result);
                    childrenToAdd.Add(new HtmlDocumentTree(doc, link));
                }
            }
            foreach (HtmlDocumentTree child in childrenToAdd)
            {
                tree.ChildDocuments.Add(child);
            }
            foreach (HtmlDocumentTree child in childrenToAdd)
            {
                return(createHtmlDocTreeSubroutine(child, hashedDocs, LinkPath, rootURL));
            }
            return(tree);
        }
Beispiel #2
0
        /// <summary>
        /// new method to clean up get method in controller
        /// </summary>
        /// <returns></returns>
        public string RunDisplay()
        {
            //   string baseUrl = "http://debian.osuosl.org/debian/pool/main/c/";
            Task <string> task = ScraperUtilities.AsyncUrlToTask(rootURL);

            task.Wait();

            var testDoc = new HtmlDocument();

            testDoc.LoadHtml(task.Result);
            htmlDocumentTree = ScraperUtilities.createHtmlDocTree(testDoc, rootURL, LinkPath, rootURL);
            //List<string> htmlDocumentHashes = new List<string>();
            //htmlDocumentHashes.Add(getHash(testDoc));
            //HtmlDocumentTree tree = createHtmlDocTree(new HtmlDocumentTree(testDoc, rootURL), htmlDocumentHashes);
            string result = "";

            //result += tree.ChildDocuments.Count.ToString() + "\n";
            result += ScraperUtilities.displayHtmlDocumentTree(htmlDocumentTree);
            // result = displayAllUrls(testDoc);
            //result += displayInnerUrls(testDoc);
            //result += getFiles(testDoc);

            result = "";

            //this just lists all the urls
            foreach (var x in urlList)
            {
                result += x + "\n";
            }

            return(result);
        }