Пример #1
0
        /// <summary>
        /// Recursively builds the HtmlDocument tree
        /// </summary>
        /// <param name="tree"></param>
        /// <param name="hashedDocs"></param>
        /// <param name="LinkPath"></param>
        /// <param name="rootURL"></param>
        private static HtmlDocumentTree createHtmlDocTreeSubroutine(HtmlDocumentTree tree, List <string> hashedDocs, string LinkPath, string rootURL)
        {
            List <string>           innerLink     = ScraperUtilities.getInnerLinks(tree.Node, LinkPath, rootURL);
            List <HtmlDocumentTree> childrenToAdd = new List <HtmlDocumentTree>();

            foreach (string link in innerLink)
            {
                //string docHash = getHash(doc);
                if (!hashedDocs.Contains(link))
                {
                    hashedDocs.Add(link);
                    Task <string> task = ScraperUtilities.AsyncUrlToTask(link);
                    task.Wait();
                    var doc = new HtmlDocument();
                    doc.LoadHtml(task.Result);
                    childrenToAdd.Add(new HtmlDocumentTree(doc, link));
                }
            }
            foreach (HtmlDocumentTree child in childrenToAdd)
            {
                tree.ChildDocuments.Add(child);
            }
            foreach (HtmlDocumentTree child in childrenToAdd)
            {
                return(createHtmlDocTreeSubroutine(child, hashedDocs, LinkPath, rootURL));
            }
            return(tree);
        }
Пример #2
0
        /// <summary>
        /// new method to clean up get method in controller
        /// </summary>
        /// <returns></returns>
        public string RunDisplay()
        {
            //   string baseUrl = "http://debian.osuosl.org/debian/pool/main/c/";
            Task <string> task = ScraperUtilities.AsyncUrlToTask(rootURL);

            task.Wait();

            var testDoc = new HtmlDocument();

            testDoc.LoadHtml(task.Result);
            htmlDocumentTree = ScraperUtilities.createHtmlDocTree(testDoc, rootURL, LinkPath, rootURL);
            //List<string> htmlDocumentHashes = new List<string>();
            //htmlDocumentHashes.Add(getHash(testDoc));
            //HtmlDocumentTree tree = createHtmlDocTree(new HtmlDocumentTree(testDoc, rootURL), htmlDocumentHashes);
            string result = "";

            //result += tree.ChildDocuments.Count.ToString() + "\n";
            result += ScraperUtilities.displayHtmlDocumentTree(htmlDocumentTree);
            // result = displayAllUrls(testDoc);
            //result += displayInnerUrls(testDoc);
            //result += getFiles(testDoc);

            result = "";

            //this just lists all the urls
            foreach (var x in urlList)
            {
                result += x + "\n";
            }

            return(result);
        }
Пример #3
0
        /// <summary>
        /// Wrapper function for creating the HtmlDocument tree
        /// </summary>
        /// <param name="root"></param>
        /// <param name="url"></param>
        /// <param name="LinkPath"></param>
        /// <param name="rootURL"></param>
        public static HtmlDocumentTree createHtmlDocTree(HtmlDocument root, string url, string LinkPath, string rootURL)
        {
            HtmlDocumentTree htmlDocumentTree   = new HtmlDocumentTree(root, url);
            List <string>    htmlDocumentHashes = new List <string>();

            htmlDocumentHashes.Add(url);
            return(createHtmlDocTreeSubroutine(htmlDocumentTree, htmlDocumentHashes, LinkPath, rootURL));
        }
Пример #4
0
        /// <summary>
        /// Wrapper function for displaying the HtmlDocument tree
        /// </summary>
        /// <param name="tree"></param>
        public static string displayHtmlDocumentTree(HtmlDocumentTree tree)
        {
            string result = "";

            foreach (HtmlDocumentTree child in tree.ChildDocuments)
            {
                result += child.Url + "\n";
            }
            List <string> urlList  = new List <string>();
            string        exeLinks = "";

            return(displayHtmlDocumentTreeSubroutine(tree, "", 0, urlList, exeLinks));
        }
Пример #5
0
        /// <summary>
        /// Recursively displays the HtmlDocument tree
        /// </summary>
        /// <param name="tree"></param>
        /// <param name="result"></param>
        /// <param name="level"></param>
        /// <param name="urlList"></param>
        /// <param name="exeLinks"></param>
        static string displayHtmlDocumentTreeSubroutine(
            HtmlDocumentTree tree, string result, int level, List <string> urlList, string exeLinks)
        {
            urlList.Add(tree.Url);

            if (tree.ChildDocuments == null || tree.ChildDocuments.Count == 0)
            {
                exeLinks += result + "\n";
                urlList.Add(tree.Url);
            }

            foreach (var child in tree.ChildDocuments)
            {
                displayHtmlDocumentTreeSubroutine(child, child.Url, level + 1, urlList, exeLinks);
            }
            return(exeLinks);
        }