Пример #1
0
        public void CrawlData()
        {
            FDOTService _service = new FDOTService();

            while (_service.IsLinkCrawlRemain() > 0)
            {
                var linkList = _service.GetUrlForLinkCrawl();
                if (linkList != null)
                {
                    foreach (var link in linkList)
                    {
                        ThreadParameters tp = new ThreadParameters
                        {
                            URL = link.FullUrl
                        };
                        string ext = Path.GetExtension(link.FullUrl);
                        if (ext == null)
                        {
                            ext = "";
                        }
                        if (skipExtension.Contains(ext) && ext != "")
                        {
                            // Skip this file format.
                            Console.WriteLine("--------Skipped Extension " + ext);
                        }
                        else
                        {
                            CrawlAllLinks(link);
                        }

                        // Update ulr status as IsLinkCrawled=True;
                        _service.UpdateLinkCrawled(link);

                        if (skipExtension.Contains(ext) && ext != "")
                        {
                            // Skip this file format.
                            Console.WriteLine("--------Skipped Extension " + ext);
                        }
                        else
                        {
                            CrawlContents(link);
                        }

                        // Update url status as IsDataCrawled= True;
                        _service.UpdateHtmlContentCrawled(link);
                    }
                }
            }
        }
Пример #2
0
        private void CrawlContents(UrlModel model)
        {
            try
            {
                if (model.FullUrl.Contains("../"))
                {
                    string file2 = Path.GetFileName(model.FullUrl);
                    model.FullUrl = model.FullUrl.Replace(file2, "");
                    Console.WriteLine("----Removed file name " + file2);
                }

                Console.WriteLine("Content Crawling " + model.FullUrl);
                TraceService("Crawling Started: --------Links--------, URL:" + model.FullUrl + "\n");

                string html = Helper.GetWebSiteContent(model.FullUrl);
                if (html == "callback")
                {
                    Console.WriteLine("Content Crawling " + model.FullUrl2);
                    TraceService("Crawling Started: --------Links--------, URL:" + model.FullUrl2 + "\n");
                    html = Helper.GetWebSiteContent(model.FullUrl2);
                }

                HtmlAgilityPack.HtmlDocument doc = Helper.LoadHtml(html);
                FDOTService crawldata            = new FDOTService();
                var         content = doc.DocumentNode.SelectSingleNode("//div[@id='content']");
                var         header  = doc.DocumentNode.SelectSingleNode("//div[@id='header']");
                var         footer  = doc.DocumentNode.SelectSingleNode("//div[@id='footer']");

                if (content != null)
                {
                    var data = new UrlModel()
                    {
                        Id          = model.Id,
                        HtmlContent = content.OuterHtml,
                        Header      = header.OuterHtml,
                        Footer      = footer.OuterHtml,
                        IsHtmlPage  = true
                    };

                    crawldata.UpdateHtmlData(data);

                    TraceService("Data Inserted : -------- html data-------- , URL:" + model.FullUrl + "\n");
                }
            }
            catch (Exception ex)
            {
                TraceService("Error  : --------html data-------- URL:" + model.FullUrl + "\n");
            }
        }
Пример #3
0
        private void CrawlAllLinks(UrlModel model)
        {
            string url = model.FullUrl;

            try
            {
                Console.WriteLine("Link Crawling " + model.FullUrl);
                TraceService("Crawling Started: --------Links--------, URL:" + url + "\n");

                string html = Helper.GetWebSiteContent(model.FullUrl);
                if (html == "callback")
                {
                    Console.WriteLine("Link Crawling " + model.FullUrl2);
                    TraceService("Crawling Started: --------Links--------, URL:" + model.FullUrl2 + "\n");
                    url  = model.FullUrl2;
                    html = Helper.GetWebSiteContent(model.FullUrl2);
                }
                HtmlAgilityPack.HtmlDocument doc         = Helper.LoadHtml(html);
                FDOTService                crawldata     = new FDOTService();
                XmlDocument                xmldoc        = new XmlDocument();
                List <UrlModel>            linkList      = new List <UrlModel>();
                List <CrawlDirectoryModel> directoryList = new List <CrawlDirectoryModel>();
                var list = doc.DocumentNode.SelectSingleNode("//body");

                Uri    myUri = new Uri(url);
                string host  = myUri.Host;

                var nodes = list.SelectNodes(".//a");
                if (nodes != null)
                {
                    Console.WriteLine("Total links: " + nodes.Count);
                    for (int i = 0; i < nodes.Count; i++)
                    {
                        string link      = nodes[i].Attributes["href"].Value;
                        string fullLink  = "";
                        string fullLink2 = "";
                        //string ext = Path.GetExtension(link);
                        //if (ext == null) ext = "";
                        //if (skipExtension.Contains(ext) && ext != "")
                        //{
                        //    string pageName = Path.GetFileName(url);
                        //}

                        if (link.Contains("http://") || link.Contains("https://"))
                        {
                            fullLink = link;
                        }
                        else
                        {
                            fullLink  = (url.Replace(Path.GetFileName(url), "") + "/" + link).Replace("///", "/../");
                            fullLink2 = url + "/" + link;
                        }
                        bool isInDomain = fullLink.Contains(host) && !link.Contains("mailto:");
                        if (isInDomain)
                        {
                            // Only in domain directory added.
                            if (directoryList.Where(st => st.Name == "").Count() == 0)
                            {
                                directoryList.Add(new CrawlDirectoryModel()
                                {
                                    Name = ""
                                });
                            }
                        }
                        linkList.Add(new UrlModel()
                        {
                            PageUrl    = link,
                            FullUrl    = fullLink,
                            FullUrl2   = fullLink2,
                            IsInDomain = isInDomain
                        });
                    }
                    xmldoc = GenerateXmlForPageUrls(linkList);
                    crawldata.InsertLinks(linkList);

                    TraceService("Data Inserted : -------- herf-------- , URL:" + url + "\n");
                }
            }
            catch (Exception ex)
            {
                TraceService("Error  : --------herf-------- URL:" + url + "\n");
            }
        }