Пример #1
0
        private static void saveCrawlInDatabase(crawlingResult crawledResult)
        {
            lock (_lockDatabaseAdd)
            {
                using (var context = new DBCrawling())
                {
                    crawledResult.UrlHash = crawledResult.Url.ComputeHashOfOurSystem();
                    crawledResult.HostUrl = crawledResult.Url.returnRootUrl();
                    var vrResult = context.tblMainUrls.SingleOrDefault(b => b.UrlHash == crawledResult.UrlHash);
                    crawledResult.ParentUrlHash = crawledResult.ParentUrlHash.ComputeHashOfOurSystem();

                    if (crawledResult.blcrawlSuccess == true)
                    {
                        crawledResult.IsCrawled = true;
                        if (!string.IsNullOrEmpty(crawledResult.SourceCode))
                        {
                            double dblOriginalSourceCodeLenght = crawledResult.SourceCode.Length;
                            crawledResult.SourceCode         = crawledResult.SourceCode.CompressString();
                            crawledResult.CompressionPercent = Convert.ToByte(
                                Math.Floor(
                                    ((crawledResult.SourceCode.Length.ToDouble() / dblOriginalSourceCodeLenght) * 100))
                                );
                        }
                        crawledResult.CrawlTryCounter = 0;
                    }


                    tblMainUrl finalObject = crawledResult.converToBaseMainUrlClass();

                    //this approach brings extra overhead to the server with deleting from server first
                    //therefore will use copy properties of object to another object without changing reference
                    //if (vrResult != null)
                    //{
                    //    context.tblMainUrls.Remove(vrResult);
                    //    context.SaveChanges();
                    //}



                    if (vrResult != null)
                    {
                        finalObject.DiscoverDate    = vrResult.DiscoverDate;
                        finalObject.LinkDepthLevel  = vrResult.LinkDepthLevel;
                        finalObject.CrawlTryCounter = vrResult.CrawlTryCounter;
                        if (crawledResult.blcrawlSuccess == false)
                        {
                            finalObject.CrawlTryCounter++;
                        }
                        finalObject.CopyProperties(vrResult);
                    }
                    else
                    {
                        context.tblMainUrls.Add(finalObject);
                    }


                    var gg = context.SaveChanges();
                }
            }
        }
Пример #2
0
        private static void extractLinks(crawlingResult myCrawlingResult, HtmlDocument doc)
        {
            var baseUri = new Uri(myCrawlingResult.Url);

            // extracting all links
            var vrNodes = doc.DocumentNode.SelectNodes("//a[@href]");

            if (vrNodes != null)
            {
                foreach (HtmlNode link in vrNodes)//xpath notation
                {
                    HtmlAttribute att = link.Attributes["href"];
                    //this is used to convert from relative path to absolute path
                    var absoluteUri = new Uri(baseUri, att.Value.ToString().decodeUrl());

                    if (!absoluteUri.ToString().StartsWith("http://") && !absoluteUri.ToString().StartsWith("https://"))
                    {
                        continue;
                    }

                    myCrawlingResult.lstDiscoveredLinks.Add(absoluteUri.ToString().Split('#').FirstOrDefault());
                }
            }

            myCrawlingResult.lstDiscoveredLinks = myCrawlingResult.lstDiscoveredLinks.Distinct().Where(pr => pr.Length < 201).ToList();

            var vrDocTitle = doc.DocumentNode.SelectSingleNode("//title")?.InnerText.ToString().Trim();

            vrDocTitle = System.Net.WebUtility.HtmlDecode(vrDocTitle);

            myCrawlingResult.PageTile = vrDocTitle;
        }
Пример #3
0
        public static void crawlPage(string srUrlToCrawl, int irUrlDepthLevel, string _srParentUrl, DateTime _dtDiscoverDate)
        {
            var            vrLocalUrl  = srUrlToCrawl;
            crawlingResult crawlResult = new crawlingResult();

            crawlResult.Url = vrLocalUrl;
            if (!string.IsNullOrEmpty(_srParentUrl))
            {
                crawlResult.ParentUrlHash = _srParentUrl;
            }
            if (_dtDiscoverDate != DateTime.MinValue)
            {
                crawlResult.DiscoverDate = _dtDiscoverDate;
            }

            Stopwatch swTimerCrawling = new Stopwatch();

            swTimerCrawling.Start();

            HtmlWeb wbClient = new HtmlWeb();//you should use httpwebrequest for more control and better performance

            wbClient.AutoDetectEncoding = true;
            wbClient.BrowserTimeout     = new TimeSpan(0, 2, 0);
            HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();

            try
            {
                doc = wbClient.Load(crawlResult.Url);
                crawlResult.SourceCode = doc.Text;
            }
            catch (Exception E)
            {
                crawlResult.blcrawlSuccess = false;
                logError(E, "crawlPage");
            }

            Interlocked.Increment(ref irCrawledUrlCount);

            swTimerCrawling.Stop();
            crawlResult.FetchTimeMS      = Convert.ToInt32(swTimerCrawling.ElapsedMilliseconds);
            crawlResult.LastCrawlingDate = DateTime.Now;
            saveCrawlInDatabase(crawlResult);

            if (crawlResult.blcrawlSuccess)
            {
                extractLinks(crawlResult, doc);
                saveDiscoveredLinksInDatabaseForFutureCrawling(crawlResult);
            }

            doc = null;
        }
Пример #4
0
        private static void saveDiscoveredLinksInDatabaseForFutureCrawling(crawlingResult crawlResult)
        {
            lock (_lockDatabaseAdd)
            {
                using (var context = new DBCrawling())
                {
                    HashSet <string> hsProcessedUrls = new HashSet <string>();

                    foreach (var vrPerLink in crawlResult.lstDiscoveredLinks)
                    {
                        var vrHashedLink = vrPerLink.ComputeHashOfOurSystem();

                        if (hsProcessedUrls.Contains(vrHashedLink))
                        {
                            continue;
                        }

                        var vrResult = context.tblMainUrls.Any(databaseRecord => databaseRecord.UrlHash == vrHashedLink);

                        if (vrResult == false)
                        {
                            crawlingResult newLinkCrawlingResult = new crawlingResult();
                            newLinkCrawlingResult.Url            = vrPerLink.normalizeUrl();
                            newLinkCrawlingResult.HostUrl        = newLinkCrawlingResult.Url.returnRootUrl();
                            newLinkCrawlingResult.UrlHash        = vrPerLink.ComputeHashOfOurSystem();
                            newLinkCrawlingResult.ParentUrlHash  = crawlResult.UrlHash;
                            newLinkCrawlingResult.LinkDepthLevel = (short)(crawlResult.LinkDepthLevel + 1);
                            context.tblMainUrls.Add(newLinkCrawlingResult.converToBaseMainUrlClass());
                            hsProcessedUrls.Add(vrHashedLink);
                            Interlocked.Increment(ref irDiscoveredUrlCount);
                        }
                    }

                    context.SaveChanges();
                }
            }
        }