コード例 #1
0
ファイル: UrlCrawlr.cs プロジェクト: IeanD/WebCrawlr
        /// <summary>
        ///     Crawls a given URL, queueing all found URLs and storing information about
        ///     the given URL for later querying.
        /// </summary>
        /// <param name="data">
        ///     Crawler data helper. Ref.
        /// </param>
        /// <param name="storage">
        ///     Crawler azure storage helper. Ref.
        /// </param>
        /// <param name="url">
        ///     The given URL to crawl.
        /// </param>
        public static void CrawlUrl(ref CrawlrDataHelper data, ref CrawlrStorageManager storage, string url)
        {
            if (data.ChkIfUriAllowed(url))
            {
                ///*  Unsure if necessary.  */
                //if (!url.Contains(".htm"))
                //{
                //    if (!url.Contains(".jpg") && !url.Contains(".png"))
                //    {
                //        if (url.EndsWith(@"/"))
                //        {
                //            url += "index.html";
                //        }
                //        else
                //        {
                //            url += @"/index.html";
                //        }
                //    }
                //}
                try
                {
                    var web      = new HtmlWeb();
                    var currDoc  = web.Load(url);
                    var urlNodes = currDoc.DocumentNode.Descendants("a")
                                   .ToList();
                    var urlPageTitle = currDoc.DocumentNode.Descendants("title")
                                       .First()
                                       .InnerText;
                    var urlLastModNode = currDoc.DocumentNode.Descendants("meta")
                                         .Select(y => y)
                                         .Where(y => y.Attributes.Contains("name"))
                                         .Where(y => y.Attributes["name"].Value == "pubdate")
                                         .ToList();

                    DateTime?urlLastMod = null;
                    if (urlLastModNode.Count > 0)
                    {
                        urlLastMod = DateTime.Parse(
                            urlLastModNode.First().Attributes["content"].Value);
                    }

                    List <string> urlsToQueue = new List <string>();

                    foreach (var urlNode in urlNodes)
                    {
                        if (urlNode.Attributes.Contains("href"))
                        {
                            urlsToQueue.Add(urlNode.Attributes["href"].Value);
                        }
                    }

                    foreach (string newUrl in urlsToQueue)
                    {
                        ChkAndAddUrl(newUrl, url, urlLastMod, ref data, ref storage);
                    }

                    if (!data.AddedUrls.Contains(url))
                    {
                        data.AddedUrls.Add(url);
                        data.NumUrlsIndexed++;
                    }
                    data.NumUrlsCrawled++;
                    FoundUrl       finishedUrl = new FoundUrl(urlPageTitle, (urlLastMod != null ? urlLastMod.ToString() : "NULL"), url);
                    UrlTableCount  newCount    = new UrlTableCount(data.NumUrlsCrawled, data.NumUrlsIndexed);
                    TableOperation insertUrl   = TableOperation.InsertOrReplace(finishedUrl);
                    TableOperation insertCount = TableOperation.InsertOrReplace(newCount);
                    storage.UrlTable.Execute(insertUrl);
                    storage.UrlTable.Execute(insertCount);
                    if (data.LastTenUrls.Count == 10)
                    {
                        data.LastTenUrls.Dequeue();
                    }
                    data.LastTenUrls.Enqueue(url);
                }
                catch (Exception ex)
                {
                    ErrorUrl       errorUrl       = new ErrorUrl(url, ex.ToString());
                    TableOperation insertErrorUrl = TableOperation.InsertOrReplace(errorUrl);
                    storage.ErrorTable.Execute(insertErrorUrl);
                }
            }
        }
コード例 #2
0
        /// <summary>
        ///     Crawls a given URL, queueing all found URLs and storing information about
        ///     the given URL for later querying.
        /// </summary>
        /// <param name="data">
        ///     Crawler data helper. Ref.
        /// </param>
        /// <param name="storage">
        ///     Crawler azure storage helper. Ref.
        /// </param>
        /// <param name="url">
        ///     The given URL to crawl.
        /// </param>
        public static void CrawlUrl(ref CrawlrDataHelper data, ref CrawlrStorageManager storage, string url)
        {
            if (data.ChkIfUriAllowed(url))
            {
                ///*  Unsure if necessary.  */
                //if (!url.Contains(".htm"))
                //{
                //    if (!url.Contains(".jpg") && !url.Contains(".png"))
                //    {
                //        if (url.EndsWith(@"/"))
                //        {
                //            url += "index.html";
                //        }
                //        else
                //        {
                //            url += @"/index.html";
                //        }
                //    }
                //}

                var web      = new HtmlWeb();
                var currDoc  = web.Load(url);
                var urlNodes = currDoc.DocumentNode.Descendants("a")
                               .ToList();
                var urlPageTitle = currDoc.DocumentNode.Descendants("title")
                                   .First()
                                   .InnerText;
                var urlLastModNode = currDoc.DocumentNode.Descendants("meta")
                                     .Select(y => y)
                                     .Where(y => y.Attributes.Contains("name"))
                                     .Where(y => y.Attributes["name"].Value == "pubdate")
                                     .ToList();

                DateTime?urlLastMod = null;
                if (urlLastModNode.Count > 0)
                {
                    urlLastMod = DateTime.Parse(
                        urlLastModNode.First().Attributes["content"].Value);
                }

                List <string> urlsToQueue = new List <string>();

                foreach (var urlNode in urlNodes)
                {
                    if (urlNode.Attributes.Contains("href"))
                    {
                        urlsToQueue.Add(urlNode.Attributes["href"].Value);
                    }
                }

                foreach (string newUrl in urlsToQueue)
                {
                    ChkAndAddUrl(newUrl, url, null, ref data, ref storage);
                }

                if (!data.AddedUrls.Contains(url))
                {
                    data.AddedUrls.Add(url);
                }
                data.NumUrlsCrawled++;
                string[] splitPageTitle = urlPageTitle.Split(' ');
                foreach (string s in splitPageTitle)
                {
                    string plainText = s.ToLower();
                    plainText = Regex.Replace(plainText, "[^a-zA-Z0-9]", "");
                    if (plainText != "")
                    {
                        IndexedUrl     wordToUrl       = new IndexedUrl(plainText, urlPageTitle, (urlLastMod != null ? urlLastMod.ToString(): "NULL"), url);
                        TableOperation insertWordToUrl = TableOperation.InsertOrReplace(wordToUrl);
                        storage.UrlTable.Execute(insertWordToUrl);
                        data.NumUrlsIndexed++;
                    }
                }
                UrlTableCount  newCount    = new UrlTableCount(data.NumUrlsCrawled, data.NumUrlsIndexed);
                TableOperation insertCount = TableOperation.InsertOrReplace(newCount);
                storage.UrlTable.Execute(insertCount);
                if (data.LastTenUrls.Count == 10)
                {
                    data.LastTenUrls.Dequeue();
                }
                data.LastTenUrls.Enqueue(url);
            }
        }