Example #1
0
        private static void GetSiteLinks(RssSource rs, string strContent)
        {
            string strUrlRule = rs.strArticleUrlPattern;

            strUrlRule = strUrlRule.Replace(".", "\\.");
            strUrlRule = strUrlRule.Replace("*", ".*?");

            HtmlAgilityPack.HtmlDocument htmlDoc = GetHtmlDocument(strContent);

            if (rs.strArticleUrlRangeCssPath != "")
            {
                IEnumerable <HtmlNode> NodesUrlContent = htmlDoc.DocumentNode.QuerySelectorAll(rs.strArticleUrlRangeCssPath);
                if (NodesUrlContent.Count() > 0)
                {
                    string strReturnPage = NodesUrlContent.ToArray()[0].InnerHtml;//进一步缩小范围
                    htmlDoc = GetHtmlDocument(strReturnPage);
                }
            }

            string            baseUrl = GetUrlLeftPart(rs.strSiteUrl);
            DocumentWithLinks links   = htmlDoc.GetLinks();



            foreach (string link in links.Links.Union(links.References))
            {
                if (string.IsNullOrEmpty(link))
                {
                    continue;
                }


                string decodedLink = link;

                string normalizedLink = GetNormalizedLink(baseUrl, decodedLink);


                if (string.IsNullOrEmpty(normalizedLink))
                {
                    continue;
                }

                MatchCollection matchs = Regex.Matches(normalizedLink, strUrlRule, RegexOptions.Singleline);
                if (matchs.Count > 0)
                {
                    string strLinkText = "";

                    foreach (string strTemp in links.m_dicLink2Text.Keys)
                    {
                        if (strTemp.Contains(normalizedLink))
                        {
                            strLinkText = links.m_dicLink2Text[strTemp];
                            break;
                        }
                    }

                    if (strLinkText == "")
                    {
                        if (links.m_dicLink2Text.Keys.Contains(link))
                        {
                            strLinkText = links.m_dicLink2Text[link].TrimEnd().TrimStart();
                        }
                        if (links.m_dicLink2Text.Keys.Contains(link.ToLower()))
                        {
                            strLinkText = links.m_dicLink2Text[link.ToLower()].TrimEnd().TrimStart();
                        }
                    }
                    SaveUrlToDB(normalizedLink, strLinkText);
                    Console.WriteLine(normalizedLink);
                }
            }
            return;
        }
Example #2
0
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {


            AspectF.Define.
                NotNull(crawler, "crawler").
                NotNull(propertyBag, "propertyBag");

            string stepUri = Uri.UnescapeDataString(propertyBag.Step.Uri.AbsoluteUri);
            if (stepUri.Length > 396)
            {
                stepUri = stepUri.Substring(0, 396);
            }
            var crawlHistory = AspectF.Define.
               Return<CrawlHistory, NCrawlerEntitiesDbServices>(
                   e => e.CrawlHistory.Where(m => m.Key == stepUri).FirstOrDefault());

            if (crawlHistory == null)
            {
                AspectF.Define.Do<NCrawlerEntitiesDbServices>(e =>
                {
                    e.ExecuteStoreCommand("delete Crawlqueue where [key] ={0}", stepUri);
                });
                return;
            }
            try
            {
                if (propertyBag.StatusCode != HttpStatusCode.OK)
                {
                    AspectF.Define.Do<NCrawlerEntitiesDbServices>(e =>
                    {
                        e.ExecuteStoreCommand("delete Crawlqueue where [key] ={0}", crawlHistory.Key);
                        //CrawlQueue result = e.CrawlQueue.FirstOrDefault(q => q.Key == crawlHistory.Key);
                        //if (!result.IsNull())
                        //{
                        //    e.DeleteObject(result);
                        //    e.SaveChanges();
                        //}
                    });
                    return;
                }

                if (!IsHtmlContent(propertyBag.ContentType))
                {
                    AspectF.Define.Do<NCrawlerEntitiesDbServices>(e =>
                    {
                        e.ExecuteStoreCommand("delete Crawlqueue where [key] ={0}", crawlHistory.Key);
                        //CrawlQueue result = e.CrawlQueue.FirstOrDefault(q => q.Key == crawlHistory.Key);
                        //if (!result.IsNull())
                        //{
                        //    e.DeleteObject(result);
                        //    e.SaveChanges();
                        //}
                    });
                    return;
                }
                HtmlDocument htmlDoc = new HtmlDocument
                {
                    OptionAddDebuggingAttributes = false,
                    OptionAutoCloseOnEnd = true,
                    OptionFixNestedTags = true,
                    OptionReadEncoding = true
                };
                using (Stream reader = propertyBag.GetResponse())
                {
                    Encoding documentEncoding = htmlDoc.DetectEncoding(reader);
                    reader.Seek(0, SeekOrigin.Begin);
                    if (!documentEncoding.IsNull())
                    {
                        htmlDoc.Load(reader, documentEncoding, true);
                    }
                    else
                    {
                        htmlDoc.Load(reader, true);
                    }

                    //string content = reader.ReadToEnd();
                    //resultHtmlContent = content;
                }
                //string steplUri = propertyBag.ResponseUri.OriginalString;


                string orginalHtmlContent = htmlDoc.DocumentNode.OuterHtml;
                string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path);
                DocumentWithLinks links = htmlDoc.GetLinks();



                //string urlRegex = @"^http://www.bbc.co.uk/food/recipes/[^#/]+$";
                List<string> recipeRegex = null;
                var jsonStr = cache.Get(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite") as string;
                if (jsonStr == null)
                {
                    using (var stream = new StreamReader(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite.txt", Encoding.UTF8))
                    {
                        jsonStr = stream.ReadToEnd();
                        var policy = new CacheItemPolicy();
                        policy.Priority = CacheItemPriority.NotRemovable;
                        policy.AbsoluteExpiration = DateTimeOffset.Now.AddDays(1);
                        cache.Set(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite", jsonStr, policy);
                        Console.WriteLine("cache --" + AppDomain.CurrentDomain.BaseDirectory + " :" + cache.Get(AppDomain.CurrentDomain.BaseDirectory + "OriginalWebSite"));
                    }
                }
                var json = JsonConvert.DeserializeObject<OriginalWebSiteTxt>(jsonStr);
                if (json.RecipeRegex != null && json.RecipeRegex.Count > 0)
                {
                    recipeRegex = json.RecipeRegex;
                }
                bool needToStore = false;

                if (recipeRegex != null)
                {
                    foreach (var regex in recipeRegex)
                    {
                        if (Regex.IsMatch(propertyBag.Step.Uri.AbsoluteUri, regex, RegexOptions.IgnoreCase))
                        {
                            needToStore = true;
                            break;
                        }
                    }
                }
                else
                {
                    needToStore = true;
                }

                if (needToStore)
                {
                    //string folderPath = "D:/CrawlerManager/CrawlerData";
                    //string instanceFolderPath = folderPath + "/" + crawlHistory.GroupId;
                    //string path = folderPath + "/" + crawlHistory.GroupId + "/" + string.Format("{0}.txt", crawlHistory.Id);
                    //if (!Directory.Exists(folderPath))
                    //{
                    //    Directory.CreateDirectory(folderPath);
                    //}
                    //if (!Directory.Exists(instanceFolderPath))
                    //{
                    //    Directory.CreateDirectory(instanceFolderPath);
                    //}

                    //if (!File.Exists(path))
                    //{
                    //    try
                    //    {

                    //        using (StreamWriter sw = File.CreateText(path))
                    //        {
                    //            sw.WriteLine(orginalHtmlContent);
                    //        }

                    //    }
                    //    catch (Exception ex)
                    //    {
                    //        log4net.Config.XmlConfigurator.Configure();
                    //        log4net.ILog log = log4net.LogManager.GetLogger("logger-name");
                    //        log.Error(ex);
                    //    }
                    //}
                    var folderHelper = new FolderHelper();
                    var path = folderHelper.GetFolderPathToStore(crawlHistory.GroupId) + "/" + string.Format("{0}.txt", crawlHistory.Id);
                    Console.Write(path);

                    if (!File.Exists(path))
                    {
                        try
                        {
                            using (StreamWriter sw = File.CreateText(path))
                            {
                                sw.WriteLine(orginalHtmlContent);
                            }

                        }
                        catch (Exception ex)
                        {
                            log4net.Config.XmlConfigurator.Configure();
                            log4net.ILog log = log4net.LogManager.GetLogger("logger-name");
                            log.Error(ex);
                        }
                    }
                    //}
                }



                AspectF.Define.Do<NCrawlerEntitiesDbServices>(e =>
                {
                    e.ExecuteStoreCommand("delete Crawlqueue where [key] ={0}", crawlHistory.Key);
                });

                foreach (string link in links.Links.Union(links.References))
                {
                    if (link.IsNullOrEmpty() || link.Length > 396)
                    {
                        continue;
                    }

                    string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(link);
                    string normalizedLink = "";
                    try
                    {
                        normalizedLink = NormalizeLink(baseUrl, decodedLink);
                    }
                    catch (Exception ex)
                    {
                        continue;
                    }
                    
                    if (normalizedLink.IsNullOrEmpty())
                    {
                        continue;
                    }
                    if (link.Contains("page="))
                    {
                        var a = 1;
                    }


                    crawler.AddStep(new Uri(normalizedLink), propertyBag.Step.Depth + 1,
                        propertyBag.Step, new Dictionary<string, object>
                        {
                            {Resources.PropertyBagKeyOriginalUrl, link},
                            {Resources.PropertyBagKeyOriginalReferrerUrl, propertyBag.ResponseUri}
                        });
                }

            }
            catch (Exception ex)
            {
                AspectF.Define.Do<NCrawlerEntitiesDbServices>(e =>
                {
                    e.ExecuteStoreCommand("delete Crawlqueue where [key] ={0}", crawlHistory.Key);
                });
                log4net.Config.XmlConfigurator.Configure();
                log4net.ILog log = log4net.LogManager.GetLogger("logger-name");
                log.Error(ex);
            }
        }
Example #3
0
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            AspectF.Define.
                NotNull(crawler, "crawler").
                NotNull(propertyBag, "propertyBag");

            if (propertyBag.StatusCode != HttpStatusCode.OK)
            {
                return;
            }

            if (!IsHtmlContent(propertyBag.ContentType))
            {
                return;
            }

            HtmlDocument htmlDoc = new HtmlDocument
                {
                    OptionAddDebuggingAttributes = false,
                    OptionAutoCloseOnEnd = true,
                    OptionFixNestedTags = true,
                    OptionReadEncoding = true
                };
            using (Stream reader = propertyBag.GetResponse())
            {
                Encoding documentEncoding = htmlDoc.DetectEncoding(reader);
                reader.Seek(0, SeekOrigin.Begin);
                if (!documentEncoding.IsNull())
                {
                    htmlDoc.Load(reader, documentEncoding, true);
                }
                else
                {
                    htmlDoc.Load(reader, true);
                }
            }

            string originalContent = htmlDoc.DocumentNode.OuterHtml;
            if (HasTextStripRules || HasSubstitutionRules)
            {
                string content = StripText(originalContent);
                content = Substitute(content, propertyBag.Step);
                using (TextReader tr = new StringReader(content))
                {
                    htmlDoc.Load(tr);
                }
            }

            propertyBag["HtmlDoc"].Value = htmlDoc;

            HtmlNodeCollection nodes = htmlDoc.DocumentNode.SelectNodes("//title");
            // Extract Title
            if (!nodes.IsNull())
            {
                propertyBag.Title = string.Join(";", nodes.
                    Select(n => n.InnerText).
                    ToArray()).Trim();
            }

            // Extract Meta Data
            nodes = htmlDoc.DocumentNode.SelectNodes("//meta[@content and @name]");
            if (!nodes.IsNull())
            {
                propertyBag["Meta"].Value = (
                    from entry in nodes
                    let name = entry.Attributes["name"]
                    let content = entry.Attributes["content"]
                    where !name.IsNull() && !name.Value.IsNullOrEmpty() && !content.IsNull() && !content.Value.IsNullOrEmpty()
                    select name.Value + ": " + content.Value).ToArray();
            }

            propertyBag.Text = htmlDoc.ExtractText().Trim();
            if (HasLinkStripRules || HasTextStripRules)
            {
                string content = StripLinks(originalContent);
                using (TextReader tr = new StringReader(content))
                {
                    htmlDoc.Load(tr);
                }
            }

            // Extract Links
            DocumentWithLinks links = htmlDoc.GetLinks();
            foreach (string link in links.Links.Union(links.References))
            {
                if (link.IsNullOrEmpty())
                {
                    continue;
                }

                string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path);
                string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(link);
                string normalizedLink = NormalizeLink(baseUrl, decodedLink);
                if (normalizedLink.IsNullOrEmpty())
                {
                    continue;
                }

                crawler.AddStep(new Uri(normalizedLink), propertyBag.Step.Depth + 1,
                    propertyBag.Step, new Dictionary<string, object>
                        {
                            {Resources.PropertyBagKeyOriginalUrl, link},
                            {Resources.PropertyBagKeyOriginalReferrerUrl, propertyBag.ResponseUri}
                        });
            }
        }
		public Task<bool> Process(ICrawler crawler, PropertyBag propertyBag)
		{
			AspectF.Define
				.NotNull(crawler, nameof(crawler))
				.NotNull(propertyBag, nameof(propertyBag));

			if (propertyBag.StatusCode != HttpStatusCode.OK)
			{
				return Task.FromResult(true);
			}

			if (!IsHtmlContent(propertyBag.ContentType))
			{
				return Task.FromResult(true);
			}

			HtmlDocument htmlDoc = new HtmlDocument
			{
				OptionAddDebuggingAttributes = false,
				OptionAutoCloseOnEnd = true,
				OptionFixNestedTags = true,
				OptionReadEncoding = true
			};

			using (MemoryStream ms = new MemoryStream(propertyBag.Response))
			{
				Encoding documentEncoding = htmlDoc.DetectEncoding(ms);
				ms.Seek(0, SeekOrigin.Begin);
				if (!documentEncoding.IsNull())
				{
					htmlDoc.Load(ms, documentEncoding, true);
				}
				else
				{
					htmlDoc.Load(ms, true);
				}
			}

			string originalContent = htmlDoc.DocumentNode.OuterHtml;
			if (HasTextStripRules || HasSubstitutionRules)
			{
				string content = StripText(originalContent);
				content = Substitute(content, propertyBag.Step);
				using (TextReader tr = new StringReader(content))
				{
					htmlDoc.Load(tr);
				}
			}

			propertyBag["HtmlDoc"].Value = htmlDoc;

			HtmlNodeCollection nodes = htmlDoc.DocumentNode.SelectNodes("//title");
			// Extract Title
			if (!nodes.IsNull())
			{
				propertyBag.Title = string.Join(";", nodes.
					Select(n => n.InnerText).
					ToArray()).Trim();
			}

			// Extract Meta Data
			nodes = htmlDoc.DocumentNode.SelectNodes("//meta[@content and @name]");
			if (!nodes.IsNull())
			{
				propertyBag["Meta"].Value = (
					from entry in nodes
					let name = entry.Attributes["name"]
					let content = entry.Attributes["content"]
					where !name.IsNull() && !name.Value.IsNullOrEmpty() && !content.IsNull() && !content.Value.IsNullOrEmpty()
					select $"{name.Value}: {content.Value}").ToArray();
			}

			// Extract text
			propertyBag.Text = htmlDoc.ExtractText().Trim();
			if (HasLinkStripRules || HasTextStripRules)
			{
				string content = StripLinks(originalContent);
				using (TextReader tr = new StringReader(content))
				{
					htmlDoc.Load(tr);
				}
			}

			string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path);

			// Extract Head Base
			nodes = htmlDoc.DocumentNode.SelectNodes("//head/base[@href]");
			if (!nodes.IsNull())
			{
				baseUrl = nodes
					.Select(entry => new {entry, href = entry.Attributes["href"]})
					.Where(arg => !arg.href.IsNull()
						&& !arg.href.Value.IsNullOrEmpty()
						&& Uri.IsWellFormedUriString(arg.href.Value, UriKind.RelativeOrAbsolute))
					.Select(t =>
					{
						if (Uri.IsWellFormedUriString(t.href.Value, UriKind.Relative))
						{
							return propertyBag.ResponseUri.GetComponents(UriComponents.SchemeAndServer, UriFormat.Unescaped) + t.href.Value;
						}

						return t.href.Value;
					})
					.AddToEnd(baseUrl)
					.FirstOrDefault();
			}

			// Extract Links
			DocumentWithLinks links = htmlDoc.GetLinks();
			foreach (string link in links.Links.Union(links.References))
			{
				if (link.IsNullOrEmpty())
				{
					continue;
				}

				string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(link);
				string normalizedLink = NormalizeLink(baseUrl, decodedLink);
				if (normalizedLink.IsNullOrEmpty())
				{
					continue;
				}

				crawler.Crawl(new Uri(normalizedLink), propertyBag);
			}

			return Task.FromResult(true);
		}
        protected bool SaveUrlToDB(string strVisitUrl, string strReturnPage, DoWorkEventArgs e)
        {
            try
            {
                string strUrlFilterRule = GetUrlFilterRule();
                HtmlAgilityPack.HtmlDocument htmlDoc = GetHtmlDocument(strReturnPage);
                string strUrlCss = GetUrlCss();
                if (strUrlCss != "")
                {
                    IEnumerable <HtmlNode> NodesUrlContent = htmlDoc.DocumentNode.QuerySelectorAll(strUrlCss);
                    if (NodesUrlContent.Count() > 0)
                    {
                        strReturnPage = NodesUrlContent.ToArray()[0].InnerHtml;//进一步缩小范围
                        htmlDoc       = GetHtmlDocument(strReturnPage);
                    }
                }
                string            baseUrl = new Uri(strVisitUrl).GetLeftPart(UriPartial.Authority);
                DocumentWithLinks links   = htmlDoc.GetLinks();


                foreach (string link in links.Links.Union(links.References))
                {
                    if (string.IsNullOrEmpty(link))
                    {
                        continue;
                    }

                    string decodedLink = link;

                    string normalizedLink = GetNormalizedLink(baseUrl, decodedLink);


                    if (string.IsNullOrEmpty(normalizedLink))
                    {
                        continue;
                    }

                    MatchCollection matchs = Regex.Matches(normalizedLink, strUrlFilterRule, RegexOptions.Singleline);
                    if (matchs.Count > 0)
                    {
                        string strLinkText = "";

                        if (links.m_dicLink2Text.Keys.Contains(normalizedLink))
                        {
                            strLinkText = links.m_dicLink2Text[normalizedLink];
                        }

                        if (strLinkText == "")
                        {
                            if (links.m_dicLink2Text.Keys.Contains(link))
                            {
                                strLinkText = links.m_dicLink2Text[link].TrimEnd().TrimStart();
                            }
                        }

                        lstThisTimesUrls.Add(normalizedLink);

                        if (!m_dicUrl2Title.Keys.Contains(normalizedLink))
                        {
                            m_dicUrl2Title.Add(normalizedLink, strLinkText);
                        }
                        m_wd.AddUrlQueue(normalizedLink);
                    }
                }
            }
            catch (Exception ex)
            {
            }
            bool bNoArticle = CheckArticles(lstThisTimesUrls);

            return(bNoArticle);
        }
Example #6
0
        protected static void SaveUrlToDB(string strReturnPage, SpiderTemplate s, List <DetailLink> list)
        {
            Dictionary <string, string> m_dicLink2Text = new Dictionary <string, string>();
            string strUrlFilterRule = s.ListPattern;

            //strUrlFilterRule = ParseUrl(strUrlFilterRule);
            HtmlAgilityPack.HtmlDocument htmlDoc = GetHtmlDocument(strReturnPage);

            // string baseUrl = new Uri(strVisitUrl).GetLeftPart(UriPartial.Authority);
            string            baseUrl   = GetUrlLeftPart(s.ListUrl);
            DocumentWithLinks links     = htmlDoc.GetLinks();
            bool          bNoArticle    = true;
            List <string> lstRevomeSame = new List <string>();

            //  int nCountPerPage = 0;
            //  bool bExistFind = false;
            //  List<string> lstNeedDownLoad = new List<string>();
            foreach (string link in links.Links.Union(links.References))
            {
                if (string.IsNullOrEmpty(link))
                {
                    continue;
                }

                //string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(link);
                string decodedLink = link;
                //if (decodedLink != link)
                //{
                //    int a = 1;
                //}
                //Console.WriteLine(decodedLink);
                string normalizedLink = GetNormalizedLink(baseUrl, decodedLink);
                //Console.WriteLine(normalizedLink);

                if (string.IsNullOrEmpty(normalizedLink))
                {
                    continue;
                }

                MatchCollection matchs = Regex.Matches(normalizedLink, strUrlFilterRule, RegexOptions.Singleline);
                if (matchs.Count > 0)
                {
                    string strLinkText = "";

                    foreach (string strTemp in links.m_dicLink2Text.Keys)
                    {
                        if (strTemp.Contains(normalizedLink))
                        {
                            strLinkText = links.m_dicLink2Text[strTemp];
                            break;
                        }
                    }
                    //if (links.m_dicLink2Text.Keys.Contains(normalizedLink))
                    //    strLinkText = links.m_dicLink2Text[normalizedLink];

                    if (strLinkText == "")
                    {
                        if (links.m_dicLink2Text.Keys.Contains(link))
                        {
                            strLinkText = links.m_dicLink2Text[link].TrimEnd().TrimStart();
                        }
                        if (links.m_dicLink2Text.Keys.Contains(link.ToLower()))
                        {
                            strLinkText = links.m_dicLink2Text[link.ToLower()].TrimEnd().TrimStart();
                        }
                    }

                    if (lstRevomeSame.Contains(normalizedLink))
                    {
                        continue;
                    }
                    else
                    {
                        lstRevomeSame.Add(normalizedLink);
                    }


                    //bool bRet = AddLayerNodeToSaveUrlToDB(m_strWholeDbName, normalizedLink, ref strLinkText);
                    DetailLink lnk = new DetailLink();
                    lnk.Template = s;
                    lnk.Url      = normalizedLink;
                    lnk.Title    = strLinkText;
                    list.Add(lnk);
                }
                //Console.WriteLine(" uri is " + normalizedLink.ToString());
            }



            return;
        }