Exemple #1
0
        protected bool TakeUrls(string strVisitUrl, string strReturnPage)
        {
            HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument
            {
                OptionAddDebuggingAttributes = false,
                OptionAutoCloseOnEnd         = true,
                OptionFixNestedTags          = true,
                OptionReadEncoding           = true
            };
            htmlDoc.LoadHtml(strReturnPage);


            string            baseUrl = new Uri(strVisitUrl).GetLeftPart(UriPartial.Authority);
            DocumentWithLinks links   = htmlDoc.GetLinks();

            List <string> lstRevomeSame = new List <string>();


            List <string> lstThisTimesUrls = new List <string>();

            foreach (string link in links.Links.Union(links.References))
            {
                if (string.IsNullOrEmpty(link))
                {
                    continue;
                }

                string decodedLink = link;

                string normalizedLink = decodedLink;
                if (string.IsNullOrEmpty(normalizedLink))
                {
                    continue;
                }

                MatchCollection matchs = Regex.Matches(normalizedLink, m_strCnblogsUrlFilterRule, RegexOptions.Singleline);
                if (matchs.Count > 0)
                {
                    string strLinkText = "";

                    if (links.m_dicLink2Text.Keys.Contains(normalizedLink))
                    {
                        strLinkText = links.m_dicLink2Text[normalizedLink];
                    }

                    if (strLinkText == "")
                    {
                        if (links.m_dicLink2Text.Keys.Contains(link))
                        {
                            strLinkText = links.m_dicLink2Text[link].TrimEnd().TrimStart();
                        }
                    }

                    PrintLog(strLinkText + "\n");
                    PrintLog(normalizedLink + "\n");


                    lstThisTimesUrls.Add(normalizedLink);
                }
            }

            bool bNoArticle = CheckArticles(lstThisTimesUrls);

            return(bNoArticle);
        }