Beispiel #1
0
        private static void savePageDetail( DetailLink lnk, StringBuilder sb )
        {
            SpiderTemplate template = lnk.Template;
            string url = lnk.Url;
            string title = lnk.Title;
            string summary = lnk.Abstract;

            if (isPageExist( url, sb )) return;

            String pageBody = new PagedDetailSpider().GetContent( url, template, sb );

            if (pageBody == null) return;

            SpiderArticle pd = new SpiderArticle();
            pd.Title = title;
            pd.Url = strUtil.SubString( url, 250 );
            pd.Abstract = summary;
            pd.Body = pageBody;
            pd.SpiderTemplate = template;

            MatchCollection matchs = Regex.Matches( pageBody, RegPattern.Img, RegexOptions.Singleline );
            if (matchs.Count > 0) {
                pd.IsPic = 1;
                pd.PicUrl = matchs[0].Groups[1].Value;
            }

            pd.insert();

            sb.AppendLine( "保存成功..." + lnk.Title + "_" + lnk.Url );

            pageBody = Regex.Replace(pageBody, "font-size", "", RegexOptions.IgnoreCase);
            string strArcitleLink = "<div class=\"ArcitleLink\"><a href=" + pd.Url + ">原文链接</a></div>";
            pageBody = pageBody + strArcitleLink;

            Maticsoft.BLL.BlogCategory bllBlogCategory = new Maticsoft.BLL.BlogCategory();
            DataSet ds = bllBlogCategory.GetList("AppId = '" + template.IsDelete.ToString() + "'");
            int nCateID = 1;
            if (ds.Tables[0].Rows.Count > 0)
            {
                nCateID = (int)ds.Tables[0].Rows[0]["Id"];
            }

            BlogPost data = new BlogPost();

            data.CategoryId = nCateID;
            data.Title = title;
            data.Abstract = summary;
            data.Content = pageBody;
            data.AccessStatus = 0;
            data.CommentCondition = 0;
            data.SaveStatus = 1;//草稿
            data.Created = System.DateTime.Now.Date;
            data.IsTop = 0;
            data.IsPick = 0;
            data.IsPic = 0;
            data.Ip = "";
            data.OwnerId = template.IsDelete;
            data.OwnerUrl = template.SiteName;
            data.OwnerType = "wojilu.Members.Users.Domain.User";
            data.CreatorUrl = template.SiteName;
            data.AppId = template.IsDelete; ;
            data.CreatorId = template.IsDelete;
            Maticsoft.BLL.BlogPost bll = new Maticsoft.BLL.BlogPost();
            bll.Add(data);
        }
Beispiel #2
0
        private static DetailLink getDetailLink( Match match, SpiderTemplate s )
        {
            string url = match.Groups[1].Value;
            string title = match.Groups[2].Value;
            //判断输入的url是否满足用户定义的通配符方式的模式
            MatchCollection matchs = Regex.Matches( url, ParseUrl( s.ListPattern ), RegexOptions.Singleline );
            if (matchs.Count == 0) {
                return null;
            }
            if (url.IndexOf( "javascript:" ) >= 0) return null;
            if (url.StartsWith( "#" )) return null;

            title = Regex.Replace( title, "<.+?>", "" );
            if (strUtil.IsNullOrEmpty( title )) return null;
            if (title == "更多") return null;
            if (title == "more") return null;
            if (title == "更多&gt;&gt;") return null;

            string summary = "";
            if (match.Groups.Count > 2) summary = match.Groups[3].Value;

            if (url.StartsWith( "http" ) == false) url = strUtil.Join( s.SiteUrl, url );

            DetailLink lnk = new DetailLink();
            lnk.Template = s;
            lnk.Url = url;
            lnk.Title = title;
            lnk.Abstract = summary;

            return lnk;
        }
Beispiel #3
0
        private static void savePageDetail( DetailLink lnk, StringBuilder sb )
        {
            SpiderTemplate template = lnk.Template;
            string url = lnk.Url;
            string title = lnk.Title;
            string summary = lnk.Abstract;

            if (isPageExist( url, sb )) return;

            String pageBody = new PagedDetailSpider().GetContent( url, template, sb );
            if (pageBody == null) return;

            SpiderArticle pd = new SpiderArticle();
            pd.Title = title;
            pd.Url = strUtil.SubString( url, 200 );
            pd.Abstract = summary;
            pd.Body = pageBody;
            pd.SpiderTemplate = template;

            MatchCollection matchs = Regex.Matches( pageBody, RegPattern.Img, RegexOptions.Singleline );
            if (matchs.Count > 0) {
                pd.IsPic = 1;
                pd.PicUrl = matchs[0].Groups[1].Value;
            }

            pd.insert();

            sb.AppendLine( "保存成功..." + lnk.Title + "_" + lnk.Url );
        }
Beispiel #4
0
        protected static void SaveUrlToDB(string strReturnPage, SpiderTemplate s, List<DetailLink> list)
        {
            Dictionary<string, string> m_dicLink2Text = new Dictionary<string, string>();
            string strUrlFilterRule = s.ListPattern;
            //strUrlFilterRule = ParseUrl(strUrlFilterRule);
            HtmlAgilityPack.HtmlDocument htmlDoc = GetHtmlDocument(strReturnPage);

               // string baseUrl = new Uri(strVisitUrl).GetLeftPart(UriPartial.Authority);
            string baseUrl = GetUrlLeftPart(s.ListUrl);
            DocumentWithLinks links = htmlDoc.GetLinks();
            bool bNoArticle = true;
            List<string> lstRevomeSame = new List<string>();

              //  int nCountPerPage = 0;
              //  bool bExistFind = false;
              //  List<string> lstNeedDownLoad = new List<string>();
            foreach (string link in links.Links.Union(links.References))
            {

                if (string.IsNullOrEmpty(link))
                {
                    continue;
                }

                //string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(link);
                string decodedLink = link;
                //if (decodedLink != link)
                //{
                //    int a = 1;
                //}
                //Console.WriteLine(decodedLink);
                string normalizedLink = GetNormalizedLink(baseUrl, decodedLink);
                //Console.WriteLine(normalizedLink);

                if (string.IsNullOrEmpty(normalizedLink))
                {
                    continue;
                }

                MatchCollection matchs = Regex.Matches(normalizedLink, strUrlFilterRule, RegexOptions.Singleline);
                if (matchs.Count > 0)
                {
                    string strLinkText = "";

                    foreach (string strTemp in links.m_dicLink2Text.Keys)
                    {
                        if (strTemp.Contains(normalizedLink))
                        {
                            strLinkText = links.m_dicLink2Text[strTemp];
                            break;
                        }
                    }
                    //if (links.m_dicLink2Text.Keys.Contains(normalizedLink))
                    //    strLinkText = links.m_dicLink2Text[normalizedLink];

                    if (strLinkText == "")
                    {
                        if (links.m_dicLink2Text.Keys.Contains(link))
                            strLinkText = links.m_dicLink2Text[link].TrimEnd().TrimStart();
                        if (links.m_dicLink2Text.Keys.Contains(link.ToLower()))
                            strLinkText = links.m_dicLink2Text[link.ToLower()].TrimEnd().TrimStart();
                    }

                    if (lstRevomeSame.Contains(normalizedLink))
                        continue;
                    else
                        lstRevomeSame.Add(normalizedLink);

                    //bool bRet = AddLayerNodeToSaveUrlToDB(m_strWholeDbName, normalizedLink, ref strLinkText);
                    DetailLink lnk = new DetailLink();
                    lnk.Template = s;
                    lnk.Url = normalizedLink;
                    lnk.Title = strLinkText;
                    list.Add(lnk);
                }
                //Console.WriteLine(" uri is " + normalizedLink.ToString());
            }

            return;
        }
Beispiel #5
0
        private static DetailLink getDetailLink( Match match, SpiderTemplate s )
        {
            string url = match.Groups[1].Value;
            string title = match.Groups[2].Value;

            if (url.IndexOf( "javascript:" ) >= 0) return null;
            if (url.StartsWith( "#" )) return null;

            title = Regex.Replace( title, "<.+?>", "" );
            if (strUtil.IsNullOrEmpty( title )) return null;
            if (title == "更多") return null;
            if (title == "more") return null;
            if (title == "更多&gt;&gt;") return null;

            string summary = "";
            if (match.Groups.Count > 2) summary = match.Groups[3].Value;

            if (url.StartsWith( "http" ) == false) url = strUtil.Join( s.SiteUrl, url );

            DetailLink lnk = new DetailLink();
            lnk.Template = s;
            lnk.Url = url;
            lnk.Title = title;
            lnk.Abstract = summary;

            return lnk;
        }