Beispiel #1
0
        private static DetailLink getDetailLink(Match match, SpiderTemplate s)
        {
            string url   = match.Groups[1].Value;
            string title = match.Groups[2].Value;
            //判断输入的url是否满足用户定义的通配符方式的模式
            MatchCollection matchs = Regex.Matches(url, ParseUrl(s.ListPattern), RegexOptions.Singleline);

            if (matchs.Count == 0)
            {
                return(null);
            }
            if (url.IndexOf("javascript:") >= 0)
            {
                return(null);
            }
            if (url.StartsWith("#"))
            {
                return(null);
            }

            title = Regex.Replace(title, "<.+?>", "");
            if (strUtil.IsNullOrEmpty(title))
            {
                return(null);
            }
            if (title == "更多")
            {
                return(null);
            }
            if (title == "more")
            {
                return(null);
            }
            if (title == "更多&gt;&gt;")
            {
                return(null);
            }

            string summary = "";

            if (match.Groups.Count > 2)
            {
                summary = match.Groups[3].Value;
            }

            if (url.StartsWith("http") == false)
            {
                url = strUtil.Join(s.SiteUrl, url);
            }


            DetailLink lnk = new DetailLink();

            lnk.Template = s;
            lnk.Url      = url;
            lnk.Title    = title;
            lnk.Abstract = summary;

            return(lnk);
        }
Beispiel #2
0
        public static List <DetailLink> getListItem(SpiderTemplate s, string page, StringBuilder sb)
        {
            List <DetailLink> list = new List <DetailLink>();

            if (strUtil.IsNullOrEmpty(page))
            {
                return(list);
            }

            MatchCollection matchs = Regex.Matches(page, s.ListPattern, RegexOptions.Singleline);

            sb.AppendLine("共抓取到链接:" + matchs.Count);
            for (int i = matchs.Count - 1; i >= 0; i--)
            {
                DetailLink dlink = getDetailLink(matchs[i], s);

                if (dlink == null)
                {
                    continue;
                }

                if (dlink.Url.Length > 100)
                {
                    continue;
                }
                list.Add(dlink);
            }
            return(list);
        }
Beispiel #3
0
        public static List <DetailLink> getListItem(SpiderTemplate s, string page, StringBuilder sb)
        {
            List <DetailLink> list = new List <DetailLink>();

            if (strUtil.IsNullOrEmpty(page))
            {
                return(list);
            }

            //获取全部url
            MatchCollection matchs = Regex.Matches(page, SpiderConfig.ListLinkPattern, RegexOptions.Singleline);

            if (matchs.Count == 0)
            {
                logger.Error("list link match count=0");
            }

            for (int i = matchs.Count - 1; i >= 0; i--)
            {
                DetailLink dlink = getDetailLink(matchs[i], s);

                if (dlink == null)
                {
                    continue;
                }

                if (dlink.Url.Length > 100)
                {
                    continue;
                }
                list.Add(dlink);
            }
            sb.AppendLine("共抓取到链接:" + list.Count);
            return(list);
        }
Beispiel #4
0
        private static DetailLink getDetailLink(Match match, SpiderTemplate s)
        {
            string url   = match.Groups[1].Value;
            string title = match.Groups[2].Value;

            if (url.IndexOf("javascript:") >= 0)
            {
                return(null);
            }
            if (url.StartsWith("#"))
            {
                return(null);
            }

            title = Regex.Replace(title, "<.+?>", "");
            if (strUtil.IsNullOrEmpty(title))
            {
                return(null);
            }
            if (title == "更多")
            {
                return(null);
            }
            if (title == "more")
            {
                return(null);
            }
            if (title == "更多&gt;&gt;")
            {
                return(null);
            }

            string summary = "";

            if (match.Groups.Count > 2)
            {
                summary = match.Groups[3].Value;
            }

            if (url.StartsWith("http") == false)
            {
                url = strUtil.Join(s.SiteUrl, url);
            }


            DetailLink lnk = new DetailLink();

            lnk.Template = s;
            lnk.Url      = url;
            lnk.Title    = title;
            lnk.Abstract = summary;

            return(lnk);
        }
Beispiel #5
0
        /// <summary>
        /// Creates a deep-copy clone of this instance.
        /// </summary>
        /// <returns>The cloned instance.</returns>
        protected virtual object CloneOverride()
        {
            var newLayer = (iLayer)MemberwiseClone();

            newLayer.ID = Guid.NewGuid().ToString();

            newLayer.ActionButtons = new List <Button>(ActionButtons.Select(a => a.Clone()));
            newLayer.Items         = new List <iLayerItem>(Items.Select(i =>
            {
                var clone = i.Clone();
                if (i == FocusedItem)
                {
                    newLayer.FocusedItem = clone;
                }
                else
                {
                    var list  = i as IList;
                    int index = -1;
                    if (list != null && (index = list.IndexOf(FocusedItem)) >= 0)
                    {
                        newLayer.FocusedItem = ((IList)clone)[index];
                    }
                }
                return(clone);
            }));

            if (BackButton != null)
            {
                newLayer.BackButton = BackButton.Clone();
            }
            if (CompositeLayerLink != null)
            {
                newLayer.CompositeLayerLink = CompositeLayerLink.Clone();
            }
            if (CompositeActionButton != null)
            {
                newLayer.CompositeActionButton = CompositeActionButton.Clone();
            }
            if (DetailLink != null)
            {
                newLayer.DetailLink = DetailLink.Clone();
            }
            if (FieldValuesRequested != null)
            {
                newLayer.FieldValuesRequested = FieldValuesRequested;
            }
            newLayer.LayerStyle = LayerStyle.Clone();

            return(newLayer);
        }
Beispiel #6
0
        private static void savePageDetail(DetailLink lnk, StringBuilder sb)
        {
            SpiderTemplate template = lnk.Template;
            string         url      = lnk.Url;
            string         title    = lnk.Title;
            string         summary  = lnk.Abstract;

            if (isPageExist(url, sb))
            {
                return;
            }

            String pageBody = new PagedDetailSpider().GetContent(url, template, sb);

            if (pageBody == null)
            {
                return;
            }

            SpiderArticle pd = new SpiderArticle();

            pd.Title          = title;
            pd.Url            = strUtil.SubString(url, 200);
            pd.Abstract       = summary;
            pd.Body           = pageBody;
            pd.SpiderTemplate = template;

            MatchCollection matchs = Regex.Matches(pageBody, RegPattern.Img, RegexOptions.Singleline);

            if (matchs.Count > 0)
            {
                pd.IsPic  = 1;
                pd.PicUrl = matchs[0].Groups[1].Value;
            }

            pd.insert();

            sb.AppendLine("保存成功..." + lnk.Title + "_" + lnk.Url);
        }
Beispiel #7
0
        private static void savePageDetail(DetailLink lnk, StringBuilder sb)
        {
            SpiderTemplate template = lnk.Template;
            string         url      = lnk.Url;
            string         title    = lnk.Title;
            string         summary  = lnk.Abstract;

            if (isPageExist(url, sb))
            {
                return;
            }

            String pageBody = new PagedDetailSpider().GetContent(url, template, sb);


            if (pageBody == null)
            {
                return;
            }

            SpiderArticle pd = new SpiderArticle();

            pd.Title          = title;
            pd.Url            = strUtil.SubString(url, 250);
            pd.Abstract       = summary;
            pd.Body           = pageBody;
            pd.SpiderTemplate = template;

            MatchCollection matchs = Regex.Matches(pageBody, RegPattern.Img, RegexOptions.Singleline);

            if (matchs.Count > 0)
            {
                pd.IsPic  = 1;
                pd.PicUrl = matchs[0].Groups[1].Value;
            }

            pd.insert();

            sb.AppendLine("保存成功..." + lnk.Title + "_" + lnk.Url);


            pageBody = Regex.Replace(pageBody, "font-size", "", RegexOptions.IgnoreCase);
            string strArcitleLink = "<div class=\"ArcitleLink\"><a href=" + pd.Url + ">原文链接</a></div>";

            pageBody = pageBody + strArcitleLink;

            Maticsoft.BLL.BlogCategory bllBlogCategory = new Maticsoft.BLL.BlogCategory();
            DataSet ds      = bllBlogCategory.GetList("AppId = '" + template.IsDelete.ToString() + "'");
            int     nCateID = 1;

            if (ds.Tables[0].Rows.Count > 0)
            {
                nCateID = (int)ds.Tables[0].Rows[0]["Id"];
            }



            BlogPost data = new BlogPost();


            data.CategoryId       = nCateID;
            data.Title            = title;
            data.Abstract         = summary;
            data.Content          = pageBody;
            data.AccessStatus     = 0;
            data.CommentCondition = 0;
            data.SaveStatus       = 1;//草稿
            data.Created          = System.DateTime.Now.Date;
            data.IsTop            = 0;
            data.IsPick           = 0;
            data.IsPic            = 0;
            data.Ip         = "";
            data.OwnerId    = template.IsDelete;
            data.OwnerUrl   = template.SiteName;
            data.OwnerType  = "wojilu.Members.Users.Domain.User";
            data.CreatorUrl = template.SiteName;
            data.AppId      = template.IsDelete;;
            data.CreatorId  = template.IsDelete;
            Maticsoft.BLL.BlogPost bll = new Maticsoft.BLL.BlogPost();
            bll.Add(data);
        }
Beispiel #8
0
        protected static void SaveUrlToDB(string strReturnPage, SpiderTemplate s, List <DetailLink> list)
        {
            Dictionary <string, string> m_dicLink2Text = new Dictionary <string, string>();
            string strUrlFilterRule = s.ListPattern;

            //strUrlFilterRule = ParseUrl(strUrlFilterRule);
            HtmlAgilityPack.HtmlDocument htmlDoc = GetHtmlDocument(strReturnPage);

            // string baseUrl = new Uri(strVisitUrl).GetLeftPart(UriPartial.Authority);
            string            baseUrl   = GetUrlLeftPart(s.ListUrl);
            DocumentWithLinks links     = htmlDoc.GetLinks();
            bool          bNoArticle    = true;
            List <string> lstRevomeSame = new List <string>();

            //  int nCountPerPage = 0;
            //  bool bExistFind = false;
            //  List<string> lstNeedDownLoad = new List<string>();
            foreach (string link in links.Links.Union(links.References))
            {
                if (string.IsNullOrEmpty(link))
                {
                    continue;
                }

                //string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(link);
                string decodedLink = link;
                //if (decodedLink != link)
                //{
                //    int a = 1;
                //}
                //Console.WriteLine(decodedLink);
                string normalizedLink = GetNormalizedLink(baseUrl, decodedLink);
                //Console.WriteLine(normalizedLink);

                if (string.IsNullOrEmpty(normalizedLink))
                {
                    continue;
                }

                MatchCollection matchs = Regex.Matches(normalizedLink, strUrlFilterRule, RegexOptions.Singleline);
                if (matchs.Count > 0)
                {
                    string strLinkText = "";

                    foreach (string strTemp in links.m_dicLink2Text.Keys)
                    {
                        if (strTemp.Contains(normalizedLink))
                        {
                            strLinkText = links.m_dicLink2Text[strTemp];
                            break;
                        }
                    }
                    //if (links.m_dicLink2Text.Keys.Contains(normalizedLink))
                    //    strLinkText = links.m_dicLink2Text[normalizedLink];

                    if (strLinkText == "")
                    {
                        if (links.m_dicLink2Text.Keys.Contains(link))
                        {
                            strLinkText = links.m_dicLink2Text[link].TrimEnd().TrimStart();
                        }
                        if (links.m_dicLink2Text.Keys.Contains(link.ToLower()))
                        {
                            strLinkText = links.m_dicLink2Text[link.ToLower()].TrimEnd().TrimStart();
                        }
                    }

                    if (lstRevomeSame.Contains(normalizedLink))
                    {
                        continue;
                    }
                    else
                    {
                        lstRevomeSame.Add(normalizedLink);
                    }


                    //bool bRet = AddLayerNodeToSaveUrlToDB(m_strWholeDbName, normalizedLink, ref strLinkText);
                    DetailLink lnk = new DetailLink();
                    lnk.Template = s;
                    lnk.Url      = normalizedLink;
                    lnk.Title    = strLinkText;
                    list.Add(lnk);
                }
                //Console.WriteLine(" uri is " + normalizedLink.ToString());
            }



            return;
        }