Esempio n. 1
0
        /// <summary>
        /// 修改用户
        /// </summary>
        /// <param name="obj">用户对象</param>
        /// <returns></returns>
        public int FixReleaseInfo(ModelReleaseInfo obj)
        {
            string sql = @"UPDATE SET ReleaseInfo Title=@Title,Contexts=@Contexts,ReleaseDate=@ReleaseDate,
                                InfoSource=@InfoSource,KeyWords=@KeyWords,ReleaseName=@ReleaseName,
                                CollectDate=@CollectDate,Snapshot=@Snapshot
                        WHERE uid=@uid";

            List<MySqlParameter> par = new List<MySqlParameter>();
            par.Add(new MySqlParameter("@uid", obj.Uid));
            par.Add(new MySqlParameter("@Title", obj.Title));
            par.Add(new MySqlParameter("@Contexts", obj.Contexts));
            par.Add(new MySqlParameter("@RleaseDate", obj.ReleaseDate));
            par.Add(new MySqlParameter("@InfoSource", obj.InfoSource));
            par.Add(new MySqlParameter("@KeyWords", obj.KeyWords));
            par.Add(new MySqlParameter("@ReleaseName", obj.ReleaseName));
            par.Add(new MySqlParameter("@CollectDate", obj.CollectDate));
            par.Add(new MySqlParameter("@Snapshot", obj.Snapshot));

            try
            {
                DataBaseServer.MySqlCmd dbobj = new DataBaseServer.MySqlCmd();
                return dbobj.ExecuteNonQueryInt(sql, par);
            }
            catch (Exception ex)
            {
                throw new Exception("新建失败,位置:FixReleaseInfo.原因:" + ex.Message);
            }
        }
Esempio n. 2
0
        /// <summary>
        /// 好搜(360,有道都是使用的好搜的搜索结果)
        /// </summary>
        /// <param name="html"></param>
        /// <param name="keyword"></param>
        /// <param name="kid"></param>
        /// <returns></returns>
        public List<ModelReleaseInfo> ParseHaosouWeb(string html, string keyword, int kid)
        {
            List<ModelReleaseInfo> webDatas = new List<ModelReleaseInfo>();
            try
            {
                #region 解析网站源码
                MIL.Html.HtmlDocument doc = MIL.Html.HtmlDocument.Create(html);
                MIL.Html.HtmlNodeCollection nodes = doc.Nodes.FindByAttributeNameValue("class", "result", true);
                if (nodes != null && nodes.Count > 0 && nodes[0].IsElement())
                {
                    MIL.Html.HtmlNodeCollection resultNodes = (nodes[0] as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "res-list", true);
                    if (resultNodes != null && resultNodes.Count > 0)
                    {
                        foreach (MIL.Html.HtmlNode result in resultNodes)
                        {
                            //<div class="res-rich so-rich-news clearfix">
                            if ((result is MIL.Html.HtmlElement) && (result as MIL.Html.HtmlElement).Nodes != null)
                            {
                                MIL.Html.HtmlNodeCollection contextNodes_Rich = (result as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "res-rich so-rich-news clearfix", true);
                                MIL.Html.HtmlNodeCollection contextNodes_Image = (result as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "res-rich res-image clearfix", true);
                                if ((contextNodes_Rich != null && contextNodes_Rich.Count > 0) || (contextNodes_Image != null && contextNodes_Image.Count > 0))
                                {
                                    ModelReleaseInfo mri = new ModelReleaseInfo();
                                    #region 标题、超链
                                    MIL.Html.HtmlNodeCollection titleNodes = (result as MIL.Html.HtmlElement).Nodes.FindByName("res-title");
                                    if (titleNodes != null && titleNodes.Count > 0)
                                    {
                                        if (titleNodes[0].IsElement())
                                        {
                                            MIL.Html.HtmlNodeCollection aNodes = (titleNodes[0] as MIL.Html.HtmlElement).Nodes.FindByName("a");
                                            if (aNodes != null && aNodes.Count > 0 && aNodes[0].IsElement())
                                            {
                                                string title = "";
                                                foreach (MIL.Html.HtmlNode t in (aNodes[0] as MIL.Html.HtmlElement).Nodes)
                                                {
                                                    if (t.IsText())
                                                    {
                                                        title += (t as MIL.Html.HtmlText).Text;
                                                    }
                                                    else if (t.IsElement() && (t as MIL.Html.HtmlElement).Name.ToLower() == "em")
                                                    {
                                                        title += (t as MIL.Html.HtmlElement).Text;
                                                    }
                                                }
                                                string href = (aNodes[0] as MIL.Html.HtmlElement).Attributes["href"].Value;

                                                mri.Title = title;
                                                mri.InfoSource = href;
                                            }
                                        }
                                    }
                                    #endregion
                                    #region 内容简介
                                    MIL.Html.HtmlNodeCollection context = null;
                                    MIL.Html.HtmlNodeCollection link = null;
                                    if (contextNodes_Image != null && contextNodes_Image.Count > 0)
                                    {
                                        MIL.Html.HtmlNodeCollection resCommNodes = (contextNodes_Image[0] as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "res-comm-con");
                                        if (resCommNodes != null && resCommNodes.Count > 0)
                                        {
                                            context = (resCommNodes[0] as MIL.Html.HtmlElement).Nodes.FindByName("p");
                                            link = (resCommNodes[0] as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "res-linkinfo");
                                        }
                                        else
                                        {
                                            context = (contextNodes_Image[0] as MIL.Html.HtmlElement).Nodes.FindByName("p");
                                            link = (contextNodes_Image[0] as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "res-linkinfo");
                                        }
                                    }
                                    if (contextNodes_Rich != null && contextNodes_Rich.Count > 0)
                                    {
                                        MIL.Html.HtmlNodeCollection resCommNodes = (contextNodes_Rich[0] as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "res-comm-con");
                                        if (resCommNodes != null && resCommNodes.Count > 0)
                                        {
                                            //内容直接写在res-comm-con的div下
                                            context = resCommNodes;
                                            link = (resCommNodes[0] as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "res-linkinfo");
                                        }
                                        else
                                        {
                                            context = (contextNodes_Image[0] as MIL.Html.HtmlElement).Nodes.FindByName("div");
                                            if (context != null && context.Count > 0 && context[0].IsElement())
                                                link = (context[0] as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "res-linkinfo");

                                        }
                                    }
                                    if (context != null && context.Count > 0)
                                    {
                                        string text = "";
                                        foreach (MIL.Html.HtmlNode c in (context[0] as MIL.Html.HtmlElement).Nodes)
                                        {
                                            if (c.IsText())
                                            {
                                                text += (c as MIL.Html.HtmlText).Text;
                                            }
                                            else if (c.IsElement() && (c as MIL.Html.HtmlElement).Name.ToLower() == "em")
                                            {
                                                text += (c as MIL.Html.HtmlElement).Text;
                                            }
                                        }
                                        mri.Contexts = text;
                                    }
                                    if (link != null && link.Count > 0)
                                    {
                                        //<cite>www.7y7.com/yule/8...&nbsp;2015-04-14</cite>
                                        MIL.Html.HtmlNodeCollection citeNodes = (link[0] as MIL.Html.HtmlElement).Nodes.FindByName("cite");
                                        if (citeNodes != null && citeNodes.Count > 0)
                                        {
                                            if (citeNodes[0].IsElement())
                                            {
                                                string date = (citeNodes[0] as MIL.Html.HtmlElement).Text;
                                                byte[] space = new byte[] { 0xc2, 0xa0 };
                                                string UTFSpace = Encoding.GetEncoding("UTF-8").GetString(space);
                                                string txt = date.Replace(UTFSpace, " ").Trim();
                                                if (txt.IndexOf(" ") > 0)
                                                {
                                                    txt = txt.Substring(txt.IndexOf(" ") + 1).Trim();
                                                }
                                                mri.ReleaseDate = FormateDate(txt);
                                            }
                                        }

                                        MIL.Html.HtmlNodeCollection cacheNodes = (link[0] as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "m");
                                        if (cacheNodes != null && cacheNodes.Count > 0)
                                        {
                                            if (cacheNodes[0].IsElement() && (cacheNodes[0] as MIL.Html.HtmlElement).Name == "a")
                                            {
                                                if ((cacheNodes[0] as MIL.Html.HtmlElement).Attributes != null && (cacheNodes[0] as MIL.Html.HtmlElement).Attributes["href"] != null)
                                                {
                                                    string href = (cacheNodes[0] as MIL.Html.HtmlElement).Attributes["href"].Value;
                                                    mri.Snapshot = href;
                                                }
                                            }
                                        }
                                        MIL.Html.HtmlNodeCollection mingpianNodes = (link[0] as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "mingpian mingpian-box");
                                        if (mingpianNodes == null || mingpianNodes.Count == 0)
                                        {
                                            mingpianNodes = (link[0] as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "mingpian");
                                        }
                                        if (mingpianNodes != null && mingpianNodes.Count > 0)
                                        {
                                            if (mingpianNodes[0].IsElement() && (mingpianNodes[0] as MIL.Html.HtmlElement).Name == "a")
                                            {
                                                string src = (mingpianNodes[0] as MIL.Html.HtmlElement).Text;
                                                if (!string.IsNullOrEmpty(src))
                                                {
                                                    mri.ReleaseName = src;
                                                }
                                            }
                                        }
                                    }

                                    #endregion
                                    #region 其他杂项
                                    mri.KeyWords = keyword;
                                    mri.CollectDate = string.Format(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"));
                                    mri.Kid = kid;
                                    mri.Sheng = "";
                                    mri.Shi = "";
                                    mri.Xian = "";
                                    if (!string.IsNullOrEmpty(mri.ReleaseName))
                                    {
                                        mri.WebName = mri.ReleaseName;
                                    }
                                    else
                                    {
                                        mri.WebName = "全网";
                                    }
                                    mri.Pid = 0;
                                    //mri.Part = GetParts(mri.Contexts);
                                    mri.Comments = (int)WebSourceType.HaosouWeb;
                                    mri.Reposts = 0;
                                    #endregion
                                    #region 报告进度
                                    OnReportCactchProcess(mri);
                                    #endregion

                                    if (!string.IsNullOrEmpty(mri.Title))
                                    {
                                        webDatas.Add(mri);
                                    }
                                }
                            }
                        }
                    }

                }
                #endregion
            }
            catch (Exception ex)
            {
                Comm.WriteErrorLog("解析中搜网页搜索时报错:" + ex.Message);
                Comm.WriteErrorLog(ex.StackTrace);
            }

            return webDatas;
        }
Esempio n. 3
0
        /// <summary>
        /// 添加用户
        /// </summary>
        /// <param name="obj">用户对象</param>
        /// <returns></returns>
        public int InsReleaseInfo(ModelReleaseInfo obj)
        {
            string sql = @"INSERT INTO ReleaseInfo(Title,Contexts,ReleaseDate,InfoSource,KeyWords,ReleaseName,CollectDate,Snapshot)
                            VALUES(@Title,@Contexts,@RleaseDate,@InfoSource,@KeyWords,@ReleaseName,@CollectDate,@Snapshot) ";

            List<MySqlParameter> par = new List<MySqlParameter>();
            par.Add(new MySqlParameter("@Title", obj.Title));
            par.Add(new MySqlParameter("@Contexts", obj.Contexts));
            par.Add(new MySqlParameter("@RleaseDate", obj.ReleaseDate));
            par.Add(new MySqlParameter("@InfoSource", obj.InfoSource));
            par.Add(new MySqlParameter("@KeyWords", obj.KeyWords));
            par.Add(new MySqlParameter("@ReleaseName", obj.ReleaseName));
            par.Add(new MySqlParameter("@CollectDate", obj.CollectDate));
            par.Add(new MySqlParameter("@Snapshot", obj.Snapshot));

            try
            {
                MySqlCmd dbobj = new MySqlCmd();
                return dbobj.ExecuteNonQueryInt(sql, par);

            }
            catch (Exception ex)
            {
                throw new Exception("新建失败,位置:InsReleaseInfo.原因:" + ex.Message);
            }
        }
Esempio n. 4
0
        /// <summary>
        /// 解析百度贴吧
        /// </summary>
        /// <param name="html"></param>
        /// <param name="keyword"></param>
        /// <returns></returns>
        public List<ModelReleaseInfo> ParseBaiduTieba(string html, string keyword, int kid)
        {
            List<ModelReleaseInfo> webDatas = new List<ModelReleaseInfo>();
            try
            {
                #region 网页源码样例
                #endregion
                #region 解析网站源码
                MIL.Html.HtmlDocument doc = MIL.Html.HtmlDocument.Create(html);
                MIL.Html.HtmlNodeCollection nodes = doc.Nodes.FindByAttributeNameValue("class", "s_post", true);
                foreach (MIL.Html.HtmlNode n in nodes)
                {
                    if ((n is MIL.Html.HtmlElement) && (n as MIL.Html.HtmlElement).Nodes != null)
                    {
                        MIL.Html.HtmlNodeCollection titleNodes = (n as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "p_title");
                        ModelReleaseInfo mri = new ModelReleaseInfo();
                        #region 标题与超链
                        if (titleNodes != null && titleNodes.Count > 0)
                        {
                            //title中包含<em>节点,需要进行处理
                            string title = "";
                            MIL.Html.HtmlElement titleElement = ((titleNodes[0] as MIL.Html.HtmlElement).Nodes[0] as MIL.Html.HtmlElement);
                            if (titleElement.Nodes != null && titleElement.Nodes.Count > 0)
                            {
                                foreach (MIL.Html.HtmlNode t in titleElement.Nodes)
                                {
                                    if (t.IsText())
                                    {
                                        title += (t as MIL.Html.HtmlText).Text;
                                    }
                                    else if (t.IsElement() && (t as MIL.Html.HtmlElement).Name.ToLower() == "em")
                                    {
                                        title += (t as MIL.Html.HtmlElement).Text;
                                    }
                                }
                            }
                            string href = ((titleNodes[0] as MIL.Html.HtmlElement).Nodes[0] as MIL.Html.HtmlElement).Attributes["href"].Value;
                            mri.Title = title;
                            mri.InfoSource = "http://tieba.baidu.com/" + href;
                        }

                        #endregion
                        #region 内容简介
                        MIL.Html.HtmlNodeCollection contextNodes = (n as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "p_content");
                        if (contextNodes != null && contextNodes.Count > 0)
                        {
                            string context = "";
                            MIL.Html.HtmlElement contextElement = (contextNodes[0] as MIL.Html.HtmlElement);
                            if (contextElement.Nodes != null && contextElement.Nodes.Count > 0)
                            {
                                foreach (MIL.Html.HtmlNode c in contextElement.Nodes)
                                {
                                    if (c.IsText())
                                    {
                                        context += (c as MIL.Html.HtmlText).Text;
                                    }
                                    else if (c.IsElement() && (c as MIL.Html.HtmlElement).Name.ToLower() == "em")
                                    {
                                        context += (c as MIL.Html.HtmlElement).Text;
                                    }
                                }
                            }
                            mri.Contexts = context;
                        }

                        #endregion
                        #region 来源,发表时间
                        MIL.Html.HtmlNodeCollection authorNodes = (n as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "p_violet");
                        if (authorNodes != null && authorNodes.Count > 0)
                        {
                            string author = (authorNodes[0] as MIL.Html.HtmlElement).Text;
                            mri.ReleaseName = author;
                        }
                        MIL.Html.HtmlNodeCollection publishNodes = (n as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "p_green p_date");
                        if (publishNodes != null && publishNodes.Count > 0)
                        {
                            string date = (publishNodes[0] as MIL.Html.HtmlElement).Text;
                            mri.ReleaseDate = date;
                        }
                        #endregion
                        #region 快照
                        mri.Snapshot = "";
                        #endregion
                        #region 其他杂项
                        mri.KeyWords = keyword;
                        mri.CollectDate = string.Format(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"));
                        mri.Kid = kid;
                        mri.Sheng = "";
                        mri.Shi = "";
                        mri.Xian = "";
                        mri.WebName = "贴吧";
                        mri.Pid = 5;
                        //mri.Part = GetParts(mri.Contexts);
                        mri.Comments = (int)WebSourceType.BaiduTieba;
                        mri.Reposts = 0;
                        #endregion
                        #region 2015.8.13 新增获取网址正文
                        if (!string.IsNullOrEmpty(mri.InfoSource))
                        {
                            string strContexts = HtmlUtil.getHtml(mri.InfoSource, "");
                            string noHtmlContexts = HtmlUtil.NoHTML(strContexts);
                            //分析关键字前100,后50个字符
                            string formatContexts = GetContexts(noHtmlContexts, keyword);
                            if (!string.IsNullOrEmpty(formatContexts))
                            {
                                mri.Contexts = formatContexts;
                            }
                        }
                        #endregion
                        #region 报告进度
                        OnReportCactchProcess(mri);
                        #endregion
                        if (!string.IsNullOrEmpty(mri.Title))
                        {
                            webDatas.Add(mri);
                        }
                    }
                }
                #endregion
            }
            catch (Exception ex)
            {
                Comm.WriteErrorLog("解析贴吧搜索页时报错:" + ex.Message);
                Comm.WriteErrorLog(ex.StackTrace);
            }

            return webDatas;
        }
Esempio n. 5
0
 public void OnReportCactchProcess(ModelReleaseInfo mri)
 {
     if (this.ReportCatchProcess != null)
         this.ReportCatchProcess(mri);
 }
Esempio n. 6
0
        /// <summary>
        /// 中搜
        /// </summary>
        /// <param name="html"></param>
        /// <param name="keyword"></param>
        /// <param name="kid"></param>
        /// <returns></returns>
        public List<ModelReleaseInfo> ParseZhongsouWeb(string html, string keyword, int kid)
        {
            List<ModelReleaseInfo> webDatas = new List<ModelReleaseInfo>();
            try
            {
                #region 解析网站源码
                MIL.Html.HtmlDocument doc = MIL.Html.HtmlDocument.Create(html);
                MIL.Html.HtmlNodeCollection nodes = doc.Nodes.FindByAttributeNameValue("class", "content-net-ul content-zonghe-ul", true);//content-net-ul
                if (nodes == null || (nodes != null && nodes.Count == 0))
                {
                    nodes = doc.Nodes.FindByAttributeNameValue("class", "content-net-ul", true);
                }
                if (nodes != null && nodes.Count > 0 && nodes[0].IsElement())
                {
                    MIL.Html.HtmlNodeCollection resultNodes = (nodes[0] as MIL.Html.HtmlElement).Nodes.FindByName("li");
                    if (resultNodes != null && resultNodes.Count > 0)
                    {
                        foreach (MIL.Html.HtmlNode result in resultNodes)
                        {
                            if ((result is MIL.Html.HtmlElement)
                                && ((result as MIL.Html.HtmlElement).Attributes == null || ((result as MIL.Html.HtmlElement).Attributes != null && (result as MIL.Html.HtmlElement).Attributes.Count == 0))
                                && (result as MIL.Html.HtmlElement).Nodes != null)
                            {
                                ModelReleaseInfo mri = new ModelReleaseInfo();
                                #region 标题、超链
                                MIL.Html.HtmlNodeCollection titleNodes = (result as MIL.Html.HtmlElement).Nodes.FindByName("h3");
                                if (titleNodes != null && titleNodes.Count > 0)
                                {
                                    //title中包含<strong>节点,需要进行处理
                                    if (titleNodes[0].IsElement())
                                    {
                                        MIL.Html.HtmlNodeCollection aNodes = (titleNodes[0] as MIL.Html.HtmlElement).Nodes.FindByName("a");
                                        if (aNodes != null && aNodes.Count > 0 && aNodes[0].IsElement())
                                        {
                                            string title = "";
                                            foreach (MIL.Html.HtmlNode t in (aNodes[0] as MIL.Html.HtmlElement).Nodes)
                                            {
                                                if (t.IsText())
                                                {
                                                    title += (t as MIL.Html.HtmlText).Text;
                                                }
                                                else if (t.IsElement() && (t as MIL.Html.HtmlElement).Name.ToLower() == "em")
                                                {
                                                    title += (t as MIL.Html.HtmlElement).Text;
                                                }
                                            }
                                            string href = (aNodes[0] as MIL.Html.HtmlElement).Attributes["href"].Value;

                                            mri.Title = title;
                                            mri.InfoSource = href;
                                            //中搜的没有发布者,暂时使用数据的超级链接代替
                                            mri.ReleaseName = href;
                                            mri.ReleaseDate = string.Format(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"));
                                        }
                                    }
                                }
                                #endregion
                                #region 内容简介
                                MIL.Html.HtmlNodeCollection contentNodes = (result as MIL.Html.HtmlElement).Nodes.FindByName("p");
                                if (contentNodes != null && contentNodes.Count > 0)
                                {
                                    string context = "";
                                    if (contentNodes[0].IsElement())
                                    {
                                        foreach (MIL.Html.HtmlNode c in (contentNodes[0] as MIL.Html.HtmlElement).Nodes)
                                        {
                                            if (c.IsText())
                                            {
                                                context += (c as MIL.Html.HtmlText).Text;
                                            }
                                            else if (c.IsElement() && (c as MIL.Html.HtmlElement).Name.ToLower() == "em")
                                            {
                                                context += (c as MIL.Html.HtmlElement).Text;
                                            }
                                        }
                                    }
                                    mri.Contexts = context;
                                }

                                #endregion
                                #region 其他杂项
                                mri.Snapshot = "";
                                mri.KeyWords = keyword;
                                mri.CollectDate = string.Format(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"));
                                mri.Kid = kid;
                                mri.Sheng = "";
                                mri.Shi = "";
                                mri.Xian = "";
                                if (!string.IsNullOrEmpty(mri.ReleaseName))
                                {
                                    mri.WebName = mri.ReleaseName;
                                }
                                else
                                {
                                    mri.WebName = "全网";
                                }
                                mri.Pid = 0;
                                //mri.Part = GetParts(mri.Contexts);
                                mri.Comments = (int)WebSourceType.ZhongsouWeb;
                                mri.Reposts = 0;
                                #endregion
                                #region 2015.8.13 新增获取网址正文
                                if (!string.IsNullOrEmpty(mri.InfoSource))
                                {
                                    string strContexts = HtmlUtil.getHtml(mri.InfoSource, "");
                                    string noHtmlContexts = HtmlUtil.NoHTML(strContexts);
                                    //分析关键字前100,后50个字符
                                    string formatContexts = GetContexts(noHtmlContexts, keyword);
                                    if (!string.IsNullOrEmpty(formatContexts))
                                    {
                                        mri.Contexts = formatContexts;
                                    }
                                }
                                #endregion
                                #region 报告进度
                                OnReportCactchProcess(mri);
                                #endregion

                                if (!string.IsNullOrEmpty(mri.Title))
                                {
                                    webDatas.Add(mri);
                                }
                            }
                        }
                    }

                }
                #endregion
            }
            catch (Exception ex)
            {
                Comm.WriteErrorLog("解析中搜网页搜索时报错:" + ex.Message);
                Comm.WriteErrorLog(ex.StackTrace);
            }

            return webDatas;
        }
Esempio n. 7
0
        /// <summary>
        /// 解析搜狗微信
        /// </summary>
        /// <param name="html"></param>
        /// <param name="keyword"></param>
        /// <returns></returns>
        public List<ModelReleaseInfo> ParseSogouWeixin(string html, string keyword, int kid, CookieContainer cookies, string strCookies)
        {
            List<ModelReleaseInfo> webDatas = new List<ModelReleaseInfo>();
            try
            {
                #region 网页源码样例
                //<div class="wx-rb wx-rb3" id="sogou_vr_11002601_box_0" d="ab735a258a90e8e1-6bee54fcbd896b2a-edc12dfac5e9cc56cc54e1c5029bc26f">
                //    <div class="img_box2">
                //        <a target="_blank" href="http://mp.weixin.qq.com/s?__biz=MjM5MzAwNTYyOA==&amp;mid=208327244&amp;idx=2&amp;sn=b717e3bec57a0ae878a0930148f95ef1&amp;3rd=MzA3MDU4NTYzMw==&amp;scene=6#rd" style="width:80px;height:80px;display:block;border:1px solid #ebebeb;overflow:hidden;" id="sogou_vr_11002601_img_0"><img style="visibility: visible; border: none; height: 80px; margin-left: -18.5px;" onload="vrImgLoad(this, 'fit', 80, 80)" onerror="imgErr(this.parentNode)" src="http://img01.store.sogou.com/net/a/04/link?appid=100520031&amp;url=http://mmbiz.qpic.cn/mmbiz/ymIic4RlQO3Hd2ouOx2RjQ9VCzpXmtev3RFRicZVrHVibM7SBZNmY7h0IcO4GlozCjfKIJIrIibAqqeQQfAGNGTdbg/0"></a>
                //    </div>
                //    <div class="txt-box">
                //        <h4>
                //            <a target="_blank" href="http://mp.weixin.qq.com/s?__biz=MjM5MzAwNTYyOA==&amp;mid=208327244&amp;idx=2&amp;sn=b717e3bec57a0ae878a0930148f95ef1&amp;3rd=MzA3MDU4NTYzMw==&amp;scene=6#rd" id="sogou_vr_11002601_title_0">马化腾:“引入移动互联网治<em><!--red_beg-->雾霾<!--red_end--></em>”</a>
                //        </h4>
                //        <p id="sogou_vr_11002601_summary_0">马化腾:“引入移动互联网治<em>雾霾</em>”来源:新京报(2015/3/5)“任何人在任何时候都应该平等、方便、无障碍地获取和使用信息.”全国人大代表、腾讯公司CEO马化腾在今年两会期间共发出四项议案,均与互联网相关.他建议加快移动互联网在民生领域的普及和应用,通过互...</p>
                //        <script>
                //                    (function(vrid,rank){
                //                        var id = 'sogou_vr_'+vrid+'_summary'+'_'+rank;
                //                        document.getElementById(id).innerHTML = cutLength(document.getElementById(id).innerHTML,240);
                //                })('11002601', '0');
                //        </script>
                //        <div class="s-p" t="1426037274">
                //            <a id="weixin_account" target="_blank" class="zhz" href="/gzh?openid=oIWsFtyE27izVYkKuEP1w4S1XpNQ" title="李志青环境经济工作室" i="oIWsFtyE27izVYkKuEP1w4S1XpNQ"><script>document.write(cutLength('李志青环境经济工作室', 16))</script>李志青环境经济...</a>
                //            <script>vrTimeHandle552write('1426037274')</script>
                //            09:27
                //            <span id="btn_share" class="fx on"><a class="fx-a" href="#" key="1">分享</a><div class="fx-pos" style="display: none"><em class="ico-sj"></em><a id="btn_share_xl" class="xl" href="#"><span></span></a><a target="_blank" id="btn_share_qzone" class="qq2" href="http://sns.qzone.qq.com/cgi-bin/qzshare/cgi_qzshare_onekey?source=shareqq&amp;url=http%3A%2F%2Fmp.weixin.qq.com%2Fs%3F__biz%3DMjM5MzAwNTYyOA%3D%3D%26mid%3D208327244%26idx%3D2%26sn%3Db717e3bec57a0ae878a0930148f95ef1%263rd%3DMzA3MDU4NTYzMw%3D%3D%26scene%3D6%23rd&amp;summary=%E3%80%80&amp;title=%E9%A9%AC%E5%8C%96%E8%85%BE%3A%E2%80%9C%E5%BC%95%E5%85%A5%E7%A7%BB%E5%8A%A8%E4%BA%92%E8%81%94%E7%BD%91%E6%B2%BB%E9%9B%BE%E9%9C%BE%E2%80%9D&amp;pics=http%3A%2F%2Fimg01.store.sogou.com%2Fnet%2Fa%2F04%2Flink%3Fappid%3D100520031%26url%3Dhttp%3A%2F%2Fmmbiz.qpic.cn%2Fmmbiz%2FymIic4RlQO3Hd2ouOx2RjQ9VCzpXmtev3RFRicZVrHVibM7SBZNmY7h0IcO4GlozCjfKIJIrIibAqqeQQfAGNGTdbg%2F0"><span></span></a><a target="_blank" id="btn_share_qq" class="qq" href="http://connect.qq.com/widget/shareqq/index.html?source=shareqq&amp;url=http%3A%2F%2Fmp.weixin.qq.com%2Fs%3F__biz%3DMjM5MzAwNTYyOA%3D%3D%26mid%3D208327244%26idx%3D2%26sn%3Db717e3bec57a0ae878a0930148f95ef1%263rd%3DMzA3MDU4NTYzMw%3D%3D%26scene%3D6%23rd&amp;summary=%E3%80%80&amp;title=%E9%A9%AC%E5%8C%96%E8%85%BE%3A%E2%80%9C%E5%BC%95%E5%85%A5%E7%A7%BB%E5%8A%A8%E4%BA%92%E8%81%94%E7%BD%91%E6%B2%BB%E9%9B%BE%E9%9C%BE%E2%80%9D&amp;desc=%E5%88%9A%E7%9C%8B%E5%88%B0%E8%BF%99%E7%AF%87%E6%96%87%E7%AB%A0%E4%B8%8D%E9%94%99%EF%BC%8C%E6%8E%A8%E8%8D%90%E7%BB%99%E4%BD%A0%E7%9C%8B%E7%9C%8B%EF%BD%9E"><span></span></a></div></span><span id="btn_favorite" class="sc"><a class="sc-a" href="#" key="0">收藏</a><div style="display:none" class="sc-pos" key="0"><em class="ico-sj"></em><p>您确定要取消该收藏?</p><span class="sc-btn"><a id="btn_confirm" i="" class="a2" href="#">确定</a><a id="btn_cancel" href="#">再想想</a></span></div><div style="display:none" class="sc-pos sc-pos-v1"><em class="ico-sj"></em><p>收藏成功!</p><p class="p2">在"<a id="btn_favorite" target="_blank" href="/share?stype=2">我的收藏</a>"可查看所有收藏内容。</p></div><div style="display:none" class="sc-pos sc-pos-wrr"><em class="ico-sj"></em><i class="ico-wrr"></i>收藏失败!请稍后再试。</div></span>
                //        </div>
                //    </div>
                //</div>
                #endregion
                #region 解析网站源码
                MIL.Html.HtmlDocument doc = MIL.Html.HtmlDocument.Create(html);
                MIL.Html.HtmlNodeCollection nodes = doc.Nodes.FindByAttributeNameValue("class", "wx-rb wx-rb3", true);
                foreach (MIL.Html.HtmlNode n in nodes)
                {
                    if ((n is MIL.Html.HtmlElement) && (n as MIL.Html.HtmlElement).Nodes != null)
                    {
                        MIL.Html.HtmlNodeCollection weixinNodes = (n as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "txt-box");
                        ModelReleaseInfo mri = new ModelReleaseInfo();
                        #region 标题与超链
                        if (weixinNodes != null && weixinNodes.Count > 0)
                        {
                            MIL.Html.HtmlNodeCollection titleNodes = (n as MIL.Html.HtmlElement).Nodes.FindByName("h4");
                            if (titleNodes != null && titleNodes.Count > 0)
                            {
                                //title中包含<em>节点,需要进行处理
                                string title = "";
                                MIL.Html.HtmlElement titleElement = ((titleNodes[0] as MIL.Html.HtmlElement).Nodes[0] as MIL.Html.HtmlElement);
                                if (titleElement.Nodes != null && titleElement.Nodes.Count > 0)
                                {
                                    foreach (MIL.Html.HtmlNode t in titleElement.Nodes)
                                    {
                                        if (t.IsText())
                                        {
                                            title += (t as MIL.Html.HtmlText).Text;
                                        }
                                        else if (t.IsElement() && (t as MIL.Html.HtmlElement).Name.ToLower() == "em")
                                        {
                                            title += (t as MIL.Html.HtmlElement).Text;
                                        }
                                    }
                                }
                                string href = ((titleNodes[0] as MIL.Html.HtmlElement).Nodes[0] as MIL.Html.HtmlElement).Attributes["href"].Value;
                                mri.Title = title;
                                //mri.InfoSource = href;
                                //2016.4.17 微信的地址需要添加前缀  http://weixin.sogou.com
                                if (href.StartsWith("http://mp.weixin.qq.com", StringComparison.OrdinalIgnoreCase))
                                {
                                    mri.InfoSource = href;
                                }
                                else
                                {
                                    mri.InfoSource = string.Format("{0}{1}", "http://weixin.sogou.com", href);
                                    HttpWebRequest req = (HttpWebRequest)HttpWebRequest.Create(mri.InfoSource);
                                    req.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
                                    req.UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36";
                                    req.Headers.Add("Accept-Encoding", "gzip, deflate, sdch");
                                    req.Headers.Add("Accept-Language", "zh-CN,zh;q=0.8");
                                    //req.CookieContainer = cookies; //使用已经保存的cookies 方法一
                                    req.Headers.Add("Cookie", strCookies); //使用已经保存的cookies 方法二

                                    HttpWebResponse response = (HttpWebResponse)req.GetResponse();
                                    if (response.ResponseUri != null)
                                    {
                                        mri.InfoSource = response.ResponseUri.AbsoluteUri;
                                    }
                                }
                            }
                        }

                        #endregion
                        #region 内容简介
                        MIL.Html.HtmlNodeCollection contextNodes = (n as MIL.Html.HtmlElement).Nodes.FindByName("p");
                        if (contextNodes != null && contextNodes.Count > 0)
                        {
                            string context = "";
                            MIL.Html.HtmlElement contextElement = (contextNodes[0] as MIL.Html.HtmlElement);
                            if (contextElement.Nodes != null && contextElement.Nodes.Count > 0)
                            {
                                foreach (MIL.Html.HtmlNode c in contextElement.Nodes)
                                {
                                    if (c.IsText())
                                    {
                                        context += (c as MIL.Html.HtmlText).Text;
                                    }
                                    else if (c.IsElement() && (c as MIL.Html.HtmlElement).Name.ToLower() == "em")
                                    {
                                        context += (c as MIL.Html.HtmlElement).Text;
                                    }
                                }
                            }
                            mri.Contexts = context;
                        }

                        #endregion
                        #region 来源,发表时间
                        MIL.Html.HtmlNodeCollection authorNodes = (n as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "s-p");
                        if (authorNodes != null && authorNodes.Count > 0)
                        {
                            string author = ((authorNodes[0] as MIL.Html.HtmlElement).Nodes[0] as MIL.Html.HtmlElement).Attributes["title"].Value;
                            mri.ReleaseName = author;

                            string date = (authorNodes[0] as MIL.Html.HtmlElement).Text;
                            date = ((authorNodes[0] as MIL.Html.HtmlElement).Nodes[1] as MIL.Html.HtmlElement).Text;
                            if (date.IndexOf('\'') >= 0 && date.LastIndexOf('\'') > date.IndexOf('\''))
                            {
                                date = date.Substring(date.IndexOf('\'') + 1, date.LastIndexOf('\'') - date.IndexOf('\'') - 1);
                                Int64 d;
                                if (Int64.TryParse(date, out d))
                                {
                                    date = new DateTime(new DateTime(1970, 1, 1, 8, 0, 0).Ticks + Int64.Parse(date) * 10000000).ToString("yyyy-MM-dd HH:mm:dd");
                                    mri.ReleaseDate = date;
                                }
                            }
                        }
                        #endregion
                        #region 快照
                        mri.Snapshot = "";
                        #endregion
                        #region 其他杂项
                        mri.KeyWords = keyword;
                        mri.CollectDate = string.Format(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"));
                        mri.Kid = kid;
                        mri.Sheng = "";
                        mri.Shi = "";
                        mri.Xian = "";
                        mri.WebName = "微信";
                        mri.Pid = 6;
                        //mri.Part = GetParts(mri.Contexts);
                        mri.Comments = (int)WebSourceType.SogouWeixin;
                        mri.Reposts = 0;
                        #endregion
                        #region 2015.8.13 新增获取网址正文
                        if (!string.IsNullOrEmpty(mri.InfoSource))
                        {
                            string strContexts = HtmlUtil.getHtml(mri.InfoSource, "");
                            string noHtmlContexts = HtmlUtil.NoHTML(strContexts);
                            //分析关键字前100,后50个字符
                            string formatContexts = GetContexts(noHtmlContexts, keyword);
                            if (!string.IsNullOrEmpty(formatContexts))
                            {
                                mri.Contexts = formatContexts;
                            }
                        }
                        #endregion
                        #region 报告进度
                        OnReportCactchProcess(mri);
                        #endregion
                        if (!string.IsNullOrEmpty(mri.Title))
                        {
                            webDatas.Add(mri);
                        }
                    }
                }
                #endregion
            }
            catch (Exception ex)
            {
                Comm.WriteErrorLog("解析搜狗微信搜索页时报错:" + ex.Message);
                Comm.WriteErrorLog(ex.StackTrace);
            }

            return webDatas;
        }
Esempio n. 8
0
        public List<ModelReleaseInfo> ParseSogouNews(string html, string keyword, int kid)
        {
            List<ModelReleaseInfo> webDatas = new List<ModelReleaseInfo>();
            try
            {
                #region 解析网站源码
                MIL.Html.HtmlDocument doc = MIL.Html.HtmlDocument.Create(html);
                MIL.Html.HtmlNodeCollection nodes = doc.Nodes.FindByAttributeNameValue("class", "results", true);
                foreach (MIL.Html.HtmlNode n in nodes)
                {
                    if ((n is MIL.Html.HtmlElement) && (n as MIL.Html.HtmlElement).Nodes != null)
                    {
                        foreach (MIL.Html.HtmlNode news in (n as MIL.Html.HtmlElement).Nodes)
                        {
                            if (news.IsElement() && (news as MIL.Html.HtmlElement).Name == "div")
                            {
                                ModelReleaseInfo mri = new ModelReleaseInfo();

                                if (news.IsElement() && (news as MIL.Html.HtmlElement).Attributes["class"].Value == "rb")
                                {
                                    #region 标题与超链、来源,发表时间
                                    MIL.Html.HtmlNodeCollection titleNodes = (news as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "pt");
                                    if (titleNodes != null && titleNodes.Count > 0)
                                    {
                                        if (titleNodes[0].IsElement() && (titleNodes[0] as MIL.Html.HtmlElement).Nodes.Count > 0)
                                        {
                                            MIL.Html.HtmlNodeCollection herfNodes = (titleNodes[0] as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "pp");
                                            if (herfNodes != null && herfNodes.Count > 0)
                                            {
                                                string title = "";
                                                if (herfNodes[0].IsElement() && (herfNodes[0] as MIL.Html.HtmlElement).Nodes != null && (herfNodes[0] as MIL.Html.HtmlElement).Nodes.Count > 0)
                                                {
                                                    //<b> </b>
                                                    MIL.Html.HtmlNode tNode = herfNodes[0];
                                                    if (((herfNodes[0] as MIL.Html.HtmlElement).Nodes[0] as MIL.Html.HtmlElement).Name == "b")
                                                    {
                                                        tNode = (herfNodes[0] as MIL.Html.HtmlElement).Nodes[0];
                                                    }
                                                    foreach (MIL.Html.HtmlNode t in (tNode as MIL.Html.HtmlElement).Nodes)
                                                    {
                                                        if (t.IsText())
                                                        {
                                                            title += (t as MIL.Html.HtmlText).Text;
                                                        }
                                                        else if (t.IsElement() && (t as MIL.Html.HtmlElement).Name.ToLower() == "em")
                                                        {
                                                            title += (t as MIL.Html.HtmlElement).Text;
                                                        }
                                                    }
                                                }
                                                //title中包含<em>节点,需要进行处理
                                                string href = ((titleNodes[0] as MIL.Html.HtmlElement).Nodes[0] as MIL.Html.HtmlElement).Attributes["href"].Value;
                                                mri.Title = title;
                                                mri.InfoSource = href;
                                            }
                                        }
                                        MIL.Html.HtmlNodeCollection citeNodes = (titleNodes[0] as MIL.Html.HtmlElement).Nodes.FindByName("cite");
                                        if (citeNodes != null && citeNodes.Count > 0)
                                        {
                                            if (citeNodes[0].IsElement() && (citeNodes[0] as MIL.Html.HtmlElement).Attributes != null && (citeNodes[0] as MIL.Html.HtmlElement).Attributes["title"] != null)
                                            {
                                                string srcTitle = (citeNodes[0] as MIL.Html.HtmlElement).Attributes["title"].Value;
                                                mri.ReleaseName = srcTitle;

                                                string srcTxt = "";
                                                if (citeNodes[0].IsElement())
                                                {
                                                    srcTxt = (citeNodes[0] as MIL.Html.HtmlElement).Text;
                                                }
                                                else if (citeNodes[0].IsText())
                                                {
                                                    srcTxt = (citeNodes[0] as MIL.Html.HtmlText).Text;
                                                }

                                                byte[] space = new byte[] { 0xc2, 0xa0 };
                                                string UTFSpace = Encoding.GetEncoding("UTF-8").GetString(space);
                                                string txt = srcTxt.Replace(UTFSpace, " ").Trim().Replace(srcTitle, "").Trim();
                                                mri.ReleaseDate = FormateDate(txt);
                                            }

                                        }
                                    }
                                    #endregion
                                    #region 内容简介
                                    MIL.Html.HtmlNodeCollection contextNodes = (news as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "ft");
                                    if (contextNodes != null && contextNodes.Count > 0)
                                    {
                                        string context = "";
                                        MIL.Html.HtmlElement contextElement = (contextNodes[0] as MIL.Html.HtmlElement);
                                        if (contextElement.Nodes != null && contextElement.Nodes.Count > 0)
                                        {
                                            foreach (MIL.Html.HtmlNode c in contextElement.Nodes)
                                            {
                                                if (c.IsText())
                                                {
                                                    context += (c as MIL.Html.HtmlText).Text;
                                                }
                                                else if (c.IsElement() && (c as MIL.Html.HtmlElement).Name.ToLower() == "em")
                                                {
                                                    context += (c as MIL.Html.HtmlElement).Text;
                                                }
                                            }
                                        }
                                        mri.Contexts = context;
                                    }

                                    #endregion
                                }

                                #region 其他杂项
                                mri.Snapshot = "";
                                mri.KeyWords = keyword;
                                mri.CollectDate = string.Format(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"));
                                mri.Kid = kid;
                                mri.Sheng = "";
                                mri.Shi = "";
                                mri.Xian = "";
                                if (!string.IsNullOrEmpty(mri.ReleaseName))
                                {
                                    mri.WebName = mri.ReleaseName;
                                }
                                else
                                {
                                    mri.WebName = "主流媒体";
                                }
                                mri.Pid = 4;
                                //mri.Part = GetParts(mri.Contexts);
                                mri.Comments = (int)WebSourceType.SogouNews;
                                mri.Reposts = 0;
                                #endregion

                                #region 2015.8.13 新增获取网址正文
                                if (!string.IsNullOrEmpty(mri.InfoSource))
                                {
                                    string strContexts = HtmlUtil.getHtml(mri.InfoSource, "");
                                    string noHtmlContexts = HtmlUtil.NoHTML(strContexts);
                                    //分析关键字前100,后50个字符
                                    string formatContexts = GetContexts(noHtmlContexts, keyword);
                                    if (!string.IsNullOrEmpty(formatContexts))
                                    {
                                        mri.Contexts = formatContexts;
                                    }
                                }
                                #endregion
                                #region 报告进度
                                OnReportCactchProcess(mri);
                                #endregion

                                if (!string.IsNullOrEmpty(mri.Title))
                                {
                                    webDatas.Add(mri);
                                }
                            }
                        }
                    }
                }
                #endregion
            }
            catch (Exception ex)
            {
                Comm.WriteErrorLog("解析搜狗博客搜索页时报错:" + ex.Message);
                Comm.WriteErrorLog(ex.StackTrace);
            }

            return webDatas;
        }
Esempio n. 9
0
 public void Tieba_ReportCatchProcess(ModelReleaseInfo mri)
 {
 }
Esempio n. 10
0
 public void parse_ReportCatchProcess(ModelReleaseInfo mri)
 {
 }
Esempio n. 11
0
 public void Blog_ReportCatchProcess(ModelReleaseInfo mri)
 {
 }
Esempio n. 12
0
 public void BaiduWeb_ReportCatchProcess(ModelReleaseInfo mri)
 {
 }
Esempio n. 13
0
        public String GetInsertStr(ModelReleaseInfo mri)
        {
            string sql = @"INSERT INTO ReleaseInfo(Title,Contexts,ReleaseDate,InfoSource,KeyWords,ReleaseName,CollectDate,Snapshot,webName,pid,part,reposts,comments,kid,sheng,shi,xian)
                            VALUES('{0}','{1}','{2}','{3}','{4}','{5}','{6}','{7}','{8}','{9}','{10}','{11}','{12}','{13}','{14}','{15}','{16}'); ";

            mri.Title = filtRiskChar(mri.Title);
            mri.Contexts = filtRiskChar(mri.Contexts);
            //统一处理一下发布时间
            string date = mri.ReleaseDate;
            string[] formats = {"yyyy-MM-dd HH:mm:ss","yyyy-M-dd HH:mm:ss","yyyy-M-d HH:mm:ss","yyyy-MM-d HH:mm:ss",
                    "yyyy-MM-dd HH:mm","yyyy-MM-dd hh:mm","yyyy-MM-dd H:mm","yyyy-MM-dd h:mm","yyyy-MM-dd HH:m","yyyy-MM-dd hh:m","yyyy-MM-dd h:m",
                    "yyyy-MM-dd hh:mm:ss","yyyy-MM-dd hh:mm:s","yyyy-MM-dd hh:m:s","yyyy-MM-dd hh:m:ss","yyyy-MM-dd h:mm:ss","yyyy-MM-dd h:mm:s","yyyy-MM-dd h:m:s","yyyy-MM-dd h:m:ss",
                    "yyyy-MM-dd HH:mm:s","yyyy-MM-dd HH:m:s","yyyy-MM-dd HH:m:ss","yyyy-MM-dd H:mm:ss","yyyy-MM-dd H:mm:s","yyyy-MM-dd H:m:s","yyyy-MM-dd H:m:ss",
                    "yyyy-M-dd HH:mm","yyyy-M-dd hh:mm","yyyy-M-dd H:mm","yyyy-M-dd h:mm","yyyy-M-dd HH:m","yyyy-M-dd hh:m","yyyy-M-dd h:m",
                    "yyyy-M-dd hh:mm:ss","yyyy-M-dd hh:mm:s","yyyy-M-dd hh:m:s","yyyy-M-dd hh:m:ss","yyyy-M-dd h:mm:ss","yyyy-M-dd h:mm:s","yyyy-M-dd h:m:s","yyyy-M-dd h:m:ss",
                    "yyyy-M-dd HH:mm:s","yyyy-M-dd HH:m:s","yyyy-M-dd HH:m:ss","yyyy-M-dd H:mm:ss","yyyy-M-dd H:mm:s","yyyy-M-dd H:m:s","yyyy-M-dd H:m:ss",
                    "yyyy-M-d HH:mm","yyyy-M-d hh:mm","yyyy-M-d H:mm","yyyy-M-d h:mm","yyyy-M-d HH:m","yyyy-M-d hh:m","yyyy-M-d h:m",
                    "yyyy-M-d hh:mm:ss","yyyy-M-d hh:mm:s","yyyy-M-d hh:m:s","yyyy-M-d hh:m:ss","yyyy-M-d h:mm:ss","yyyy-M-d h:mm:s","yyyy-M-d h:m:s","yyyy-M-d h:m:ss",
                    "yyyy-M-d HH:mm:s","yyyy-M-d HH:m:s","yyyy-M-d HH:m:ss","yyyy-M-d H:mm:ss","yyyy-M-d H:mm:s","yyyy-M-d H:m:s","yyyy-M-d H:m:ss",
                    "yyyy-MM-d HH:mm","yyyy-MM-d hh:mm","yyyy-MM-d H:mm","yyyy-MM-d h:mm","yyyy-MM-d HH:m","yyyy-MM-d hh:m","yyyy-MM-d h:m",
                    "yyyy-MM-d hh:mm:ss","yyyy-MM-d hh:mm:s","yyyy-MM-d hh:m:s","yyyy-MM-d hh:m:ss","yyyy-MM-d h:mm:ss","yyyy-MM-d h:mm:s","yyyy-MM-d h:m:s","yyyy-MM-d h:m:ss",
                    "yyyy-MM-d HH:mm:s","yyyy-MM-d HH:m:s","yyyy-MM-d HH:m:ss","yyyy-MM-d H:mm:ss","yyyy-MM-d H:mm:s","yyyy-MM-d H:m:s","yyyy-MM-d H:m:ss",
                    "yyyy-MM-dd","yyyy-M-dd","yyyy-M-d","yyyy-MM-d"};

            DateTime dateValue;
            if (DateTime.TryParseExact(date, formats,
                System.Globalization.DateTimeFormatInfo.CurrentInfo, System.Globalization.DateTimeStyles.None, out dateValue))
            {
                date = dateValue.ToString("yyyy-MM-dd HH:mm:ss");
            }
            else
            {
                date = mri.CollectDate;
            }
            sql = string.Format(sql, mri.Title, mri.Contexts, date, mri.InfoSource, mri.KeyWords,
                mri.ReleaseName, mri.CollectDate, mri.Snapshot, mri.WebName, mri.Pid, mri.Part, mri.Reposts, mri.Comments, mri.Kid, mri.Sheng, mri.Shi, mri.Xian);
            return sql;
        }
Esempio n. 14
0
        /// <summary>
        /// 解析新浪微博搜索
        /// </summary>
        /// <param name="html"></param>
        /// <param name="keyword"></param>
        /// <returns></returns>
        public List<ModelReleaseInfo> ParseSinaWeibo(string html, string keyword, int kid)
        {
            List<ModelReleaseInfo> webDatas = new List<ModelReleaseInfo>();
            try
            {
                //新浪微博的数据进行了加密处理,需要对数据进行解密操作
                string decodeHtml = SinWeiboDecode(html);
                MIL.Html.HtmlDocument doc = MIL.Html.HtmlDocument.Create(decodeHtml);

                #region 网页源码样例
                #endregion
                #region 解析网站源码
                MIL.Html.HtmlNodeCollection nodes = doc.Nodes.FindByAttributeNameValue("class", "WB_cardwrap S_bg2 clearfix", true);
                foreach (MIL.Html.HtmlNode n in nodes)
                {
                    if ((n is MIL.Html.HtmlElement) && (n as MIL.Html.HtmlElement).Nodes != null)
                    {
                        MIL.Html.HtmlNodeCollection contenNodes = (n as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "content clearfix", true);
                        if (contenNodes != null && contenNodes.Count > 0)
                        {
                            try
                            {
                                ModelReleaseInfo mri = new ModelReleaseInfo();
                                MIL.Html.HtmlNodeCollection feedContent = (contenNodes[0] as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "feed_content wbcon", true);
                                if (feedContent != null && feedContent.Count > 0)
                                {
                                    #region 标题与超链
                                    MIL.Html.HtmlNodeCollection wTexta = (feedContent[0] as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "W_texta W_fb");
                                    if (wTexta != null && wTexta.Count > 0)
                                    {
                                        string title = (wTexta[0] as MIL.Html.HtmlElement).Attributes["title"].Value;
                                        string href = (wTexta[0] as MIL.Html.HtmlElement).Attributes["href"].Value;
                                        mri.Title = title;
                                        mri.ReleaseName = title;
                                        mri.InfoSource = href;
                                    }
                                    #endregion
                                    #region 内容简介
                                    MIL.Html.HtmlNodeCollection commentTxt = (feedContent[0] as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "comment_txt");
                                    if (commentTxt != null && commentTxt.Count > 0)
                                    {
                                        string context = "";
                                        MIL.Html.HtmlElement contextElement = (commentTxt[0] as MIL.Html.HtmlElement);
                                        if (contextElement.Nodes != null && contextElement.Nodes.Count > 0)
                                        {
                                            foreach (MIL.Html.HtmlNode c in contextElement.Nodes)
                                            {
                                                if (c.IsText())
                                                {
                                                    context += (c as MIL.Html.HtmlText).Text;
                                                }
                                                else if (c.IsElement() && (c as MIL.Html.HtmlElement).Name.ToLower() == "em")
                                                {
                                                    context += (c as MIL.Html.HtmlElement).Text;
                                                }
                                            }
                                        }
                                        mri.Contexts = context;
                                    }
                                    #endregion
                                }
                                #region 发表时间
                                MIL.Html.HtmlNodeCollection publishNodes = (contenNodes[0] as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "feed_from W_textb", false);
                                if (publishNodes != null && publishNodes.Count > 0)
                                {
                                    MIL.Html.HtmlNodeCollection dateNodes = (publishNodes[0] as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "W_textb");
                                    if (dateNodes != null && dateNodes.Count > 0)
                                    {
                                        string date = (dateNodes[0] as MIL.Html.HtmlElement).Attributes["title"].Value;
                                        mri.ReleaseDate = date;
                                    }
                                }
                                #endregion
                                #region 其他杂项
                                mri.KeyWords = keyword;
                                mri.CollectDate = string.Format(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"));
                                mri.Kid = kid;
                                mri.Sheng = "";
                                mri.Shi = "";
                                mri.Xian = "";
                                mri.WebName = "微博";
                                mri.Pid = 3;
                                //mri.Part = GetParts(mri.Contexts);
                                mri.Comments = (int)WebSourceType.SinWeibo;
                                mri.Reposts = 0;
                                #endregion
                                #region 2015.8.13 新增获取网址正文
                                if (!string.IsNullOrEmpty(mri.InfoSource))
                                {
                                    string strContexts = HtmlUtil.getHtml(mri.InfoSource, "");
                                    string noHtmlContexts = HtmlUtil.NoHTML(strContexts);
                                    //分析关键字前100,后50个字符
                                    string formatContexts = GetContexts(noHtmlContexts, keyword);
                                    if (!string.IsNullOrEmpty(formatContexts))
                                    {
                                        mri.Contexts = formatContexts;
                                    }
                                }
                                #endregion
                                #region 报告进度
                                OnReportCactchProcess(mri);
                                #endregion
                                if (!string.IsNullOrEmpty(mri.Title))
                                {
                                    webDatas.Add(mri);
                                }
                            }
                            catch (Exception ex1)
                            {
                                Comm.WriteErrorLog("解析新浪微博搜索页时报错:" + ex1.Message);
                                Comm.WriteErrorLog(ex1.StackTrace);
                            }
                        }
                        else
                        {
                            //相关文章
                            MIL.Html.HtmlNodeCollection linkFeedNodes = (n as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "search_shortlink2 nopic clearfix", true);
                            if (linkFeedNodes != null && linkFeedNodes.Count > 0)
                            {
                                foreach (MIL.Html.HtmlNode linkFeed in linkFeedNodes)
                                {
                                    try
                                    {
                                        ModelReleaseInfo mri = new ModelReleaseInfo();
                                        #region 标题与超链
                                        MIL.Html.HtmlNodeCollection pNodes = (linkFeed as MIL.Html.HtmlElement).Nodes.FindByName("p", false);
                                        if (pNodes != null && pNodes.Count > 0)
                                        {
                                            MIL.Html.HtmlNodeCollection aNodes = (pNodes[0] as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "W_texta W_fb", false);
                                            if (aNodes != null && aNodes.Count > 0)
                                            {
                                                string title = (aNodes[0] as MIL.Html.HtmlElement).Attributes["title"].Value;
                                                string href = (aNodes[0] as MIL.Html.HtmlElement).Attributes["href"].Value;
                                                mri.Title = title;
                                                mri.InfoSource = href;
                                            }
                                        }
                                        #endregion
                                        #region 内容和发布者,日期
                                        MIL.Html.HtmlNodeCollection divNodes = (linkFeed as MIL.Html.HtmlElement).Nodes.FindByName("div", false);
                                        if (divNodes != null && divNodes.Count > 0)
                                        {
                                            MIL.Html.HtmlNodeCollection linkNodes = (divNodes[0] as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "link_con");
                                            if (linkNodes != null && linkNodes.Count > 0)
                                            {
                                                #region "内容"
                                                MIL.Html.HtmlNodeCollection pTitleNodes = (linkNodes[0] as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "link_info W_textb", false);
                                                if (pTitleNodes != null && pTitleNodes.Count > 0)
                                                {
                                                    string context = "";
                                                    MIL.Html.HtmlElement contextElement = (pTitleNodes[0] as MIL.Html.HtmlElement);
                                                    if (contextElement.Nodes != null && contextElement.Nodes.Count > 0)
                                                    {
                                                        foreach (MIL.Html.HtmlNode c in contextElement.Nodes)
                                                        {
                                                            if (c.IsText())
                                                            {
                                                                context += (c as MIL.Html.HtmlText).Text;
                                                            }
                                                            else if (c.IsElement() && (c as MIL.Html.HtmlElement).Name.ToLower() == "em")
                                                            {
                                                                context += (c as MIL.Html.HtmlElement).Text;
                                                            }
                                                        }
                                                    }
                                                    mri.Contexts = context;

                                                }
                                                #endregion
                                                #region 发布者,日期
                                                MIL.Html.HtmlNodeCollection footNodes = (linkNodes[0] as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "link_action clearfix W_linkb W_textb", false);
                                                if (footNodes != null && footNodes.Count > 0)
                                                {
                                                    MIL.Html.HtmlNodeCollection linkAcNodes = (footNodes[0] as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "linkAC_from");
                                                    if (linkAcNodes != null && linkAcNodes.Count > 0)
                                                    {
                                                        if (linkAcNodes.Count >= 2)
                                                        {
                                                            if ((linkAcNodes[0] as MIL.Html.HtmlElement).Nodes != null && (linkAcNodes[0] as MIL.Html.HtmlElement).Nodes.Count > 0)
                                                            {
                                                                //<span class="linkAC_from"><a>新浪网</a></span>
                                                                if ((linkAcNodes[0] as MIL.Html.HtmlElement).Nodes[0].IsElement())
                                                                {
                                                                    mri.ReleaseName = ((linkAcNodes[0] as MIL.Html.HtmlElement).Nodes[0] as MIL.Html.HtmlElement).Text;
                                                                }
                                                                else if ((linkAcNodes[0] as MIL.Html.HtmlElement).Nodes[0].IsText())
                                                                {
                                                                    mri.ReleaseName = ((linkAcNodes[0] as MIL.Html.HtmlElement).Nodes[0] as MIL.Html.HtmlText).Text;
                                                                }
                                                            }
                                                            else
                                                            {
                                                                //<span class="linkAC_from">新浪网 发布</span>
                                                                if (linkAcNodes[0].IsElement())
                                                                {
                                                                    mri.ReleaseName = (linkAcNodes[0] as MIL.Html.HtmlElement).Text;
                                                                }
                                                                else if (linkAcNodes[0].IsText())
                                                                {
                                                                    mri.ReleaseName = (linkAcNodes[0] as MIL.Html.HtmlText).Text;
                                                                }
                                                            }

                                                            string date = (linkAcNodes[1] as MIL.Html.HtmlElement).Text;
                                                            if (date.IndexOf("年") > 0 && date.IndexOf("月") > 0 && date.IndexOf("日") > 0)
                                                            {
                                                                date = date.Replace("年", "-").Replace("月", "-").Replace("日", "");
                                                            }
                                                            else if (date.IndexOf("年") == -1 && date.IndexOf("月") > 0 && date.IndexOf("日") > 0)
                                                            {
                                                                date = DateTime.Now.Year.ToString() + "-" + date.Replace("月", "-").Replace("日", "");
                                                            }
                                                            else
                                                            {
                                                                if (date.Contains("天前"))
                                                                {
                                                                    int offset = int.Parse(date.Substring(0, date.IndexOf("天前")).Trim());
                                                                    date = DateTime.Now.AddDays(offset * -1).ToString("yyyy-MM-dd");
                                                                }
                                                                else if (date.Contains("小时前"))
                                                                {
                                                                    int offset = int.Parse(date.Substring(0, date.IndexOf("小时前")).Trim());
                                                                    date = DateTime.Now.AddHours(offset * -1).ToString("yyyy-MM-dd");
                                                                }
                                                                else if (date.Contains("时前"))
                                                                {
                                                                    int offset = int.Parse(date.Substring(0, date.IndexOf("时前")).Trim());
                                                                    date = DateTime.Now.AddHours(offset * -1).ToString("yyyy-MM-dd");
                                                                }
                                                                else if (date.Contains("分") && date.Contains("前"))
                                                                {
                                                                    int offset = int.Parse(date.Substring(0, date.IndexOf("分")).Trim());
                                                                    date = DateTime.Now.AddMinutes(offset * -1).ToString("yyyy-MM-dd");
                                                                }
                                                            }
                                                            mri.ReleaseDate = date;
                                                        }

                                                    }
                                                }
                                                #endregion
                                            }
                                        }
                                        #endregion
                                        #region 其他杂项
                                        mri.KeyWords = keyword;
                                        mri.CollectDate = string.Format(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"));
                                        mri.Kid = kid;
                                        mri.Sheng = "";
                                        mri.Shi = "";
                                        mri.Xian = "";
                                        mri.WebName = "微博";
                                        mri.Pid = 3;
                                        //mri.Part = GetParts(mri.Contexts);
                                        mri.Comments = (int)WebSourceType.SinWeibo;
                                        mri.Reposts = 0;
                                        #endregion
                                        #region 2015.8.13 新增获取网址正文
                                        if (!string.IsNullOrEmpty(mri.InfoSource))
                                        {
                                            string strContexts = HtmlUtil.getHtml(mri.InfoSource, "");
                                            string noHtmlContexts = HtmlUtil.NoHTML(strContexts);
                                            //分析关键字前100,后50个字符
                                            string formatContexts = GetContexts(noHtmlContexts, keyword);
                                            if (!string.IsNullOrEmpty(formatContexts))
                                            {
                                                mri.Contexts = formatContexts;
                                            }
                                        }
                                        #endregion
                                        #region 报告进度
                                        OnReportCactchProcess(mri);
                                        #endregion

                                        if (!string.IsNullOrEmpty(mri.Title))
                                        {
                                            webDatas.Add(mri);
                                        }
                                    }
                                    catch (Exception ex2)
                                    {
                                        Comm.WriteErrorLog("解析新浪微博搜索页时报错:" + ex2.Message);
                                        Comm.WriteErrorLog(ex2.StackTrace);
                                    }
                                }
                            }
                        }
                    }
                }
                #endregion
            }
            catch (Exception ex)
            {
                Comm.WriteErrorLog("解析新浪微博搜索页时报错:" + ex.Message);
                Comm.WriteErrorLog(ex.StackTrace);
            }

            return webDatas;
        }
Esempio n. 15
0
 public void Weixin_ReportCatchProcess(ModelReleaseInfo mri)
 {
 }
Esempio n. 16
0
        /// <summary>
        /// 解析搜狗论坛搜索
        /// </summary>
        /// <param name="html"></param>
        /// <param name="keyword"></param>
        /// <returns></returns>
        public List<ModelReleaseInfo> ParseSogouBBS(string html, string keyword, int kid)
        {
            List<ModelReleaseInfo> webDatas = new List<ModelReleaseInfo>();
            try
            {
                #region 网页源码样例
                //<div class="rb">
                //<h3 class="pt">
                //<a name="dttl" target="_blank" href="http://www.douban.com/group/topic/73012913/" id="sogou_vr__7"><em><!--red_beg-->王学兵<!--red_end--></em>张博被抓啦?</a>
                //</h3>
                //<div class="bloginfo">回复:42  发帖时间:2015-03-10</div>
                //<div class="ft" id="cacheresult_summary_7"><em><!--red_beg-->王学兵<!--red_end--></em>被抓啦? 没人来扒嚒? 好像是真的,其余的演员是谁啊? 有人说是张博 对他的印象还停留在鼻子上带个环的牛魔王 最帅牛魔王。。可惜了 尼玛 我买了他话剧的票啊。 还好...</div>
                //<div class="fb">
                //<cite id="cacheresult_info_7">豆瓣 - www.douban.com - 2015-3-10</cite> - <a target="_blank" style="color: #666666;" href="/websnapshot?ie=utf8&amp;url=http%3A%2F%2Fwww.douban.com%2Fgroup%2Ftopic%2F73012913%2F&amp;did=4353f151360295fa-19fb356d1006e9a2-a79bdc52c64c17d84ef3760ce2d90ecf&amp;k=700a30e8de1c5980883fd5c164cf8fe7&amp;encodedQuery=%E7%8E%8B%E5%AD%A6%E5%85%B5&amp;query=%E7%8E%8B%E5%AD%A6%E5%85%B5&amp;&amp;p=40040100&amp;dp=1&amp;w=01020400&amp;m=0&amp;st=1" id="sogou_snapshot_7">  快照</a> - <a name="sogou_preview_links" style="color: #666666;" href="javascript:void(null);" id="sogou_preview_7" onclick="sogou_preview(this,'7');return false;" sogou_preview_title="<em><!--red_beg-->王学兵<!--red_end--></em>张博被抓啦?" sogou_preview_link="http://www.douban.com/group/topic/73012913/" url="/websnapshot?ie=utf8&amp;preview=1&amp;url=http%3A%2F%2Fwww.douban.com%2Fgroup%2Ftopic%2F73012913%2F&amp;did=4353f151360295fa-19fb356d1006e9a2-a79bdc52c64c17d84ef3760ce2d90ecf&amp;k=700a30e8de1c5980883fd5c164cf8fe7&amp;encodedQuery=%E7%8E%8B%E5%AD%A6%E5%85%B5&amp;query=%E7%8E%8B%E5%AD%A6%E5%85%B5&amp;&amp;p=40040100&amp;dp=1&amp;title=%E7%8E%8B%E5%AD%A6%E5%85%B5%E5%BC%A0%E5%8D%9A%E8%A2%AB%E6%8A%93%E5%95%A6%EF%BC%9F&amp;st=1">预览</a>
                //</div>
                //</div>

                //<div id="rb_8" class="rb">  <h3 class="pt">  <!--awbg8-->  <a name="dttl" target="_blank" href="http://club.baobao.sohu.com/mom_daugh/thread/31gdnmg7ltc" id="uigs__8"><!--awbg8--><em><!--red_beg-->王学兵<!--red_end--></em>涉毒被抓 - 搜狐社区</a>  </h3>  <div class="ft" id="cacheresult_summary_8">  <!--summary_beg-->坚持到永远虎妞 坚持到永远虎妞 坚持到永远虎妞 someone 淡烟疏雨杏花天 someone 坚持到永远虎妞 坚持到永远虎妞 兔妈1975 坚持到永远虎妞 someone 蔷薇雨花落 蔷薇雨花落 心想梦成小顺子 坚持到永远虎妞 心想梦成小顺子 青蛙oO<!--summary_end--></div>  <div class="fb">  <cite id="cacheresult_info_8">搜狐母婴 - club.baobao.sohu.co...&nbsp;-&nbsp;2天前</cite>  <!--resultinfodate:2015-3-10-->&nbsp;-&nbsp;<!--resultsnap_beg--><a target="_blank" style="color: #666666;" href="/websnapshot?ie=utf8&amp;url=http%3A%2F%2Fclub.baobao.sohu.com%2Fmom_daugh%2Fthread%2F31gdnmg7ltc&amp;did=30b30b6c3013fff8-413eedc0ea50c10e-f27e2646a3d448dca9697951702d6bdd&amp;k=26b5166c09911a1fde756d4a6baf7a16&amp;encodedQuery=%E7%8E%8B%E5%AD%A6%E5%85%B5&amp;query=%E7%8E%8B%E5%AD%A6%E5%85%B5&amp;&amp;p=40040100&amp;dp=1&amp;w=01020400&amp;m=0&amp;st=1" id="sogou_snapshot_8">快照<!--resultsnap_end--></a>  &nbsp;-&nbsp;<a name="sogou_preview_links" style="color: #666666;" href="javascript:void(null);" id="sogou_preview_8" onclick="sogou_preview(this,'8');return false;" sogou_preview_title="<em><!--red_beg-->王学兵<!--red_end--></em>涉毒被抓 - 搜狐社区" sogou_preview_link="http://club.baobao.sohu.com/mom_daugh/thread/31gdnmg7ltc" url="/websnapshot?ie=utf8&amp;preview=1&amp;url=http%3A%2F%2Fclub.baobao.sohu.com%2Fmom_daugh%2Fthread%2F31gdnmg7ltc&amp;did=30b30b6c3013fff8-413eedc0ea50c10e-f27e2646a3d448dca9697951702d6bdd&amp;k=26b5166c09911a1fde756d4a6baf7a16&amp;encodedQuery=%E7%8E%8B%E5%AD%A6%E5%85%B5&amp;query=%E7%8E%8B%E5%AD%A6%E5%85%B5&amp;&amp;p=40040100&amp;dp=1&amp;title=%E7%8E%8B%E5%AD%A6%E5%85%B5%E6%B6%89%E6%AF%92%E8%A2%AB%E6%8A%93+-+%E6%90%9C%E7%8B%90%E7%A4%BE%E5%8C%BA&amp;st=1">预览</a>  <div class="fb-remark"><a href="javascript:void(0);" class="vr-sp-evaicon" title="点评"><i></i></a><a href="javascript:void(0);" class="vr-sp-collect" title="收藏"><i></i></a><span style="display:none;" from="java" id="30b30b6c3013fff8-413eedc0ea50c10e-f27e2646a3d448dca9697951702d6bdd" zanurl="http://club.baobao.sohu.com/mom_daugh/thread/31gdnmg7ltc" zantitle="%3Cem%3E%3C!--red_beg--%3E%E7%8E%8B%E5%AD%A6%E5%85%B5%3C!--red_end--%3E%3C/em%3E%E6%B6%89%E6%AF%92%E8%A2%AB%E6%8A%93%20-%20%E6%90%9C%E7%8B%90%E7%A4%BE%E5%8C%BA" zandocid="30b30b6c3013fff8-413eedc0ea50c10e-f27e2646a3d448dca9697951702d6bdd"></span></div><script>initEndorseShow2({"docid":"30b30b6c3013fff8-413eedc0ea50c10e-f27e2646a3d448dca9697951702d6bdd","count":"0"},{"score":"0","total":"0","docid":"30b30b6c3013fff8-413eedc0ea50c10e-f27e2646a3d448dca9697951702d6bdd"});</script></div>  <div class="r-sech ext_query" style="" id="sogou_vr__sq_ext_8"><span>推荐您搜索:</span><a target="_blank" href="http://www.sogou.com/web?query=%E7%8E%8B%E5%AD%A6%E5%85%B5%20%E6%90%9C%E7%8B%90%E7%A4%BE%E5%8C%BA" id="sogou_vr__sq_ext_a_0_8">王学兵 搜狐社区</a><a target="_blank" href="http://www.sogou.com/web?query=%E7%8E%8B%E5%AD%A6%E5%85%B5%E6%B6%89%E6%AF%92%E8%A2%AB%E6%8A%93" id="sogou_vr__sq_ext_a_1_8">王学兵涉毒被抓</a></div>  <div class="r-sech site_query" style="display: none;" id="sogou_vr__sq_ext_site_8" site="" ext="王学兵 搜狐社区;王学兵涉毒被抓"><span>推荐您在<a target="_blank" href="http://club.baobao.sohu.com/mom_daugh/thread/31gdnmg7ltc" id="sogou_vr__sq_ext_site_url_8">http://club.baobao.sohu.com/mom_daugh/thread/31gdnmg7ltc</a>站内搜索: </span><a target="_blank" href="http://club.baobao.sohu.com/mom_daugh/thread/31gdnmg7ltc" id="sogou_vr__sq_ext_stie_a_8"></a></div></div>
                //<div class="vrwrap" style="width:548px">
                //<h3 class="vrTitle">
                //<a target="_blank" href="http://bbs.shangdu.com/t/20150310/01001001684597/684597-1.htm" id="sogou_vr_30000909_9"><em><!--red_beg-->王学兵<!--red_end--></em>涉毒被抓……_商都社区</a>
                //</h3>
                //<div class="strBox">
                //<div class="str_div" id="sogou_vr_30000909_pic_9">
                //<a target="_blank" class="str_img size_120_80" id="sogou_vr_30000909_pic_a_9" href="http://bbs.shangdu.com/t/20150310/01001001684597/684597-1.htm"><img alt="" id="sogou_vr_30000909_pic_img_9" src="http://img01.sogoucdn.com/net/a/04/link?&amp;url=http%3A%2F%2Fbbsimg.shangdu.com%2FUserFiles%2FImage%2F653%2F61751653%2F1425968947558.jpg&amp;appid=100520124&amp;referer=http://bbs.shangdu.com/t/20150310/01001001684597/684597-1.htm"
                //style="top: -17.572px;"></a>
                //</div>
                //<div class="str_info_div">
                //<p class="str_info">
                //<span class="pink-color" style="display:none" id="sogou_vr_30000909_pink_9">[图文]</span>广告载入中... 广告载入中... 【楼主】 还有谁…… [HUAWEI T8950客户端软件]贴 手机客户端下载: 用户最新文章 广告载入中... 广告载入中... 广告载入中... 回复:<em><!--red_beg-->王学兵<!--red_end--></em>涉毒被...</p>
                //<div class="fb">
                //<cite id="cacheresult_info_9">商都BBS - bbs.shangdu.com - 2天前</cite>&nbsp;-&nbsp;<a target="_blank" style="color: #666666;" href="/websnapshot?ie=utf8&amp;url=http%3A%2F%2Fbbs.shangdu.com%2Ft%2F20150310%2F01001001684597%2F684597-1.htm&amp;did=e9fc19c65cd2272a-d1870cb49bf799d7-31857de99ffe3d54d661d75d7e88abc1&amp;k=cb1ce12d0348d24eb37534155017d22e&amp;encodedQuery=%E7%8E%8B%E5%AD%A6%E5%85%B5&amp;query=%E7%8E%8B%E5%AD%A6%E5%85%B5&amp;&amp;p=40040100&amp;dp=1&amp;w=01020400&amp;m=0&amp;st=1" id="sogou_snapshot_9">快照</a>
                //<div class="fb-remark">
                //</div>
                //</div>
                //</div>
                //</div>
                //<div class="r-sech ext_query" style="display: none;" id="sogou_vr_30000909_sq_ext_9">

                //</div>
                //</div>
                #endregion
                #region 解析网站源码
                MIL.Html.HtmlDocument doc = MIL.Html.HtmlDocument.Create(html);
                MIL.Html.HtmlNodeCollection nodes = doc.Nodes.FindByAttributeNameValue("class", "results", true);
                foreach (MIL.Html.HtmlNode n in nodes)
                {
                    if ((n is MIL.Html.HtmlElement) && (n as MIL.Html.HtmlElement).Nodes != null)
                    {
                        foreach (MIL.Html.HtmlNode blog in (n as MIL.Html.HtmlElement).Nodes)
                        {
                            if (blog.IsElement() && (blog as MIL.Html.HtmlElement).Name == "div")
                            {
                                ModelReleaseInfo mri = new ModelReleaseInfo();
                                if (blog.IsElement() && (blog as MIL.Html.HtmlElement).Attributes["class"].Value == "rb")
                                {
                                    #region 标题与超链
                                    MIL.Html.HtmlNodeCollection titleNodes = (blog as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "pt");
                                    if (titleNodes != null && titleNodes.Count > 0)
                                    {
                                        //title中包含<em>节点,需要进行处理
                                        string title = "";
                                        MIL.Html.HtmlElement titleElement = ((titleNodes[0] as MIL.Html.HtmlElement).Nodes[0] as MIL.Html.HtmlElement);
                                        if (titleElement.Nodes != null && titleElement.Nodes.Count > 0)
                                        {
                                            foreach (MIL.Html.HtmlNode t in titleElement.Nodes)
                                            {
                                                if (t.IsText())
                                                {
                                                    title += (t as MIL.Html.HtmlText).Text;
                                                }
                                                else if (t.IsElement() && (t as MIL.Html.HtmlElement).Name.ToLower() == "em")
                                                {
                                                    title += (t as MIL.Html.HtmlElement).Text;
                                                }
                                            }
                                        }
                                        string href = ((titleNodes[0] as MIL.Html.HtmlElement).Nodes[0] as MIL.Html.HtmlElement).Attributes["href"].Value;
                                        mri.Title = title;
                                        mri.InfoSource = href;
                                    }
                                    #endregion
                                    #region 内容简介
                                    MIL.Html.HtmlNodeCollection contextNodes = (blog as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "ft");
                                    if (contextNodes != null && contextNodes.Count > 0)
                                    {
                                        string context = "";
                                        MIL.Html.HtmlElement contextElement = (contextNodes[0] as MIL.Html.HtmlElement);
                                        if (contextElement.Nodes != null && contextElement.Nodes.Count > 0)
                                        {
                                            foreach (MIL.Html.HtmlNode c in contextElement.Nodes)
                                            {
                                                if (c.IsText())
                                                {
                                                    context += (c as MIL.Html.HtmlText).Text;
                                                }
                                                else if (c.IsElement() && (c as MIL.Html.HtmlElement).Name.ToLower() == "em")
                                                {
                                                    context += (c as MIL.Html.HtmlElement).Text;
                                                }
                                            }
                                        }
                                        mri.Contexts = context;
                                    }

                                    #endregion
                                    #region 来源,发表时间 快照
                                    MIL.Html.HtmlNodeCollection authorNodes = (blog as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "fb");
                                    if (authorNodes != null && authorNodes.Count > 0)
                                    {
                                        foreach (MIL.Html.HtmlNode child in (authorNodes[0] as MIL.Html.HtmlElement).Nodes)
                                        {
                                            if (child.IsElement() && (child as MIL.Html.HtmlElement).Name == "cite")
                                            {
                                                //<cite id="cacheresult_info_0">新浪博客 - blog.sina.com.cn/s/...&nbsp;-&nbsp;2013-1-28</cite>
                                                string txt = (child as MIL.Html.HtmlElement).Text;
                                                //解析出来源与时间
                                                txt = GetSogouAuthorAndDate(txt);
                                                if (txt.IndexOf(',') >= 0)
                                                {
                                                    mri.ReleaseName = txt.Substring(0, txt.IndexOf(','));
                                                    mri.ReleaseDate = txt.Substring(txt.IndexOf(',') + 1);
                                                }
                                            }
                                            else if (child.IsElement() && (child as MIL.Html.HtmlElement).Name == "a" && (child as MIL.Html.HtmlElement).Attributes["id"].Value.Contains("snapshot"))
                                            {
                                                #region 快照
                                                string snapShot = (child as MIL.Html.HtmlElement).Attributes["href"].Value;
                                                mri.Snapshot = "http://www.sogou.com" + snapShot;
                                                #endregion
                                            }
                                        }

                                    }
                                    #endregion
                                }
                                else if (blog.IsElement() && (blog as MIL.Html.HtmlElement).Attributes["class"].Value == "vrwrap")
                                {
                                    #region 标题与超链
                                    MIL.Html.HtmlNodeCollection titleNodes = (blog as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "vrTitle");
                                    if (titleNodes != null && titleNodes.Count > 0)
                                    {
                                        //title中包含<em>节点,需要进行处理
                                        string title = "";
                                        MIL.Html.HtmlElement titleElement = ((titleNodes[0] as MIL.Html.HtmlElement).Nodes[0] as MIL.Html.HtmlElement);
                                        if (titleElement.Nodes != null && titleElement.Nodes.Count > 0)
                                        {
                                            foreach (MIL.Html.HtmlNode t in titleElement.Nodes)
                                            {
                                                if (t.IsText())
                                                {
                                                    title += (t as MIL.Html.HtmlText).Text;
                                                }
                                                else if (t.IsElement() && (t as MIL.Html.HtmlElement).Name.ToLower() == "em")
                                                {
                                                    title += (t as MIL.Html.HtmlElement).Text;
                                                }
                                            }
                                        }
                                        string href = ((titleNodes[0] as MIL.Html.HtmlElement).Nodes[0] as MIL.Html.HtmlElement).Attributes["href"].Value;
                                        mri.Title = title;
                                        mri.InfoSource = href;
                                    }
                                    #endregion
                                    #region 内容简介
                                    MIL.Html.HtmlNodeCollection contextNodes = (blog as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "str_info");
                                    if (contextNodes != null && contextNodes.Count > 0)
                                    {
                                        string context = "";
                                        MIL.Html.HtmlElement contextElement = (contextNodes[0] as MIL.Html.HtmlElement);
                                        if (contextElement.Nodes != null && contextElement.Nodes.Count > 0)
                                        {
                                            foreach (MIL.Html.HtmlNode c in contextElement.Nodes)
                                            {
                                                if (c.IsText())
                                                {
                                                    context += (c as MIL.Html.HtmlText).Text;
                                                }
                                                else if (c.IsElement() && (c as MIL.Html.HtmlElement).Name.ToLower() == "em")
                                                {
                                                    context += (c as MIL.Html.HtmlElement).Text;
                                                }
                                            }
                                        }
                                        mri.Contexts = context;
                                    }

                                    #endregion
                                    #region 来源,发表时间 快照
                                    MIL.Html.HtmlNodeCollection authorNodes = (blog as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "fb");
                                    if (authorNodes != null && authorNodes.Count > 0)
                                    {
                                        foreach (MIL.Html.HtmlNode child in (authorNodes[0] as MIL.Html.HtmlElement).Nodes)
                                        {
                                            if (child.IsElement() && (child as MIL.Html.HtmlElement).Name == "cite")
                                            {
                                                //<cite id="cacheresult_info_9">商都BBS - bbs.shangdu.com - 2天前</cite>
                                                string txt = (child as MIL.Html.HtmlElement).Text;
                                                //解析出来源与时间
                                                txt = GetSogouAuthorAndDate(txt);
                                                if (txt.IndexOf(',') >= 0)
                                                {
                                                    mri.ReleaseName = txt.Substring(0, txt.IndexOf(','));
                                                    mri.ReleaseDate = txt.Substring(txt.IndexOf(',') + 1);
                                                }
                                            }
                                            else if (child.IsElement() && (child as MIL.Html.HtmlElement).Name == "a" && (child as MIL.Html.HtmlElement).Attributes["id"].Value.Contains("snapshot"))
                                            {
                                                #region 快照
                                                string snapShot = (child as MIL.Html.HtmlElement).Attributes["href"].Value;
                                                mri.Snapshot = "http://www.sogou.com" + snapShot;
                                                #endregion
                                            }
                                        }
                                    }
                                    #endregion
                                }
                                #region 其他杂项
                                mri.KeyWords = keyword;
                                mri.CollectDate = string.Format(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"));
                                mri.Kid = kid;
                                mri.Sheng = "";
                                mri.Shi = "";
                                mri.Xian = "";
                                mri.WebName = "论坛";
                                mri.Pid = 2;
                                //mri.Part = GetParts(mri.Contexts);
                                mri.Comments = (int)WebSourceType.SogouBBS;
                                mri.Reposts = 0;
                                #endregion
                                #region 2015.8.13 新增获取网址正文
                                if (!string.IsNullOrEmpty(mri.InfoSource))
                                {
                                    string strContexts = HtmlUtil.getHtml(mri.InfoSource, "");
                                    string noHtmlContexts = HtmlUtil.NoHTML(strContexts);
                                    //分析关键字前100,后50个字符
                                    string formatContexts = GetContexts(noHtmlContexts, keyword);
                                    if (!string.IsNullOrEmpty(formatContexts))
                                    {
                                        mri.Contexts = formatContexts;
                                    }
                                }
                                #endregion

                                #region 报告进度
                                OnReportCactchProcess(mri);
                                #endregion
                                if (!string.IsNullOrEmpty(mri.Title))
                                {
                                    webDatas.Add(mri);
                                }
                            }
                        }
                    }
                }
                #endregion
            }
            catch (Exception ex)
            {
                Comm.WriteErrorLog("解析搜狗论坛搜索页时报错:" + ex.Message);
                Comm.WriteErrorLog(ex.StackTrace);
            }

            return webDatas;
        }
Esempio n. 17
0
        /// <summary>
        /// 得到网站的新闻类数据
        /// </summary>
        private void GetWebNewsInfo()
        {
            lbWeb.Text = "";
            lbWeb.Visible = true;
            //相似链接
            string Similar = "";

            DataBaseServer.MySqlCmd cmd = new MySqlCmd();

            //得到相似表
            DataTable dtXs = new DataTable();
            dtXs = cmd.GetTabel("Select * from WebAddress WHERE pid=0");

            dtParts = cmd.GetTabel("SELECT * FROM partword");

            DataTable dtkey = new DataTable();
            dtkey = cmd.GetTabel("select * from Keywords");

            //相似表中的被抓取网址
            string webInfo = "";

            //要过滤链接中首页的正则
            string strTopFormat = "http://.+/";
            List<string> strTop = new List<string>();
            sb = new StringBuilder();
            sb.Append("");
            string filterStr = "";

            #region 读取相似度表中的数据据,循环抓取
            for (int xs = 0; xs < dtXs.Rows.Count; xs++)
            {
                lbWeb.ForeColor = Color.DarkBlue;
                lbWeb.Text = "正在搜索:" + dtXs.Rows[xs]["name"].ToString();
                //读取相似表中要抓取的网址
                webInfo = getHtml(dtXs.Rows[xs]["url"].ToString(), "");
                //读取相似链接
                Similar = dtXs.Rows[xs]["likeurl"].ToString();

                //取出
                //string[] strA = HtmlUtil.GetElementsByTagName(webInfo, "a");
                List<string> strList = HtmlUtil.GetElementsByTagNameList(webInfo, "a");

                string strURLformat = "http://.[^\"]+";

                TbReleaseInfo ri = new TbReleaseInfo();

                string[] strA = GetLIstDate(strList.Distinct());
                #region 逐个链接判断
                //循环时判断是否要验证
                bool isThere = false;

                for (int i = 0; i < strA.Length; i++)
                {
                    if (Program.ProClose == true) break;
                    Application.DoEvents();
                    try
                    {
                        //得到目标网址中的所有链接,如果未得到,那么就继续读取下一个
                        strA[i] = HtmlUtil.GetListByHtml(dtXs.Rows[xs]["url"].ToString(), strA[i], strURLformat)[0];
                        //处理含有单引号的链接
                        strA[i] = UrlCl(strA[i]);

                        //处理单引号的链接
                        if (strA[i].IndexOf("'") != -1)
                        {
                            strA[i] = GetstringByHtmlArray(strA[i], "http://.[^\']+");
                        }
                    }
                    catch (Exception)
                    {
                        continue;
                    }
                    //得到相似值,大于0.70的认为相同,并开始抓取
                    if (HtmlUtil.getSimilarDegree(Similar, strA[i]) >= 0.60)
                    {
                        //判断这个链接是否已经在库中或者列表中,如果存在,此次就不再执行
                        strTop = HtmlUtil.GetListByHtmlArray(strA[i], strTopFormat);
                        if (strTop.Count != 0)
                        {
                            //if (strTop[0] == "http://blog.sohu.com/")
                            continue;//同新闻,如果将首页去掉
                        }

                        if (isThere)
                        {
                            continue;
                        }
                        else
                        {
                            //if (strA[i] == "http://news.ifeng.com/mainland/detail_2013_10/18/30459577_0.shtml'>[详细]</a>")
                            //{
                            //    strA[i] = strA[i];
                            //}

                            if (UrlThereare(strA[i], this.dtnewsinfo, dtWebNewsInfo, true) != 0) { isThere = true; continue; }
                        }

                        //得到此链接的源码
                        webInfo = getHtml(strA[i], "");
                        if (webInfo.Length == 0) { continue; }

                        //创建数据对象
                        ModelReleaseInfo newsInfo = new ModelReleaseInfo();

                        try
                        {
                            //流水+1
                            newsInfo.Uid = this.dvAll.Rows.Count + 1;

                            //标题
                            string[] strT = HtmlUtil.GetElementsByTagName(webInfo, "title");
                            if (strT.Length == 0)
                            {
                                continue;
                            }
                            else
                            {
                                newsInfo.Title = HtmlUtil.NoHTML(HtmlUtil.GetElementsByTagName(webInfo, "title")[0]);
                            }

                            //得到正文,以P标签来区分
                            string[] strContext = HtmlUtil.GetElementsByTagName(webInfo, "p");
                            newsInfo.Contexts = "";
                            for (int j = 0; j < strContext.Length; j++)
                            {
                                //循环累加正文信息
                                newsInfo.Contexts += HtmlUtil.NoHTML(strContext[j]);
                            }

                            //如果正文信息为空,那么将无法做关键字对照,此条数据舍弃
                            if (newsInfo.Contexts.Length == 0)
                            {
                                continue;
                            }

                            //网站链接
                            newsInfo.InfoSource = strA[i].Trim();

                            //关键字的设置
                            newsInfo.KeyWords = "";
                            for (int j = 0; j < dtkey.Rows.Count; j++)
                            {
                                Application.DoEvents();
                                if (newsInfo.Contexts.IndexOf(dtkey.Rows[j][1].ToString()) > 0)
                                { newsInfo.KeyWords += dtkey.Rows[j][1].ToString() + ","; }
                                else
                                {

                                }
                            }
                            if (newsInfo.KeyWords.Length == 0) { continue; }
                            newsInfo.KeyWords = newsInfo.KeyWords.Substring(0, newsInfo.KeyWords.Length - 1);

                            //收集日期
                            newsInfo.CollectDate = string.Format(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"));

                            //发布人和发布日期暂时无法取到,手工赋值为空
                            newsInfo.ReleaseDate = "";
                            newsInfo.ReleaseName = "";

                            //网页快照,这里为用户指定生成,如果未选择生成,那么为空
                            newsInfo.Snapshot = "";

                            //网站名
                            newsInfo.WebName = dtXs.Rows[xs]["Name"].ToString();
                            //pid
                            newsInfo.Pid = 0;
                            //part正负判断
                            newsInfo.Part = GetParts(newsInfo.Contexts);
                            //reposts
                            newsInfo.Reposts = 0;
                            //comments
                            newsInfo.Comments = 0;

                            //新建数据行
                            DataRow dr = dtnewsinfo.NewRow();
                            if (dvWeb.RowCount == 0)
                            {
                                dr[0] = 1;
                            }
                            else
                            {
                                dr[0] = int.Parse(dvWeb.Rows[dvWeb.RowCount - 1].Cells[0].Value.ToString()) + 1;
                            }
                            //dr[0] = newsInfo.Uid;
                            dr[1] = newsInfo.Title;
                            dr[2] = newsInfo.Contexts;
                            dr[3] = newsInfo.ReleaseDate;
                            dr[4] = newsInfo.InfoSource;
                            dr[5] = newsInfo.KeyWords;
                            dr[6] = newsInfo.ReleaseName;
                            dr[7] = newsInfo.CollectDate;
                            dr[8] = newsInfo.Snapshot;
                            dr[9] = newsInfo.WebName;
                            dr[10] = newsInfo.Pid;
                            dr[11] = newsInfo.Part;
                            dr[12] = newsInfo.Reposts;
                            dr[13] = newsInfo.Comments;

                            //把行加到DT中
                            dtnewsinfo.Rows.InsertAt(dr, 0);

                            //数据源刷新
                            if (dtnewsinfo.Rows.Count >= 500)
                            {
                                dtnewsinfo.Rows.RemoveAt(500);
                            }
                            dvWeb.Refresh();
                        }
                        catch (Exception ex)
                        {
                            StreamWriter sw = File.AppendText("log.txt");
                            sw.WriteLine(DateTime.Now.ToLongDateString());
                            sw.WriteLine("begin");
                            sw.WriteLine(ex.Message);
                            sw.WriteLine(sb.ToString());
                            sw.WriteLine("end");
                            sw.WriteLine("");

                            sw.Close();
                        }

                        ////总表刷新
                        //dt.Rows.Add(dr);
                        //dvAll.Refresh();

                        //得到插入语句
                        try
                        {
                            if (isThere)
                            {
                                continue;
                            }
                            else
                            {
                                sb.Append(ri.GetInsString(newsInfo) + ";");
                            }

                            //每10次执行一次插入数据库
                            if (sb.ToString().Length != 0)
                            {
                                if (i % 10 == 0)
                                {
                                    filterStr = sb.ToString();
                                    filterStr = filterStr.Replace("[ ", "[");
                                    filterStr = filterStr.Replace(" ]", "]");
                                    //执行插入
                                    cmd.ExecuteNonQuery(filterStr);
                                    //清除插入字段串
                                    sb.Clear();
                                    filterStr = "";
                                }
                            }
                        }
                        catch (Exception ex)
                        {
                            StreamWriter sw = File.AppendText("log.txt");
                            sw.WriteLine(DateTime.Now.ToLongDateString());
                            sw.WriteLine("begin");
                            sw.WriteLine(ex.Message);
                            sw.WriteLine(sb.ToString());
                            sw.WriteLine("end");
                            sw.WriteLine("");

                            sw.Close();
                        }
                    }
                }
                #endregion
            }
            #endregion

            try
            {
                if (sb.ToString().Length != 0)
                {
                    filterStr = sb.ToString();
                    filterStr = filterStr.Replace("[ ", "[");
                    filterStr = filterStr.Replace(" ]", "]");
                    //执行插入
                    cmd.ExecuteNonQuery(filterStr);
                    //清除插入字段串
                    sb.Clear();
                    filterStr = "";
                }
            }
            catch (Exception ex)
            {
                StreamWriter sw = File.AppendText("log.txt");
                sw.WriteLine(DateTime.Now.ToLongDateString());
                sw.WriteLine("begin");
                sw.WriteLine(ex.Message);
                sw.WriteLine(sb.ToString());
                sw.WriteLine("end");
                sw.WriteLine("");

                sw.Close();
            }

            //执行完毕后,重新获取一次数据库的数据
            dtWebNewsInfo = tri.SelReleaseInfo(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"), DateTime.Now.AddDays(-15).ToString("yyyy-MM-dd HH:mm:ss"), "0 AND webName<>'百度'");
            //MessageBox.Show("ok");
            lbWeb.Text = "一轮搜索完毕!";
            lbWeb.ForeColor = Color.Red;
        }
Esempio n. 18
0
        /// <summary>
        /// 解析搜狗的网页搜索,与博客的代码一致
        /// </summary>
        /// <param name="html"></param>
        /// <param name="keyword"></param>
        /// <param name="kid"></param>
        /// <returns></returns>
        public List<ModelReleaseInfo> ParseSogouWeb(string html, string keyword, int kid)
        {
            List<ModelReleaseInfo> webDatas = new List<ModelReleaseInfo>();
            try
            {
                #region 网页源码样例
                //<div id="rb_6" class="rb">
                //    <h3 class="pt">  <!--awbg6-->
                //        <a name="dttl" target="_blank" href="http://www.blogbus.com/everclean-logs/262169317.html" id="uigs__6"><!--awbg6--><em><!--red_beg-->雾霾<!--red_end--></em> - 倍儿逗象小树一样茁壮成长 - 博客大巴</a>
                //    </h3>
                //    <div class="ft" id="cacheresult_summary_6">
                //        <!--summary_beg-->版权声明 :转载时请以超链接形式标明文章原始出处和作者信息及 本声明 一周内能有一天看到太阳,就是好的了。 很难想象如果环境一直这样,是不是就如恐怖片一样。移居,说...<!--summary_end-->
                //    </div>
                //    <div class="fb">
                //        <cite id="cacheresult_info_6">博客大巴 - www.blogbus.com/eve...&nbsp;-&nbsp;2014-2-28</cite>
                //        <a target="_blank" style="color: #666666;" href="/websnapshot?ie=utf8&amp;url=http%3A%2F%2Fwww.blogbus.com%2Feverclean-logs%2F262169317.html&amp;did=53c29d6a07020108-a3fbb6d637fe07d5-af73e37b602371d2e22f42f4362e846e&amp;k=8f41c9f2f14fdb8e687b2d9ff06e513d&amp;encodedQuery=%E9%9B%BE%E9%9C%BE&amp;query=%E9%9B%BE%E9%9C%BE&amp;&amp;w=01020400&amp;m=0&amp;st=0" id="sogou_snapshot_6">快照</a>
                //    </div>
                //</div>

                //<div class="vrwrap">
                //    <h3 class="vrTitle">
                //        <a target="_blank" href="http://www.360doc.com/content/13/1225/10/11736860_339938566.shtml" id="sogou_vr_30010028_7"><em><!--red_beg-->雾霾<!--red_end--></em>的成因、危害及对策</a>
                //    </h3>
                //    <div class="strBox">
                //        <div class="str_div" id="sogou_vr_30010028_str_div_7">
                //            <a target="_blank" class="str_img size_120_90" id="sogou_vr_30010028_pic_7" href="http://www.360doc.com/content/13/1225/10/11736860_339938566.shtml"><img alt="" id="sogou_vr_30010028_pic_img_7" src="http://img03.sogoucdn.com/net/a/04/link?&amp;url=http%3A%2F%2Fimage67.360doc.com%2FDownloadImg%2F2013%2F12%2F2510%2F37746619_1.gif&amp;appid=100520083&amp;referer=http://www.360doc.com/content/13/1225/10/11736860_339938566.shtml" onerror="this.parentNode.parentNode.style.display=&quot;none&quot;;this.onerror = null;" style="left: -2.292px;"></a>
                //        </div>
                //        <div class="str_info_div">
                //            <ul class="str-list-v4">
                //                <li>来自:<a target="_blank" id="sogou_vr_30010028_value_stc_1_7" href="http://www.360doc.com/userhome/11736860">jywlkljh</a><strong>类别:</strong><a target="_blank" id="sogou_vr_30010028_value_stc_2_7" href="http://www.360doc.com/userhome.aspx?userid=11736860&amp;cid=259">环境保护</a><strong>日期:</strong><span>2013-12-25</span>
                //                </li>
                //                <li class="str-text-info">
                //                    <span><em><!--red_beg-->雾霾<!--red_end--></em>是雾和霾的组合词.因为空气质量的恶化,<em><!--red_beg-->阴霾<!--red_end--></em>天气现象出现增多,危害加重.中国不少地区把<em><!--red_beg-->阴霾<!--red_end--></em>天气现象并入雾一起作为灾害性天气预警预报.统称为"<em><!--red_beg-->雾霾<!--red_end--></em>天气". 雾与... </span>
                //                </li>
                //            </ul>
                //            <div class="fb">
                //                <cite id="cacheresult_info_7">360doc个人图书馆 - www.360doc.com - 2013-12-25</cite>&nbsp;-&nbsp;<a target="_blank" style="color: #666666;" href="/websnapshot?ie=utf8&amp;url=http%3A%2F%2Fwww.360doc.com%2Fcontent%2F13%2F1225%2F10%2F11736860_339938566.shtml&amp;did=cbcd72704ffe6ca0-cf2cae934446a0eb-6fc08790e542c27802ad1b08c02dffd6&amp;k=d6447289cb4f5a76dd743d1bfb7758d5&amp;encodedQuery=%E9%9B%BE%E9%9C%BE&amp;query=%E9%9B%BE%E9%9C%BE&amp;&amp;w=01020400&amp;m=0&amp;st=1" id="sogou_snapshot_7">快照</a>&nbsp;-&nbsp;<a name="sogou_preview_links" style="color: #666666;" href="javascript:void(null);" id="sogou_preview_7" onclick="sogou_preview(this,'7');return false;" sogou_preview_title="<em><!--red_beg-->雾霾<!--red_end--></em>的成因、危害及对策" sogou_preview_link="http://www.360doc.com/content/13/1225/10/11736860_339938566.shtml" url="/websnapshot?ie=utf8&amp;preview=1&amp;url=http%3A%2F%2Fwww.360doc.com%2Fcontent%2F13%2F1225%2F10%2F11736860_339938566.shtml&amp;did=cbcd72704ffe6ca0-cf2cae934446a0eb-6fc08790e542c27802ad1b08c02dffd6&amp;k=d6447289cb4f5a76dd743d1bfb7758d5&amp;encodedQuery=%E9%9B%BE%E9%9C%BE&amp;query=%E9%9B%BE%E9%9C%BE&amp;&amp;title=%E9%9B%BE%E9%9C%BE%E7%9A%84%E6%88%90%E5%9B%A0%E3%80%81%E5%8D%B1%E5%AE%B3%E5%8F%8A%E5%AF%B9%E7%AD%96&amp;st=1">预览</a>
                //            </div>
                //        </div>
                //    </div>
                //</div>
                #endregion
                #region 解析网站源码
                MIL.Html.HtmlDocument doc = MIL.Html.HtmlDocument.Create(html);
                MIL.Html.HtmlNodeCollection nodes = doc.Nodes.FindByAttributeNameValue("class", "results", true);
                foreach (MIL.Html.HtmlNode n in nodes)
                {
                    if ((n is MIL.Html.HtmlElement) && (n as MIL.Html.HtmlElement).Nodes != null)
                    {
                        foreach (MIL.Html.HtmlNode blog in (n as MIL.Html.HtmlElement).Nodes)
                        {
                            if (blog.IsElement() && (blog as MIL.Html.HtmlElement).Name == "div")
                            {
                                ModelReleaseInfo mri = new ModelReleaseInfo();

                                if (blog.IsElement() && (blog as MIL.Html.HtmlElement).Attributes["class"].Value == "rb")
                                {
                                    #region 标题与超链
                                    MIL.Html.HtmlNodeCollection titleNodes = (blog as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "pt");
                                    if (titleNodes != null && titleNodes.Count > 0)
                                    {
                                        //title中包含<em>节点,需要进行处理
                                        string title = "";
                                        MIL.Html.HtmlElement titleElement = ((titleNodes[0] as MIL.Html.HtmlElement).Nodes[0] as MIL.Html.HtmlElement);
                                        if (titleElement.Nodes != null && titleElement.Nodes.Count > 0)
                                        {
                                            foreach (MIL.Html.HtmlNode t in titleElement.Nodes)
                                            {
                                                if (t.IsText())
                                                {
                                                    title += (t as MIL.Html.HtmlText).Text;
                                                }
                                                else if (t.IsElement() && (t as MIL.Html.HtmlElement).Name.ToLower() == "em")
                                                {
                                                    title += (t as MIL.Html.HtmlElement).Text;
                                                }
                                            }
                                        }
                                        string href = ((titleNodes[0] as MIL.Html.HtmlElement).Nodes[0] as MIL.Html.HtmlElement).Attributes["href"].Value;
                                        mri.Title = title;
                                        mri.InfoSource = href;
                                    }
                                    #endregion
                                    #region 内容简介
                                    MIL.Html.HtmlNodeCollection contextNodes = (blog as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "ft");
                                    if (contextNodes != null && contextNodes.Count > 0)
                                    {
                                        string context = "";
                                        MIL.Html.HtmlElement contextElement = (contextNodes[0] as MIL.Html.HtmlElement);
                                        if (contextElement.Nodes != null && contextElement.Nodes.Count > 0)
                                        {
                                            foreach (MIL.Html.HtmlNode c in contextElement.Nodes)
                                            {
                                                if (c.IsText())
                                                {
                                                    context += (c as MIL.Html.HtmlText).Text;
                                                }
                                                else if (c.IsElement() && (c as MIL.Html.HtmlElement).Name.ToLower() == "em")
                                                {
                                                    context += (c as MIL.Html.HtmlElement).Text;
                                                }
                                            }
                                        }
                                        mri.Contexts = context;
                                    }

                                    #endregion
                                    #region 来源,发表时间 快照
                                    MIL.Html.HtmlNodeCollection authorNodes = (blog as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "fb");
                                    if (authorNodes != null && authorNodes.Count > 0)
                                    {
                                        foreach (MIL.Html.HtmlNode child in (authorNodes[0] as MIL.Html.HtmlElement).Nodes)
                                        {
                                            if (child.IsElement() && (child as MIL.Html.HtmlElement).Name == "cite")
                                            {
                                                //<cite id="cacheresult_info_0">新浪博客 - blog.sina.com.cn/s/...&nbsp;-&nbsp;2013-1-28</cite>
                                                string txt = (child as MIL.Html.HtmlElement).Text;
                                                //解析出来源与时间
                                                txt = GetSogouAuthorAndDate(txt);
                                                if (txt.IndexOf(',') >= 0)
                                                {
                                                    mri.ReleaseName = txt.Substring(0, txt.IndexOf(','));
                                                    mri.ReleaseDate = txt.Substring(txt.IndexOf(',') + 1);
                                                }
                                            }
                                            else if (child.IsElement() && (child as MIL.Html.HtmlElement).Name == "a" && (child as MIL.Html.HtmlElement).Attributes["id"].Value.Contains("snapshot"))
                                            {
                                                #region 快照
                                                string snapShot = (child as MIL.Html.HtmlElement).Attributes["href"].Value;
                                                mri.Snapshot = "http://www.sogou.com" + snapShot;
                                                #endregion
                                            }
                                        }

                                    }
                                    #endregion
                                }
                                else if (blog.IsElement() && (blog as MIL.Html.HtmlElement).Attributes["class"].Value == "vrwrap")
                                {
                                    #region 标题与超链
                                    MIL.Html.HtmlNodeCollection titleNodes = (blog as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "vrTitle");
                                    if (titleNodes != null && titleNodes.Count > 0)
                                    {
                                        //title中包含<em>节点,需要进行处理
                                        string title = "";
                                        MIL.Html.HtmlElement titleElement = ((titleNodes[0] as MIL.Html.HtmlElement).Nodes[0] as MIL.Html.HtmlElement);
                                        if (titleElement.Nodes != null && titleElement.Nodes.Count > 0)
                                        {
                                            foreach (MIL.Html.HtmlNode t in titleElement.Nodes)
                                            {
                                                if (t.IsText())
                                                {
                                                    title += (t as MIL.Html.HtmlText).Text;
                                                }
                                                else if (t.IsElement() && (t as MIL.Html.HtmlElement).Name.ToLower() == "em")
                                                {
                                                    title += (t as MIL.Html.HtmlElement).Text;
                                                }
                                            }
                                        }
                                        string href = ((titleNodes[0] as MIL.Html.HtmlElement).Nodes[0] as MIL.Html.HtmlElement).Attributes["href"].Value;
                                        mri.Title = title;
                                        mri.InfoSource = href;
                                    }
                                    #endregion
                                    #region 内容简介
                                    MIL.Html.HtmlNodeCollection contextNodes = (blog as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "str-text-info");
                                    if (contextNodes != null && contextNodes.Count > 0)
                                    {
                                        string context = "";
                                        MIL.Html.HtmlElement contextElement = ((contextNodes[0] as MIL.Html.HtmlElement).Nodes[0] as MIL.Html.HtmlElement);
                                        if (contextElement.Nodes != null && contextElement.Nodes.Count > 0)
                                        {
                                            foreach (MIL.Html.HtmlNode c in contextElement.Nodes)
                                            {
                                                if (c.IsText())
                                                {
                                                    context += (c as MIL.Html.HtmlText).Text;
                                                }
                                                else if (c.IsElement() && (c as MIL.Html.HtmlElement).Name.ToLower() == "em")
                                                {
                                                    context += (c as MIL.Html.HtmlElement).Text;
                                                }
                                            }
                                        }
                                        mri.Contexts = context;
                                    }

                                    #endregion
                                    #region 来源,发表时间 快照
                                    MIL.Html.HtmlNodeCollection authorNodes = (blog as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "fb");
                                    if (authorNodes != null && authorNodes.Count > 0)
                                    {
                                        foreach (MIL.Html.HtmlNode child in (authorNodes[0] as MIL.Html.HtmlElement).Nodes)
                                        {
                                            if (child.IsElement() && (child as MIL.Html.HtmlElement).Name == "cite")
                                            {
                                                //<cite id="cacheresult_info_1">搜狐博客 - beijinghubeigirl.blog.... - 2014-1-17</cite>
                                                string txt = (child as MIL.Html.HtmlElement).Text;
                                                //解析出来源与时间
                                                txt = GetSogouAuthorAndDate(txt);
                                                if (txt.IndexOf(',') >= 0)
                                                {
                                                    mri.ReleaseName = txt.Substring(0, txt.IndexOf(','));
                                                    mri.ReleaseDate = txt.Substring(txt.IndexOf(',') + 1);
                                                }
                                            }
                                            else if (child.IsElement() && (child as MIL.Html.HtmlElement).Name == "a" && (child as MIL.Html.HtmlElement).Attributes["id"].Value.Contains("snapshot"))
                                            {
                                                #region 快照
                                                string snapShot = (child as MIL.Html.HtmlElement).Attributes["href"].Value;
                                                mri.Snapshot = "http://www.sogou.com" + snapShot;
                                                #endregion
                                            }
                                        }
                                    }
                                    #endregion
                                }

                                #region 其他杂项
                                mri.KeyWords = keyword;
                                mri.CollectDate = string.Format(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"));
                                mri.Kid = kid;
                                mri.Sheng = "";
                                mri.Shi = "";
                                mri.Xian = "";
                                if (!string.IsNullOrEmpty(mri.ReleaseName))
                                {
                                    mri.WebName = mri.ReleaseName;
                                }
                                else
                                {
                                    mri.WebName = "全网";
                                }
                                mri.Pid = 0;                                //mri.Part = GetParts(mri.Contexts);
                                mri.Comments = (int)WebSourceType.SogouWeb;
                                mri.Reposts = 0;
                                #endregion
                                #region 2015.8.13 新增获取网址正文
                                if (!string.IsNullOrEmpty(mri.InfoSource))
                                {
                                    string strContexts = HtmlUtil.getHtml(mri.InfoSource, "");
                                    string noHtmlContexts = HtmlUtil.NoHTML(strContexts);
                                    //分析关键字前100,后50个字符
                                    string formatContexts = GetContexts(noHtmlContexts, keyword);
                                    if (!string.IsNullOrEmpty(formatContexts))
                                    {
                                        mri.Contexts = formatContexts;
                                    }
                                }
                                #endregion
                                #region 报告进度
                                OnReportCactchProcess(mri);
                                #endregion
                                if (!string.IsNullOrEmpty(mri.Title))
                                {
                                    webDatas.Add(mri);
                                }
                            }
                        }
                    }
                }
                #endregion
            }
            catch (Exception ex)
            {
                Comm.WriteErrorLog("解析搜狗新闻搜索页时报错:" + ex.Message);
                Comm.WriteErrorLog(ex.StackTrace);
            }

            return webDatas;
        }
Esempio n. 19
0
        private void GetBaiduInfo()
        {
            lbAll.Text = "";
            lbAll.Visible = true;

            MySqlCmd cmd = new MySqlCmd();
            ;
            //得到关键字列表
            DataTable dtkey = new DataTable();
            dtkey = cmd.GetTabel("select * from Keywords");

            dtParts = cmd.GetTabel("SELECT * FROM partword");

            //链接的正则
            string aa = "http://.[^\"]+";
            string[] sDate;

            sb = new StringBuilder();
            sb.Append("");

            //TbReleaseInfo ri = new TbReleaseInfo();

            //按关键字循环
            for (int kw = 0; kw < dtkey.Rows.Count; kw++)
            {
                lbAll.Text = "正在搜索关键字为<" + dtkey.Rows[kw]["KeyWord"].ToString().Trim() + ">的数据.";
                lbAll.ForeColor = Color.DarkBlue;
                //取得关键字
                string keys = dtkey.Rows[kw]["KeyWord"].ToString().Trim();
                //组成查询字串
                string url = "http://www.baidu.com/s?wd=\"" + keys + "\"&rn=50";

                //得到结果放在数组内
                List<string> lis = new List<string>();
                lis = HtmlUtil.GetElementsByClassList(getHtml(url, "utf-8"), "result");

                //如果没取到,就结束本次循环
                if (lis == null) return;
                //webBrowser1.Navigate(url);

                //循环时判断是否要验证
                bool isThere = false;

                for (int i = 0; i < lis.Count; i++)
                {
                    if (Program.ProClose == true) break;

                    ModelReleaseInfo mri = new ModelReleaseInfo();

                    //发布日期的赋值
                    sDate = HtmlUtil.GetElementsByTagAndClass(lis[i], "span", "g");
                    if (sDate.Length <= 0) continue;

                    mri.ReleaseDate = HtmlUtil.NoHTML(sDate[0]);
                    mri.ReleaseDate = mri.ReleaseDate.Substring(mri.ReleaseDate.Length - 10, 10);

                    //判断日期
                    DateTime ddt;
                    if (DateTime.TryParse(mri.ReleaseDate, out ddt))
                    {
                    }
                    else
                    {
                        //百度的快照日期有时会是9位或8位,如果是这种情况,那么按规则去掉
                        mri.ReleaseDate = mri.ReleaseDate.Substring(1, 9);
                        if (DateTime.TryParse(mri.ReleaseDate, out ddt))
                        {
                        }
                        else
                        {
                            mri.ReleaseDate = mri.ReleaseDate.Substring(1, 8);
                        }
                    }
                    //处理日期
                    try
                    {
                        mri.ReleaseDate = DateTime.Parse(mri.ReleaseDate).ToString("yyyy-MM-dd HH:mm:ss");
                    }
                    catch (Exception ex)
                    {
                        StreamWriter sw = File.AppendText("log.txt");
                        sw.WriteLine(DateTime.Now.ToLongDateString());
                        sw.WriteLine("begin");
                        sw.WriteLine(ex.Message);
                        sw.WriteLine(sb.ToString());
                        sw.WriteLine("end");
                        sw.WriteLine("");

                        sw.Close();

                    }

                    //只拿取三天的内的数据
                    try
                    {
                        if (DateTime.Parse(mri.ReleaseDate) < DateTime.Now.AddDays(-3)) continue;
                    }
                    catch (Exception ex) { continue; }
                    try
                    {
                        //得到标题
                        mri.Title = HtmlUtil.NoHTML(HtmlUtil.GetElementsByTagName(lis[i], "h3")[0]);
                        string[] temp = HtmlUtil.GetElementsByClass(lis[i], "c-abstract");

                        //如果未取到内容部分,就跳出
                        if (temp.Length == 0)
                            continue;

                        mri.Contexts = HtmlUtil.NoHTML(temp[0]);
                        mri.InfoSource = HtmlUtil.GetListByHtml("", HtmlUtil.GetElementsByTagName(lis[i], "a")[0], aa)[0];

                        //去掉重复
                        if (isThere)
                        {
                            continue;
                        }
                        else
                        {
                            if (UrlThereare(mri.Title, this.dtqueryinfo, dtWebQueryInfo, false) != 0)
                            { isThere = true; continue; }
                        }

                        mri.KeyWords = dtkey.Rows[kw]["KeyWord"].ToString().Trim();
                        mri.CollectDate = string.Format(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"));
                        mri.Snapshot = "";
                        mri.ReleaseName = "";

                        mri.WebName = "百度";
                        mri.Pid = 0;
                        mri.Part = GetParts(mri.Contexts);
                        mri.Comments = 0;
                        mri.Reposts = 0;

                        DataRow dr = dtqueryinfo.NewRow();
                        if (dvAll.RowCount == 0)
                        {
                            dr[0] = 1;
                        }
                        else
                        {
                            dr[0] = int.Parse(dvAll.Rows[dvAll.RowCount - 1].Cells[0].Value.ToString()) + 1;
                        }
                        dr[1] = mri.Title;
                        dr[2] = mri.Contexts;
                        dr[3] = mri.ReleaseDate;
                        dr[4] = mri.InfoSource;
                        dr[5] = mri.KeyWords;
                        dr[6] = mri.ReleaseName;
                        dr[7] = mri.CollectDate;
                        dr[8] = mri.Snapshot;
                        dr[9] = mri.WebName;
                        dr[10] = mri.Pid;
                        dr[11] = mri.Part;
                        dr[12] = mri.Reposts;
                        dr[13] = mri.Comments;

                        dtqueryinfo.Rows.InsertAt(dr, 0);

                        if (dtqueryinfo.Rows.Count >= 500)
                        {
                            dtqueryinfo.Rows.RemoveAt(500);
                        }
                        dvAll.Refresh();
                    }
                    catch (Exception ex)
                    {
                        StreamWriter sw = File.AppendText("log.txt");
                        sw.WriteLine(DateTime.Now.ToLongDateString());
                        sw.WriteLine("begin");
                        sw.WriteLine(ex.Message);
                        sw.WriteLine(sb.ToString());
                        sw.WriteLine("end");
                        sw.WriteLine("");

                        sw.Close();
                    }

                    try
                    {
                        //得到插入语句
                        if (isThere)
                        {
                            continue;
                        }
                        else
                        {
                            sb.Append(tri.GetInsString(mri) + ";");
                        }

                        //每10次执行一次插入数据库
                        if (sb.ToString().Length != 0)
                        {
                            if (i % 10 == 0)
                            {
                                //执行插入
                                cmd.ExecuteNonQuery(sb.ToString());
                                //清除插入字段串
                                sb.Clear();
                            }
                        }
                    }
                    catch (Exception ex)
                    {
                        StreamWriter sw = File.AppendText("log.txt");
                        sw.WriteLine(DateTime.Now.ToLongDateString());
                        sw.WriteLine("begin");
                        sw.WriteLine(ex.Message);
                        sw.WriteLine(sb.ToString());
                        sw.WriteLine("end");
                        sw.WriteLine("");

                        sw.Close();
                    }

                }
            }
            try
            {
                if (sb.ToString().Length != 0)
                {
                    //执行插入
                    cmd.ExecuteNonQuery(sb.ToString());
                    //清除插入字段串
                    sb.Clear();
                }
            }
            catch (Exception ex)
            {
                StreamWriter sw = File.AppendText("log.txt");
                sw.WriteLine(DateTime.Now.ToLongDateString());
                sw.WriteLine("begin");
                sw.WriteLine(ex.Message);
                sw.WriteLine(sb.ToString());
                sw.WriteLine("end");
                sw.WriteLine("");

                sw.Close();
            }

            //执行完毕后,重新获取一次数据库的数据
            dtWebQueryInfo = tri.SelReleaseInfo(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"), DateTime.Now.AddDays(-15).ToString("yyyy-MM-dd HH:mm:ss"), "0 AND webName='百度'");
            //MessageBox.Show("ok");
            lbAll.Text = "一轮搜索完毕!";
            lbAll.ForeColor = Color.Red;
        }
Esempio n. 20
0
        /// <summary>
        /// 中搜
        /// </summary>
        /// <param name="html"></param>
        /// <param name="keyword"></param>
        /// <param name="kid"></param>
        /// <returns></returns>
        public List<ModelReleaseInfo> ParseZhongsouNews(string html, string keyword, int kid)
        {
            List<ModelReleaseInfo> webDatas = new List<ModelReleaseInfo>();
            try
            {
                #region 解析网站源码
                MIL.Html.HtmlDocument doc = MIL.Html.HtmlDocument.Create(html);
                MIL.Html.HtmlNodeCollection nodes = doc.Nodes.FindByAttributeNameValue("class", "content-net-ul content-infor-ul", true);
                foreach (MIL.Html.HtmlNode n in nodes)
                {
                    if ((n is MIL.Html.HtmlElement) && (n as MIL.Html.HtmlElement).Nodes != null)
                    {
                        MIL.Html.HtmlNodeCollection resultNodes = (n as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "clearfix");
                        if (resultNodes != null && resultNodes.Count > 0)
                        {
                            foreach (MIL.Html.HtmlNode result in resultNodes)
                            {
                                if ((result is MIL.Html.HtmlElement) && (result as MIL.Html.HtmlElement).Nodes != null)
                                {
                                    ModelReleaseInfo mri = new ModelReleaseInfo();
                                    #region 标题、超链、来源、发表时间
                                    MIL.Html.HtmlNodeCollection topNodes = (result as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "h3-wrap");
                                    if (topNodes != null && topNodes.Count > 0)
                                    {
                                        MIL.Html.HtmlNodeCollection titleNodes = (topNodes[0] as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "h3-zx");
                                        if (titleNodes != null && titleNodes.Count > 0)
                                        {
                                            //title中包含<strong>节点,需要进行处理
                                            if (titleNodes[0].IsElement())
                                            {
                                                MIL.Html.HtmlNodeCollection titleNode2 = (titleNodes[0] as MIL.Html.HtmlElement).Nodes.FindByName("a");
                                                if (titleNode2 != null && titleNode2.Count > 0 && titleNode2[0].IsElement())
                                                {
                                                    string title = "";
                                                    foreach (MIL.Html.HtmlNode t in (titleNode2[0] as MIL.Html.HtmlElement).Nodes)
                                                    {
                                                        if (t.IsText())
                                                        {
                                                            title += (t as MIL.Html.HtmlText).Text;
                                                        }
                                                        else if (t.IsElement() && (t as MIL.Html.HtmlElement).Name.ToLower() == "em")
                                                        {
                                                            title += (t as MIL.Html.HtmlElement).Text;
                                                        }
                                                    }
                                                    string href = (titleNode2[0] as MIL.Html.HtmlElement).Attributes["href"].Value;

                                                    mri.Title = title;
                                                    mri.InfoSource = href;
                                                }
                                            }
                                        }
                                        MIL.Html.HtmlNodeCollection publishNodes = (topNodes[0] as MIL.Html.HtmlElement).Nodes.FindByName("font");
                                        if (publishNodes != null && publishNodes.Count > 0)
                                        {
                                            if (publishNodes[0].IsElement())
                                            {
                                                MIL.Html.HtmlNodeCollection publish = (publishNodes[0] as MIL.Html.HtmlElement).Nodes.FindByName("nobr");
                                                if (publish != null && publish.Count > 0)
                                                {
                                                    string txt = "";
                                                    if (publish[0].IsElement())
                                                    {
                                                        txt = (publish[0] as MIL.Html.HtmlElement).Text;
                                                    }
                                                    else if (publish[0].IsText())
                                                    {
                                                        txt = (publish[0] as MIL.Html.HtmlText).Text;
                                                    }

                                                    byte[] space = new byte[] { 0xc2, 0xa0 };
                                                    string UTFSpace = Encoding.GetEncoding("UTF-8").GetString(space);
                                                    txt = txt.Replace(UTFSpace, " ").Trim();
                                                    if (txt.IndexOf(" ") > 0)
                                                    {
                                                        mri.ReleaseName = txt.Substring(0, txt.IndexOf(" ")).Trim();
                                                        txt = txt.Substring(txt.IndexOf(" ") + 1);
                                                    }
                                                    mri.ReleaseDate = FormateDate(txt);
                                                }
                                            }
                                        }
                                    }
                                    #endregion
                                    #region 内容简介
                                    MIL.Html.HtmlNodeCollection txtNodes = (result as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "img-info noimg-txt");
                                    if (txtNodes == null || txtNodes.Count == 0)
                                    {
                                        txtNodes = (result as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "img-info clearfix");
                                    }
                                    if (txtNodes != null && txtNodes.Count > 0)
                                    {
                                        if ((txtNodes[0] is MIL.Html.HtmlElement) && (txtNodes[0] as MIL.Html.HtmlElement).Nodes != null)
                                        {
                                            MIL.Html.HtmlNodeCollection contentNodes = (txtNodes[0] as MIL.Html.HtmlElement).Nodes.FindByName("p");
                                            if (contentNodes != null && contentNodes.Count > 0)
                                            {
                                                string context = "";
                                                if (contentNodes[0].IsElement())
                                                {
                                                    foreach (MIL.Html.HtmlNode c in (contentNodes[0] as MIL.Html.HtmlElement).Nodes)
                                                    {
                                                        if (c.IsText())
                                                        {
                                                            context += (c as MIL.Html.HtmlText).Text;
                                                        }
                                                        else if (c.IsElement() && (c as MIL.Html.HtmlElement).Name.ToLower() == "em")
                                                        {
                                                            context += (c as MIL.Html.HtmlElement).Text;
                                                        }
                                                    }
                                                }
                                                mri.Contexts = context;
                                            }
                                        }
                                    }

                                    #endregion
                                    #region 快照
                                    mri.Snapshot = "";
                                    #endregion
                                    #region 其他杂项
                                    mri.KeyWords = keyword;
                                    mri.CollectDate = string.Format(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"));
                                    mri.Kid = kid;
                                    mri.Sheng = "";
                                    mri.Shi = "";
                                    mri.Xian = "";
                                    //mri.WebName = "必应";
                                    if (!string.IsNullOrEmpty(mri.ReleaseName))
                                    {
                                        mri.WebName = mri.ReleaseName;
                                    }
                                    else
                                    {
                                        mri.WebName = "主流媒体";
                                    }
                                    mri.Pid = 4;
                                    //mri.Part = GetParts(mri.Contexts);
                                    mri.Comments = (int)WebSourceType.ZhongsouNews;
                                    mri.Reposts = 0;
                                    #endregion

                                    #region 2015.8.13 新增获取网址正文
                                    if (!string.IsNullOrEmpty(mri.InfoSource))
                                    {
                                        string strContexts = HtmlUtil.getHtml(mri.InfoSource, "");
                                        string noHtmlContexts = HtmlUtil.NoHTML(strContexts);
                                        //分析关键字前100,后50个字符
                                        string formatContexts = GetContexts(noHtmlContexts, keyword);
                                        if (!string.IsNullOrEmpty(formatContexts))
                                        {
                                            mri.Contexts = formatContexts;
                                        }
                                    }
                                    #endregion
                                    #region 报告进度
                                    OnReportCactchProcess(mri);
                                    #endregion

                                    if (!string.IsNullOrEmpty(mri.Title))
                                    {
                                        webDatas.Add(mri);
                                    }
                                }
                            }
                        }
                    }

                }
                #endregion
            }
            catch (Exception ex)
            {
                Comm.WriteErrorLog("解析搜狗博客搜索页时报错:" + ex.Message);
                Comm.WriteErrorLog(ex.StackTrace);
            }

            return webDatas;
        }
Esempio n. 21
0
        public List<ModelReleaseInfo> ParseBingWeb(string html, string keyword, int kid)
        {
            List<ModelReleaseInfo> webDatas = new List<ModelReleaseInfo>();
            try
            {
                #region 解析网站源码
                MIL.Html.HtmlDocument doc = MIL.Html.HtmlDocument.Create(html);
                MIL.Html.HtmlNodeCollection nodes = doc.Nodes.FindByAttributeNameValue("id", "b_results", true);
                foreach (MIL.Html.HtmlNode n in nodes)
                {
                    if ((n is MIL.Html.HtmlElement) && (n as MIL.Html.HtmlElement).Nodes != null)
                    {
                        MIL.Html.HtmlNodeCollection resultNodes = (n as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "b_algo");
                        if (resultNodes != null && resultNodes.Count > 0)
                        {
                            foreach (MIL.Html.HtmlNode result in resultNodes)
                            {
                                if ((result is MIL.Html.HtmlElement) && (result as MIL.Html.HtmlElement).Nodes != null)
                                {
                                    ModelReleaseInfo mri = new ModelReleaseInfo();
                                    #region 标题与超链
                                    MIL.Html.HtmlNodeCollection titleNodes = (result as MIL.Html.HtmlElement).Nodes.FindByName("h2");
                                    if (titleNodes != null && titleNodes.Count > 0)
                                    {
                                        //title中包含<strong>节点,需要进行处理
                                        string title = "";
                                        MIL.Html.HtmlElement titleElement = ((titleNodes[0] as MIL.Html.HtmlElement).Nodes[0] as MIL.Html.HtmlElement);
                                        if (titleElement.Nodes != null && titleElement.Nodes.Count > 0)
                                        {
                                            foreach (MIL.Html.HtmlNode t in titleElement.Nodes)
                                            {
                                                if (t.IsText())
                                                {
                                                    title += (t as MIL.Html.HtmlText).Text;
                                                }
                                                else if (t.IsElement() && (t as MIL.Html.HtmlElement).Name.ToLower() == "strong")
                                                {
                                                    title += (t as MIL.Html.HtmlElement).Text;
                                                }
                                            }
                                        }
                                        string href = ((titleNodes[0] as MIL.Html.HtmlElement).Nodes[0] as MIL.Html.HtmlElement).Attributes["href"].Value;

                                        mri.Title = title;
                                        mri.InfoSource = href;
                                    }
                                    #endregion
                                    #region 来源,发表时间与内容简介
                                    MIL.Html.HtmlNodeCollection txtNodes = (result as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "b_caption");
                                    if (txtNodes != null && txtNodes.Count > 0)
                                    {
                                        if ((txtNodes[0] is MIL.Html.HtmlElement) && (txtNodes[0] as MIL.Html.HtmlElement).Nodes != null)
                                        {
                                            foreach (MIL.Html.HtmlNode t in (txtNodes[0] as MIL.Html.HtmlElement).Nodes)
                                            {
                                                if (t.IsElement() && (t as MIL.Html.HtmlElement).Name == "p")
                                                {
                                                    string context = "";
                                                    foreach (MIL.Html.HtmlNode c in (t as MIL.Html.HtmlElement).Nodes)
                                                    {
                                                        if (c.IsText())
                                                        {
                                                            context += (c as MIL.Html.HtmlText).Text;
                                                        }
                                                        else if (c.IsElement() && (c as MIL.Html.HtmlElement).Name.ToLower() == "strong")
                                                        {
                                                            context += (c as MIL.Html.HtmlElement).Text;
                                                        }
                                                    }
                                                    mri.Contexts = context;
                                                }

                                                if (t.IsElement() && (t as MIL.Html.HtmlElement).Name == "div"
                                                    && (t as MIL.Html.HtmlElement).Attributes != null)
                                                {
                                                    MIL.Html.HtmlAttribute attr = (t as MIL.Html.HtmlElement).Attributes.FindByName("class");
                                                    if (attr != null && attr.Value == "b_attribution")
                                                    {
                                                        mri.ReleaseDate = FormateDate((t as MIL.Html.HtmlElement).Text);

                                                        if ((t as MIL.Html.HtmlElement).Nodes != null && (t as MIL.Html.HtmlElement).Nodes.Count > 0)
                                                        {
                                                            MIL.Html.HtmlNodeCollection releaseNodes = (t as MIL.Html.HtmlElement).Nodes.FindByName("cite");
                                                            if (releaseNodes != null && releaseNodes.Count > 0)
                                                            {
                                                                if (releaseNodes[0].IsElement())
                                                                {
                                                                    mri.ReleaseName = (releaseNodes[0] as MIL.Html.HtmlElement).Text;
                                                                }
                                                                else if (releaseNodes[0].IsText())
                                                                {
                                                                    mri.ReleaseName = (releaseNodes[0] as MIL.Html.HtmlText).Text;
                                                                }
                                                            }
                                                        }
                                                    }
                                                }
                                            }
                                        }
                                    }

                                    #endregion
                                    #region 快照
                                    mri.Snapshot = "";
                                    #endregion
                                    #region 其他杂项
                                    mri.KeyWords = keyword;
                                    mri.CollectDate = string.Format(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"));
                                    mri.Kid = kid;
                                    mri.Sheng = "";
                                    mri.Shi = "";
                                    mri.Xian = "";
                                    //mri.WebName = "必应";
                                    if (!string.IsNullOrEmpty(mri.ReleaseName))
                                    {
                                        mri.WebName = mri.ReleaseName;
                                    }
                                    else
                                    {
                                        mri.WebName = "全网";
                                    }
                                    mri.Pid = 0;
                                    //mri.Part = GetParts(mri.Contexts);
                                    mri.Comments = (int)WebSourceType.BingWeb;
                                    mri.Reposts = 0;
                                    #endregion
                                    #region 2015.8.13 新增获取网址正文
                                    if (!string.IsNullOrEmpty(mri.InfoSource))
                                    {
                                        string strContexts = HtmlUtil.getHtml(mri.InfoSource, "");
                                        string noHtmlContexts = HtmlUtil.NoHTML(strContexts);
                                        //分析关键字前100,后50个字符
                                        string formatContexts = GetContexts(noHtmlContexts, keyword);
                                        if (!string.IsNullOrEmpty(formatContexts))
                                        {
                                            mri.Contexts = formatContexts;
                                        }
                                    }
                                    #endregion
                                    #region 报告进度
                                    OnReportCactchProcess(mri);
                                    #endregion
                                    if (!string.IsNullOrEmpty(mri.Title))
                                    {
                                        webDatas.Add(mri);
                                    }
                                }
                            }
                        }
                    }

                }
                #endregion
            }
            catch (Exception ex)
            {
                Comm.WriteErrorLog("解析必应新闻网页时报错:" + ex.Message);
                Comm.WriteErrorLog(ex.StackTrace);
            }

            return webDatas;
        }
Esempio n. 22
0
        public List<ModelReleaseInfo> ParseZhongsouWeibo(string html, string keyword, int kid)
        {
            List<ModelReleaseInfo> webDatas = new List<ModelReleaseInfo>();
            try
            {
                #region 解析网站源码
                MIL.Html.HtmlDocument doc = MIL.Html.HtmlDocument.Create(html);
                MIL.Html.HtmlNodeCollection nodes = doc.Nodes.FindByAttributeNameValue("class", "main_scenery_left", true);
                if (nodes != null && nodes.Count > 0 && nodes[0].IsElement())
                {
                    MIL.Html.HtmlNodeCollection resultNodes = (nodes[0] as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "godreply_on");
                    if (resultNodes != null && resultNodes.Count > 0)
                    {
                        foreach (MIL.Html.HtmlNode result in resultNodes)
                        {
                            MIL.Html.HtmlNodeCollection weiboItems = (result as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "weibo_item clearfix");
                            if (weiboItems != null && weiboItems.Count > 0 && weiboItems[0].IsElement())
                            {
                                MIL.Html.HtmlNodeCollection weiboRightNodes = (weiboItems[0] as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "weibo_right");
                                if (weiboRightNodes != null && weiboRightNodes.Count > 0 && weiboRightNodes[0].IsElement())
                                {
                                    ModelReleaseInfo mri = new ModelReleaseInfo();
                                    #region 标题
                                    MIL.Html.HtmlNodeCollection weiboTitleNodes = (weiboRightNodes[0] as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "weibo_title");
                                    if (weiboTitleNodes != null && weiboTitleNodes.Count > 0)
                                    {
                                        MIL.Html.HtmlNodeCollection aNodes = (weiboRightNodes[0] as MIL.Html.HtmlElement).Nodes.FindByName("a");
                                        if (aNodes != null && aNodes.Count > 0 && aNodes[0].IsElement())
                                        {
                                            mri.Title = (aNodes[0] as MIL.Html.HtmlElement).Text;
                                        }
                                    }
                                    #endregion
                                    #region 内容
                                    MIL.Html.HtmlNodeCollection weiboTxtNodes = (weiboRightNodes[0] as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "weibo_txt");
                                    if (weiboTxtNodes != null && weiboTxtNodes.Count > 0 && weiboTxtNodes[0].IsElement())
                                    {
                                        mri.Contexts = GetContent(weiboTxtNodes[0] as MIL.Html.HtmlElement);
                                    }
                                    //引用
                                    MIL.Html.HtmlNodeCollection weiboFbboxNodes = (weiboRightNodes[0] as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "weibofbbox");
                                    if (weiboFbboxNodes != null && weiboFbboxNodes.Count > 0 && weiboFbboxNodes[0].IsElement())
                                    {
                                        mri.Contexts += GetContent(weiboFbboxNodes[0] as MIL.Html.HtmlElement);
                                    }
                                    #endregion
                                    #region 发布时间及超链
                                    MIL.Html.HtmlNodeCollection publishNodes = (weiboRightNodes[0] as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "weibo_handle clearfix");
                                    if (publishNodes != null && publishNodes.Count > 0 && publishNodes[0].IsElement())
                                    {
                                        MIL.Html.HtmlNodeCollection dateNodes = (publishNodes[0] as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "weibo_time");
                                        if (dateNodes != null && dateNodes.Count > 0 && dateNodes[0].IsElement())
                                        {
                                            string date = (dateNodes[0] as MIL.Html.HtmlElement).Text;
                                            mri.ReleaseDate = FormateDate2(date);
                                        }
                                        MIL.Html.HtmlNodeCollection hrefNodes = (publishNodes[0] as MIL.Html.HtmlElement).Nodes.FindByName("a");
                                        if (hrefNodes != null && hrefNodes.Count > 0 && hrefNodes[0].IsElement())
                                        {
                                            mri.InfoSource = (hrefNodes[0] as MIL.Html.HtmlElement).Attributes["href"].Value;
                                        }
                                    }
                                    #endregion
                                    #region 其他杂项
                                    mri.Snapshot = "";
                                    mri.KeyWords = keyword;
                                    mri.CollectDate = string.Format(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"));
                                    mri.Kid = kid;
                                    mri.Sheng = "";
                                    mri.Shi = "";
                                    mri.Xian = "";
                                    mri.WebName = "微博";
                                    mri.Pid = 3;
                                    //mri.Part = GetParts(mri.Contexts);
                                    mri.Comments = (int)WebSourceType.ZhongsouWeibo;
                                    mri.Reposts = 0;
                                    #endregion
                                    #region 2015.8.13 新增获取网址正文
                                    if (!string.IsNullOrEmpty(mri.InfoSource))
                                    {
                                        string strContexts = HtmlUtil.getHtml(mri.InfoSource, "");
                                        string noHtmlContexts = HtmlUtil.NoHTML(strContexts);
                                        //分析关键字前100,后50个字符
                                        string formatContexts = GetContexts(noHtmlContexts, keyword);
                                        if (!string.IsNullOrEmpty(formatContexts))
                                        {
                                            mri.Contexts = formatContexts;
                                        }
                                    }
                                    #endregion
                                    #region 报告进度
                                    OnReportCactchProcess(mri);
                                    #endregion
                                    if (!string.IsNullOrEmpty(mri.Title))
                                    {
                                        webDatas.Add(mri);
                                    }
                                }
                            }
                        }
                    }

                }
                #endregion
            }
            catch (Exception ex)
            {
                Comm.WriteErrorLog("解析中搜微博搜索时报错:" + ex.Message);
                Comm.WriteErrorLog(ex.StackTrace);
            }
            return webDatas;
        }
Esempio n. 23
0
        public List<ModelReleaseInfo> ParseGeneralWeb(string html, string url, DataTable dtkey, string sheng, string shi, string xian, string webName, string webInfo, int pid)
        {
            //string strURLformat = "https?://.[^\"]+";
            List<ModelReleaseInfo> mris = new List<ModelReleaseInfo>();
            for (int i = 0; i < dtkey.Rows.Count; i++)
            {
                string keywords = dtkey.Rows[i]["keyword"].ToString();
                int kid = 0;
                int.TryParse(dtkey.Rows[i]["kid"].ToString().Trim(), out kid);

                string title = HtmlUtil.NoHTML(html);
                string context = HtmlUtil.NoHTML(webInfo);
                if (!string.IsNullOrEmpty(keywords))
                {
                    bool isFundTitle = true;
                    bool isFundContext = true;
                    string[] keyw = keywords.Split(' ');
                    if (keyw != null && keyw.Count() > 0)
                    {
                        foreach (string key in keyw)
                        {
                            if (title.IndexOf(key) < 0)
                            {
                                isFundTitle = false;
                            }
                            if (context.IndexOf(key) < 0)
                            {
                                isFundContext = false;
                            }
                        }
                    }
                    if (isFundTitle || isFundContext)
                    {
                        //如果标题或者内容有一个完全匹配关键字则添加该条数据
                        ModelReleaseInfo mri = new ModelReleaseInfo();
                        mri.Title = title;
                        mri.KeyWords = keywords;
                        mri.Contexts = context;
                        mri.InfoSource = url;

                        //发布人和发布日期暂时无法取到,手工赋值
                        mri.ReleaseDate = DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss");
                        mri.ReleaseName = "";

                        //收集日期
                        mri.CollectDate = DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss");

                        //网页快照,这里为用户指定生成,如果未选择生成,那么为空
                        mri.Snapshot = "";
                        mri.Sheng = sheng == null ? "" : sheng;
                        mri.Shi = shi == null ? "" : shi;
                        mri.Xian = xian == null ? "" : xian;
                        //网站名
                        mri.WebName = webName == null ? "" : webName;
                        //pid
                        mri.Pid = pid;
                        mri.Kid = kid;
                        //part正负判断
                        mri.Part = 0; //CrawlHtml.GetParts(mri.Contexts);
                        //reposts
                        mri.Reposts = 0;
                        //comments
                        mri.Comments = 0;

                        #region 报告进度
                        OnReportCactchProcess(mri);
                        #endregion

                        if (!string.IsNullOrEmpty(mri.Title))
                        {
                            mris.Add(mri);
                        }
                    }
                }
            }
            return mris;
        }
Esempio n. 24
0
        public List<ModelReleaseInfo> ParseBaiduNews(string html, string keyword, int kid)
        {
            List<ModelReleaseInfo> webDatas = new List<ModelReleaseInfo>();
            try
            {
                #region 网页源码样例
                //<li class="result" id="1">
                //    <h3 class="c-title">
                //        <a href="http://news.sina.com.cn/c/2015-03-10/023931587467.shtml"  data-click="{      'f0':'77A717EA',      'f1':'9F63F1E4',      'f2':'4CA6DE6E',      'f3':'54E5243F',      't':'1425975464'      }" target="_blank">澳门中联办主任:<em>反腐</em>后很多官员不敢再来赌博</a>
                //    </h3>
                //    <div class="c-summary c-row">
                //        <p class="c-author">新浪新闻&nbsp;&nbsp;13小时前</p> 由于国家加大<em>反腐</em>力度,很多人不敢再到澳门参与赌博,李刚表示,这也是<em>博彩</em>业连续9个月下跌的原因之一。 【谈“占中”】 澳门不支持类似行为 去年发生在香港的“...
                //        <span class="c-info">
                //            <a href="/ns?word=%E5%8F%8D%E8%85%90%20%E5%8D%9A%E5%BD%A9+cont:2431691772&amp;same=13&amp;cl=1&amp;tn=news&amp;rn=30&amp;fm=sd" class="c-more_link" data-click="{'fm':'sd'}">13条相同新闻</a>
                //            &nbsp;&nbsp;-&nbsp;&nbsp;<a href="http://cache.baidu.com/c?m=9f65cb4a8c8507ed4fece763104a8023584380147d8c8c4668d4e419ce3b4c413037bfa6763f1006d0c26b6777ad484bea8635702a0120b690c98b4dd7be942c2a9f27433146c01e4cc75cf28b102ad650954d99a90e97b0e741e7b9d2a2c82453dd200e6df0f09c2b73&amp;p=97759a43d08903eb11acc7710f48&amp;newp=8b2a9707829b0aff57eb92291b5992694f08d7267dc8914212d2950ac73c10&amp;user=baidu&amp;fm=sc&amp;query=%B7%B4%B8%AF+%B2%A9%B2%CA&amp;qid=a46011fc00005d39&amp;p1=1" data-click="{'fm':'sc'}" target="_blank" class="c-cache">百度快照</a>
                //        </span>
                //    </div>
                //</li>

                //<li class="result" id="8"><h3 class="c-title"><a href="http://forex.hexun.com/2015-03-06/173805302.html"><em>两会任性翻译</em>姐:英语不是白学的!</a></h3>
                //    <div class="c-summary c-row c-gap-top-small">
                //        <div class="c-span6">
                //            <a class="c_photo" href="http://forex.hexun.com/2015-03-06/173805302.html" ><img class="c-img c-img6" src="http://t12.baidu.com/it/u=3225043934,4141352763&amp;fm=55&amp;s=9F0B834F0A727D966080583C03005068&amp;w=121&amp;h=81&amp;img.JPEG" alt=""></a>
                //        </div>
                //        <div class="c-span18 c-span-last">
                //            <p class="c-author">和讯外汇&nbsp;&nbsp;2015年03月06日 11:00</p> 女翻译当时楞了,回头与吕新华沟通,确认后再翻译,由此<em>两会任性女翻译走红</em>。<em>两会任性女翻译走红</em>后,其身份受到了网友关注,有媒体爆料,两会任性女翻译老公是习大大和奥...  <span class="c-info"><a href="/ns?word=%E4%B8%A4%E4%BC%9A%E4%BB%BB%E6%80%A7%E5%A5%B3%E7%BF%BB%E8%AF%91%E8%B5%B0%E7%BA%A2+cont:1156374414&amp;same=2&amp;cl=1&amp;tn=news&amp;rn=30&amp;fm=sd" class="c-more_link" data-click="{'fm':'sd'}">2条相同新闻</a>&nbsp;&nbsp;-&nbsp;&nbsp;<a href="http://cache.baidu.com/c?m=9d78d513d9d430ac4f9de2697d66c0111a4381132ba6da020ea3843e91732d47506793ac56250777a4d27d1716df4c4b99862104371457c78cc9f85dabbe855e5b9f5747676bf755559347a091006383379129f4b24dbafaa77884aea589881e149644050dd1add4470016c968e71447e1a78e48635d14a7ee3564f55b70289d2357b630a3a66d30&amp;p=b449d016d9c157ff57e69268454a&amp;newp=9277c64ad4891afb00bd9b750b0892695c02dc3051d4d616358fc710&amp;user=baidu&amp;fm=sc&amp;query=%C1%BD%BB%E1%C8%CE%D0%D4%C5%AE%B7%AD%D2%EB%D7%DF%BA%EC&amp;qid=e48e7f4100000633&amp;p1=8" data-click="{'fm':'sc'}" target="_blank" class="c-cache">百度快照</a></span></div></div></li>
                #endregion
                #region 解析网站源码
                //html = html.Replace("<em>", "").Replace("</em>", "");
                MIL.Html.HtmlDocument doc = MIL.Html.HtmlDocument.Create(html);
                MIL.Html.HtmlNodeCollection nodes = doc.Nodes.FindByAttributeNameValue("class", "result", true);
                foreach (MIL.Html.HtmlNode n in nodes)
                {
                    if ((n is MIL.Html.HtmlElement) && (n as MIL.Html.HtmlElement).Nodes != null)
                    {
                        ModelReleaseInfo mri = new ModelReleaseInfo();
                        #region 标题与超链
                        MIL.Html.HtmlNodeCollection titleNodes = (n as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "c-title");
                        if (titleNodes != null && titleNodes.Count > 0)
                        {
                            //title中包含<em>节点,需要进行处理
                            string title = "";
                            MIL.Html.HtmlElement titleElement = ((titleNodes[0] as MIL.Html.HtmlElement).Nodes[0] as MIL.Html.HtmlElement);
                            if (titleElement.Nodes != null && titleElement.Nodes.Count > 0)
                            {
                                foreach (MIL.Html.HtmlNode t in titleElement.Nodes)
                                {
                                    if (t.IsText())
                                    {
                                        title += (t as MIL.Html.HtmlText).Text;
                                    }
                                    else if (t.IsElement() && (t as MIL.Html.HtmlElement).Name.ToLower() == "em")
                                    {
                                        title += (t as MIL.Html.HtmlElement).Text;
                                    }
                                }
                            }
                            //string title = ((titleNodes[0] as MIL.Html.HtmlElement).Nodes[0] as MIL.Html.HtmlElement).Text;
                            string href = ((titleNodes[0] as MIL.Html.HtmlElement).Nodes[0] as MIL.Html.HtmlElement).Attributes["href"].Value;

                            mri.Title = title;
                            mri.InfoSource = href;
                        }
                        #endregion
                        #region 来源,发表时间与内容简介
                        MIL.Html.HtmlNodeCollection authorNodes = (n as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "c-author");
                        if (authorNodes != null && authorNodes.Count > 0)
                        {
                            string author = (authorNodes[0] as MIL.Html.HtmlElement).Text;
                            //string context = ((authorNodes[0] as MIL.Html.HtmlElement).Parent as MIL.Html.HtmlElement).Text;
                            string context = "";
                            MIL.Html.HtmlElement contextElement = ((authorNodes[0] as MIL.Html.HtmlElement).Parent as MIL.Html.HtmlElement);
                            if (contextElement.Nodes != null && contextElement.Nodes.Count > 0)
                            {
                                foreach (MIL.Html.HtmlNode c in contextElement.Nodes)
                                {
                                    if (c.IsText())
                                    {
                                        context += (c as MIL.Html.HtmlText).Text;
                                    }
                                    else if (c.IsElement() && (c as MIL.Html.HtmlElement).Name.ToLower() == "em")
                                    {
                                        context += (c as MIL.Html.HtmlElement).Text;
                                    }
                                }
                            }
                            mri.Contexts = context;

                            byte[] space = new byte[] { 0xc2, 0xa0 };
                            string UTFSpace = Encoding.GetEncoding("UTF-8").GetString(space);

                            if (author.IndexOf(UTFSpace) >= 0)
                            {
                                string source = author.Substring(0, author.IndexOf(UTFSpace)).Trim();
                                mri.ReleaseName = source;
                                string date = author.Substring(author.IndexOf(UTFSpace) + 1).Trim();
                                if (date.IndexOf('年') >= 0)
                                {
                                    //南方网  2015年03月09日 11:51
                                    date = date.Replace('年', '-').Replace('月', '-').Replace("日", "");
                                }
                                else
                                {
                                    //国土资源部  5小时前
                                    date = DateTime.Now.ToString("yyyy-MM-dd HH:mm:dd");
                                }
                                mri.ReleaseDate = date;

                            }
                            else
                            {
                                if (author.IndexOf(' ') >= 0)
                                {
                                    string source = author.Substring(0, author.IndexOf(' '));
                                    mri.ReleaseName = source;
                                    string date = author.Substring(author.IndexOf(' ') + 1).Trim();
                                    if (date.IndexOf('年') >= 0)
                                    {
                                        //南方网  2015年03月09日 11:51
                                        date = date.Replace('年', '-').Replace('月', '-').Replace("日", "");
                                    }
                                    else
                                    {
                                        //国土资源部  5小时前
                                        date = DateTime.Now.ToString("yyyy-MM-dd HH:mm:dd");
                                    }
                                    mri.ReleaseDate = date;
                                }
                            }
                        }
                        #endregion
                        #region 快照
                        MIL.Html.HtmlNodeCollection cacheNodes = (n as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "c-cache");
                        if (cacheNodes != null && cacheNodes.Count > 0)
                        {
                            string cache = (cacheNodes[0] as MIL.Html.HtmlElement).Attributes["href"].Value;
                            mri.Snapshot = cache;
                        }
                        #endregion
                        #region 其他杂项
                        mri.KeyWords = keyword;
                        mri.CollectDate = string.Format(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"));
                        mri.Kid = kid;
                        mri.Sheng = "";
                        mri.Shi = "";
                        mri.Xian = "";
                        //mri.WebName = "百度";
                        if (!string.IsNullOrEmpty(mri.ReleaseName))
                        {
                            mri.WebName = mri.ReleaseName;
                        }
                        else
                        {
                            mri.WebName = "主流媒体";
                        }
                        mri.Pid = 4;
                        //mri.Part = GetParts(mri.Contexts);
                        mri.Comments = (int)WebSourceType.BaiduNews;
                        mri.Reposts = 0;
                        #endregion
                        #region 2015.8.13 新增获取网址正文
                        if (!string.IsNullOrEmpty(mri.InfoSource))
                        {
                            string strContexts = HtmlUtil.getHtml(mri.InfoSource, "");
                            string noHtmlContexts = HtmlUtil.NoHTML(strContexts);
                            //分析关键字前100,后50个字符
                            string formatContexts = GetContexts(noHtmlContexts, keyword);
                            if (!string.IsNullOrEmpty(formatContexts))
                            {
                                mri.Contexts = formatContexts;
                            }
                        }
                        #endregion
                        #region 报告进度
                        OnReportCactchProcess(mri);
                        #endregion

                        if (!string.IsNullOrEmpty(mri.Title))
                        {
                            webDatas.Add(mri);
                        }
                    }
                }
                #endregion
            }
            catch (Exception ex)
            {
                Comm.WriteErrorLog("解析百度新闻网页时报错:" + ex.Message);
                Comm.WriteErrorLog(ex.StackTrace);
            }

            return webDatas;
        }
Esempio n. 25
0
        public List<ModelReleaseInfo> ParseHaosouNews(string html, string keyword, int kid)
        {
            List<ModelReleaseInfo> webDatas = new List<ModelReleaseInfo>();
            try
            {
                #region 解析网站源码
                MIL.Html.HtmlDocument doc = MIL.Html.HtmlDocument.Create(html);
                MIL.Html.HtmlNodeCollection nodes = doc.Nodes.FindByAttributeNameValue("class", "result", true);
                if (nodes != null && nodes.Count > 0 && nodes[0].IsElement())
                {
                    MIL.Html.HtmlNodeCollection resultNodes = (nodes[0] as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "res-list", true);
                    if (resultNodes == null || resultNodes.Count == 0)
                    {
                        resultNodes = (nodes[0] as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "res-list hasimg", true);
                    }
                    if (resultNodes != null && resultNodes.Count > 0)
                    {
                        foreach (MIL.Html.HtmlNode result in resultNodes)
                        {
                            //<div class="res-rich so-rich-news clearfix">
                            if ((result is MIL.Html.HtmlElement) && (result as MIL.Html.HtmlElement).Nodes != null)
                            {
                                ModelReleaseInfo mri = new ModelReleaseInfo();
                                #region 标题、超链
                                MIL.Html.HtmlNodeCollection titleNodes = (result as MIL.Html.HtmlElement).Nodes.FindByName("h3");
                                if (titleNodes != null && titleNodes.Count > 0)
                                {
                                    if (titleNodes[0].IsElement())
                                    {
                                        MIL.Html.HtmlNodeCollection aNodes = (titleNodes[0] as MIL.Html.HtmlElement).Nodes.FindByName("a");
                                        if (aNodes != null && aNodes.Count > 0 && aNodes[0].IsElement())
                                        {
                                            string title = "";
                                            foreach (MIL.Html.HtmlNode t in (aNodes[0] as MIL.Html.HtmlElement).Nodes)
                                            {
                                                if (t.IsText())
                                                {
                                                    title += (t as MIL.Html.HtmlText).Text;
                                                }
                                                else if (t.IsElement() && (t as MIL.Html.HtmlElement).Name.ToLower() == "em")
                                                {
                                                    title += (t as MIL.Html.HtmlElement).Text;
                                                }
                                            }
                                            string href = (aNodes[0] as MIL.Html.HtmlElement).Attributes["href"].Value;

                                            mri.Title = title;
                                            mri.InfoSource = href;
                                        }
                                    }
                                }
                                #endregion
                                #region 内容简介
                                MIL.Html.HtmlNodeCollection contentNodes = (result as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "content", true);
                                if (contentNodes != null && contentNodes.Count > 0)
                                {
                                    string text = "";
                                    foreach (MIL.Html.HtmlNode c in (contentNodes[0] as MIL.Html.HtmlElement).Nodes)
                                    {
                                        if (c.IsText())
                                        {
                                            text += (c as MIL.Html.HtmlText).Text;
                                        }
                                        else if (c.IsElement() && (c as MIL.Html.HtmlElement).Name.ToLower() == "em")
                                        {
                                            text += (c as MIL.Html.HtmlElement).Text;
                                        }
                                    }
                                    mri.Contexts = text;
                                }
                                #endregion
                                #region 发布人,时间
                                MIL.Html.HtmlNodeCollection newsinfoNodes = (result as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "newsinfo");
                                if (newsinfoNodes != null && newsinfoNodes.Count > 0)
                                {
                                    MIL.Html.HtmlNodeCollection sitenameNodes = (newsinfoNodes[0] as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "sitename");
                                    if (sitenameNodes != null && sitenameNodes.Count > 0)
                                    {
                                        if (sitenameNodes[0].IsElement())
                                        {
                                            mri.ReleaseName = (sitenameNodes[0] as MIL.Html.HtmlElement).Text;
                                        }
                                    }

                                    MIL.Html.HtmlNodeCollection posttimeNodes = (newsinfoNodes[0] as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "posttime");
                                    if (posttimeNodes != null && posttimeNodes.Count > 0)
                                    {
                                        if (posttimeNodes[0].IsElement())
                                        {
                                            if ((posttimeNodes[0] as MIL.Html.HtmlElement).Attributes != null && (posttimeNodes[0] as MIL.Html.HtmlElement).Attributes["title"] != null)
                                            {
                                                mri.ReleaseDate = (posttimeNodes[0] as MIL.Html.HtmlElement).Attributes["title"].Value;
                                            }
                                        }
                                    }
                                }
                                #endregion
                                #region 其他杂项
                                mri.KeyWords = keyword;
                                mri.CollectDate = string.Format(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"));
                                mri.Kid = kid;
                                mri.Sheng = "";
                                mri.Shi = "";
                                mri.Xian = "";
                                if (!string.IsNullOrEmpty(mri.ReleaseName))
                                {
                                    mri.WebName = mri.ReleaseName;
                                }
                                else
                                {
                                    mri.WebName = "主流媒体";
                                }
                                mri.Pid = 4;
                                //mri.Part = GetParts(mri.Contexts);
                                mri.Comments = (int)WebSourceType.HaosouNews;
                                mri.Reposts = 0;
                                #endregion

                                #region 2015.8.13 新增获取网址正文
                                if (!string.IsNullOrEmpty(mri.InfoSource))
                                {
                                    string strContexts = HtmlUtil.getHtml(mri.InfoSource, "");
                                    string noHtmlContexts = HtmlUtil.NoHTML(strContexts);
                                    //分析关键字前100,后50个字符
                                    string formatContexts = GetContexts(noHtmlContexts, keyword);
                                    if (!string.IsNullOrEmpty(formatContexts))
                                    {
                                        mri.Contexts = formatContexts;
                                    }
                                }
                                #endregion
                                #region 报告进度
                                OnReportCactchProcess(mri);
                                #endregion

                                if (!string.IsNullOrEmpty(mri.Title))
                                {
                                    webDatas.Add(mri);
                                }
                            }
                        }
                    }

                }
                #endregion
            }
            catch (Exception ex)
            {
                Comm.WriteErrorLog("解析好搜新闻搜索时报错:" + ex.Message);
                Comm.WriteErrorLog(ex.StackTrace);
            }

            return webDatas;
        }
Esempio n. 26
0
        /// <summary>
        /// 搜索百度网页
        /// </summary>
        /// <param name="html"></param>
        /// <param name="keyword"></param>
        /// <param name="kid"></param>
        /// <returns></returns>
        public List<ModelReleaseInfo> ParseBaiduWeb(string html, string keyword, int kid)
        {
            List<ModelReleaseInfo> webDatas = new List<ModelReleaseInfo>();
            try
            {
                #region 网页源码样例
                #endregion
                #region 解析网站源码
                //html = html.Replace("<em>", "").Replace("</em>", "");
                MIL.Html.HtmlDocument doc = MIL.Html.HtmlDocument.Create(html);
                MIL.Html.HtmlNodeCollection nodes = doc.Nodes.FindByAttributeNameValue("class", "result c-container ", true);
                foreach (MIL.Html.HtmlNode n in nodes)
                {
                    if ((n is MIL.Html.HtmlElement) && (n as MIL.Html.HtmlElement).Nodes != null)
                    {
                        ModelReleaseInfo mri = new ModelReleaseInfo();
                        #region 标题与超链
                        MIL.Html.HtmlNodeCollection titleNodes = (n as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "t");
                        if (titleNodes != null && titleNodes.Count > 0)
                        {
                            //title中包含<em>节点,需要进行处理
                            string title = "";
                            MIL.Html.HtmlElement titleElement = ((titleNodes[0] as MIL.Html.HtmlElement).Nodes[0] as MIL.Html.HtmlElement);
                            if (titleElement.Nodes != null && titleElement.Nodes.Count > 0)
                            {
                                foreach (MIL.Html.HtmlNode t in titleElement.Nodes)
                                {
                                    if (t.IsText())
                                    {
                                        title += (t as MIL.Html.HtmlText).Text;
                                    }
                                    else if (t.IsElement() && (t as MIL.Html.HtmlElement).Name.ToLower() == "em")
                                    {
                                        title += (t as MIL.Html.HtmlElement).Text;
                                    }
                                }
                            }
                            //string title = ((titleNodes[0] as MIL.Html.HtmlElement).Nodes[0] as MIL.Html.HtmlElement).Text;
                            string href = ((titleNodes[0] as MIL.Html.HtmlElement).Nodes[0] as MIL.Html.HtmlElement).Attributes["href"].Value;

                            mri.Title = title;
                            mri.InfoSource = href;
                            //百度跳转超链,需要二次解析
                            if (href.StartsWith("http://www.baidu.com/link?url="))
                            {
                                HttpWebRequest req = (HttpWebRequest)HttpWebRequest.Create(href);
                                req.AllowAutoRedirect = false;
                                HttpWebResponse response = (HttpWebResponse)req.GetResponse();
                                string loc = response.Headers["location"];
                                if (!string.IsNullOrEmpty(loc))
                                {
                                    mri.InfoSource = loc;
                                    if (loc.Trim().StartsWith("http://zhidao.baidu.com/link?url"))
                                    {
                                        string zhidaoHtml = HtmlUtil.HttpGet(href, Encoding.Default);
                                        MIL.Html.HtmlDocument doc2 = MIL.Html.HtmlDocument.Create(zhidaoHtml);
                                        MIL.Html.HtmlNodeCollection nodes2 = doc2.Nodes.FindByAttributeNameValue("rel", "canonical", true);
                                        if (nodes2 != null && nodes2.Count > 0)
                                        {
                                            if (nodes2[0].IsElement() && (nodes2[0] as MIL.Html.HtmlElement).Attributes != null && (nodes2[0] as MIL.Html.HtmlElement).Attributes["href"] != null)
                                            {
                                                mri.InfoSource = (nodes2[0] as MIL.Html.HtmlElement).Attributes["href"].Value;
                                            }
                                        }

                                    }
                                }
                            }
                            else if (href.Trim().StartsWith("http://zhidao.baidu.com/link?url"))
                            {
                                string zhidaoHtml = HtmlUtil.HttpGet(href, Encoding.Default);
                                MIL.Html.HtmlDocument doc2 = MIL.Html.HtmlDocument.Create(zhidaoHtml);
                                MIL.Html.HtmlNodeCollection nodes2 = doc2.Nodes.FindByAttributeNameValue("rel", "canonical", true);
                                if (nodes2 != null && nodes2.Count > 0)
                                {
                                    if (nodes2[0].IsElement() && (nodes2[0] as MIL.Html.HtmlElement).Attributes != null && (nodes2[0] as MIL.Html.HtmlElement).Attributes["href"] != null)
                                    {
                                        mri.InfoSource = (nodes2[0] as MIL.Html.HtmlElement).Attributes["href"].Value;
                                    }
                                }

                            }
                        }
                        #endregion
                        #region 发表时间与内容简介
                        MIL.Html.HtmlNodeCollection contextNodes = (n as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "c-abstract");
                        if (contextNodes != null && contextNodes.Count > 0)
                        {
                            #region 简介
                            string context = "";
                            string date = "";// DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss");
                            foreach (MIL.Html.HtmlNode c in (contextNodes[0] as MIL.Html.HtmlElement).Nodes)
                            {
                                if (c.IsElement() && (c as MIL.Html.HtmlElement).Name.ToLower() == "span")
                                {
                                    date = (c as MIL.Html.HtmlElement).Text;
                                }
                                else if (c.IsText())
                                {
                                    context += (c as MIL.Html.HtmlText).Text;
                                }
                                else if (c.IsElement() && (c as MIL.Html.HtmlElement).Name.ToLower() == "em")
                                {
                                    context += (c as MIL.Html.HtmlElement).Text;
                                }
                            }
                            mri.Contexts = context;
                            #endregion
                            #region 处理时间
                            if (string.IsNullOrEmpty(date))
                            {
                                date = DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss");
                            }
                            else
                            {
                                //<span class=" newTimeFactor_before_abs m">2009年9月6日&nbsp;-&nbsp;</span>
                                byte[] space = new byte[] { 0xc2, 0xa0 };
                                string UTFSpace = Encoding.GetEncoding("UTF-8").GetString(space);
                                date = date.Replace(UTFSpace, "").Trim();
                                if (date.EndsWith("-")) date = date.Substring(0, date.Length - 1);

                                if (date.EndsWith("前"))
                                {
                                    if (date.Contains("天前"))
                                    {
                                        int offset = int.Parse(date.Substring(0, date.IndexOf("天前")).Trim());
                                        date = DateTime.Now.AddDays(offset * -1).ToString("yyyy-MM-dd");
                                    }
                                    else if (date.Contains("小时前"))
                                    {
                                        int offset = int.Parse(date.Substring(0, date.IndexOf("小时前")).Trim());
                                        date = DateTime.Now.AddHours(offset * -1).ToString("yyyy-MM-dd");
                                    }
                                    else if (date.Contains("时前"))
                                    {
                                        int offset = int.Parse(date.Substring(0, date.IndexOf("时前")).Trim());
                                        date = DateTime.Now.AddHours(offset * -1).ToString("yyyy-MM-dd");
                                    }
                                    else if (date.Contains("分") && date.Contains("前"))
                                    {
                                        int offset = int.Parse(date.Substring(0, date.IndexOf("分")).Trim());
                                        date = DateTime.Now.AddMinutes(offset * -1).ToString("yyyy-MM-dd");
                                    }
                                }
                                else
                                {
                                    if (date.IndexOf('年') >= 0)
                                    {
                                        //南方网  2015年03月09日 11:51
                                        date = date.Replace('年', '-').Replace('月', '-').Replace("日", "");
                                    }
                                    else
                                    {
                                        if (date.IndexOf('月') >= 0)
                                        {
                                            date = DateTime.Now.Year + "-" + date.Replace('月', '-').Replace("日", "");
                                        }
                                    }
                                }
                                mri.ReleaseDate = date;

                            }
                            #endregion
                        }
                        #endregion
                        #region 快照 来源
                        MIL.Html.HtmlNodeCollection footNodes = (n as MIL.Html.HtmlElement).Nodes.FindByAttributeNameValue("class", "f13");
                        if (footNodes != null && footNodes.Count > 0)
                        {
                            if (footNodes[0].IsElement())
                            {
                                foreach (MIL.Html.HtmlNode foot in (footNodes[0] as MIL.Html.HtmlElement).Nodes)
                                {
                                    if (foot.IsElement() && (foot as MIL.Html.HtmlElement).Name == "span"
                                        && (foot as MIL.Html.HtmlElement).Attributes != null
                                        && (foot as MIL.Html.HtmlElement).Attributes["class"] != null
                                        && (foot as MIL.Html.HtmlElement).Attributes["class"].Value == "g")
                                    {
                                        //<span class="g">www.douban.com/note/43...&nbsp;</span>
                                        mri.ReleaseName = (foot as MIL.Html.HtmlElement).Text;
                                    }
                                    else if (foot.IsElement() && (foot as MIL.Html.HtmlElement).Name == "a"
                                        && (foot as MIL.Html.HtmlElement).Attributes != null
                                        && (foot as MIL.Html.HtmlElement).Attributes["class"] != null
                                        && (foot as MIL.Html.HtmlElement).Attributes["class"].Value == "m"
                                        && (foot as MIL.Html.HtmlElement).Text.Contains("百度快照"))
                                    {
                                        mri.Snapshot = (foot as MIL.Html.HtmlElement).Attributes["href"].Value;
                                    }
                                }
                            }
                        }
                        #endregion
                        #region 其他杂项
                        mri.KeyWords = keyword;
                        mri.CollectDate = string.Format(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"));
                        mri.Kid = kid;
                        mri.Sheng = "";
                        mri.Shi = "";
                        mri.Xian = "";
                        //mri.WebName = "百度";
                        if (!string.IsNullOrEmpty(mri.ReleaseName))
                        {
                            mri.WebName = mri.ReleaseName;
                        }
                        else
                        {
                            mri.WebName = "全网";
                        }
                        mri.Pid = 0;
                        //mri.Part = GetParts(mri.Contexts);
                        mri.Comments = (int)WebSourceType.BaiduWeb;
                        mri.Reposts = 0;
                        #endregion
                        #region 2015.8.13 新增获取网址正文
                        if (!string.IsNullOrEmpty(mri.InfoSource))
                        {
                            string strContexts = HtmlUtil.getHtml(mri.InfoSource, "");
                            string noHtmlContexts = HtmlUtil.NoHTML(strContexts);
                            //分析关键字前100,后50个字符
                            string formatContexts = GetContexts(noHtmlContexts, keyword);
                            if (!string.IsNullOrEmpty(formatContexts))
                            {
                                mri.Contexts = formatContexts;
                            }
                        }
                        #endregion
                        #region 报告进度
                        OnReportCactchProcess(mri);
                        #endregion
                        if (!string.IsNullOrEmpty(mri.Title))
                        {
                            webDatas.Add(mri);
                        }
                    }
                }
                #endregion
            }
            catch (Exception ex)
            {
                Comm.WriteErrorLog("解析百度新闻网页时报错:" + ex.Message);
                Comm.WriteErrorLog(ex.StackTrace);
            }

            return webDatas;
        }
Esempio n. 27
0
        public String GetInsString(ModelReleaseInfo obj)
        {
            StringBuilder insertSql = new StringBuilder();
            string[] keywords = obj.KeyWords.Split(new string[] { "," }, StringSplitOptions.RemoveEmptyEntries);
            foreach (string k in keywords)
            {
                string sql = @"INSERT INTO ReleaseInfo(Title,Contexts,ReleaseDate,InfoSource,KeyWords,ReleaseName,CollectDate,Snapshot,webName,pid,part,reposts,comments,kid,sheng,shi,xian)
                            VALUES('{0}','{1}','{2}','{3}','{4}','{5}','{6}','{7}','{8}','{9}','{10}','{11}','{12}','{13}','{14}','{15}','{16}'); ";

                obj.Title = filtRiskChar(obj.Title);
                obj.Contexts = filtRiskChar(obj.Contexts);
                insertSql.Append(string.Format(sql, obj.Title, obj.Contexts, obj.ReleaseDate, obj.InfoSource, k.Contains('-') ? k.Split(new char[] { '-' })[0] : k,
                    obj.ReleaseName, obj.CollectDate, obj.Snapshot, obj.WebName, obj.Pid, obj.Part, obj.Reposts, obj.Comments, k.Contains('-') ? k.Split(new char[] { '-' })[1] : k, obj.Sheng, obj.Shi, obj.Xian));
            }
            return insertSql.ToString();
        }