Ejemplo n.º 1
0
        public static ModelReleaseInfo CrawlHtmlSource(string html, string url, DataTable dtkey, string sheng, string shi, string xian, string webName, string webInfo, int pid)
        {
            //string strURLformat = "https?://.[^\"]+";
            Dictionary <string, int> events = new Dictionary <string, int>();
            //创建数据对象
            ModelReleaseInfo newsInfo = new ModelReleaseInfo();

            try
            {
                newsInfo.Title = HtmlUtil.NoHTML(html);
                //newsInfo.Title = html;
                for (int j = 0; j < dtkey.Rows.Count; j++)
                {
                    string[] keys = dtkey.Rows[j][4].ToString().Split(new char[] { ' ' });
                    if (!events.ContainsKey(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString()))
                    {
                        events.Add(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString(), 1);
                        foreach (string k in keys)
                        {
                            if (!html.ToLower().Contains(k.ToLower()))
                            {
                                events.Remove(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString());
                                break;
                            }
                        }
                    }
                }
                foreach (KeyValuePair <string, int> ev in events)
                {
                    if (ev.Value == 1)
                    {
                        newsInfo.KeyWords += "," + ev.Key.Split(new char[] { '-' })[0] + "-" + int.Parse(ev.Key.Split(new char[] { '-' })[1]);
                    }
                }
                newsInfo.Contexts = HtmlUtil.NoHTML(webInfo);

                //网站链接
                newsInfo.InfoSource = url;

                //关键字的设置
                if (newsInfo.KeyWords == null || newsInfo.KeyWords.Length == 0)
                {
                    for (int j = 0; j < dtkey.Rows.Count; j++)
                    {
                        //Application.DoEvents();
                        string[] keys = dtkey.Rows[j][4].ToString().Split(new char[] { ' ' });
                        if (!events.ContainsKey(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString()))
                        {
                            events.Add(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString(), 1);
                            foreach (string k in keys)
                            {
                                if (!newsInfo.Contexts.ToLower().Contains(k.ToLower()))
                                {
                                    events.Remove(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString());
                                    break;
                                }
                            }
                        }
                    }
                    foreach (KeyValuePair <string, int> ev in events)
                    {
                        if (ev.Value == 1)
                        {
                            newsInfo.KeyWords += "," + ev.Key.Split(new char[] { '-' })[0] + "-" + int.Parse(ev.Key.Split(new char[] { '-' })[1]);
                        }
                    }
                }
                //if (newsInfo.KeyWords.Length == 0) { continue; }
                if (newsInfo.KeyWords != null)
                {
                    newsInfo.KeyWords = newsInfo.KeyWords.Substring(1);
                }

                //收集日期
                newsInfo.CollectDate = string.Format(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"));

                //发布人和发布日期暂时无法取到,手工赋值为空
                newsInfo.ReleaseDate = "";
                newsInfo.ReleaseName = "";

                //网页快照,这里为用户指定生成,如果未选择生成,那么为空
                newsInfo.Snapshot = "";
                newsInfo.Sheng    = sheng == null ? "" : sheng;
                newsInfo.Shi      = shi == null ? "" : shi;
                newsInfo.Xian     = xian == null ? "" : xian;
                //网站名
                newsInfo.WebName = webName == null ? "" : webName;
                //pid
                newsInfo.Pid = pid;
                //part正负判断
                newsInfo.Part = GetParts(newsInfo.Contexts);
                //reposts
                newsInfo.Reposts = 0;
                //comments
                newsInfo.Comments = 0;
            }
            catch (Exception ex)
            {
                Comm.WriteErrorLog(ex.StackTrace);
            }

            return(newsInfo);
        }