예제 #1
0
        public void StartWrite()
        {
            Action actionG = () =>
            {
                List <ModelReleaseInfo> data;
                while (true)
                {
                    if (Program.ProClose)
                    {
                        break;
                    }
                    if (Queue.TryDequeue(out data))
                    {
                        //// 写入数据库
                        #region 数据入库
                        try
                        {
                            StringBuilder sb = new StringBuilder();
                            sb.Append("");
                            TbReleaseInfo tri = new TbReleaseInfo();
                            MySqlCmd      cmd = new MySqlCmd();
                            foreach (var mri in data)
                            {
                                #region 2016.11.16 加入精确匹配的判断
                                string keywords = mri.KeyWords;
                                string title    = mri.Title;
                                string context  = mri.Contexts;
                                if (!string.IsNullOrEmpty(keywords))
                                {
                                    bool     isFundTitle   = true;
                                    bool     isFundContext = true;
                                    string[] keyw          = keywords.Split(' ');
                                    if (keyw != null && keyw.Count() > 0)
                                    {
                                        foreach (string key in keyw)
                                        {
                                            if (title.IndexOf(key) < 0)
                                            {
                                                isFundTitle = false;
                                            }
                                            if (context.IndexOf(key) < 0)
                                            {
                                                isFundContext = false;
                                            }
                                        }
                                    }
                                    if (!isFundTitle && !isFundContext)
                                    {
                                        //如果标题或者内容没有匹配全部关键字则去掉该条数据
                                        continue;
                                    }
                                }
                                #endregion
                                if (tri.GetReleaseInfoCount(mri.InfoSource, mri.KeyWords) > 0)
                                {
                                    continue;
                                }
                                string sql = tri.GetInsertStr(mri);
                                if (!sql.Trim().EndsWith(";"))
                                {
                                    sql += sql + ";";
                                }
                                sb.Append(sql);
                            }

                            if (sb.ToString().Length > 0)
                            {
                                //执行插入
                                cmd.ExecuteNonQuery(sb.ToString());
                                //清除插入字段串
                                sb.Clear();
                            }
                        }
                        catch (Exception ex)
                        {
                            Comm.WriteErrorLog(ex.Message);
                            Comm.WriteErrorLog(ex.StackTrace);
                        }
                        #endregion

                        log.Info("数据层写入数据库成功");
                    }
                    else
                    {
                        System.Threading.Thread.Sleep(1000);
                    }
                }
            };

            Parallel.Invoke(actionG, actionG);
        }
예제 #2
0
파일: CrawlHtml.cs 프로젝트: wcgcw/Finder
        public static ModelReleaseInfo CrawlHtmlSource(string html, string url, DataTable dtkey, string sheng, string shi, string xian, string webName, string webInfo, int pid)
        {
            //string strURLformat = "https?://.[^\"]+";
            Dictionary <string, int> events = new Dictionary <string, int>();
            //创建数据对象
            ModelReleaseInfo newsInfo = new ModelReleaseInfo();

            try
            {
                newsInfo.Title = HtmlUtil.NoHTML(html);
                //newsInfo.Title = html;
                for (int j = 0; j < dtkey.Rows.Count; j++)
                {
                    string[] keys = dtkey.Rows[j][4].ToString().Split(new char[] { ' ' });
                    if (!events.ContainsKey(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString()))
                    {
                        events.Add(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString(), 1);
                        foreach (string k in keys)
                        {
                            if (!html.ToLower().Contains(k.ToLower()))
                            {
                                events.Remove(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString());
                                break;
                            }
                        }
                    }
                }
                foreach (KeyValuePair <string, int> ev in events)
                {
                    if (ev.Value == 1)
                    {
                        newsInfo.KeyWords += "," + ev.Key.Split(new char[] { '-' })[0] + "-" + int.Parse(ev.Key.Split(new char[] { '-' })[1]);
                    }
                }
                newsInfo.Contexts = HtmlUtil.NoHTML(webInfo);

                //网站链接
                newsInfo.InfoSource = url;

                //关键字的设置
                if (newsInfo.KeyWords == null || newsInfo.KeyWords.Length == 0)
                {
                    for (int j = 0; j < dtkey.Rows.Count; j++)
                    {
                        //Application.DoEvents();
                        string[] keys = dtkey.Rows[j][4].ToString().Split(new char[] { ' ' });
                        if (!events.ContainsKey(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString()))
                        {
                            events.Add(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString(), 1);
                            foreach (string k in keys)
                            {
                                if (!newsInfo.Contexts.ToLower().Contains(k.ToLower()))
                                {
                                    events.Remove(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString());
                                    break;
                                }
                            }
                        }
                    }
                    foreach (KeyValuePair <string, int> ev in events)
                    {
                        if (ev.Value == 1)
                        {
                            newsInfo.KeyWords += "," + ev.Key.Split(new char[] { '-' })[0] + "-" + int.Parse(ev.Key.Split(new char[] { '-' })[1]);
                        }
                    }
                }
                //if (newsInfo.KeyWords.Length == 0) { continue; }
                if (newsInfo.KeyWords != null)
                {
                    newsInfo.KeyWords = newsInfo.KeyWords.Substring(1);
                }

                //收集日期
                newsInfo.CollectDate = string.Format(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"));

                //发布人和发布日期暂时无法取到,手工赋值为空
                newsInfo.ReleaseDate = "";
                newsInfo.ReleaseName = "";

                //网页快照,这里为用户指定生成,如果未选择生成,那么为空
                newsInfo.Snapshot = "";
                newsInfo.Sheng    = sheng == null ? "" : sheng;
                newsInfo.Shi      = shi == null ? "" : shi;
                newsInfo.Xian     = xian == null ? "" : xian;
                //网站名
                newsInfo.WebName = webName == null ? "" : webName;
                //pid
                newsInfo.Pid = pid;
                //part正负判断
                newsInfo.Part = GetParts(newsInfo.Contexts);
                //reposts
                newsInfo.Reposts = 0;
                //comments
                newsInfo.Comments = 0;
            }
            catch (Exception ex)
            {
                Comm.WriteErrorLog(ex.StackTrace);
            }

            return(newsInfo);
        }