/// <summary> /// 结果列表页面解析 /// 解析出结果记录 /// </summary> /// <param name="ExtractPage"></param> static private void ExtractPage(string html) { try { string strRef = "/view/.*?.html"; MatchCollection matches = new Regex(strRef, RegexOptions.Compiled).Matches(html); foreach (Match match in matches) { try { string href = match.Value.Replace("/view/", ""); href = href.Replace(".html", ""); DocInfo fi = new DocInfo(); fi.DownAddress = href; if (!string.IsNullOrEmpty(fi.DownAddress) && !MSSQL.IsExistDoc(fi)) { FInfoExtract.ExtractInfo(fi); } } catch (Exception) { } } } catch (Exception) { } }
public static bool IsExistDoc(DocInfo doc) { GetConnStr(); Control.CheckForIllegalCrossThreadCalls = false; SqlConnection conn = new SqlConnection(strConn); bool isExist = true; try{ conn.Open(); SqlCommand cmd = new SqlCommand("procSelectInfo", conn); cmd.CommandType = CommandType.StoredProcedure; cmd.Parameters.Add("@DownAddress", SqlDbType.NVarChar).Value = doc.DownAddress; if (cmd.ExecuteScalar() == null) { isExist = false; } } catch (Exception ex) { isExist = false; } finally{ conn.Close(); conn.Dispose(); } return(isExist); }
/// <summary> /// 局部页面解析 /// 提取出页面信息 /// </summary> /// <param name="ExtractLinks"></param> /// <returns></returns> static public bool ExtractInfo(DocInfo fi) { bool Flag = false; string urls = baidu.wi.webUrl + "view/" + fi.DownAddress + ".html"; string html = GetWebHtml(urls); if (string.IsNullOrEmpty(html)) { return(Flag); } //fi.DocIntro = ExtractIntro(html); fi.DocKeyWord = ExtractKWord(html); ///////////////////////////////////////// string DOC_INFO = Seprate(html); if (!string.IsNullOrEmpty(DOC_INFO)) { fi.DocName = ExtractFileName(DOC_INFO); if (!string.IsNullOrEmpty(fi.DocName)) { fi.param = ExtractParam(DOC_INFO); if (baidu.HT.ContainsKey(fi.param)) { fi.ClassID = (string)baidu.HT[fi.param]; fi.Money = ExtractMoney(DOC_INFO); fi.DocType = ExtractType(DOC_INFO); // fi.DocSize = ExtractLength(DOC_INFO); if (MSSQL.AddExtractInfo(fi)) { baidu.filecount++; Flag = true; } } else { MessageBox.Show("警告:标识为" + fi.DownAddress + "的文档类别参数" + fi.param + "无法查询到!请马上查看补充! " + string.Format("{0:G}", DateTime.Now)); } } } return(Flag); }
/// <summary> /// 写入文档记录 /// </summary> /// <param name="doc"></param> /// <returns>写入成功or失败</returns> public static bool AddExtractInfo(DocInfo doc) { GetConnStr(); Control.CheckForIllegalCrossThreadCalls = false; SqlConnection conn = new SqlConnection(strConn); bool isok = false; try { conn.Open(); SqlCommand cmd = new SqlCommand("procAddExtractInfo", conn); cmd.CommandType = CommandType.StoredProcedure; cmd.Parameters.Add("@ClassID", SqlDbType.NVarChar).Value = doc.ClassID; cmd.Parameters.Add("@DocName", SqlDbType.NVarChar).Value = doc.DocName; cmd.Parameters.Add("@DocType", SqlDbType.NVarChar).Value = doc.DocType; cmd.Parameters.Add("@DownAddress", SqlDbType.NVarChar).Value = doc.DownAddress; cmd.Parameters.Add("@DocIntro", SqlDbType.Text).Value = doc.DocIntro; cmd.Parameters.Add("@DocKeyWord", SqlDbType.NVarChar).Value = doc.DocKeyWord; cmd.Parameters.Add("@DocSize", SqlDbType.BigInt).Value = doc.DocSize; cmd.Parameters.Add("@Money", SqlDbType.Int).Value = doc.Money; if (cmd.ExecuteNonQuery() > 0) { isok = true; } } catch (System.Exception) { //dt = DateTime.Now; //lock (AgentLog) //{ // AgentLog.AppendText(doc.DocName + "----导入数据库出错:" + ex.Message + " " + string.Format("{0:G}", dt) + "\n"); //} isok = false; } finally { conn.Close(); conn.Dispose(); } return(isok); }