/// <summary> /// 附件下载 /// </summary> /// <param name="infoUrl"></param> private void AddBaseFile(string infoUrl, string strFileName, CorpPunish info) { string strFileUrl = ToolDb.DbServerPath + "SiteManage\\Files\\Corp_Attach\\"; string strFile = DateTime.Now.Year.ToString() + DateTime.Now.Month.ToString() + "\\"; //新建文件夹地址 long lStartPos = 0; //返回上次下载字节 long lCurrentPos = 0; //返回当前下载文件长度 long lDownLoadFile; //返回当前下载文件长度 System.IO.FileStream fs; long length = 0; if (System.IO.File.Exists(strFileUrl + strFile)) { fs = System.IO.File.OpenWrite(strFileUrl + strFile); lStartPos = fs.Length; fs.Seek(lStartPos, System.IO.SeekOrigin.Current); } else { Directory.CreateDirectory(strFileUrl + strFile); fs = new FileStream(strFileUrl + strFile + strFileName, System.IO.FileMode.OpenOrCreate); lStartPos = 0; } try { System.Net.HttpWebRequest request = System.Net.HttpWebRequest.Create(infoUrl) as System.Net.HttpWebRequest; length = request.GetResponse().ContentLength; lDownLoadFile = length; if (lStartPos > 0) { request.AddRange((int)lStartPos); } System.IO.Stream ns = request.GetResponse().GetResponseStream(); byte[] nbytes = new byte[102]; int nReadSize = 0; nReadSize = ns.Read(nbytes, 0, 102); while (nReadSize > 0) { fs.Write(nbytes, 0, nReadSize); nReadSize = ns.Read(nbytes, 0, 102); lCurrentPos = fs.Length; } fs.Close(); ns.Close(); if (length > 1024) { BaseAttach baseInfo = ToolDb.GenBaseAttach(ToolDb.NewGuid, strFileName, info.Id, strFile + strFileName, length.ToString(), ""); ToolDb.SaveEntity(baseInfo, string.Empty); } else { File.Delete(strFileUrl + strFile + strFileName); } } catch { fs.Close(); File.Delete(strFileUrl + strFile + strFileName); } }
/// <summary> /// 企业处罚信息 /// </summary> /// <param name="info"></param> /// <param name="html"></param> protected void AddCorpPunish(CorpInfo info, string html) { Parser parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "xzcf"))); if (nodeList != null && nodeList.Count > 0) { parser = new Parser(new Lexer(nodeList.ToHtml().Replace("th", "td"))); NodeList tableNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (tableNode != null && tableNode.Count > 0) { TableTag table = tableNode[0] as TableTag; for (int i = 1; i < table.RowCount; i++) { TableRow tr = table.Rows[i]; if (tr.Columns[0].ToPlainTextString().Contains("没有显示结果")) { break; } string DocNo = string.Empty, PunishType = string.Empty, GrantUnit = string.Empty, DocDate = string.Empty, PunishCtx = string.Empty, IsShow = string.Empty; DocNo = tr.Columns[1].ToNodePlainString(); PunishType = tr.Columns[2].ToNodePlainString(); GrantUnit = tr.Columns[3].ToNodePlainString(); DocDate = tr.Columns[4].ToNodePlainString(); CorpPunish punish = ToolDb.GenCorpPunish(info.Id, DocNo, PunishType, GrantUnit, DocDate, PunishCtx, info.Url, "0"); ToolDb.SaveEntity(punish, string.Empty); } } } }
/// <summary> /// 保存行政处罚 /// </summary> /// <param name="table"></param> /// <param name="id"></param> /// <param name="url"></param> private void AddCorpPunish(TableTag table, string id, string url) { for (int i = 1; i < table.RowCount; i++) { string DocNo = string.Empty, PunishType = string.Empty, GrantUnit = string.Empty, DocDate = string.Empty, PunishCtx = string.Empty, IsShow = string.Empty; TableRow tr = table.Rows[i]; PunishType = tr.Columns[0].ToNodePlainString(); PunishCtx = tr.Columns[4].ToPlainTextString().Replace(" ", ""); DocDate = tr.Columns[5].ToPlainTextString().GetDateRegex(); GrantUnit = tr.Columns[7].ToNodePlainString(); if (string.IsNullOrEmpty(PunishType) && string.IsNullOrEmpty(PunishCtx) && string.IsNullOrEmpty(DocDate) && string.IsNullOrEmpty(GrantUnit)) { continue; } CorpPunish punish = ToolDb.GenCorpPunish(id, DocNo, PunishType, GrantUnit, DocDate, PunishCtx, url, "0"); ToolDb.SaveEntity(punish, string.Empty); } }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string htl = string.Empty; int sqlCount = 0; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { htl = ToolWeb.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(list); } if (htl.Contains("RowCount")) { try { int index = htl.IndexOf("RowCount"); string pageStr = htl.Substring(index, htl.Length - index).Replace("RowCount", "").Replace("}", "").Replace(":", "").Replace("\"", ""); decimal b = decimal.Parse(pageStr) / 10; if (b.ToString().Contains(".")) { pageInt = Convert.ToInt32(b) + 1; } else { pageInt = Convert.ToInt32(b); } } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { htl = ToolWeb.GetHtmlByUrl("http://www.szjs.gov.cn/build/build.ashx?_=1353579439242&menu=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A&pageSize=10&pageIndex=" + i.ToString() + "&fileOrg=&fileDate=&fileId=&unitName=&timp=", Encoding.UTF8); } catch { } } JavaScriptSerializer serializer = new JavaScriptSerializer(); Dictionary <string, object> smsTypeJson = (Dictionary <string, object>)serializer.DeserializeObject(htl); foreach (KeyValuePair <string, object> obj in smsTypeJson) { if (obj.Key != "DataList") { continue; } object[] array = (object[])obj.Value; foreach (object obj2 in array) { Dictionary <string, object> dicSmsType = (Dictionary <string, object>)obj2; string DocNo = string.Empty, PunishType = string.Empty, GrantUnit = string.Empty, DocDate = string.Empty, PunishCtx = string.Empty, GrantName = string.Empty, InfoUrl = string.Empty; try { DocNo = Convert.ToString(dicSmsType["FileId"]); PunishType = Convert.ToString(dicSmsType["PunTypeText"]); GrantUnit = Convert.ToString(dicSmsType["UnitName"]); DocDate = Convert.ToString(dicSmsType["ServiceDate"]); InfoUrl = "http://www.szjs.gov.cn/PUNhtml/" + Convert.ToString(dicSmsType["PunDoc"]); CorpPunish info = ToolDb.GenCorpPunish(string.Empty, DocNo, PunishType, GrantUnit, DocDate, PunishCtx, InfoUrl, GrantName, "1"); if (sqlCount <= this.MaxCount) { if (ToolDb.SaveEntity(info, this.ExistCompareFields)) { string file = Convert.ToString(dicSmsType["PunDoc"]); AddBaseFile(InfoUrl, file, info); } sqlCount++; } else { return(list); } } catch { continue; } } } } return(list); }
/// <summary> /// 企业处罚信息 /// </summary> /// <param name="info"></param> /// <param name="html"></param> protected void AddCorpPunish(CorpInfo info, string param, string corpType) { string url = "http://portal.szjs.gov.cn:8888/publicShow/queryPunish.html"; string[] postParams = new string[] { "param", "corpType", "orgCode", "page" }; string[] postValues = new string[] { param, corpType, info.CorpCode, "1" }; NameValueCollection nvc = ToolWeb.GetNameValueCollection(postParams, postValues); string html = string.Empty; try { html = ToolWeb.GetHtmlByUrl(url, nvc, Encoding.UTF8); } catch { Thread.Sleep(12 * 60 * 1000); try { html = ToolWeb.GetHtmlByUrl(url, nvc, Encoding.UTF8); } catch { Thread.Sleep(8 * 60 * 1000); return; } } JavaScriptSerializer java = new JavaScriptSerializer(); Dictionary <string, object> jsonResults = (Dictionary <string, object>)java.DeserializeObject(html); int pageInt = 1; try { pageInt = (int)jsonResults["totalPage"]; } catch { } for (int i = 1; i <= pageInt; i++) { if (i > 1) { postValues = new string[] { param, corpType, info.CorpCode, i.ToString() }; nvc = ToolWeb.GetNameValueCollection(postParams, postValues); try { html = ToolWeb.GetHtmlByUrl(url, nvc, Encoding.UTF8); } catch { Thread.Sleep(12 * 60 * 1000); try { html = ToolWeb.GetHtmlByUrl(url, nvc, Encoding.UTF8); } catch { Thread.Sleep(8 * 60 * 1000); continue; } } jsonResults = (Dictionary <string, object>)java.DeserializeObject(html); } object[] dicRecords = (object[])jsonResults["records"]; foreach (object dicRecord in dicRecords) { string DocNo = string.Empty, PunishType = string.Empty, GrantUnit = string.Empty, DocDate = string.Empty, PunishCtx = string.Empty, IsShow = string.Empty; Dictionary <string, object> dic = (Dictionary <string, object>)dicRecord; DocNo = Convert.ToString(dic["file_id"]); PunishType = Convert.ToString(dic["pun_type_text"]); GrantUnit = Convert.ToString(dic["file_org"]); DocDate = Convert.ToString(dic["file_date"]); CorpPunish punish = ToolDb.GenCorpPunish(info.Id, DocNo, PunishType, GrantUnit, DocDate, PunishCtx, info.Url, "0"); ToolDb.SaveEntity(punish, string.Empty); } } }
protected override IList ExecuteCrawl(bool crawlAll) { int count = 1; IList list = new List <CorpPunish>(); string htl = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { htl = ToolWeb.GetHtmlByUrl(SiteUrl, Encoding.Default); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(htl)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("a"), new HasAttributeFilter("id", "lx"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.GetATagHref().GetRegexBegEnd("page=", "&"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { htl = ToolWeb.GetHtmlByUrl(this.SiteUrl + "&page=" + i.ToString(), Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(htl)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "bean"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { string DocNo = string.Empty, PunishType = string.Empty, GrantUnit = string.Empty, DocDate = string.Empty, PunishCtx = string.Empty, GrantName = string.Empty, InfoUrl = string.Empty; TableRow tr = table.Rows[j]; DocNo = tr.Columns[1].ToNodePlainString(); GrantName = tr.Columns[2].ToNodePlainString(); DocDate = tr.Columns[3].ToPlainTextString().GetDateRegex(); PunishType = tr.Columns[5].ToNodePlainString(); InfoUrl = tr.Columns[1].GetATagHref(); CorpPunish info = ToolDb.GenCorpPunish(string.Empty, DocNo, PunishType, GrantUnit, DocDate, PunishCtx, InfoUrl, GrantName, "1"); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } count++; if (count >= 50) { count = 1; Thread.Sleep(480000); } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string htl = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int page = 1; string eventValidation = string.Empty; try { htl = ToolWeb.GetHtmlByUrl(ToolWeb.UrlEncode(SiteUrl), Encoding.Default, ref cookiestr); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(htl)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "list_page"))); if (nodeList != null && nodeList.Count > 0) { Regex regexPage = new Regex(@"\d+页"); page = int.Parse(regexPage.Match(nodeList.AsString()).Value.Trim(new char[] { '共', '页' })); } for (int i = 1; i <= page; i++) { if (i > 1) { viewState = ToolWeb.GetAspNetViewState(htl); eventValidation = ToolWeb.GetAspNetEventValidation(htl); NameValueCollection nvc = ToolWeb.GetNameValueCollection(new string[] { "__EVENTTARGET", "__EVENTARGUMENT", "__VIEWSTATE", "DOC_ID", "CORP_NAME", "APPYEAR", "ucPageNumControl:gotopage", "ucPageNumControl:NEXTpage" }, new string[] { string.Empty, string.Empty, viewState, string.Empty, string.Empty, "2012", (i - 2).ToString(), "下一页" }); try { htl = ToolWeb.GetHtmlByUrl(SiteUrl, nvc, Encoding.Default, ref cookiestr); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(htl)); NodeList tableNodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "dgConstBid"))); if (tableNodeList.Count > 0) { TableTag table = (TableTag)tableNodeList[0]; for (int j = 1; j < table.RowCount; j++) { string DocNo = string.Empty, PunishType = string.Empty, GrantUnit = string.Empty, DocDate = string.Empty, PunishCtx = string.Empty, GrantName = string.Empty, InfoUrl = string.Empty; TableRow tr = table.Rows[j]; DocNo = tr.Columns[1].ToPlainTextString().Trim(); PunishType = tr.Columns[5].ToPlainTextString().Trim(); GrantUnit = tr.Columns[2].ToPlainTextString().Replace(" ", "").Trim(); DocDate = tr.Columns[3].ToPlainTextString().Trim(); if (GrantUnit.Length <= 5) { GrantName = GrantUnit; GrantUnit = ""; } else { GrantName = ""; } ATag aTag = tr.Columns[1].SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = "http://61.144.226.2/PUNHTML/" + aTag.Link.Replace("GoDetail('", "").Replace("');", "").Trim(); string htmldetail = string.Empty; try { htmldetail = ToolWeb.GetHtmlByUrl(ToolWeb.UrlEncode(InfoUrl), Encoding.GetEncoding("GB2312")).Replace("= 602;", "罚"); } catch (Exception) { continue; } Parser parserdetail = new Parser(new Lexer(htmldetail)); NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new HasParentFilter(new TagNameFilter("div"))); PunishCtx = dtnode.AsString().Replace("=\r\n", "").Replace(" ", "").Trim(); PunishCtx = System.Web.HttpUtility.HtmlDecode(PunishCtx).Replace("</p>", "").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Trim(); if (GrantUnit == "") { Regex regGrantUnit = new Regex(@"(工程位置|被处罚单位)(:|:)[^\r\n]+\r\n"); GrantUnit = regGrantUnit.Match(PunishCtx).Value.Replace("被处罚单位", "").Replace(":", "").Replace(":", "").Trim(); } if (GrantName == "") { Regex regGrantName = new Regex(@"(工程位置|企业负责人)(:|:)[^\r\n]+\r\n"); GrantName = regGrantName.Match(PunishCtx).Value.Replace("企业负责人", "").Replace(":", "").Replace(":", "").Trim(); } CorpPunish info = ToolDb.GenCorpPunish(string.Empty, DocNo, PunishType, GrantUnit, DocDate, PunishCtx, InfoUrl, GrantName, "1"); list.Add(info); } } } return(list); }