private static Encoding GetEncoding(ConfigOfRequest requestConfig) { Encoding encoding = Encoding.Unicode; switch (requestConfig.Encoding) { case EEncoding.Auto: //TODO 从页面中分析出Encoding格式 break; case EEncoding.ASCII: case EEncoding.Unicode: case EEncoding.GBK: encoding = Encoding.GetEncoding(requestConfig.Encoding.ToString()); break; case EEncoding.UTF8: encoding = Encoding.UTF8; break; default: break; } return(encoding); }
private static HttpWebRequest GetHttpWebRequest(string url, ConfigOfRequest requestConfig) { HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url); request.Method = requestConfig.Method; request.UserAgent = requestConfig.UserAgent; return(request); }
public override void LoadXElement(XElement element) { SpiderName = element.Attribute(nameof(SpiderName)).Value; RequestConfig = new ConfigOfRequest(element.Descendants(nameof(ConfigOfRequest)).First()); GrabConfigs = new List <IGrabConfig>(); foreach (var grabConfig in element.Descendants(nameof(IGrabConfig))) { GrabConfigs.Add(IGrabConfig.GetGrabConfig(grabConfig, this)); } }
public bool StartGrabbing(ConfigOfRequest requestConfig) { bool isSuccess = true; if (!IsOn) { return(isSuccess); } #region GrabContentByRequestConfig //抓取内容 switch (requestConfig.URLStrategy) { case URLStrategy.Default: var orientURL = requestConfig.URL; HttpWebRequest request = GetHttpWebRequest(orientURL, requestConfig); using (WebResponse response = request.GetResponse()) { try { Encoding encoding = GetEncoding(requestConfig); var pageString = new StreamReader(response.GetResponseStream(), encoding).ReadToEnd(); Result result = GrabContentByGrabType(pageString); TriggerOnGrabFinish(orientURL, result.ResultCode == EResultCode.Success, result.Message); isSuccess = result.ResultCode == EResultCode.Success; } catch (Exception ex) { TriggerOnGrabFinish(orientURL, false, "抓取出现异常:" + ex.ToString()); isSuccess = false; } } break; case URLStrategy.IncreaseByValue: int stopBy = requestConfig.StopWhenLT; int increaseValue = requestConfig.StartAt; while (true) { string increaseURL = string.Format(requestConfig.URL, increaseValue); request = GetHttpWebRequest(increaseURL, requestConfig); using (WebResponse response = request.GetResponse()) { try { Encoding encoding = GetEncoding(requestConfig); var pageString = new StreamReader(response.GetResponseStream(), encoding).ReadToEnd(); if (pageString.Length <= stopBy) { TriggerOnGrabFinish(increaseURL, false, "抓取达成终止条件而终止:页面数据长度(" + pageString.Length + ")未达到设定标准(" + stopBy + ")"); isSuccess = false; break; } Result result = GrabContentByGrabType(pageString, SpiderConfig.SpiderName + increaseValue); TriggerOnGrabFinish(increaseURL, result.ResultCode == EResultCode.Success, result.Message); isSuccess = isSuccess && result.ResultCode == EResultCode.Success; } catch (Exception ex) { TriggerOnGrabFinish(increaseURL, false, "抓取出现异常:" + ex.ToString()); isSuccess = false; break; } } increaseValue += requestConfig.IncreaseBy; } break; default: break; } return(isSuccess); #endregion }