private void GetUrlDict(Match m, ref string url, string nextPageText, List <string> levelUrlList, string startPos, string endPos, OnEnqueueUrls enqueueUrl) { string str = m.Value.Substring(5);//去除dict: if (!string.IsNullOrEmpty(str)) { string errMsg = string.Empty; DictList dictList = XmlHelper.LoadFromXml <DictList>(Program.GetConfigPath(@"dict.xml"), ref errMsg); Dict dictEntity = dictList.Where(q => q.CategoryName.Value == str).FirstOrDefault(); if (dictEntity.DictItemList == null || dictEntity.DictItemList.Count == 0) { string tempUrl0 = url.Substring(0, m.Index - 1); string tempCode = ""; string tempUrl1 = string.Empty; int lastIndex = m.Index + m.Length + 1; if (url.Length > lastIndex) { tempUrl1 = url.Substring(lastIndex); } string tempUrl = tempUrl0 + tempCode + tempUrl1; url = GetMatchUrl(tempUrl, nextPageText, levelUrlList, startPos, endPos, enqueueUrl); if (_GatherUrlItemSingleFlag) { return; } enqueueUrl(url, nextPageText, levelUrlList, startPos, endPos); } else { string tempUrl0 = url.Substring(0, m.Index - 1); string tempUrl1 = string.Empty; int index = 1; foreach (var item in dictEntity.DictItemList) { string tempCode = item.DictName.Value; int lastIndex = m.Index + m.Length + 1; if (url.Length > lastIndex) { tempUrl1 = url.Substring(lastIndex); } string tempUrl = tempUrl0 + tempCode + tempUrl1; url = GetMatchUrl(tempUrl, nextPageText, levelUrlList, startPos, endPos, enqueueUrl); if (_GatherUrlItemSingleFlag) { return; } if (index == dictEntity.DictItemList.Count) { break; } enqueueUrl(url, nextPageText, levelUrlList, startPos, endPos); index++; } } } }
private void GetUrlNumber(Match m, ref string url, string nextPageText, List <string> levelUrlList, string startPos, string endPos, OnEnqueueUrls enqueueUrl) { #region 取大括号内的数字分页数据 string[] str = m.Value.Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries); if (str.Length == 3) { int[] pageItem = Array.ConvertAll <string, int>(str, q => DMSFrame.TryParse.StrToInt(q)); string tempUrl0 = url.Substring(0, m.Index - 1); int lastIndex = m.Index + m.Length + 1; string tempUrl1 = string.Empty; if (url.Length > lastIndex) { tempUrl1 = url.Substring(lastIndex); } if (pageItem[0] < pageItem[1] && pageItem[2] <= pageItem[1] && pageItem[2] > 0)//递增 { for (int i = pageItem[0]; i <= pageItem[1]; i += pageItem[2]) { string tempUrl = tempUrl0 + i + tempUrl1; url = GetMatchUrl(tempUrl, nextPageText, levelUrlList, startPos, endPos, enqueueUrl); if (_GatherUrlItemSingleFlag) { break; } enqueueUrl(url, nextPageText, levelUrlList, startPos, endPos); } } else if (pageItem[0] >= pageItem[1] && pageItem[0] > pageItem[2] && pageItem[2] < 0)//递减 { for (int i = pageItem[0]; i >= pageItem[1]; i += pageItem[2]) { string tempUrl = tempUrl0 + i + tempUrl1; url = GetMatchUrl(tempUrl, nextPageText, levelUrlList, startPos, endPos, enqueueUrl); if (_GatherUrlItemSingleFlag) { break; } if (i == pageItem[1]) { break; } enqueueUrl(url, nextPageText, levelUrlList, startPos, endPos); } } } #endregion }
private void GetUrlLetter(Match m, ref string url, string nextPageText, List <string> levelUrlList, string startPos, string endPos, OnEnqueueUrls enqueueUrl) { string[] str = m.Value.Split(new char[] { '-' }, StringSplitOptions.RemoveEmptyEntries); #region 取大括号内的字典数据 if (str.Length == 2) { #region 取大括号内的字母数据 string tempCode = str[0]; string tempUrl0 = url.Substring(0, m.Index - 1); int lastIndex = m.Index + m.Length + 1; string tempUrl1 = string.Empty; if (url.Length > lastIndex) { tempUrl1 = url.Substring(lastIndex); } string tempUrl = tempUrl0 + tempCode + tempUrl1; url = GetMatchUrl(tempUrl, nextPageText, levelUrlList, startPos, endPos, enqueueUrl); if (_GatherUrlItemSingleFlag) { return; } enqueueUrl(url, nextPageText, levelUrlList, startPos, endPos); char start = tempCode[0]; char end = str[1][0]; int value = start < end ? 1 : -1; while (true) { char cb = Convert.ToChar(start + value); tempUrl = tempUrl0 + cb + tempUrl1; url = GetMatchUrl(tempUrl, nextPageText, levelUrlList, startPos, endPos, enqueueUrl); if (_GatherUrlItemSingleFlag) { break; } if (cb == end) { break; } enqueueUrl(url, nextPageText, levelUrlList, startPos, endPos); start = cb; } #endregion } #endregion }
public string GetMatchUrl(string url, string nextPageText, List <string> levelUrlList, string startPos, string endPos, OnEnqueueUrls enqueueUrl) { Match m = Regex.Match(url, RegexString.Regex01, RegexOptions.IgnoreCase | RegexOptions.Compiled); if (m.Success) { GetUrlNumber(m, ref url, nextPageText, levelUrlList, startPos, endPos, enqueueUrl); } m = Regex.Match(url, RegexString.Regex02, RegexOptions.IgnoreCase | RegexOptions.Compiled); if (m.Success) { GetUrlLetter(m, ref url, nextPageText, levelUrlList, startPos, endPos, enqueueUrl); } m = Regex.Match(url, RegexString.Regex03, RegexOptions.IgnoreCase | RegexOptions.Compiled); if (m.Success) { GetUrlLetter(m, ref url, nextPageText, levelUrlList, startPos, endPos, enqueueUrl); } m = Regex.Match(url, RegexString.Regex04, RegexOptions.IgnoreCase | RegexOptions.Compiled); if (m.Success) { GetUrlDict(m, ref url, nextPageText, levelUrlList, startPos, endPos, enqueueUrl); } return(url); }