示例#1
0
        private void GetUrlDict(Match m, ref string url, string nextPageText, List <string> levelUrlList, string startPos, string endPos, OnEnqueueUrls enqueueUrl)
        {
            string str = m.Value.Substring(5);//去除dict:

            if (!string.IsNullOrEmpty(str))
            {
                string   errMsg     = string.Empty;
                DictList dictList   = XmlHelper.LoadFromXml <DictList>(Program.GetConfigPath(@"dict.xml"), ref errMsg);
                Dict     dictEntity = dictList.Where(q => q.CategoryName.Value == str).FirstOrDefault();
                if (dictEntity.DictItemList == null || dictEntity.DictItemList.Count == 0)
                {
                    string tempUrl0  = url.Substring(0, m.Index - 1);
                    string tempCode  = "";
                    string tempUrl1  = string.Empty;
                    int    lastIndex = m.Index + m.Length + 1;
                    if (url.Length > lastIndex)
                    {
                        tempUrl1 = url.Substring(lastIndex);
                    }
                    string tempUrl = tempUrl0 + tempCode + tempUrl1;
                    url = GetMatchUrl(tempUrl, nextPageText, levelUrlList, startPos, endPos, enqueueUrl);
                    if (_GatherUrlItemSingleFlag)
                    {
                        return;
                    }
                    enqueueUrl(url, nextPageText, levelUrlList, startPos, endPos);
                }
                else
                {
                    string tempUrl0 = url.Substring(0, m.Index - 1);
                    string tempUrl1 = string.Empty;
                    int    index    = 1;
                    foreach (var item in dictEntity.DictItemList)
                    {
                        string tempCode  = item.DictName.Value;
                        int    lastIndex = m.Index + m.Length + 1;
                        if (url.Length > lastIndex)
                        {
                            tempUrl1 = url.Substring(lastIndex);
                        }
                        string tempUrl = tempUrl0 + tempCode + tempUrl1;
                        url = GetMatchUrl(tempUrl, nextPageText, levelUrlList, startPos, endPos, enqueueUrl);
                        if (_GatherUrlItemSingleFlag)
                        {
                            return;
                        }
                        if (index == dictEntity.DictItemList.Count)
                        {
                            break;
                        }
                        enqueueUrl(url, nextPageText, levelUrlList, startPos, endPos);
                        index++;
                    }
                }
            }
        }
示例#2
0
        private void GetUrlNumber(Match m, ref string url, string nextPageText, List <string> levelUrlList, string startPos, string endPos, OnEnqueueUrls enqueueUrl)
        {
            #region 取大括号内的数字分页数据
            string[] str = m.Value.Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries);
            if (str.Length == 3)
            {
                int[]  pageItem  = Array.ConvertAll <string, int>(str, q => DMSFrame.TryParse.StrToInt(q));
                string tempUrl0  = url.Substring(0, m.Index - 1);
                int    lastIndex = m.Index + m.Length + 1;
                string tempUrl1  = string.Empty;
                if (url.Length > lastIndex)
                {
                    tempUrl1 = url.Substring(lastIndex);
                }
                if (pageItem[0] < pageItem[1] && pageItem[2] <= pageItem[1] && pageItem[2] > 0)//递增
                {
                    for (int i = pageItem[0]; i <= pageItem[1]; i += pageItem[2])
                    {
                        string tempUrl = tempUrl0 + i + tempUrl1;

                        url = GetMatchUrl(tempUrl, nextPageText, levelUrlList, startPos, endPos, enqueueUrl);
                        if (_GatherUrlItemSingleFlag)
                        {
                            break;
                        }
                        enqueueUrl(url, nextPageText, levelUrlList, startPos, endPos);
                    }
                }
                else if (pageItem[0] >= pageItem[1] && pageItem[0] > pageItem[2] && pageItem[2] < 0)//递减
                {
                    for (int i = pageItem[0]; i >= pageItem[1]; i += pageItem[2])
                    {
                        string tempUrl = tempUrl0 + i + tempUrl1;
                        url = GetMatchUrl(tempUrl, nextPageText, levelUrlList, startPos, endPos, enqueueUrl);
                        if (_GatherUrlItemSingleFlag)
                        {
                            break;
                        }
                        if (i == pageItem[1])
                        {
                            break;
                        }
                        enqueueUrl(url, nextPageText, levelUrlList, startPos, endPos);
                    }
                }
            }
            #endregion
        }
示例#3
0
        private void GetUrlLetter(Match m, ref string url, string nextPageText, List <string> levelUrlList, string startPos, string endPos, OnEnqueueUrls enqueueUrl)
        {
            string[] str = m.Value.Split(new char[] { '-' }, StringSplitOptions.RemoveEmptyEntries);
            #region 取大括号内的字典数据
            if (str.Length == 2)
            {
                #region 取大括号内的字母数据
                string tempCode  = str[0];
                string tempUrl0  = url.Substring(0, m.Index - 1);
                int    lastIndex = m.Index + m.Length + 1;
                string tempUrl1  = string.Empty;
                if (url.Length > lastIndex)
                {
                    tempUrl1 = url.Substring(lastIndex);
                }
                string tempUrl = tempUrl0 + tempCode + tempUrl1;
                url = GetMatchUrl(tempUrl, nextPageText, levelUrlList, startPos, endPos, enqueueUrl);
                if (_GatherUrlItemSingleFlag)
                {
                    return;
                }
                enqueueUrl(url, nextPageText, levelUrlList, startPos, endPos);
                char start = tempCode[0];
                char end   = str[1][0];
                int  value = start < end ? 1 : -1;
                while (true)
                {
                    char cb = Convert.ToChar(start + value);
                    tempUrl = tempUrl0 + cb + tempUrl1;
                    url     = GetMatchUrl(tempUrl, nextPageText, levelUrlList, startPos, endPos, enqueueUrl);
                    if (_GatherUrlItemSingleFlag)
                    {
                        break;
                    }
                    if (cb == end)
                    {
                        break;
                    }
                    enqueueUrl(url, nextPageText, levelUrlList, startPos, endPos);

                    start = cb;
                }
                #endregion
            }
            #endregion
        }
示例#4
0
        public string GetMatchUrl(string url, string nextPageText, List <string> levelUrlList, string startPos, string endPos, OnEnqueueUrls enqueueUrl)
        {
            Match m = Regex.Match(url, RegexString.Regex01, RegexOptions.IgnoreCase | RegexOptions.Compiled);

            if (m.Success)
            {
                GetUrlNumber(m, ref url, nextPageText, levelUrlList, startPos, endPos, enqueueUrl);
            }
            m = Regex.Match(url, RegexString.Regex02, RegexOptions.IgnoreCase | RegexOptions.Compiled);
            if (m.Success)
            {
                GetUrlLetter(m, ref url, nextPageText, levelUrlList, startPos, endPos, enqueueUrl);
            }
            m = Regex.Match(url, RegexString.Regex03, RegexOptions.IgnoreCase | RegexOptions.Compiled);
            if (m.Success)
            {
                GetUrlLetter(m, ref url, nextPageText, levelUrlList, startPos, endPos, enqueueUrl);
            }
            m = Regex.Match(url, RegexString.Regex04, RegexOptions.IgnoreCase | RegexOptions.Compiled);
            if (m.Success)
            {
                GetUrlDict(m, ref url, nextPageText, levelUrlList, startPos, endPos, enqueueUrl);
            }
            return(url);
        }