Example #1
0
 /// <summary>
 /// 执行字段提取
 /// </summary>
 /// <param name="pFeildRule"></param>
 /// <returns></returns>
 public string Exe(RegularFeildRule pFeildRule)
 {
     int sStart = 0;
     if (pFeildRule.GroupTag != null && pFeildRule.GroupTag.Length > 0)
     {
         if (mGroupTagDic.TryGetValue(pFeildRule.GroupTag, out sStart) == false)
         {
             sStart = mOriHtml.IndexOf(pFeildRule.GroupTag, 0, StringComparison.OrdinalIgnoreCase);
             mGroupTagDic[pFeildRule.GroupTag] = sStart;
         }
     }
     if (sStart < 0) { return ""; }    //组标记未找到,该组数据自动丧失
     int sEnd = mOriHtml.Length;
     if (pFeildRule.GroupEndTag != null && pFeildRule.GroupEndTag.Length > 0)
     {
         int sTmpEnd = mOriHtml.IndexOf(pFeildRule.GroupEndTag, sStart + (pFeildRule.GroupEndTag == null ? 0 : pFeildRule.GroupEndTag.Length), StringComparison.OrdinalIgnoreCase);
         if (sTmpEnd > 0) { sEnd = sTmpEnd; }
     }
     StringBuilder sSb = new StringBuilder();
     if (pFeildRule.Repeatable)
     {
         sStart = Exe(sStart, pFeildRule, sSb, sEnd);
         while (sStart > 0)
         {
             sSb.Append("\n");
             sStart = Exe(sStart, pFeildRule, sSb, sEnd);
         }
     }
     else
     {
         Exe(sStart, pFeildRule, sSb, sEnd);
     }
     return sSb.ToString();
 }
Example #2
0
        public string StartTag; //起始标记,包含

        #endregion Fields

        #region Methods

        internal RegularFeildRule Clone()
        {
            RegularFeildRule sMetaFeild = new RegularFeildRule();
            sMetaFeild.GroupTag = GroupTag;
            sMetaFeild.GroupEndTag = GroupEndTag;
            sMetaFeild.Repeatable = Repeatable;
            sMetaFeild.ClearHtml = ClearHtml;
            sMetaFeild.EndTag = EndTag;
            sMetaFeild.RemoveWhiteSpace = RemoveWhiteSpace;
            sMetaFeild.ReplacePairs = (string[])ReplacePairs.Clone();
            sMetaFeild.StartTag = StartTag;
            return sMetaFeild;
        }
Example #3
0
        public bool RemoveWhiteSpace;          //移除空白


        internal RegularFeildRule Clone()
        {
            RegularFeildRule sMetaFeild = new RegularFeildRule();

            sMetaFeild.GroupTag         = GroupTag;
            sMetaFeild.GroupEndTag      = GroupEndTag;
            sMetaFeild.Repeatable       = Repeatable;
            sMetaFeild.ClearHtml        = ClearHtml;
            sMetaFeild.EndTag           = EndTag;
            sMetaFeild.RemoveWhiteSpace = RemoveWhiteSpace;
            sMetaFeild.ReplacePairs     = (string[])ReplacePairs.Clone();
            sMetaFeild.StartTag         = StartTag;
            return(sMetaFeild);
        }
Example #4
0
        /// <summary>
        /// 执行字段提取
        /// </summary>
        /// <param name="pFeildRule"></param>
        /// <returns></returns>
        public string Exe(RegularFeildRule pFeildRule)
        {
            int sStart = 0;

            if (pFeildRule.GroupTag != null && pFeildRule.GroupTag.Length > 0)
            {
                if (mGroupTagDic.TryGetValue(pFeildRule.GroupTag, out sStart) == false)
                {
                    sStart = mOriHtml.IndexOf(pFeildRule.GroupTag, 0, StringComparison.OrdinalIgnoreCase);
                    mGroupTagDic[pFeildRule.GroupTag] = sStart;
                }
            }
            if (sStart < 0)
            {
                return("");
            }                                 //组标记未找到,该组数据自动丧失
            int sEnd = mOriHtml.Length;

            if (pFeildRule.GroupEndTag != null && pFeildRule.GroupEndTag.Length > 0)
            {
                int sTmpEnd = mOriHtml.IndexOf(pFeildRule.GroupEndTag, sStart + (pFeildRule.GroupEndTag == null ? 0 : pFeildRule.GroupEndTag.Length), StringComparison.OrdinalIgnoreCase);
                if (sTmpEnd > 0)
                {
                    sEnd = sTmpEnd;
                }
            }
            StringBuilder sSb = new StringBuilder();

            if (pFeildRule.Repeatable)
            {
                sStart = Exe(sStart, pFeildRule, sSb, sEnd);
                while (sStart > 0)
                {
                    sSb.Append("\n");
                    sStart = Exe(sStart, pFeildRule, sSb, sEnd);
                }
            }
            else
            {
                Exe(sStart, pFeildRule, sSb, sEnd);
            }
            return(sSb.ToString());
        }
Example #5
0
        public int Exe(int pStart, RegularFeildRule pFeildRule, StringBuilder pSb, int pEnd)
        {
            int x = pStart;
            int y = pEnd;

            #region 哈尔滨修改--通配符--修改前
            //if (pFeildRule.StartTag != null && pFeildRule.StartTag.Length > 0)
            //{
            //    x = mOriHtml.IndexOf(pFeildRule.StartTag, pStart, pEnd - pStart, StringComparison.OrdinalIgnoreCase);
            //}
            //if (x >= pStart)
            //{
            //    x = x + (pFeildRule.StartTag == null ? 0 : pFeildRule.StartTag.Length);
            //    if (pFeildRule.EndTag != null && pFeildRule.EndTag.Length > 0)
            //    {
            //        y = mOriHtml.IndexOf(pFeildRule.EndTag, x, pEnd - x, StringComparison.OrdinalIgnoreCase);
            //    }
            //}
            #endregion

            #region 哈尔滨修改--通配符--修改后
            if (pFeildRule.StartTag != null && pFeildRule.StartTag.Length > 0)
            {
                List<string> list = new List<string>();
                list = ExeStringList(pFeildRule.StartTag);
                int xIndex = 0;
                for (int i = 0; i < list.Count; i++)
                {
                    x = mOriHtml.IndexOf(list[i], x, pEnd - x, StringComparison.OrdinalIgnoreCase);
                    if (x != -1)
                        x += list[i].Length;
                    if (i == 0)
                    {
                        if (x == -1)
                            break;
                        else
                            xIndex = x;
                    }
                    else
                    {
                        if (x == -1)
                        {
                            i = -1;
                            x = xIndex;
                        }
                    }
                }
            }
            if (x >= pStart)
            {
                if (pFeildRule.EndTag != null && pFeildRule.EndTag.Length > 0)
                {
                    List<string> list = new List<string>();
                    list = ExeStringList(pFeildRule.EndTag);
                    int yIndex = 0;
                    y = x;
                    for (int i = 0; i < list.Count; i++)
                    {
                        if (i != 0)
                            y += list[i - 1].Length;
                        y = mOriHtml.IndexOf(list[i], y, pEnd - y, StringComparison.OrdinalIgnoreCase);

                        if (i == 0)
                        {
                            if (y == -1)
                                break;
                            else
                                yIndex = y;
                        }
                        else
                        {
                            if (y == -1)
                            {
                                i = -1;
                                y = yIndex + list[0].Length;
                            }
                        }
                    }
                }
            }
            #endregion

            if (x >= pStart && y > x)
            {
                string s = mOriHtml.Substring(x, y - x);  //取得原始资料
                if (pFeildRule.ClearHtml)
                {
                    s = mRegexScript.Replace(s, "");       //清除脚本
                    s = mRegexHtml.Replace(s, "");         //清除Html标签
                }
                if (pFeildRule.ReplacePairs != null && pFeildRule.ReplacePairs.Length > 0)
                {
                    for (int i = 0; i < pFeildRule.ReplacePairs.Length; i++)
                    {
                        string[] sSpan = pFeildRule.ReplacePairs[i].Split('\t');
                        if (sSpan.Length > 1)
                        {
                            s = s.Replace(sSpan[0], sSpan[1]);
                        }
                        else
                        {
                            s = s.Replace(sSpan[0], "");
                        }
                    }
                }
                s = WebUtility.HtmlDecode(s);
                if (pFeildRule.RemoveWhiteSpace)
                {
                    for (int i = 0; i < s.Length; i++)
                    {
                        char c = s[i];
                        if (c == 12288)
                        {
                            c = (char)32;
                        }
                        else if (c > 65280 && c < 65375)
                        {
                            c = (char)(c - 65248);
                        }
                        if (!char.IsWhiteSpace(c))
                        {
                            pSb.Append(c);
                        }
                    }
                }
                else
                {
                    pSb.Append(s.Trim());
                }
                return y + (pFeildRule.EndTag == null ? 0 : pFeildRule.EndTag.Length);
            }
            return -1;   //不符合要求
        }
Example #6
0
        public int Exe(int pStart, RegularFeildRule pFeildRule, StringBuilder pSb, int pEnd)
        {
            int x = pStart;
            int y = pEnd;

            #region 哈尔滨修改--通配符--修改前
            //if (pFeildRule.StartTag != null && pFeildRule.StartTag.Length > 0)
            //{
            //    x = mOriHtml.IndexOf(pFeildRule.StartTag, pStart, pEnd - pStart, StringComparison.OrdinalIgnoreCase);
            //}
            //if (x >= pStart)
            //{
            //    x = x + (pFeildRule.StartTag == null ? 0 : pFeildRule.StartTag.Length);
            //    if (pFeildRule.EndTag != null && pFeildRule.EndTag.Length > 0)
            //    {
            //        y = mOriHtml.IndexOf(pFeildRule.EndTag, x, pEnd - x, StringComparison.OrdinalIgnoreCase);
            //    }
            //}
            #endregion

            #region 哈尔滨修改--通配符--修改后
            if (pFeildRule.StartTag != null && pFeildRule.StartTag.Length > 0)
            {
                List <string> list = new List <string>();
                list = ExeStringList(pFeildRule.StartTag);
                int xIndex = 0;
                for (int i = 0; i < list.Count; i++)
                {
                    x = mOriHtml.IndexOf(list[i], x, pEnd - x, StringComparison.OrdinalIgnoreCase);
                    if (x != -1)
                    {
                        x += list[i].Length;
                    }
                    if (i == 0)
                    {
                        if (x == -1)
                        {
                            break;
                        }
                        else
                        {
                            xIndex = x;
                        }
                    }
                    else
                    {
                        if (x == -1)
                        {
                            i = -1;
                            x = xIndex;
                        }
                    }
                }
            }
            if (x >= pStart)
            {
                if (pFeildRule.EndTag != null && pFeildRule.EndTag.Length > 0)
                {
                    List <string> list = new List <string>();
                    list = ExeStringList(pFeildRule.EndTag);
                    int yIndex = 0;
                    y = x;
                    for (int i = 0; i < list.Count; i++)
                    {
                        if (i != 0)
                        {
                            y += list[i - 1].Length;
                        }
                        y = mOriHtml.IndexOf(list[i], y, pEnd - y, StringComparison.OrdinalIgnoreCase);

                        if (i == 0)
                        {
                            if (y == -1)
                            {
                                break;
                            }
                            else
                            {
                                yIndex = y;
                            }
                        }
                        else
                        {
                            if (y == -1)
                            {
                                i = -1;
                                y = yIndex + list[0].Length;
                            }
                        }
                    }
                }
            }
            #endregion


            if (x >= pStart && y > x)
            {
                string s = mOriHtml.Substring(x, y - x);  //取得原始资料
                if (pFeildRule.ClearHtml)
                {
                    s = mRegexScript.Replace(s, "");       //清除脚本
                    s = mRegexHtml.Replace(s, "");         //清除Html标签
                }
                if (pFeildRule.ReplacePairs != null && pFeildRule.ReplacePairs.Length > 0)
                {
                    for (int i = 0; i < pFeildRule.ReplacePairs.Length; i++)
                    {
                        string[] sSpan = pFeildRule.ReplacePairs[i].Split('\t');
                        if (sSpan.Length > 1)
                        {
                            s = s.Replace(sSpan[0], sSpan[1]);
                        }
                        else
                        {
                            s = s.Replace(sSpan[0], "");
                        }
                    }
                }
                s = WebUtility.HtmlDecode(s);
                if (pFeildRule.RemoveWhiteSpace)
                {
                    for (int i = 0; i < s.Length; i++)
                    {
                        char c = s[i];
                        if (c == 12288)
                        {
                            c = (char)32;
                        }
                        else if (c > 65280 && c < 65375)
                        {
                            c = (char)(c - 65248);
                        }
                        if (!char.IsWhiteSpace(c))
                        {
                            pSb.Append(c);
                        }
                    }
                }
                else
                {
                    pSb.Append(s.Trim());
                }
                return(y + (pFeildRule.EndTag == null ? 0 : pFeildRule.EndTag.Length));
            }
            return(-1);   //不符合要求
        }