/// <summary> /// 执行字段提取 /// </summary> /// <param name="pFeildRule"></param> /// <returns></returns> public string Exe(RegularFeildRule pFeildRule) { int sStart = 0; if (pFeildRule.GroupTag != null && pFeildRule.GroupTag.Length > 0) { if (mGroupTagDic.TryGetValue(pFeildRule.GroupTag, out sStart) == false) { sStart = mOriHtml.IndexOf(pFeildRule.GroupTag, 0, StringComparison.OrdinalIgnoreCase); mGroupTagDic[pFeildRule.GroupTag] = sStart; } } if (sStart < 0) { return ""; } //组标记未找到,该组数据自动丧失 int sEnd = mOriHtml.Length; if (pFeildRule.GroupEndTag != null && pFeildRule.GroupEndTag.Length > 0) { int sTmpEnd = mOriHtml.IndexOf(pFeildRule.GroupEndTag, sStart + (pFeildRule.GroupEndTag == null ? 0 : pFeildRule.GroupEndTag.Length), StringComparison.OrdinalIgnoreCase); if (sTmpEnd > 0) { sEnd = sTmpEnd; } } StringBuilder sSb = new StringBuilder(); if (pFeildRule.Repeatable) { sStart = Exe(sStart, pFeildRule, sSb, sEnd); while (sStart > 0) { sSb.Append("\n"); sStart = Exe(sStart, pFeildRule, sSb, sEnd); } } else { Exe(sStart, pFeildRule, sSb, sEnd); } return sSb.ToString(); }
public string StartTag; //起始标记,包含 #endregion Fields #region Methods internal RegularFeildRule Clone() { RegularFeildRule sMetaFeild = new RegularFeildRule(); sMetaFeild.GroupTag = GroupTag; sMetaFeild.GroupEndTag = GroupEndTag; sMetaFeild.Repeatable = Repeatable; sMetaFeild.ClearHtml = ClearHtml; sMetaFeild.EndTag = EndTag; sMetaFeild.RemoveWhiteSpace = RemoveWhiteSpace; sMetaFeild.ReplacePairs = (string[])ReplacePairs.Clone(); sMetaFeild.StartTag = StartTag; return sMetaFeild; }
public bool RemoveWhiteSpace; //移除空白 internal RegularFeildRule Clone() { RegularFeildRule sMetaFeild = new RegularFeildRule(); sMetaFeild.GroupTag = GroupTag; sMetaFeild.GroupEndTag = GroupEndTag; sMetaFeild.Repeatable = Repeatable; sMetaFeild.ClearHtml = ClearHtml; sMetaFeild.EndTag = EndTag; sMetaFeild.RemoveWhiteSpace = RemoveWhiteSpace; sMetaFeild.ReplacePairs = (string[])ReplacePairs.Clone(); sMetaFeild.StartTag = StartTag; return(sMetaFeild); }
/// <summary> /// 执行字段提取 /// </summary> /// <param name="pFeildRule"></param> /// <returns></returns> public string Exe(RegularFeildRule pFeildRule) { int sStart = 0; if (pFeildRule.GroupTag != null && pFeildRule.GroupTag.Length > 0) { if (mGroupTagDic.TryGetValue(pFeildRule.GroupTag, out sStart) == false) { sStart = mOriHtml.IndexOf(pFeildRule.GroupTag, 0, StringComparison.OrdinalIgnoreCase); mGroupTagDic[pFeildRule.GroupTag] = sStart; } } if (sStart < 0) { return(""); } //组标记未找到,该组数据自动丧失 int sEnd = mOriHtml.Length; if (pFeildRule.GroupEndTag != null && pFeildRule.GroupEndTag.Length > 0) { int sTmpEnd = mOriHtml.IndexOf(pFeildRule.GroupEndTag, sStart + (pFeildRule.GroupEndTag == null ? 0 : pFeildRule.GroupEndTag.Length), StringComparison.OrdinalIgnoreCase); if (sTmpEnd > 0) { sEnd = sTmpEnd; } } StringBuilder sSb = new StringBuilder(); if (pFeildRule.Repeatable) { sStart = Exe(sStart, pFeildRule, sSb, sEnd); while (sStart > 0) { sSb.Append("\n"); sStart = Exe(sStart, pFeildRule, sSb, sEnd); } } else { Exe(sStart, pFeildRule, sSb, sEnd); } return(sSb.ToString()); }
public int Exe(int pStart, RegularFeildRule pFeildRule, StringBuilder pSb, int pEnd) { int x = pStart; int y = pEnd; #region 哈尔滨修改--通配符--修改前 //if (pFeildRule.StartTag != null && pFeildRule.StartTag.Length > 0) //{ // x = mOriHtml.IndexOf(pFeildRule.StartTag, pStart, pEnd - pStart, StringComparison.OrdinalIgnoreCase); //} //if (x >= pStart) //{ // x = x + (pFeildRule.StartTag == null ? 0 : pFeildRule.StartTag.Length); // if (pFeildRule.EndTag != null && pFeildRule.EndTag.Length > 0) // { // y = mOriHtml.IndexOf(pFeildRule.EndTag, x, pEnd - x, StringComparison.OrdinalIgnoreCase); // } //} #endregion #region 哈尔滨修改--通配符--修改后 if (pFeildRule.StartTag != null && pFeildRule.StartTag.Length > 0) { List<string> list = new List<string>(); list = ExeStringList(pFeildRule.StartTag); int xIndex = 0; for (int i = 0; i < list.Count; i++) { x = mOriHtml.IndexOf(list[i], x, pEnd - x, StringComparison.OrdinalIgnoreCase); if (x != -1) x += list[i].Length; if (i == 0) { if (x == -1) break; else xIndex = x; } else { if (x == -1) { i = -1; x = xIndex; } } } } if (x >= pStart) { if (pFeildRule.EndTag != null && pFeildRule.EndTag.Length > 0) { List<string> list = new List<string>(); list = ExeStringList(pFeildRule.EndTag); int yIndex = 0; y = x; for (int i = 0; i < list.Count; i++) { if (i != 0) y += list[i - 1].Length; y = mOriHtml.IndexOf(list[i], y, pEnd - y, StringComparison.OrdinalIgnoreCase); if (i == 0) { if (y == -1) break; else yIndex = y; } else { if (y == -1) { i = -1; y = yIndex + list[0].Length; } } } } } #endregion if (x >= pStart && y > x) { string s = mOriHtml.Substring(x, y - x); //取得原始资料 if (pFeildRule.ClearHtml) { s = mRegexScript.Replace(s, ""); //清除脚本 s = mRegexHtml.Replace(s, ""); //清除Html标签 } if (pFeildRule.ReplacePairs != null && pFeildRule.ReplacePairs.Length > 0) { for (int i = 0; i < pFeildRule.ReplacePairs.Length; i++) { string[] sSpan = pFeildRule.ReplacePairs[i].Split('\t'); if (sSpan.Length > 1) { s = s.Replace(sSpan[0], sSpan[1]); } else { s = s.Replace(sSpan[0], ""); } } } s = WebUtility.HtmlDecode(s); if (pFeildRule.RemoveWhiteSpace) { for (int i = 0; i < s.Length; i++) { char c = s[i]; if (c == 12288) { c = (char)32; } else if (c > 65280 && c < 65375) { c = (char)(c - 65248); } if (!char.IsWhiteSpace(c)) { pSb.Append(c); } } } else { pSb.Append(s.Trim()); } return y + (pFeildRule.EndTag == null ? 0 : pFeildRule.EndTag.Length); } return -1; //不符合要求 }
public int Exe(int pStart, RegularFeildRule pFeildRule, StringBuilder pSb, int pEnd) { int x = pStart; int y = pEnd; #region 哈尔滨修改--通配符--修改前 //if (pFeildRule.StartTag != null && pFeildRule.StartTag.Length > 0) //{ // x = mOriHtml.IndexOf(pFeildRule.StartTag, pStart, pEnd - pStart, StringComparison.OrdinalIgnoreCase); //} //if (x >= pStart) //{ // x = x + (pFeildRule.StartTag == null ? 0 : pFeildRule.StartTag.Length); // if (pFeildRule.EndTag != null && pFeildRule.EndTag.Length > 0) // { // y = mOriHtml.IndexOf(pFeildRule.EndTag, x, pEnd - x, StringComparison.OrdinalIgnoreCase); // } //} #endregion #region 哈尔滨修改--通配符--修改后 if (pFeildRule.StartTag != null && pFeildRule.StartTag.Length > 0) { List <string> list = new List <string>(); list = ExeStringList(pFeildRule.StartTag); int xIndex = 0; for (int i = 0; i < list.Count; i++) { x = mOriHtml.IndexOf(list[i], x, pEnd - x, StringComparison.OrdinalIgnoreCase); if (x != -1) { x += list[i].Length; } if (i == 0) { if (x == -1) { break; } else { xIndex = x; } } else { if (x == -1) { i = -1; x = xIndex; } } } } if (x >= pStart) { if (pFeildRule.EndTag != null && pFeildRule.EndTag.Length > 0) { List <string> list = new List <string>(); list = ExeStringList(pFeildRule.EndTag); int yIndex = 0; y = x; for (int i = 0; i < list.Count; i++) { if (i != 0) { y += list[i - 1].Length; } y = mOriHtml.IndexOf(list[i], y, pEnd - y, StringComparison.OrdinalIgnoreCase); if (i == 0) { if (y == -1) { break; } else { yIndex = y; } } else { if (y == -1) { i = -1; y = yIndex + list[0].Length; } } } } } #endregion if (x >= pStart && y > x) { string s = mOriHtml.Substring(x, y - x); //取得原始资料 if (pFeildRule.ClearHtml) { s = mRegexScript.Replace(s, ""); //清除脚本 s = mRegexHtml.Replace(s, ""); //清除Html标签 } if (pFeildRule.ReplacePairs != null && pFeildRule.ReplacePairs.Length > 0) { for (int i = 0; i < pFeildRule.ReplacePairs.Length; i++) { string[] sSpan = pFeildRule.ReplacePairs[i].Split('\t'); if (sSpan.Length > 1) { s = s.Replace(sSpan[0], sSpan[1]); } else { s = s.Replace(sSpan[0], ""); } } } s = WebUtility.HtmlDecode(s); if (pFeildRule.RemoveWhiteSpace) { for (int i = 0; i < s.Length; i++) { char c = s[i]; if (c == 12288) { c = (char)32; } else if (c > 65280 && c < 65375) { c = (char)(c - 65248); } if (!char.IsWhiteSpace(c)) { pSb.Append(c); } } } else { pSb.Append(s.Trim()); } return(y + (pFeildRule.EndTag == null ? 0 : pFeildRule.EndTag.Length)); } return(-1); //不符合要求 }