/// <summary> /// /// </summary> /// <returns></returns> internal RegularMetaFeild Clone() { RegularMetaFeild sMetaFeild = new RegularMetaFeild(); sMetaFeild.BindType = BindType; sMetaFeild.Name = Name; if (Rule != null) { sMetaFeild.Rule = Rule.Clone(); } return(sMetaFeild); }
private List <DrillResult> DrillRegularResult(CrawlOriData pCrawlOriData) { StringBuilder sHtmlSb = new StringBuilder(pCrawlOriData.Data.ToString()); //执行清洗操作 if (CleanRule != null && CleanRule.Length > 0) { string[] sCleanRules = CleanRule.Split(new string[] { "\n", "\r" }, StringSplitOptions.RemoveEmptyEntries); if (sCleanRules.Length > 0) { for (int i = 0; i < sCleanRules.Length; i++) { string[] sCleanRule = sCleanRules[i].Split('|'); if (sCleanRule.Length == 1) { sHtmlSb.Replace(sCleanRule[0], ""); } else if (sCleanRule.Length == 2) { sHtmlSb.Replace(sCleanRule[0], sCleanRule[1]); } } } } List <DrillResult> Records = new List <DrillResult>(); //对当前html进行一个规则实例化-富血模型类 RegScriptTransactor sRegScriptTransactor = new RegScriptTransactor(sHtmlSb.ToString()); //(多个)记录区-规则提取 DrillRegularRules.ForEach(t => { if (sRegScriptTransactor.CanExe(t)) { if (t.DrillType == 0) { string sName = t.FeatureType == 0 ? LineFeatureType.链接.ToString() : LineFeatureType.图片.ToString(); DrillResult sDrillResult = new DrillResult(); string[] sRdData = sRegScriptTransactor.GetUrls(t, pCrawlOriData.Url); //是否能找到记录区 if (sRdData != null && sRdData.Length > 0) { RegularMetaFeild sFeild = null; if (t.Feilds != null && t.Feilds.Count >= 5) { sFeild = t.Feilds[1]; } //生成结果集 for (int j = 0; j < sRdData.Length; j++) { sDrillResult.Records.Add(new DrillCRecord(Plot.Name, sName, sRdData[j])); } } Records.Add(sDrillResult); } else { //高级自定义提取 //规则结果对象 DrillResult sDrillResult = new DrillResult(); //获取记录区片段 string[] sRegionHtmls = sRegScriptTransactor.GetRecordHtmls(t); if (sRegionHtmls != null) { for (int j = 0; j < sRegionHtmls.Length; j++) { RegScriptTransactor sRegionTransactor = new RegScriptTransactor(sRegionHtmls[j]); DrillCRecord sDrillCRecord = new DrillCRecord(); //sDrillCRecord.DbModelID = sDrillRule.MetaModalID; SRecord sCRecord = new SRecord(); sDrillCRecord.Record = sCRecord; //sCRecord.DbID = sDrillRule.DbID; //sCRecord.Meta = new string[sDrillRule.Feilds.Count]; for (int k = 0; k < t.Feilds.Count; k++) { RegularMetaFeild sRegularMetaFeild = t.Feilds[k]; if (sRegularMetaFeild.Name == "来源链接") { sCRecord.Url = pCrawlOriData.Url; } else if (sRegularMetaFeild.Rule != null) { string sValue = sRegionTransactor.Exe(sRegularMetaFeild.Rule); sCRecord.Url = sValue; if (sRegularMetaFeild.BindType > 0 && sValue != null && sValue.Trim().Length > 0) { string[] sUrls = sValue.Split(new char[] { '\n' }, StringSplitOptions.RemoveEmptyEntries); if (sUrls != null && sUrls.Length > 0) { Dictionary <string, string> sUrlDic = new Dictionary <string, string>(); for (int ii = 0; ii < sUrls.Length; ii++) { string sUrl = sUrls[ii].Trim(); if (sUrlDic.ContainsKey(sUrl.ToLower()) == false) { sUrlDic[sUrl.ToLower()] = sUrl; string[] sUrlSpans = sUrl.Split('\t'); if (sUrlSpans.Length > 1 && (sUrlSpans[1].StartsWith("http://") || sUrlSpans[1].StartsWith("https://"))) { //sDrillCRecord.AddDownload(sUrlSpans[1], pData.Url, sRegularMetaFeild.BindType); } else if (sUrlSpans[0].StartsWith("http://") || sUrlSpans[0].StartsWith("https://")) { //sDrillCRecord.AddDownload(sUrlSpans[0], pData.Url, sRegularMetaFeild.BindType); } } } } } } } sDrillResult.Records.Add(sDrillCRecord); } } Records.Add(sDrillResult); }; } }); return(Records); }
private void btnSave_Click(object sender, EventArgs e) { TxtName.Text = TxtName.Text.Trim(); if (TxtName.Text.Length == 0) { MessageBox.Show("记录区名称不能为空!"); return; } #region 提取类型 mDrillRule.ConditionType = (byte)CmbConditionType.SelectedIndex; mDrillRule.ConditionTag = TxtConditionTag.Text.Trim(); mDrillRule.StartTag = TxtStartTag.Text.Trim().ToLower(); mDrillRule.EndTag = TxtEndTag.Text.Trim().ToLower(); if (tabControl3.SelectedIndex == 0) { mDrillRule.DrillType = 0; } else { mDrillRule.DrillType = 2; } CDbDefine sCDbDefine; if (mDrillRule.DrillType == 0) { sCDbDefine = SysDbDefines.下载链接; } else { if (TxtMeta.Tag != null) { sCDbDefine = ((CDbDefine)TxtMeta.Tag); } else { sCDbDefine = SysDbDefines.实时资讯数据库; } } mDrillRule.MetaModalID = sCDbDefine.DbModelID; //确定元数据 if (mDrillRule.Feilds.Count == 0) { for (int i = 0; i < sCDbDefine.Fields.Length; i++) //加载预定义字段 { RegularMetaFeild sFeild = new RegularMetaFeild(); sFeild.Name = sCDbDefine.Fields[i].Name; mDrillRule.Feilds.Add(sFeild); } } #endregion #region 下载链接 mDrillRule.FeatureType = CmbLinkType.SelectedIndex; mDrillRule.Feature = TxtUrlFeatrue.Text; #endregion #region 自定义高级提取 mDrillRule.Splitter = TxtSpliter.Text.Trim(); mDrillRule.Name = TxtName.Text; mRegionTreeNode.Text = TxtName.Text; #endregion mRegionTreeNode.Tag = mDrillRule; mRegionTreeNode.ImageIndex = 1; mRegionTreeNode.SelectedImageIndex = 1; mWaterLine.DrillRegularRules = new List<DrillRegularRule>(); for (int i = 0; i < TViewRegion.Nodes.Count; i++) { if (TViewRegion.Nodes[i].Tag != null) { mWaterLine.DrillRegularRules.Add((DrillRegularRule)TViewRegion.Nodes[i].Tag); } } }
/// <summary> /// /// </summary> /// <returns></returns> internal RegularMetaFeild Clone() { RegularMetaFeild sMetaFeild = new RegularMetaFeild(); sMetaFeild.BindType = BindType; sMetaFeild.Name = Name; if (Rule != null) { sMetaFeild.Rule = Rule.Clone(); } return sMetaFeild; }