Exemple #1
0
        private List<DrillResult> DrillRegularResult(CrawlOriData pCrawlOriData)
        {
            StringBuilder sHtmlSb = new StringBuilder(pCrawlOriData.Data.ToString());
            //执行清洗操作
            if (CleanRule != null && CleanRule.Length > 0)
            {
                string[] sCleanRules = CleanRule.Split(new string[] { "\n", "\r" }, StringSplitOptions.RemoveEmptyEntries);
                if (sCleanRules.Length > 0)
                {
                    for (int i = 0; i < sCleanRules.Length; i++)
                    {
                        string[] sCleanRule = sCleanRules[i].Split('|');
                        if (sCleanRule.Length == 1)
                        {
                            sHtmlSb.Replace(sCleanRule[0], "");
                        }
                        else if (sCleanRule.Length == 2)
                        {
                            sHtmlSb.Replace(sCleanRule[0], sCleanRule[1]);
                        }
                    }
                }
            }

            List<DrillResult> Records = new List<DrillResult>();
            //对当前html进行一个规则实例化-富血模型类
            RegScriptTransactor sRegScriptTransactor = new RegScriptTransactor(sHtmlSb.ToString());
            //(多个)记录区-规则提取
            DrillRegularRules.ForEach(t =>
            {
                if (sRegScriptTransactor.CanExe(t))
                {
                    if (t.DrillType == 0)
                    {
                        string sName = t.FeatureType == 0 ? LineFeatureType.链接.ToString() : LineFeatureType.图片.ToString();
                        DrillResult sDrillResult = new DrillResult();
                        string[] sRdData = sRegScriptTransactor.GetUrls(t, pCrawlOriData.Url);
                        //是否能找到记录区
                        if (sRdData != null && sRdData.Length > 0)
                        {
                            RegularMetaFeild sFeild = null;
                            if (t.Feilds != null && t.Feilds.Count >= 5) { sFeild = t.Feilds[1]; }
                            //生成结果集
                            for (int j = 0; j < sRdData.Length; j++)
                            {
                                sDrillResult.Records.Add(new DrillCRecord(Plot.Name, sName, sRdData[j]));
                            }
                        }
                        Records.Add(sDrillResult);
                    }
                    else
                    {
                        //高级自定义提取
                        //规则结果对象
                        DrillResult sDrillResult = new DrillResult();
                        //获取记录区片段
                        string[] sRegionHtmls = sRegScriptTransactor.GetRecordHtmls(t);
                        if (sRegionHtmls != null)
                        {
                            for (int j = 0; j < sRegionHtmls.Length; j++)
                            {
                                RegScriptTransactor sRegionTransactor = new RegScriptTransactor(sRegionHtmls[j]);
                                DrillCRecord sDrillCRecord = new DrillCRecord();
                                //sDrillCRecord.DbModelID = sDrillRule.MetaModalID;
                                SRecord sCRecord = new SRecord();
                                sDrillCRecord.Record = sCRecord;
                                //sCRecord.DbID = sDrillRule.DbID;
                                //sCRecord.Meta = new string[sDrillRule.Feilds.Count];
                                for (int k = 0; k < t.Feilds.Count; k++)
                                {
                                    RegularMetaFeild sRegularMetaFeild = t.Feilds[k];
                                    if (sRegularMetaFeild.Name == "来源链接")
                                    {
                                        sCRecord.Url = pCrawlOriData.Url;
                                    }
                                    else if (sRegularMetaFeild.Rule != null)
                                    {
                                        string sValue = sRegionTransactor.Exe(sRegularMetaFeild.Rule);
                                        sCRecord.Url = sValue;
                                        if (sRegularMetaFeild.BindType > 0 && sValue != null && sValue.Trim().Length > 0)
                                        {
                                            string[] sUrls = sValue.Split(new char[] { '\n' }, StringSplitOptions.RemoveEmptyEntries);
                                            if (sUrls != null && sUrls.Length > 0)
                                            {
                                                Dictionary<string, string> sUrlDic = new Dictionary<string, string>();
                                                for (int ii = 0; ii < sUrls.Length; ii++)
                                                {
                                                    string sUrl = sUrls[ii].Trim();
                                                    if (sUrlDic.ContainsKey(sUrl.ToLower()) == false)
                                                    {
                                                        sUrlDic[sUrl.ToLower()] = sUrl;
                                                        string[] sUrlSpans = sUrl.Split('\t');
                                                        if (sUrlSpans.Length > 1 && (sUrlSpans[1].StartsWith("http://") || sUrlSpans[1].StartsWith("https://")))
                                                        {
                                                            //sDrillCRecord.AddDownload(sUrlSpans[1], pData.Url, sRegularMetaFeild.BindType);
                                                        }
                                                        else if (sUrlSpans[0].StartsWith("http://") || sUrlSpans[0].StartsWith("https://"))
                                                        {
                                                            //sDrillCRecord.AddDownload(sUrlSpans[0], pData.Url, sRegularMetaFeild.BindType);
                                                        }
                                                    }
                                                }
                                            }
                                        }
                                    }
                                }
                                sDrillResult.Records.Add(sDrillCRecord);
                            }
                        }
                        Records.Add(sDrillResult);
                    };
                }
            });
            return Records;
        }
Exemple #2
0
 /// <summary>
 ///  显示记录
 /// </summary>
 /// <param name="pIndex"></param>
 private void ShowRecord(int pIndex)
 {
     TxtResult.Clear();
     TxtRecordHtml.Clear();
     if (pIndex >= 0)
     {
         if (mRecordHtmls != null && mRecordHtmls.Length > pIndex)
         {
             TxtCurrent.Text = pIndex.ToString();
             TxtRecordHtml.Text = mRecordHtmls[pIndex];
             RegScriptTransactor sSe = new RegScriptTransactor(TxtRecordHtml.Text);
             if (mDrillRule.Feilds != null)
             {
                 for (int i = 0; i < mDrillRule.Feilds.Count; i++)
                 {
                     RegularMetaFeild sFeild = mDrillRule.Feilds[i];
                     if (sFeild.Rule != null)
                     {
                         try
                         {
                             TxtResult.AppendText(sFeild.Name);
                             TxtResult.AppendText(" : ");
                             TxtResult.AppendText(sSe.Exe(sFeild.Rule));
                             TxtResult.AppendText("\n");
                         }
                         catch (Exception E)
                         {
                             MessageBox.Show(E.Message);
                         }
                     }
                 }
             }
         }
     }
     else
     {
         TxtCurrent.Text = "0";
     }
 }
Exemple #3
0
        private List <DrillResult> DrillRegularResult(CrawlOriData pCrawlOriData)
        {
            StringBuilder sHtmlSb = new StringBuilder(pCrawlOriData.Data.ToString());

            //执行清洗操作
            if (CleanRule != null && CleanRule.Length > 0)
            {
                string[] sCleanRules = CleanRule.Split(new string[] { "\n", "\r" }, StringSplitOptions.RemoveEmptyEntries);
                if (sCleanRules.Length > 0)
                {
                    for (int i = 0; i < sCleanRules.Length; i++)
                    {
                        string[] sCleanRule = sCleanRules[i].Split('|');
                        if (sCleanRule.Length == 1)
                        {
                            sHtmlSb.Replace(sCleanRule[0], "");
                        }
                        else if (sCleanRule.Length == 2)
                        {
                            sHtmlSb.Replace(sCleanRule[0], sCleanRule[1]);
                        }
                    }
                }
            }

            List <DrillResult> Records = new List <DrillResult>();
            //对当前html进行一个规则实例化-富血模型类
            RegScriptTransactor sRegScriptTransactor = new RegScriptTransactor(sHtmlSb.ToString());

            //(多个)记录区-规则提取
            DrillRegularRules.ForEach(t =>
            {
                if (sRegScriptTransactor.CanExe(t))
                {
                    if (t.DrillType == 0)
                    {
                        string sName             = t.FeatureType == 0 ? LineFeatureType.链接.ToString() : LineFeatureType.图片.ToString();
                        DrillResult sDrillResult = new DrillResult();
                        string[] sRdData         = sRegScriptTransactor.GetUrls(t, pCrawlOriData.Url);
                        //是否能找到记录区
                        if (sRdData != null && sRdData.Length > 0)
                        {
                            RegularMetaFeild sFeild = null;
                            if (t.Feilds != null && t.Feilds.Count >= 5)
                            {
                                sFeild = t.Feilds[1];
                            }
                            //生成结果集
                            for (int j = 0; j < sRdData.Length; j++)
                            {
                                sDrillResult.Records.Add(new DrillCRecord(Plot.Name, sName, sRdData[j]));
                            }
                        }
                        Records.Add(sDrillResult);
                    }
                    else
                    {
                        //高级自定义提取
                        //规则结果对象
                        DrillResult sDrillResult = new DrillResult();
                        //获取记录区片段
                        string[] sRegionHtmls = sRegScriptTransactor.GetRecordHtmls(t);
                        if (sRegionHtmls != null)
                        {
                            for (int j = 0; j < sRegionHtmls.Length; j++)
                            {
                                RegScriptTransactor sRegionTransactor = new RegScriptTransactor(sRegionHtmls[j]);
                                DrillCRecord sDrillCRecord            = new DrillCRecord();
                                //sDrillCRecord.DbModelID = sDrillRule.MetaModalID;
                                SRecord sCRecord     = new SRecord();
                                sDrillCRecord.Record = sCRecord;
                                //sCRecord.DbID = sDrillRule.DbID;
                                //sCRecord.Meta = new string[sDrillRule.Feilds.Count];
                                for (int k = 0; k < t.Feilds.Count; k++)
                                {
                                    RegularMetaFeild sRegularMetaFeild = t.Feilds[k];
                                    if (sRegularMetaFeild.Name == "来源链接")
                                    {
                                        sCRecord.Url = pCrawlOriData.Url;
                                    }
                                    else if (sRegularMetaFeild.Rule != null)
                                    {
                                        string sValue = sRegionTransactor.Exe(sRegularMetaFeild.Rule);
                                        sCRecord.Url  = sValue;
                                        if (sRegularMetaFeild.BindType > 0 && sValue != null && sValue.Trim().Length > 0)
                                        {
                                            string[] sUrls = sValue.Split(new char[] { '\n' }, StringSplitOptions.RemoveEmptyEntries);
                                            if (sUrls != null && sUrls.Length > 0)
                                            {
                                                Dictionary <string, string> sUrlDic = new Dictionary <string, string>();
                                                for (int ii = 0; ii < sUrls.Length; ii++)
                                                {
                                                    string sUrl = sUrls[ii].Trim();
                                                    if (sUrlDic.ContainsKey(sUrl.ToLower()) == false)
                                                    {
                                                        sUrlDic[sUrl.ToLower()] = sUrl;
                                                        string[] sUrlSpans      = sUrl.Split('\t');
                                                        if (sUrlSpans.Length > 1 && (sUrlSpans[1].StartsWith("http://") || sUrlSpans[1].StartsWith("https://")))
                                                        {
                                                            //sDrillCRecord.AddDownload(sUrlSpans[1], pData.Url, sRegularMetaFeild.BindType);
                                                        }
                                                        else if (sUrlSpans[0].StartsWith("http://") || sUrlSpans[0].StartsWith("https://"))
                                                        {
                                                            //sDrillCRecord.AddDownload(sUrlSpans[0], pData.Url, sRegularMetaFeild.BindType);
                                                        }
                                                    }
                                                }
                                            }
                                        }
                                    }
                                }
                                sDrillResult.Records.Add(sDrillCRecord);
                            }
                        }
                        Records.Add(sDrillResult);
                    };
                }
            });
            return(Records);
        }
Exemple #4
0
 private void btnTest_Click(object sender, EventArgs e)
 {
     RegScriptTransactor sRegScriptTransactor = new RegScriptTransactor(mTree.Html);
     DrillRegularRule sDrillRule = new DrillRegularRule();
     sDrillRule.StartTag = TxtStartTag.Text.Trim();
     sDrillRule.EndTag = TxtEndTag.Text.Trim();
     sDrillRule.DrillType = 0;
     sDrillRule.MetaModalID = SysDbDefines.下载链接.DbModelID;
     sDrillRule.FeatureType = CmbLinkType.SelectedIndex;
     sDrillRule.Feature = TxtUrlFeatrue.Text;
     string[] sUrls = sRegScriptTransactor.GetUrls(sDrillRule, mTree.URL);
     List<string> sList = new List<string>();
     for (int i = 0; i < sUrls.Length; i++)
     {
         sList.Add((i + 1).ToString() + "." + sUrls[i]);
     }
     TxtTestResult0.Lines = sList.ToArray();
 }