/// <summary> /// 把数据添加到表格中 /// </summary> /// <param name="dt"></param> /// <param name="item"></param> public static void Add2Dt(DataTable dt, Data item) { //数据清洗 item = DataHandler(item); //添加到表格中 if (item.CompanyName.Trim().Length != 0) { DataRow dr = dt.NewRow(); dr["CompanyName"] = item.CompanyName; dr["Phase"] = item.Phase; dr["Contacts"] = item.Contacts; dr["Post"] = item.Post; dr["Telephone"] = item.Telephone; dr["MobilePhone"] = item.MobilePhone; dr["Fax"] = item.Fax; dr["Address"] = item.Address; dr["Country_Region"] = item.Country_Region; dr["Province"] = item.Province; dr["City"] = item.City; dr["Facebook"] = item.Facebook; dr["URL"] = item.URL; dt.Rows.Add(dr); } }
/// <summary> /// 数据清洗 /// </summary> /// <param name="item"></param> public static Data DataHandler(Data item) { Data item_new = new Data(); item_new.CompanyName = item.CompanyName.Trim(); //期数,正则匹配出第几期 string phase = ""; string pattern = "Phase[0-9]";//是字母或数字 至少出现一次 System.Text.RegularExpressions.Regex regex = new System.Text.RegularExpressions.Regex(pattern); System.Text.RegularExpressions.MatchCollection mc = regex.Matches(item.Phase.Trim()); for (int i = 0; i < mc.Count; i++) { if (phase.IndexOf(mc[i].Value) == -1) { phase = mc[i].Value + "," + phase; } } if (phase.Length != 0) { phase = phase.Substring(0, phase.Length - 1); } phase = phase.Replace("Phase", ""); item_new.Phase = phase; item_new.Contacts = item.Contacts.Trim(); item_new.Post = item.Post.Trim(); item_new.Telephone = item.Telephone.Trim().Split(new char[] { ':' })[1].Trim(); item_new.MobilePhone = item.MobilePhone.Trim().Split(new char[] { ':' })[1].Trim(); item_new.Fax = item.Fax.Trim().Split(new char[] { ':' })[1].Trim(); item_new.Address = item.Address.Trim().Split(new char[] { ':' })[1].Trim(); item_new.Country_Region = item.Country_Region.Split(new char[] { ':' })[1].Trim(); item_new.Province = item.Province.Trim().Split(new char[] { ':' })[1].Trim(); item_new.City = item.City.Trim().Split(new char[] { ':' })[1].Trim(); item_new.Facebook = item.Facebook.Trim().Split(new char[] { ':' })[1].Trim(); item_new.URL = item.URL.Trim(); return item_new; }
/// <summary> /// 根据需要采集的地址,和解析规则,采集回相对应的数据[列表数据 DataTable] /// </summary> /// <param name="url">需要采集的地址</param> /// <param name="xPath">解析规则[HtmlAgilityPack类库的解析规则]</param> /// <returns></returns> public static Data GetDataByUrl(string url) { Data data = new Data(); HtmlAgilityPack.HtmlDocument doc = GetHTML(url); data.CompanyName = GetContentByXpath(doc, XPath.COMPANYNAME); Console.WriteLine(String.Format("url==>{0},company==>{1}", url, data.CompanyName)); data.Phase = GetContentByXpath(doc, XPath.PHASE); data.Contacts = GetContentByXpath(doc, XPath.CONTACTS); data.Post = GetContentByXpath(doc, XPath.POST); HtmlAgilityPack.HtmlNodeCollection collection = doc.DocumentNode.SelectNodes(XPath.TELEPHONE); data.Telephone = GetContentByCollection(collection, 0); data.MobilePhone = GetContentByCollection(collection, 1); data.Fax = GetContentByCollection(collection, 2); data.Address = GetContentByCollection(collection, 3); data.Country_Region = GetContentByCollection(collection, 4); data.Province = GetContentByCollection(collection, 5); data.City = GetContentByCollection(collection, 6); data.Facebook = GetContentByCollection(collection, 7); data.URL = url; return data; }