Ejemplo n.º 1
0
        /// <summary>
        /// 把数据添加到表格中
        /// </summary>
        /// <param name="dt"></param>
        /// <param name="item"></param>
        public static void Add2Dt(DataTable dt, Data item)
        {
            //数据清洗
            item = DataHandler(item);

            //添加到表格中
            if (item.CompanyName.Trim().Length != 0)
            {
                DataRow dr = dt.NewRow();
                dr["CompanyName"] = item.CompanyName;
                dr["Phase"] = item.Phase;
                dr["Contacts"] = item.Contacts;
                dr["Post"] = item.Post;
                dr["Telephone"] = item.Telephone;
                dr["MobilePhone"] = item.MobilePhone;
                dr["Fax"] = item.Fax;
                dr["Address"] = item.Address;
                dr["Country_Region"] = item.Country_Region;
                dr["Province"] = item.Province;
                dr["City"] = item.City;
                dr["Facebook"] = item.Facebook;
                dr["URL"] = item.URL;
                dt.Rows.Add(dr);
            }
        }
Ejemplo n.º 2
0
        /// <summary>
        /// 数据清洗
        /// </summary>
        /// <param name="item"></param>
        public static Data DataHandler(Data item)
        {
            Data item_new = new Data();
            item_new.CompanyName = item.CompanyName.Trim();

            //期数,正则匹配出第几期
            string phase = "";

            string pattern = "Phase[0-9]";//是字母或数字 至少出现一次
            System.Text.RegularExpressions.Regex regex = new System.Text.RegularExpressions.Regex(pattern);
            System.Text.RegularExpressions.MatchCollection mc = regex.Matches(item.Phase.Trim());
            for (int i = 0; i < mc.Count; i++)
            {
                if (phase.IndexOf(mc[i].Value) == -1)
                {
                    phase = mc[i].Value + "," + phase;
                }
            }

            if (phase.Length != 0)
            {
                phase = phase.Substring(0, phase.Length - 1);
            }
            phase = phase.Replace("Phase", "");
            item_new.Phase = phase;


            item_new.Contacts = item.Contacts.Trim();
            item_new.Post = item.Post.Trim();
            item_new.Telephone = item.Telephone.Trim().Split(new char[] { ':' })[1].Trim();
            item_new.MobilePhone = item.MobilePhone.Trim().Split(new char[] { ':' })[1].Trim();
            item_new.Fax = item.Fax.Trim().Split(new char[] { ':' })[1].Trim();
            item_new.Address = item.Address.Trim().Split(new char[] { ':' })[1].Trim();
            item_new.Country_Region = item.Country_Region.Split(new char[] { ':' })[1].Trim();
            item_new.Province = item.Province.Trim().Split(new char[] { ':' })[1].Trim();
            item_new.City = item.City.Trim().Split(new char[] { ':' })[1].Trim();
            item_new.Facebook = item.Facebook.Trim().Split(new char[] { ':' })[1].Trim();
            item_new.URL = item.URL.Trim();

            return item_new;
        }
Ejemplo n.º 3
0
        /// <summary>
        /// 根据需要采集的地址,和解析规则,采集回相对应的数据[列表数据 DataTable]
        /// </summary>
        /// <param name="url">需要采集的地址</param>
        /// <param name="xPath">解析规则[HtmlAgilityPack类库的解析规则]</param>
        /// <returns></returns>
        public static Data GetDataByUrl(string url)
        {
            Data data = new Data();
            HtmlAgilityPack.HtmlDocument doc = GetHTML(url);

            data.CompanyName = GetContentByXpath(doc, XPath.COMPANYNAME);

            Console.WriteLine(String.Format("url==>{0},company==>{1}", url, data.CompanyName));

            data.Phase = GetContentByXpath(doc, XPath.PHASE);
            data.Contacts = GetContentByXpath(doc, XPath.CONTACTS);
            data.Post = GetContentByXpath(doc, XPath.POST);

            HtmlAgilityPack.HtmlNodeCollection collection = doc.DocumentNode.SelectNodes(XPath.TELEPHONE);
            data.Telephone = GetContentByCollection(collection, 0);
            data.MobilePhone = GetContentByCollection(collection, 1);
            data.Fax = GetContentByCollection(collection, 2);
            data.Address = GetContentByCollection(collection, 3);
            data.Country_Region = GetContentByCollection(collection, 4);
            data.Province = GetContentByCollection(collection, 5);
            data.City = GetContentByCollection(collection, 6);
            data.Facebook = GetContentByCollection(collection, 7);
            data.URL = url;
            return data;
        }