Ejemplo n.º 1
0
        private string GetDescription(HtmlAgilityPack.HtmlDocument Html)
        {
            var description = Html.GetElementbyId("productDescription");

            if (description == null)
            {
                try
                {
                    description = Html.GetElementbyId("pd-available").NextSibling.NextSibling;

                    string javatoparse = description.InnerText;

                    var frame = GetJavaIFrame(javatoparse);

                    var htmlcode = WebUtility.UrlDecode(frame);

                    HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
                    doc.LoadHtml(htmlcode);

                    description = doc.GetElementbyId("productDescription");

                }
                catch (Exception)
                {

                }

            }

            var descriptiontext = "";

            if (description != null)
            {
                descriptiontext = description.InnerText;
            }

            return descriptiontext;
        }
Ejemplo n.º 2
0
 public string Validate(HtmlAgilityPack.HtmlDocument document, HtmlStaticizeContext status)
 {
     var errorMessageBuilder = new StringBuilder();
     foreach (var id in this.elementXPath.Keys)
     {
         String truthXPath = elementXPath[id];
         var element = document.GetElementbyId(id);
         if (element == null)
         {
             errorMessageBuilder.AppendFormat("\r\n元素 \"{0}\" 在文档中不存在。", id);
             continue;
         }
         if (element.XPath != truthXPath)
         {
             errorMessageBuilder.AppendFormat("\r\n元素 \"{0}\" XPath 不匹配,应为\"{1}\",但实际为\"{2}\"。\r\n行号:{3}\r\n源HTML:\r\n{4}\r\n", id, truthXPath, element.XPath, element.Line.ToString(), element.OuterHtml);
             continue;
         }
     }
     return errorMessageBuilder.Length == 0 ? null : errorMessageBuilder.ToString();
 }
Ejemplo n.º 3
0
		private static void parsecontent(HttpClient httpClient, string folderpath, HtmlAgilityPack.HtmlDocument htmlDoc, List<string> lstImgurl, string link)
		{
			int a = link.LastIndexOf("/");
			var name = link.Substring(a + 1);
			var b = name.IndexOf(".");
			name = name.Substring(0, b);

			string urlPageName = name;
			string resname = Path.Combine(folderpath, urlPageName);
			string contenturl = baseurl + link;

			string txtfullname = resname + ".txt";
			if (File.Exists(txtfullname))
			{
				Console.WriteLine("page {0} has download =>{1}", link, urlPageName);
				return;
			}

			try
			{
				var taskget = httpClient.GetStreamAsync(contenturl);
				htmlDoc.Load(taskget.Result, Encoding.UTF8);

				Console.WriteLine("load html " + contenturl);
			}
			catch (Exception ex)
			{
				Console.WriteLine("load html error: " + ex.Message);
				return;
			}
			var contentNode = htmlDoc.GetElementbyId("read_tpc");


			var content = contentNode.InnerHtml;
			try
			{
				FileStream fs = File.OpenWrite(txtfullname);
				byte[] torbytes = Encoding.UTF8.GetBytes(content);
				fs.Write(torbytes, 0, torbytes.Count());
				fs.Flush();
				fs.Close();
				fs.Dispose();
			}
			catch (Exception ex)
			{
				Console.WriteLine("get torrent failed! " + ex.Message);
			}

			int imgIndex = 0;
			contentNode.Elements("img").ToList().ForEach(e =>
			{
				try
				{
					var imgurl = e.Attributes["src"].Value;
					var imgstream = httpClient.GetStreamAsync(imgurl);



					var downImgname = resname + "-" + (++imgIndex) + ".jpg";

					lstImgurl.Add(imgurl);
					FileStream fsimg = File.OpenWrite(downImgname);


					imgstream.Result.CopyTo(fsimg);

					if (fsimg.Length < 100)
					{
						return;

					}

					fsimg.Flush();
					fsimg.Close();
					fsimg.Dispose();

					Console.WriteLine("save img => " + downImgname);
				}
				catch (Exception ex)
				{
					Console.WriteLine("get img {0} failed! {1} ", lstImgurl, ex.Message);

				}
			});
		}
Ejemplo n.º 4
0
        private string GetTitle(HtmlAgilityPack.HtmlDocument Html)
        {
            var title = Html.GetElementbyId("productTitle");
            if (title == null)
            {
                title = Html.GetElementbyId("btAsinTitle");
            }

            return title.InnerText.Trim();
        }
        public void updateCraigslistInfoFromFullItemDetailsPage(HtmlAgilityPack.HtmlDocument htmlDoc, string url)
        {
            HtmlAgilityPack.HtmlNode time = htmlDoc.DocumentNode.SelectSingleNode("//time");
            HtmlAgilityPack.HtmlNode title = htmlDoc.DocumentNode.SelectSingleNode("//h2");
            HtmlAgilityPack.HtmlNode bodyElement = htmlDoc.GetElementbyId("userbody");
            //
            // magic here to trim down the HTML, break it up, analyze it etc
            // you can see example HTMl for the pages at URLs like this
            // view-source:http://vancouver.en.craigslist.ca/van/bik/3436265260.html  - no images
            // view-source:http://vancouver.en.craigslist.ca/rds/bik/3451242524.html  - got images
            //

            //
            // really pikey bit of brittle code here - but had some real difficulty parsing the image 
            // URL out of the img tags using the agility pack. Kept getting null pointers and gave up after a couple of hours.
            // TODO: ervert back ToString parsing the <img tags as iterator's a better approach
            //
            if (bodyElement.InnerText.Contains("imgList ="))
            {
                string scriptText = bodyElement.InnerText.Substring(bodyElement.InnerText.IndexOf("imgList =")).Replace("\"", "").Replace("\n", String.Empty).Replace("\r", String.Empty);
                string[] images = scriptText.Substring(scriptText.IndexOf('[') + 1, (scriptText.IndexOf(']') - scriptText.IndexOf('[')) - 1).Split(',');
                this.Images = images.ToList();
            }
            //
            // title is on the format
            //    <h2 class="postingtitle">2008 Kona Dawg - $1200 (Delta, BC)</h2>
            //
            this.Title = title.InnerText;
            this.LinkURL = url;

            if (time != null)
            {
                this.DatePosted = time.InnerText;
            }
            this.DescriptionHTML = bodyElement.InnerText.Replace("\t", String.Empty).Replace("\n", String.Empty); ;
            this.PhoneNumber = this.extractPhoneNumber(bodyElement);
            this.calculateDodginessScore(bodyElement);
        }