/// <summary> /// Extracts the raw data from craigslist by hitting /// http://vancouver.en.craigslist.ca/search/sss?query=bike+kona+dawg&srchType=A&format=rss /// then breaks down the individual links and hits those concurrently parsing out the relevant details /// I cannot find a paging paramter for this RSS feed /// The start at parameter is &s=75 /// You only get 25 items on a page and have no clue in the RSS feed of how many matching /// items there are in total. The non rss version of the page has a page count / total /// matching items count /// /// See also another piece of software that does this stuff too : /// http://www.zentastic.com/blog/zencash/ /// /// </summary> /// <param name="cityName"></param> /// <param name="stolenDate"></param> /// <param name="itemDescription"></param> /// <returns></returns> public async Task<CraigslistInfo[]> GetCraigslistInfoByRSS(string cityName, string stolenDate, int pageNumber, string itemDescription) { XmlDocument xmlDoc = new XmlDocument(); //* create an xml document object. xmlDoc.Load(String.Format("http://{0}.en.craigslist.ca/search/sss?query={1}&srchType=A&format=rss&s={2}", cityName, itemDescription, pageNumber * RSS_RESULTS_PAGE_FEED_PAGE_SIZE)); // // Again a bit of fragile code here, but whaddaya gonna do! // if (xmlDoc.ChildNodes[1].ChildNodes[0].ChildNodes[13].ChildNodes[0].ChildNodes.Count > 0) { if (xmlDoc.ChildNodes[1].ChildNodes[0].ChildNodes[13].ChildNodes[0].ChildNodes.Count < RSS_RESULTS_PAGE_FEED_PAGE_SIZE) { //* Get elements. XmlNodeList items = xmlDoc.ChildNodes[1].ChildNodes[0].ChildNodes[13].ChildNodes[0].ChildNodes; // // This multitaking stuff is a bit of a basard // used this link http://stackoverflow.com/questions/10806951/how-to-limit-the-amount-of-concurrent-async-i-o-operations and also // See http://msdn.microsoft.com/en-gb/library/vstudio/hh300224.aspx // and especially http://msdn.microsoft.com/en-us/library/hh556530.aspx return await GetAllUrlsAsync(items); } else { // // Break all the rest rules - which say the error message should be in the statusText header // see: http://stackoverflow.com/questions/1077340/best-way-to-return-error-messages-on-rest-services // instead do it wrong and return an error disguised as the object // CraigslistInfo cockupTooManyresults = new CraigslistInfo(); cockupTooManyresults.DescriptionHTML = "Error"; cockupTooManyresults.Title = String.Format("More than {0} results returned, please be more selective", RSS_RESULTS_PAGE_FEED_PAGE_SIZE); cockupTooManyresults.DodgyScore = 0; CraigslistInfo[] results = new CraigslistInfo[1]; results[0] = cockupTooManyresults; return results; } } else { // // Break all the rest rules - which say the error message should be in the statusText header // see: http://stackoverflow.com/questions/1077340/best-way-to-return-error-messages-on-rest-services // instead do it wrong and return an error disguised as the object // Oh yeah and breech the Dont reeat Yoursefl principle too! // CraigslistInfo cockupTooManyresults = new CraigslistInfo(); cockupTooManyresults.DescriptionHTML = "Error"; cockupTooManyresults.Title = "No results returned, alter your search terms"; cockupTooManyresults.DodgyScore = 0; CraigslistInfo[] results = new CraigslistInfo[1]; results[0] = cockupTooManyresults; return results; } }
/// <summary> /// Parses data from an HTML node for a listing items to populate most of the data into a craigslistInfo object /// </summary> /// <param name="item"></param> /// <returns></returns> public static CraigslistInfo transformSummaryHTMLNodeIntoCraigslistInfo(HtmlAgilityPack.HtmlNode item) { CraigslistInfo craigslistInfo = new CraigslistInfo(); // // the node is a block of HTml that looks like this //<p class="row" data-latitude="" data-longitude=""> // <span class="ih" id="images:3Ee3K43m25Ne5K95Hecbmc8b000ac1a0e1376.jpg"> </span> // <span class="itemdate"> Dec 11</span> // <span class="itemsep"> - </span> // <a href="http://vancouver.en.craigslist.ca/pml/ctd/3427888027.html">2003 HYUNDAI TIBURON GT</a> // <span class="itemsep"> - </span> // <span class="itemph"></span> // <span class="itempp"> $6595</span> // <span class="itempn"><font size="-1"> (MISSION)</font></span> // <span class="itemcg" title="ctd"> <small class="gc"><a href="/ctd/">cars & trucks - by dealer</a></small></span> // <span class="itempx"> <span class="p"> pic</span></span> // <br class="c"> // </p> // // First of all parse the date : Dec 11 // // .:TODO:. this is not working and throwing the exception every time - and using the output in jscript breaks the page! // string[] formats = { " MMM dd" }; try { craigslistInfo.DatePostedReal = DateTime.ParseExact(item.SelectSingleNode("//span[@class='itemdate']").InnerText.Substring(0, formats[0].Length), formats, null, System.Globalization.DateTimeStyles.AssumeUniversal); } catch (FormatException) { craigslistInfo.DatePostedReal = DateTime.MinValue; Console.WriteLine("Unable to convert '{0}' to a date.", item.SelectSingleNode("//span[@class='itemdate']").InnerText); } // // Next the Price // craigslistInfo.Price = item.SelectSingleNode("//span[@class='itempp']").InnerText.Trim(); craigslistInfo.SaleArea = item.SelectSingleNode("//font").InnerText; craigslistInfo.LinkURL = item.ChildNodes[7].Attributes["href"].Value; craigslistInfo.ShortTitle = item.ChildNodes[7].InnerText; craigslistInfo.Category = item.ChildNodes[17].SelectSingleNode("//a").InnerText; craigslistInfo.DatePostedParsed = craigslistInfo.DatePostedReal.ToString("dd MMM yyyy HH:mm:ss"); return craigslistInfo; }
/// <summary> /// The actions from the foreach loop are moved to this async method. /// </summary> /// <param name="url"></param> /// <param name="client"></param> /// <returns></returns> public static async Task<CraigslistInfo> ProcessURL(string url, HttpClient client) { // // The detail page is not available in RSS, so have to parse the HTML for it (much harder than the nice XML you get frmo RSS feed) // to kae this easier use a special gizmo called the HTMl agility pack that parses the // HTMl into a document tree (and fixes errors etc in the HTMl) - this allows us to traverse the page safely // HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument(); // // There are various options, set as needed // Here I just want any errors tidied up to make a valid Docuemtn Tree so we can parse with confidence // htmlDoc.OptionFixNestedTags = true; htmlDoc.Load((await (client.GetStreamAsync(url))), true); // // Now ots of magic to parse the data from all the HTML // CraigslistInfo currentListing = new CraigslistInfo(); currentListing.updateCraigslistInfoFromFullItemDetailsPage(htmlDoc, url); return currentListing; }
/// <summary> /// <summary> /// Extracts the raw data from craigslist by hitting /// http://vancouver.en.craigslist.ca/search/sss?query=bike+kona+dawg&srchType=A /// then breaks down the individual links and hits those concurrently parsing out the relevant details /// We use the HTmll page not the RSS feed since the RSS feed doe not do paging correctly /// which makes things crap /// The start at parameter is &s=75 /// /// See also another piece of software that does this stuff too : /// http://www.zentastic.com/blog/zencash/ /// /// </summary> /// <param name="cityName"></param> /// <param name="stolenDate"></param> /// <param name="pageNumber"></param> /// <param name="itemDescription"></param> /// <returns></returns> public async Task<CraigslistInfo[]> GetCraigslistInfo(string cityName, string stolenDate, int pageNumber, string itemDescription) { List<CraigslistInfo> listings = new List<CraigslistInfo>(); // // The search result page is available in RSS, but that page does not have paging so try to use the HTML // version instead (much harder to parse than the nice XML you get from RSS feed) // to make this easier use a special gizmo called the HtmlAgilityPack that parses the // HTMl into a document tree (and fixes errors etc in the HTML) - this allows us to traverse the // page strucvture safely // HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument(); // // There are various options, set as needed // Here I just want any errors tidied up to make a valid Document Tree so we can parse with confidence // Note: We get this first URl synchronously, but call up the fine detail for each item within it // by calling it's details URL asynchronoursly (and concurrently) for speed purposes // htmlDoc.OptionFixNestedTags = true; System.Net.WebRequest webReq = System.Net.WebRequest.Create(String.Format("http://{0}.en.craigslist.ca/search/sss?query={1}&srchType=A&s={2}", cityName, itemDescription, pageNumber * HTML_RESULTS_PAGE_FEED_PAGE_SIZE)); System.Net.WebResponse webRes = webReq.GetResponse(); System.IO.Stream mystream = webRes.GetResponseStream(); if (mystream != null) { htmlDoc.Load(mystream); } // // Use XPAth queries (see http://www.w3schools.com/xpath/xpath_syntax.asp) // to parse the DOm the HtmlAgilityPack has made us // //HtmlAgilityPack.HtmlNode pageCountNode = htmlDoc.DocumentNode.SelectSingleNode("//h4/b[2]"); //string searchMatchCount = pageCountNode.InnerText.Substring(pageCountNode.InnerText.IndexOf("Found: "),pageCountNode.InnerText.IndexOf(" Displaying:") - pageCountNode.InnerText.IndexOf("Found: ")); // // Get the inodvidual items for sales // HtmlAgilityPack.HtmlNodeCollection itemNodes = htmlDoc.DocumentNode.SelectNodes("//p[@class='row']"); if (itemNodes != null) { if (itemNodes.Count < MAX_RESULTS_WARN) { foreach (HtmlAgilityPack.HtmlNode item in itemNodes) { CraigslistInfo craigslistInfo = CraigslistInfo.transformSummaryHTMLNodeIntoCraigslistInfo(item); listings.Add(craigslistInfo); } } else { // // Break all the rest rules - which say the error message should be in the statusText header // see: http://stackoverflow.com/questions/1077340/best-way-to-return-error-messages-on-rest-services // instead do it wrong and return an error disguised as the object // CraigslistInfo cockupTooManyresults = new CraigslistInfo(); cockupTooManyresults.DescriptionHTML = "Error"; cockupTooManyresults.Title = String.Format("More than {0} results returned, please be more selective", MAX_RESULTS_WARN); cockupTooManyresults.DodgyScore = 0; CraigslistInfo[] results = new CraigslistInfo[1]; results[0] = cockupTooManyresults; return results; } } else { // // Break all the rest rules - which say the error message should be in the statusText header // see: http://stackoverflow.com/questions/1077340/best-way-to-return-error-messages-on-rest-services // instead do it wrong and return an error disguised as the object // Oh yeah and breech the Dont reeat Yoursefl principle too! // CraigslistInfo cockupTooManyresults = new CraigslistInfo(); cockupTooManyresults.DescriptionHTML = ""; cockupTooManyresults.Title = "No results returned, alter your search terms"; cockupTooManyresults.DodgyScore = 0; CraigslistInfo[] results = new CraigslistInfo[1]; results[0] = cockupTooManyresults; return results; } return await UpdateAllListItemsWithDetailsFromTheirDetailPageAsync(listings); }