/// <summary> /// Main routine, this is where the crawler should be called first /// </summary> /// <param name="args"></param> static void Main(string[] args) { //Example query //Look for Appartmens with 1 bedroom and with pics Query q1 = new Query(); q1.query = "http://chicago.craigslist.org/search/chc/apa?maxAsk=1500&bedrooms=1&hasPic=1"; //defines the latitude and longitude rectangle for the search area q1.topLatN = 41.920114; q1.bottomLatN = 41.8948; q1.rightLonE = -87.6000; q1.leftLonE = -87.6446; //recipient of that search q1.emailRecipient = "*****@*****.**"; //phone number to send a text message (feature offred from TMobile to their users) q1.textRecipient = "*****@*****.**"; //unique ID for that example query q1.Id = new Guid("1d9beafd-4290-465f-adc8-2a2d83b43f33"); //another query for someone else Query q2 = new Query(); q2.query = "http://chicago.craigslist.org/search/apa?maxAsk=2800&bedrooms=2&hasPic=1"; q2.topLatN = 41.9125; q2.bottomLatN = 41.8948; q2.rightLonE = -87.6000; q2.leftLonE = -87.6446; q2.emailRecipient = "*****@*****.**"; q2.Id = new Guid("abb15e58-49ce-4df3-aef9-4218a636cc2d"); q2.enabled = false; //another query for someone else Query q3 = new Query(); q3.query = "http://chicago.craigslist.org/search/chc/roo?maxAsk=800&hasPic=1"; q3.topLatN = 41.926331; q3.bottomLatN = 41.891632; q3.rightLonE = -87.6000; q3.leftLonE = -87.719356; q3.emailRecipient = "*****@*****.**"; q3.Id = new Guid("170E2366-0739-48F7-A314-92F79B48E1E6".ToLower()); q3.enabled = false; List <Query> queries = new List <Query> { q1, q2, q3 }; SMTPTools.TrySMTP(); LoadExploredFromAppData(queries); RunMainLoop(queries); }
/// <summary> /// Executes one query on craigslist /// Calls the page by using the query link, parse the HTML and see if the post matches the query /// </summary> /// <param name="q"></param> static void ExecuteQuery(Query q) { string searchResults = ""; //for (int i = 0; i < 1; i+=100) LogFile.Write("Quering:" + q.query); searchResults += DownloadPage(q.query);// + "&s=" + i); if (searchResults != null) { int countNew = 0; int countTotal = 0; List <string> exploredOnPage = new List <string>(); Match m = regexPost.Match(searchResults); while (m.Success) { LogFile.Write("Found Link: http://chicago.craigslist.org" + m.Groups[0].Value); if (!exploredOnPage.Contains(m.Groups[0].Value)) { exploredOnPage.Add(m.Groups[0].Value); if (!q.exploredPosts.Contains(m.Groups[0].Value)) { q.newLinks++; countNew++; q.exploredPosts.Add(m.Groups[0].Value); Console.WriteLine(countNew + ") Found new URL for " + q.emailRecipient + ", parsing for location..."); string postPage = DownloadPage("http://chicago.craigslist.org" + m.Groups[0].Value); Match mc = regexCoord.Match(postPage); if (mc.Success) { #region compare location, then images and send by email or text double lat = Convert.ToDouble(mc.Groups[1].Value); double lon = Convert.ToDouble(mc.Groups[2].Value); if (lat > q.bottomLatN && lat < q.topLatN && lon > q.leftLonE && lon < q.rightLonE) { #region compares images and send email/text Match mi = regexSmallPicture.Match(postPage); int totalPics = 0; int matchedPics = 0; List <string> exploredImageForThisPost = new List <string>(); while (mi.Success) { string localFilename = mi.Groups[0].Value.Replace("http://images.craigslist.org/", ""); if (!exploredImageForThisPost.Contains(localFilename)) { totalPics++; using (WebClient client = new WebClient()) { int imgSize = 16; #region download and compare image client.DownloadFile(mi.Groups[0].Value, localFilename); Bitmap img = (Bitmap)FromFile(localFilename, new Size(imgSize, imgSize)); byte[] BWarray = new byte[imgSize * imgSize]; int idx = 0; int sum = 0; for (int i = 0; i < img.Width; i++) { for (int j = 0; j < img.Height; j++) { Color c = img.GetPixel(i, j); BWarray[idx] = (byte)((c.R + c.G + c.B) / 3); sum += BWarray[idx]; idx++; } } int avg = sum / idx; bool[] boolArray = new bool[BWarray.Length]; for (int i = 0; i < BWarray.Length; i++) { boolArray[i] = BWarray[i] > avg; } BitArray arr = new BitArray(boolArray); byte[] data = new byte[arr.Length / 8]; arr.CopyTo(data, 0); string hash = string.Join(string.Empty, Array.ConvertAll(data, b => b.ToString("X2"))); int minDistance = int.MaxValue; foreach (String hexaImage in q.exploredImages) { byte[] bytes = Enumerable.Range(0, hexaImage.Length) .Where(x => x % 2 == 0) .Select(x => Convert.ToByte(hexaImage.Substring(x, 2), 16)) .ToArray(); BitArray bits = new BitArray(bytes); int dist = 0; for (int i = 0; i < bits.Length; i++) { dist += (bits[i] == arr[i] ? 0 : 1); } minDistance = (int)Math.Min(minDistance, dist); } if (minDistance != 0) { q.exploredImages.Add(hash); CraigslistWatcher2.Settings.Default.exploredImages += hash + ","; CraigslistWatcher2.Settings.Default.Save(); } else { LogFile.Write("Picture match previous post:" + localFilename); matchedPics++; } exploredImageForThisPost.Add(localFilename); //File.Delete(localFilename); #endregion } } mi = mi.NextMatch(); } if ((matchedPics / totalPics) <= 0.5) //less than 50% of the images were already scanned { if (q.enabled) { #region creating email string title = ""; Match mt = regexTitle.Match(postPage); if (mt.Success) { title = mt.Groups[1].Value; } Match li = regexLargePicture.Match(postPage); List <string> largeImages = new List <string>(); while (li.Success) { if (!largeImages.Contains(li.Groups[0].Value)) { largeImages.Add(li.Groups[0].Value); } li = li.NextMatch(); } string body = "I found a new match in the desired area." + "See <a href=\"" + "http://chicago.craigslist.org" + m.Groups[0].Value + "\">" + "http://chicago.craigslist.org" + m.Groups[0].Value + "</a><br>" + "<br><br><br>"; foreach (string src in largeImages) { body += "<img src=\"" + src + "\" alt=\"craigslist image\" style=\"width:300px\"><br>"; } body += "queryId=" + q.Id; SMTPTools.SendMail(q.emailRecipient, "MATCH! " + title, body, true); LogFile.Write("Mail sent"); #endregion #region creating text message though smtp if (!String.IsNullOrEmpty(q.textRecipient)) { string textBody = "http://chicago.craigslist.org" + m.Groups[0].Value; SMTPTools.SendMail(q.textRecipient, "MATCH! " + title, textBody, false); LogFile.Write("Text message sent"); } #endregion } LogFile.Write("There is a new match:" + "http://chicago.craigslist.org" + m.Groups[0].Value); Console.WriteLine("There is a new match:"); Console.WriteLine("http://chicago.craigslist.org" + m.Groups[0].Value); q.validatedAndSent++; } else { LogFile.Write("Found identical images, probably duplicated post"); Console.WriteLine("Math has identical images, probably a duplicated post: http://chicago.craigslist.org" + m.Groups[0].Value); q.duplicateImage++; } #endregion } else { LogFile.Write("Located outside of desired area"); Console.WriteLine("Located outside of desired area"); q.outsideArea++; } #endregion } else { LogFile.Write("No coordinates found"); Console.WriteLine("No coordinates found"); q.noLocation++; } } else { LogFile.Write("Link already explored during a previous query"); break; } countTotal++; } //else //LogFile.Write("Link already explored for this query response"); m = m.NextMatch(); } Console.WriteLine(DateTime.Now.ToString() + " Total Links: " + countTotal + " New:" + countNew); } }