public int ScrapeObituaries(CWFuneralHome funeralHome) { int recordCounter = 0; //Funeral Home API, However I found that it only returns var client = new RestClient(funeralHome.Website + "/dynamic/tributes-getcurrent.json"); // client.Authenticator = new HttpBasicAuthenticator(username, password); var request = new RestRequest("resource/{id}", Method.GET); request.OnBeforeDeserialization = resp => { resp.ContentType = "application/json"; }; // execute the request //IRestResponse response = client.Execute(request); //var content = response.Content; // raw content as string // or automatically deserialize result // return content type is sniffed but can be explicitly set via RestClient.AddHandler(); IRestResponse <FuneralOneResponse> response = client.Execute <FuneralOneResponse>(request); if (response.Data != null) { foreach (var obit in response.Data.tributes) { CWObituary obituary = new CWObituary { FirstName = obit.fn, LastName = obit.ln, MiddleName = obit.mn, FullName = obit.dn, CWFuneralHomeId = funeralHome.Id, ObituaryText = obit.obit, ObitURL = funeralHome.Website + "/" + obit.u, PictureURL = obit.tu, Age = 0 }; DateTime birthday = new DateTime(); DateTime Deathday = new DateTime(); bool BdayG = false; if (DateTime.TryParse(obit.bd, out birthday)) { obituary.BirthDate = birthday; BdayG = true; } else { obituary.BirthDate = null; } if (DateTime.TryParse(obit.dd, out Deathday)) { obituary.DeathDate = Deathday; if (BdayG == true) { DateTime zeroTime = new DateTime(1, 1, 1); TimeSpan span = Deathday - birthday; // because we start at year 1 for the Gregorian // calendar, we must subtract a year here. int yearsOld = (zeroTime + span).Year - 1; obituary.Age = yearsOld; } } else { obituary.DeathDate = null; } bool insertSuccess = insertObitIntoDatabase(obituary); if (insertSuccess) { recordCounter++; } } } for (int i = 1; i < 6000; i++) { CWObituary cwObit = new CWObituary(); string obitPage = funeralHome.Website + "/obituaries/ObitSearchList/" + i.ToString(); var aboutWebPage = new HtmlWeb(); var document = aboutWebPage.Load(obitPage); var outOfObits = document.DocumentNode.SelectNodes("//*[contains(@class,'no-matches')]"); if (outOfObits != null) { if (outOfObits.Count > 0) { break; } } foreach (HtmlNode li in document.DocumentNode.SelectNodes("//li")) { if (li.Attributes["class"] != null) { if (li.Attributes["class"].Value == "pager") { break; } } var h3 = li.Descendants("h3"); var imageNode = li.Descendants("img").FirstOrDefault(); var aNode = li.Descendants("a").FirstOrDefault(); string href = ""; string imageSrc = ""; if (aNode != null) { if (aNode.Attributes["href"] != null) { href = aNode.Attributes["href"].Value; href = funeralHome.Website + href; } } if (imageNode != null) { if (imageNode.Attributes["src"] != null) { imageSrc = imageNode.Attributes["src"].Value; } } string h3Html = h3.FirstOrDefault().InnerHtml; if (h3Html.Contains("<")) { string fullName = h3Html.Substring(0, h3Html.IndexOf("<")); string firstName = ""; string middleName = ""; string lastName = ""; //Whatever should split the string... var words = fullName.Split(' '); //Mind that the indexes are zero-based List <string> names = new List <string>(); for (var index = 0; index < words.Length; index++) { var word = (String)words.GetValue(index); if (!names.Contains(word)) { names.Add(word); } } firstName = names[0]; //If string contains a quote it usually means they inserted a nick name if (names[1].Contains(""")) { middleName = names[2]; } else { middleName = names[1]; } lastName = names[names.Count() - 1]; if (lastName == "") { lastName = names[names.Count() - 2]; } if (lastName.Contains(',')) { lastName = lastName.Replace(",", ""); } string deatDate = h3.FirstOrDefault().Descendants("span").FirstOrDefault().InnerText; DateTime dateTimeDeath = DateTime.Parse(deatDate); cwObit.FirstName = firstName; cwObit.MiddleName = middleName; cwObit.LastName = lastName; cwObit.ObitURL = href; cwObit.PictureURL = "http:" + imageSrc; cwObit.DeathDate = dateTimeDeath; cwObit.CWFuneralHomeId = funeralHome.Id; bool succuess = insertObitIntoDatabase(cwObit); if (succuess) { recordCounter++; } } } } return(recordCounter); }
public bool insertObitIntoDatabase(CWObituary scrappedObit) { CWObituary previousObit = db.CWObituary.Where(f => scrappedObit.CWFuneralHomeId == scrappedObit.CWFuneralHomeId && f.FirstName == scrappedObit.FirstName && f.LastName == scrappedObit.LastName && f.DeathDate == scrappedObit.DeathDate).FirstOrDefault(); if (previousObit == null) { db.CWObituary.Add(scrappedObit); db.SaveChanges(); return(true); } else { bool newInfoAvailable = false; if (scrappedObit.MiddleName != previousObit.MiddleName) { newInfoAvailable = true; previousObit.MiddleName = scrappedObit.MiddleName; } if (scrappedObit.Age != previousObit.Age) { newInfoAvailable = true; previousObit.Age = scrappedObit.Age; } if (scrappedObit.BirthDate != previousObit.BirthDate) { newInfoAvailable = true; previousObit.BirthDate = scrappedObit.BirthDate; } if (scrappedObit.FullName != previousObit.FullName) { newInfoAvailable = true; previousObit.FullName = scrappedObit.FullName; } if (scrappedObit.ObituaryText != previousObit.ObituaryText) { newInfoAvailable = true; previousObit.ObituaryText = scrappedObit.ObituaryText; } if (scrappedObit.ObitURL != previousObit.ObitURL) { newInfoAvailable = true; previousObit.ObitURL = scrappedObit.ObitURL; } if (scrappedObit.PictureURL != previousObit.PictureURL) { newInfoAvailable = true; previousObit.PictureURL = scrappedObit.PictureURL; } if (scrappedObit.ServiceDate != previousObit.ServiceDate) { newInfoAvailable = true; previousObit.ServiceDate = scrappedObit.ServiceDate; } if (scrappedObit.ServiceDay != previousObit.ServiceDay) { newInfoAvailable = true; previousObit.ServiceDay = scrappedObit.ServiceDay; } if (newInfoAvailable) { db.Entry(previousObit).State = System.Data.Entity.EntityState.Modified; db.SaveChanges(); return(true); } } return(false); }