protected override void ExtractItemFromPage() { Trace.TraceInformation("Extracting infomation from " + driver.Url); using (var ctx = new ScrapedItem()) { var profile = new BuildUppProfile(); profile.NameOfCompany = this.GetTextByXPath("//*[@id='consultant-header']/h1"); profile.OfficeAddress = this.GetTextByXPath("//*[@id='consultant-tabs']/div[2]/table/tbody/tr[1]/td[1]/p"); profile.EmailAddress = this.GetTextByXPath("//*[@id='consultant-tabs']//table//tr//p[starts-with(text(), 'Email:')]//a"); profile.WebsiteAddress = this.GetTextByXPath("//*[@id='consultant-tabs']//table//tr//p[starts-with(text(), 'Website:')]//a"); profile.AboutUs = this.GroupAllTextByXPath("//*[@id='consultant-tabs']/following-sibling::p", "\r\n\r\n"); string otherDetails = this.GetTextByXPath("//*[@id='consultant-tabs']//table/tbody/tr[1]/td[2]/p"); // Take line from paragraph by word, then replace the text profile.PhoneNumber = this.GetLineContaining(otherDetails, "Telephone: ").Replace("Telephone: ", ""); profile.FaxNumber = this.GetLineContaining(otherDetails, "Fax: ").Replace("Fax: ", ""); profile.MobileNumber = this.GetLineContaining(otherDetails, "Mobile: ").Replace("Mobile: ", ""); profile.ContactName = this.GetLineContaining(otherDetails, "Contact: ").Replace("Contact: ", ""); profile.PageURL = driver.Url; profile.WebsiteScraped = WebsiteName; ctx.BuildUppProfiles.Add(profile); ctx.SaveChanges(); Trace.TraceInformation("Completed scraping infomation from " + driver.Url); }; }
protected override void ExtractItemFromPage() { Trace.TraceInformation("Extracting infomation from " + driver.Url); using (var ctx = new ScrapedItem()) { var profile = new BuildUppProfile(); profile.NameOfCompany = this.GetTextByXPath("//*[@id='mainContainer']/h2"); profile.OfficeAddress = this.GetTextByXPath("//*[@id='companyDetails']//span[text()='Address:']/../following-sibling::td"); profile.PhoneNumber = this.GetTextByXPath("//*[@id='companyDetails']//span[text()='Tel:']/../following-sibling::td"); profile.FaxNumber = this.GetTextByXPath("//*[@id='companyDetails']//span[text()='Fax:']/../following-sibling::td"); profile.EmailAddress = this.GetTextByXPath("//*[@id='companyDetails']//span[text()='Email:']/../following-sibling::td"); profile.WebsiteAddress = this.GetTextByXPath("//*[@id='companyDetails']//span[text()='Web:']/../following-sibling::td"); profile.Category = this.GroupAllTextByXPath("//*[@id='companyDetails']//span[text()='See also:']/../following-sibling::td/a", ", "); profile.PageURL = driver.Url; profile.WebsiteScraped = WebsiteName; ctx.BuildUppProfiles.Add(profile); ctx.SaveChanges(); Trace.TraceInformation("Completed scraping infomation from " + driver.Url); }; }
protected override void ExtractItemFromPage() { Trace.TraceInformation("Extracting infomation from " + driver.Url); using (var ctx = new ScrapedItem()) { var profile = new BuildUppProfile(); string nameOfCompanyXpath = string.Format("//*[@id='member-listing']/li[{0}]/div[1]/h3", this.CurrentResultOnPage + 1); string emailAddressXpath = string.Format("//*[@id='member-listing']/li[{0}]/div[3]/ul/li/a[contains(.,'Email us')]", this.CurrentResultOnPage + 1); string phoneNumberXpath = string.Format("//*[@id='member-listing']/li[{0}]/div[3]/p/span/strong", this.CurrentResultOnPage + 1); string officeAddressXpath = string.Format("//*[@id='member-listing']/li[{0}]/div[2]/p[@class='location']", this.CurrentResultOnPage + 1); //string aboutUsAddressXpath = string.Format("", this.CurrentResultOnPage + 1); string servicesProvidedXPath = string.Format("//*[@id='member-listing']/li[{0}]/div[1]/p[2]/strong", this.CurrentResultOnPage + 1); string websiteAddressXpath = string.Format("//*[@id='member-listing']/li[{0}]/div[3]/ul/li/a[contains(.,'Visit Website')]", this.CurrentResultOnPage + 1); profile.NameOfCompany = this.GetTextByXPath(nameOfCompanyXpath); profile.EmailAddress = this.GetAttributeValueByXPath(emailAddressXpath, "href")?.Replace("mailto:", ""); profile.PhoneNumber = this.GetTextByXPath(phoneNumberXpath); profile.OfficeAddress = this.GetTextByXPath(officeAddressXpath); profile.ServicesProvided = this.GetTextByXPath(servicesProvidedXPath); profile.WebsiteAddress = this.GetAttributeValueByXPath(websiteAddressXpath, "href"); profile.PageURL = driver.Url; profile.WebsiteScraped = WebsiteName; ctx.BuildUppProfiles.Add(profile); ctx.SaveChanges(); Trace.TraceInformation("Completed scraping infomation from " + driver.Url); }; }
protected override void ExtractItemFromPage() { Trace.TraceInformation("Extracting infomation from " + driver.Url); using (var ctx = new ScrapedItem()) { var profile = new BuildUppProfile(); profile.NameOfCompany = this.GetTextByXPath("//*[@id='njh_container']/div[3]/div/div/div[1]/h2"); List <string> address = new List <string>(); string addressLine = this.GetTextByXPath("//*[@id='OrderingForm']/table//tr//td//table//td[text()='Address: ']/following-sibling::td"); string county = this.GetTextByXPath("//*[@id='OrderingForm']/table//tr//td//table//td[text()='County: ']/following-sibling::td"); string postcode = this.GetTextByXPath("//*[@id='OrderingForm']/table//tr//td//table//td[text()='Postcode: ']/following-sibling::td"); string country = this.GetTextByXPath("//*[@id='OrderingForm']/table//tr//td//table//td[text()='Country: ']/following-sibling::td"); if (!string.IsNullOrEmpty(addressLine)) { address.Add(addressLine.Trim().Trim(',')); } if (!string.IsNullOrEmpty(county)) { address.Add(county.Trim().Trim(',')); } if (!string.IsNullOrEmpty(postcode)) { address.Add(postcode.Trim().Trim(',')); } if (!string.IsNullOrEmpty(country)) { address.Add(addressLine.Trim().Trim(',')); } profile.OfficeAddress = string.Join(", ", address); profile.PhoneNumber = this.GetTextByXPath("//*[@id='OrderingForm']/table//tr//td//table//td[text()='Telephone: ']/following-sibling::td"); profile.BusinessType = this.GetTextByXPath("//*[@id='OrderingForm']/table//tr//td//table//td[text()='Type: ']/following-sibling::td"); profile.AreasServed = this.GetTextByXPath("//*[@id='OrderingForm']/table//tr//td//table//td[text()='Search District: ']/following-sibling::td"); profile.EmailAddress = this.GetTextByXPath("//*[@id='OrderingForm']/table//span[@class='email_l']"); profile.WebsiteAddress = this.GetTextByXPath("//*[@id='OrderingForm']/table//span[@class='website_l']"); profile.AboutUs = this.GroupAllTextByXPath("//*[@id='OrderingForm']/table/tbody//div[@class='description']/following-sibling::*"); profile.PageURL = driver.Url; profile.WebsiteScraped = WebsiteName; ctx.BuildUppProfiles.Add(profile); ctx.SaveChanges(); Trace.TraceInformation("Completed scraping infomation from " + driver.Url); }; }
protected void ExportTable(List <string> columns, string fileName, bool updateExportedStatus = true, List <string> headings = null) { using (var ctx = new ScrapedItem()) { Trace.TraceInformation("Exporting results to CSV file " + fileName); // Loop through each row in database which isn't deleted and hasn't got a don't export flag var exportResults = ctx.BuildUppProfiles.Where(p => p.IsDeletedFlag != true && p.DoNotExportFlag != true && p.WebsiteScraped == WebsiteName); // Write it to the CSV file specified for only specific columns using (StreamWriter outfile = new StreamWriter(fileName)) { headings = headings ?? columns; outfile.WriteLine( string.Join(",", headings) ); foreach (var profile in exportResults) { List <string> rowValues = new List <string>(); // Loop over each colum name and get a list of the values foreach (var column in columns) { rowValues.Add( "\"" + this.ReplaceSafe(Convert.ToString(this.GetPropertyValue(profile, column))) + "\"" ); } // Print the row of values outfile.WriteLine(string.Join(",", rowValues)); if (updateExportedStatus) { profile.HasBeenExportedFlag = true; } } } // Update field to say they've exported if (updateExportedStatus) { ctx.SaveChanges(); } // Log the number of results exported Trace.TraceInformation("CSV file exported with " + exportResults.Count() + " results to " + fileName); } }
protected override void ExtractItemFromPage() { Trace.TraceInformation("Extracting infomation from " + driver.Url); using (var ctx = new ScrapedItem()) { var profile = new BuildUppProfile(); profile.NameOfCompany = this.GetTextByXPath("/html/body/div/section[1]/div/div/div/div/h2"); profile.EmailAddress = this.GetTextByXPath("//*[@id='member-info']/tbody/tr/th[text()='Email']/following-sibling::td/a"); profile.PhoneNumber = this.GetTextByXPath("//*[@id='member-info']/tbody/tr/th[text()='Telephone']/following-sibling::td"); profile.WebsiteAddress = this.GetTextByXPath("//*[@id='member-info']/tbody/tr/th[text()='Website']/following-sibling::td/a"); var addressLines = new List <string>(); var addressLinesCorrected = new List <string>(); addressLines.Add(this.GroupAllTextByXPath("//*[@id='member-info']/tbody/tr[5]/td")); addressLines.Add(this.GroupAllTextByXPath("//*[@id='member-info']/tbody/tr[7]/td")); addressLines.Add(this.GroupAllTextByXPath("//*[@id='member-info']/tbody/tr[9]/td")); addressLines.Add(this.GroupAllTextByXPath("//*[@id='member-info']/tbody/tr[11]/td")); addressLines.Add(this.GroupAllTextByXPath("//*[@id='member-info']/tbody/tr[13]/td")); addressLines.Add(this.GroupAllTextByXPath("//*[@id='member-info']/tbody/tr[15]/td")); // Loop over each addressLine only, copying non blanks into corrected version foreach (string addressLine in addressLines) { if (!string.IsNullOrEmpty(addressLine?.Trim())) { addressLinesCorrected.Add(addressLine); } } profile.OfficeAddress = string.Join(", ", addressLinesCorrected); profile.PageURL = driver.Url; profile.WebsiteScraped = WebsiteName; ctx.BuildUppProfiles.Add(profile); ctx.SaveChanges(); Trace.TraceInformation("Completed scraping infomation from " + driver.Url); }; }
protected override void ExtractItemFromPage() { Trace.TraceInformation("Extracting infomation from " + driver.Url); using (var ctx = new ScrapedItem()) { var profile = new BuildUppProfile(); profile.NameOfCompany = this.GetTextByXPath("//h1"); profile.PageURL = driver.Url; profile.WebsiteScraped = WebsiteName; ctx.BuildUppProfiles.Add(profile); ctx.SaveChanges(); Trace.TraceInformation("Completed scraping infomation from " + driver.Url); }; }
protected override void ExtractItemFromPage() { Trace.TraceInformation("Extracting infomation from " + driver.Url); using (var ctx = new ScrapedItem()) { var profile = new BuildUppProfile(); profile.NameOfCompany = this.GetTextByXPath("//*[@id='page_container']/header/div[3]/div/h1/span"); profile.WebsiteAddress = this.GetAttributeValueByXPath("//*[@id='page_container']/section/div/div/div[2]/div[2]/div/div/p[1]/a", "href"); profile.OfficeAddress = this.GetTextByXPath("//*[@id='page_container']/section/div/div/div[2]/div[2]/div/div/p[2]"); profile.AreasServed = this.GroupAllTextByXPath("//*[@id='page_container']/section/div/div/div[3]/div[2]/ul/li", ", "); profile.AboutUs = this.GroupAllTextByXPath("//*[@id='page_container']/section/div/div/div[3]/div[1]/p"); profile.ContactName = this.GetTextByXPath("//*[@id='page_container']/header/div[3]/div/h1"); if (!string.IsNullOrEmpty(profile.NameOfCompany)) { profile.ContactName = profile.ContactName.Replace(profile.NameOfCompany, "").Trim(); } string infoBox = this.GetTextByXPath("//*[@id='page_container']/section/div/div/div[2]/div[2]/div/div/p[1]"); profile.PhoneNumber = this.GetLineContaining(infoBox, "Tel:").Replace("Tel: ", ""); try { profile.PageURL = driver.Url; } catch (Exception e) { }; try { profile.WebsiteScraped = WebsiteName; } catch (Exception e) { }; ctx.BuildUppProfiles.Add(profile); ctx.SaveChanges(); Trace.TraceInformation("Completed scraping infomation from " + driver.Url); }; }
/// <summary> /// Loop over all profiles in the database and crawl for email addresses /// </summary> public void LoopOverResults() { IQueryable <BuildUppProfile> profiles; List <BuildUppProfile> profilesList; using (var ctx = new ScrapedItem()) { profiles = ctx.BuildUppProfiles.Where(m => m.Id >= this.idToStart && m.WebsiteAddress != null && m.EmailAddress == null && m.HasEmailBeenChecked != true).Take(100); profilesList = profiles.ToList(); } foreach (var profile in profilesList) { Trace.TraceInformation("About to attempt to get URL for {0}", profile.NameOfCompany); string email = ""; try { email = this.GetEmailFromWebsite(profile.WebsiteAddress); } catch (Exception e) { Trace.TraceInformation("Unexpected Exception thrown: {0}", e.ToString()); } string nameOfCompany = profile.NameOfCompany; using (var ctx = new ScrapedItem()) { //update all entries with that name of company to email address foreach (var profileToUpdate in ctx.BuildUppProfiles.Where(m => m.NameOfCompany == nameOfCompany)) { profileToUpdate.EmailAddress = email; // if not found then update the EmailCheckedFlag profileToUpdate.HasEmailBeenChecked = true; } ctx.SaveChanges(); } } }
protected override void ExtractItemFromPage() { Trace.TraceInformation("Extracting infomation from " + driver.Url); using (var ctx = new ScrapedItem()) { var profile = new BuildUppProfile(); profile.NameOfCompany = this.GetTextByXPath("//*[@id='block-system-main']/div/div/table/tbody/tr/td/div[1]/h1"); profile.ContactName = this.GetTextByXPath("//div[contains(@class,'views-field')]/strong[text()='Contact Name: ']/following-sibling::span"); profile.PhoneNumber = this.GetTextByXPath("//div[contains(@class,'views-field')]/strong[text()='Telephone: ']/following-sibling::span"); profile.EmailAddress = this.GetTextByXPath("//div[contains(@class,'views-field')]/strong[text()='Email Address: ']/following-sibling::span"); profile.WebsiteAddress = this.GetTextByXPath("//div[contains(@class,'views-field')]/strong[text()='Website: ']/following-sibling::span"); profile.OfficeAddress = this.GetTextByXPath("//span[contains(@class,'views-field')]/strong[text()='Contact Name: ']/following-sibling::span"); profile.AboutUs = this.GetTextByXPath("//span[contains(@class, 'field-content')]/strong[text()='Company Information:']/../.."); profile.AboutUs = string.IsNullOrEmpty(profile.AboutUs) ? null : profile.AboutUs.Replace("Company Information:", "").Trim(); profile.ServicesProvided = this.GroupAllTextByXPath("//*[@id='buyersGuideCompany_productsAndServices']/li"); List <string> address = new List <string>(); string streetAddress = this.GetTextByXPath("//span[contains(@class, 'views-field-street-address')]/span"); string addressLine1 = this.GetTextByXPath("//span[contains(@class, 'views-field-supplemental-address-1')]/span"); string city = this.GetTextByXPath("//span[contains(@class, 'views-field-city')]/span"); string province = this.GetTextByXPath("//span[contains(@class, 'views-field-state-province')]/span"); string postcode = this.GetTextByXPath("//span[contains(@class, 'views-field-postal-code')]/span"); string country = this.GetTextByXPath("//span[contains(@class, 'views-field-country')]/span"); if (!string.IsNullOrEmpty(streetAddress)) { address.Add(streetAddress.Trim().Trim(',')); } if (!string.IsNullOrEmpty(addressLine1)) { address.Add(addressLine1.Trim().Trim(',')); } if (!string.IsNullOrEmpty(city)) { address.Add(city.Trim().Trim(',')); } if (!string.IsNullOrEmpty(province)) { address.Add(province.Trim().Trim(',')); } if (!string.IsNullOrEmpty(postcode)) { address.Add(postcode.Trim().Trim(',')); } if (!string.IsNullOrEmpty(country)) { address.Add(country.Trim().Trim(',')); } profile.OfficeAddress = string.Join(", ", address); profile.PageURL = driver.Url; profile.WebsiteScraped = WebsiteName; ctx.BuildUppProfiles.Add(profile); ctx.SaveChanges(); Trace.TraceInformation("Completed scraping infomation from " + driver.Url); }; }
protected void StandardClean(bool RemoveExactDuplicates) { Trace.TraceInformation("Cleaning results data"); Trace.TraceInformation("Remove duplicate profiles"); using (var ctx = new ScrapedItem()) { // Loop over each row and delete duplicates by adding isDeleted flag var duplicateProfiles = from r in ctx.BuildUppProfiles group r by new { NameOfCompany = r.NameOfCompany, OfficeAddress = r.OfficeAddress, PhoneNumber = r.PhoneNumber, EmailAddress = r.EmailAddress, WebsiteScraped = this.WebsiteName } into g where g.Count() > 1 select g; foreach (var g in duplicateProfiles) { var removeProfiles = g.Skip(1); foreach (var record in removeProfiles) { record.IsDeletedFlag = true; } } // Add do not expoert for profiles that do not meet the basic requirment var invalidProfiles = ctx.BuildUppProfiles.Where(p => String.IsNullOrEmpty(p.NameOfCompany) || String.IsNullOrEmpty(p.EmailAddress) || String.IsNullOrEmpty(p.OfficeAddress) ); foreach (var profile in invalidProfiles) { profile.DoNotExportFlag = true; } Trace.TraceInformation("do not export flag set for multiple company name listings"); // Add the do not export flag for duplicate names var multipleAddressProfiles = from r in ctx.BuildUppProfiles where r.IsDeletedFlag != true group r by new { NameOfCompany = r.NameOfCompany, } into g where g.Count() > 1 select g; foreach (var g in multipleAddressProfiles) { var removeProfiles = g.Skip(1); foreach (var record in removeProfiles) { if (record.IsDeletedFlag != true) { record.DoNotExportFlag = true; } } } ctx.SaveChanges(); } }