Exemple #1
0
        protected override void ExtractItemFromPage()
        {
            Trace.TraceInformation("Extracting infomation from " + driver.Url);
            using (var ctx = new ScrapedItem())
            {
                var profile = new BuildUppProfile();

                profile.NameOfCompany  = this.GetTextByXPath("//*[@id='consultant-header']/h1");
                profile.OfficeAddress  = this.GetTextByXPath("//*[@id='consultant-tabs']/div[2]/table/tbody/tr[1]/td[1]/p");
                profile.EmailAddress   = this.GetTextByXPath("//*[@id='consultant-tabs']//table//tr//p[starts-with(text(), 'Email:')]//a");
                profile.WebsiteAddress = this.GetTextByXPath("//*[@id='consultant-tabs']//table//tr//p[starts-with(text(), 'Website:')]//a");
                profile.AboutUs        = this.GroupAllTextByXPath("//*[@id='consultant-tabs']/following-sibling::p", "\r\n\r\n");
                string otherDetails = this.GetTextByXPath("//*[@id='consultant-tabs']//table/tbody/tr[1]/td[2]/p");

                // Take line from paragraph by word, then replace the text
                profile.PhoneNumber  = this.GetLineContaining(otherDetails, "Telephone: ").Replace("Telephone: ", "");
                profile.FaxNumber    = this.GetLineContaining(otherDetails, "Fax: ").Replace("Fax: ", "");
                profile.MobileNumber = this.GetLineContaining(otherDetails, "Mobile: ").Replace("Mobile: ", "");
                profile.ContactName  = this.GetLineContaining(otherDetails, "Contact: ").Replace("Contact: ", "");


                profile.PageURL        = driver.Url;
                profile.WebsiteScraped = WebsiteName;


                ctx.BuildUppProfiles.Add(profile);
                ctx.SaveChanges();

                Trace.TraceInformation("Completed scraping infomation from " + driver.Url);
            };
        }
Exemple #2
0
        protected override void ExtractItemFromPage()
        {
            Trace.TraceInformation("Extracting infomation from " + driver.Url);
            using (var ctx = new ScrapedItem())
            {
                var profile = new BuildUppProfile();

                profile.NameOfCompany  = this.GetTextByXPath("//*[@id='mainContainer']/h2");
                profile.OfficeAddress  = this.GetTextByXPath("//*[@id='companyDetails']//span[text()='Address:']/../following-sibling::td");
                profile.PhoneNumber    = this.GetTextByXPath("//*[@id='companyDetails']//span[text()='Tel:']/../following-sibling::td");
                profile.FaxNumber      = this.GetTextByXPath("//*[@id='companyDetails']//span[text()='Fax:']/../following-sibling::td");
                profile.EmailAddress   = this.GetTextByXPath("//*[@id='companyDetails']//span[text()='Email:']/../following-sibling::td");
                profile.WebsiteAddress = this.GetTextByXPath("//*[@id='companyDetails']//span[text()='Web:']/../following-sibling::td");
                profile.Category       = this.GroupAllTextByXPath("//*[@id='companyDetails']//span[text()='See also:']/../following-sibling::td/a", ", ");

                profile.PageURL        = driver.Url;
                profile.WebsiteScraped = WebsiteName;


                ctx.BuildUppProfiles.Add(profile);
                ctx.SaveChanges();

                Trace.TraceInformation("Completed scraping infomation from " + driver.Url);
            };
        }
Exemple #3
0
        protected override void ExtractItemFromPage()
        {
            Trace.TraceInformation("Extracting infomation from " + driver.Url);
            using (var ctx = new ScrapedItem())
            {
                var profile = new BuildUppProfile();

                string nameOfCompanyXpath = string.Format("//*[@id='member-listing']/li[{0}]/div[1]/h3", this.CurrentResultOnPage + 1);
                string emailAddressXpath  = string.Format("//*[@id='member-listing']/li[{0}]/div[3]/ul/li/a[contains(.,'Email us')]", this.CurrentResultOnPage + 1);
                string phoneNumberXpath   = string.Format("//*[@id='member-listing']/li[{0}]/div[3]/p/span/strong", this.CurrentResultOnPage + 1);
                string officeAddressXpath = string.Format("//*[@id='member-listing']/li[{0}]/div[2]/p[@class='location']", this.CurrentResultOnPage + 1);
                //string aboutUsAddressXpath = string.Format("", this.CurrentResultOnPage + 1);
                string servicesProvidedXPath = string.Format("//*[@id='member-listing']/li[{0}]/div[1]/p[2]/strong", this.CurrentResultOnPage + 1);
                string websiteAddressXpath   = string.Format("//*[@id='member-listing']/li[{0}]/div[3]/ul/li/a[contains(.,'Visit Website')]", this.CurrentResultOnPage + 1);



                profile.NameOfCompany    = this.GetTextByXPath(nameOfCompanyXpath);
                profile.EmailAddress     = this.GetAttributeValueByXPath(emailAddressXpath, "href")?.Replace("mailto:", "");
                profile.PhoneNumber      = this.GetTextByXPath(phoneNumberXpath);
                profile.OfficeAddress    = this.GetTextByXPath(officeAddressXpath);
                profile.ServicesProvided = this.GetTextByXPath(servicesProvidedXPath);
                profile.WebsiteAddress   = this.GetAttributeValueByXPath(websiteAddressXpath, "href");


                profile.PageURL        = driver.Url;
                profile.WebsiteScraped = WebsiteName;

                ctx.BuildUppProfiles.Add(profile);
                ctx.SaveChanges();

                Trace.TraceInformation("Completed scraping infomation from " + driver.Url);
            };
        }
        protected override void ExtractItemFromPage()
        {
            Trace.TraceInformation("Extracting infomation from " + driver.Url);
            using (var ctx = new ScrapedItem())
            {
                var profile = new BuildUppProfile();

                profile.NameOfCompany = this.GetTextByXPath("//*[@id='njh_container']/div[3]/div/div/div[1]/h2");

                List <string> address     = new List <string>();
                string        addressLine = this.GetTextByXPath("//*[@id='OrderingForm']/table//tr//td//table//td[text()='Address: ']/following-sibling::td");
                string        county      = this.GetTextByXPath("//*[@id='OrderingForm']/table//tr//td//table//td[text()='County: ']/following-sibling::td");
                string        postcode    = this.GetTextByXPath("//*[@id='OrderingForm']/table//tr//td//table//td[text()='Postcode: ']/following-sibling::td");
                string        country     = this.GetTextByXPath("//*[@id='OrderingForm']/table//tr//td//table//td[text()='Country: ']/following-sibling::td");

                if (!string.IsNullOrEmpty(addressLine))
                {
                    address.Add(addressLine.Trim().Trim(','));
                }
                if (!string.IsNullOrEmpty(county))
                {
                    address.Add(county.Trim().Trim(','));
                }
                if (!string.IsNullOrEmpty(postcode))
                {
                    address.Add(postcode.Trim().Trim(','));
                }
                if (!string.IsNullOrEmpty(country))
                {
                    address.Add(addressLine.Trim().Trim(','));
                }

                profile.OfficeAddress = string.Join(", ", address);


                profile.PhoneNumber  = this.GetTextByXPath("//*[@id='OrderingForm']/table//tr//td//table//td[text()='Telephone: ']/following-sibling::td");
                profile.BusinessType = this.GetTextByXPath("//*[@id='OrderingForm']/table//tr//td//table//td[text()='Type: ']/following-sibling::td");
                profile.AreasServed  = this.GetTextByXPath("//*[@id='OrderingForm']/table//tr//td//table//td[text()='Search District: ']/following-sibling::td");


                profile.EmailAddress   = this.GetTextByXPath("//*[@id='OrderingForm']/table//span[@class='email_l']");
                profile.WebsiteAddress = this.GetTextByXPath("//*[@id='OrderingForm']/table//span[@class='website_l']");

                profile.AboutUs = this.GroupAllTextByXPath("//*[@id='OrderingForm']/table/tbody//div[@class='description']/following-sibling::*");

                profile.PageURL        = driver.Url;
                profile.WebsiteScraped = WebsiteName;


                ctx.BuildUppProfiles.Add(profile);
                ctx.SaveChanges();

                Trace.TraceInformation("Completed scraping infomation from " + driver.Url);
            };
        }
Exemple #5
0
        protected void ExportTable(List <string> columns, string fileName, bool updateExportedStatus = true, List <string> headings = null)
        {
            using (var ctx = new ScrapedItem())
            {
                Trace.TraceInformation("Exporting results to CSV file " + fileName);
                // Loop through each row in database which isn't deleted and hasn't got a don't export flag
                var exportResults = ctx.BuildUppProfiles.Where(p => p.IsDeletedFlag != true && p.DoNotExportFlag != true && p.WebsiteScraped == WebsiteName);

                // Write it to the CSV file specified for only specific columns
                using (StreamWriter outfile = new StreamWriter(fileName))
                {
                    headings = headings ?? columns;
                    outfile.WriteLine(
                        string.Join(",", headings)
                        );

                    foreach (var profile in exportResults)
                    {
                        List <string> rowValues = new List <string>();
                        // Loop over each colum name and get a list of the values
                        foreach (var column in columns)
                        {
                            rowValues.Add(
                                "\"" + this.ReplaceSafe(Convert.ToString(this.GetPropertyValue(profile, column))) + "\""
                                );
                        }

                        // Print the row of values
                        outfile.WriteLine(string.Join(",", rowValues));

                        if (updateExportedStatus)
                        {
                            profile.HasBeenExportedFlag = true;
                        }
                    }
                }

                // Update field to say they've exported
                if (updateExportedStatus)
                {
                    ctx.SaveChanges();
                }
                // Log the number of results exported
                Trace.TraceInformation("CSV file exported with " + exportResults.Count() + " results to " + fileName);
            }
        }
Exemple #6
0
        protected override void ExtractItemFromPage()
        {
            Trace.TraceInformation("Extracting infomation from " + driver.Url);

            using (var ctx = new ScrapedItem())
            {
                var profile = new BuildUppProfile();

                profile.NameOfCompany  = this.GetTextByXPath("/html/body/div/section[1]/div/div/div/div/h2");
                profile.EmailAddress   = this.GetTextByXPath("//*[@id='member-info']/tbody/tr/th[text()='Email']/following-sibling::td/a");
                profile.PhoneNumber    = this.GetTextByXPath("//*[@id='member-info']/tbody/tr/th[text()='Telephone']/following-sibling::td");
                profile.WebsiteAddress = this.GetTextByXPath("//*[@id='member-info']/tbody/tr/th[text()='Website']/following-sibling::td/a");

                var addressLines          = new List <string>();
                var addressLinesCorrected = new List <string>();

                addressLines.Add(this.GroupAllTextByXPath("//*[@id='member-info']/tbody/tr[5]/td"));
                addressLines.Add(this.GroupAllTextByXPath("//*[@id='member-info']/tbody/tr[7]/td"));
                addressLines.Add(this.GroupAllTextByXPath("//*[@id='member-info']/tbody/tr[9]/td"));
                addressLines.Add(this.GroupAllTextByXPath("//*[@id='member-info']/tbody/tr[11]/td"));
                addressLines.Add(this.GroupAllTextByXPath("//*[@id='member-info']/tbody/tr[13]/td"));
                addressLines.Add(this.GroupAllTextByXPath("//*[@id='member-info']/tbody/tr[15]/td"));

                // Loop over each addressLine only, copying non blanks into corrected version
                foreach (string addressLine in addressLines)
                {
                    if (!string.IsNullOrEmpty(addressLine?.Trim()))
                    {
                        addressLinesCorrected.Add(addressLine);
                    }
                }


                profile.OfficeAddress = string.Join(", ", addressLinesCorrected);

                profile.PageURL        = driver.Url;
                profile.WebsiteScraped = WebsiteName;


                ctx.BuildUppProfiles.Add(profile);
                ctx.SaveChanges();

                Trace.TraceInformation("Completed scraping infomation from " + driver.Url);
            };
        }
        protected override void ExtractItemFromPage()
        {
            Trace.TraceInformation("Extracting infomation from " + driver.Url);
            using (var ctx = new ScrapedItem())
            {
                var profile = new BuildUppProfile();

                profile.NameOfCompany = this.GetTextByXPath("//h1");

                profile.PageURL        = driver.Url;
                profile.WebsiteScraped = WebsiteName;


                ctx.BuildUppProfiles.Add(profile);
                ctx.SaveChanges();

                Trace.TraceInformation("Completed scraping infomation from " + driver.Url);
            };
        }
Exemple #8
0
        protected override void ExtractItemFromPage()
        {
            Trace.TraceInformation("Extracting infomation from " + driver.Url);
            using (var ctx = new ScrapedItem())
            {
                var profile = new BuildUppProfile();

                profile.NameOfCompany  = this.GetTextByXPath("//*[@id='page_container']/header/div[3]/div/h1/span");
                profile.WebsiteAddress = this.GetAttributeValueByXPath("//*[@id='page_container']/section/div/div/div[2]/div[2]/div/div/p[1]/a", "href");
                profile.OfficeAddress  = this.GetTextByXPath("//*[@id='page_container']/section/div/div/div[2]/div[2]/div/div/p[2]");
                profile.AreasServed    = this.GroupAllTextByXPath("//*[@id='page_container']/section/div/div/div[3]/div[2]/ul/li", ", ");
                profile.AboutUs        = this.GroupAllTextByXPath("//*[@id='page_container']/section/div/div/div[3]/div[1]/p");
                profile.ContactName    = this.GetTextByXPath("//*[@id='page_container']/header/div[3]/div/h1");

                if (!string.IsNullOrEmpty(profile.NameOfCompany))
                {
                    profile.ContactName = profile.ContactName.Replace(profile.NameOfCompany, "").Trim();
                }
                string infoBox = this.GetTextByXPath("//*[@id='page_container']/section/div/div/div[2]/div[2]/div/div/p[1]");
                profile.PhoneNumber = this.GetLineContaining(infoBox, "Tel:").Replace("Tel: ", "");

                try
                {
                    profile.PageURL = driver.Url;
                }
                catch (Exception e) { };

                try
                {
                    profile.WebsiteScraped = WebsiteName;
                }
                catch (Exception e) { };


                ctx.BuildUppProfiles.Add(profile);
                ctx.SaveChanges();

                Trace.TraceInformation("Completed scraping infomation from " + driver.Url);
            };
        }
        /// <summary>
        /// Loop over all profiles in the database and crawl for email addresses
        /// </summary>
        public void LoopOverResults()
        {
            IQueryable <BuildUppProfile> profiles;
            List <BuildUppProfile>       profilesList;

            using (var ctx = new ScrapedItem())
            {
                profiles     = ctx.BuildUppProfiles.Where(m => m.Id >= this.idToStart && m.WebsiteAddress != null && m.EmailAddress == null && m.HasEmailBeenChecked != true).Take(100);
                profilesList = profiles.ToList();
            }

            foreach (var profile in profilesList)
            {
                Trace.TraceInformation("About to attempt to get URL for {0}", profile.NameOfCompany);
                string email = "";
                try
                {
                    email = this.GetEmailFromWebsite(profile.WebsiteAddress);
                }
                catch (Exception e) {
                    Trace.TraceInformation("Unexpected Exception thrown: {0}", e.ToString());
                }
                string nameOfCompany = profile.NameOfCompany;

                using (var ctx = new ScrapedItem())
                {
                    //update all entries with that name of company to email address
                    foreach (var profileToUpdate in ctx.BuildUppProfiles.Where(m => m.NameOfCompany == nameOfCompany))
                    {
                        profileToUpdate.EmailAddress = email;
                        // if not found then update the EmailCheckedFlag
                        profileToUpdate.HasEmailBeenChecked = true;
                    }
                    ctx.SaveChanges();
                }
            }
        }
Exemple #10
0
        protected override void ExtractItemFromPage()
        {
            Trace.TraceInformation("Extracting infomation from " + driver.Url);
            using (var ctx = new ScrapedItem())
            {
                var profile = new BuildUppProfile();

                profile.NameOfCompany    = this.GetTextByXPath("//*[@id='block-system-main']/div/div/table/tbody/tr/td/div[1]/h1");
                profile.ContactName      = this.GetTextByXPath("//div[contains(@class,'views-field')]/strong[text()='Contact Name: ']/following-sibling::span");
                profile.PhoneNumber      = this.GetTextByXPath("//div[contains(@class,'views-field')]/strong[text()='Telephone: ']/following-sibling::span");
                profile.EmailAddress     = this.GetTextByXPath("//div[contains(@class,'views-field')]/strong[text()='Email Address: ']/following-sibling::span");
                profile.WebsiteAddress   = this.GetTextByXPath("//div[contains(@class,'views-field')]/strong[text()='Website: ']/following-sibling::span");
                profile.OfficeAddress    = this.GetTextByXPath("//span[contains(@class,'views-field')]/strong[text()='Contact Name: ']/following-sibling::span");
                profile.AboutUs          = this.GetTextByXPath("//span[contains(@class, 'field-content')]/strong[text()='Company Information:']/../..");
                profile.AboutUs          = string.IsNullOrEmpty(profile.AboutUs) ? null : profile.AboutUs.Replace("Company Information:", "").Trim();
                profile.ServicesProvided = this.GroupAllTextByXPath("//*[@id='buyersGuideCompany_productsAndServices']/li");

                List <string> address = new List <string>();

                string streetAddress = this.GetTextByXPath("//span[contains(@class, 'views-field-street-address')]/span");
                string addressLine1  = this.GetTextByXPath("//span[contains(@class, 'views-field-supplemental-address-1')]/span");
                string city          = this.GetTextByXPath("//span[contains(@class, 'views-field-city')]/span");
                string province      = this.GetTextByXPath("//span[contains(@class, 'views-field-state-province')]/span");
                string postcode      = this.GetTextByXPath("//span[contains(@class, 'views-field-postal-code')]/span");
                string country       = this.GetTextByXPath("//span[contains(@class, 'views-field-country')]/span");

                if (!string.IsNullOrEmpty(streetAddress))
                {
                    address.Add(streetAddress.Trim().Trim(','));
                }
                if (!string.IsNullOrEmpty(addressLine1))
                {
                    address.Add(addressLine1.Trim().Trim(','));
                }
                if (!string.IsNullOrEmpty(city))
                {
                    address.Add(city.Trim().Trim(','));
                }
                if (!string.IsNullOrEmpty(province))
                {
                    address.Add(province.Trim().Trim(','));
                }
                if (!string.IsNullOrEmpty(postcode))
                {
                    address.Add(postcode.Trim().Trim(','));
                }
                if (!string.IsNullOrEmpty(country))
                {
                    address.Add(country.Trim().Trim(','));
                }

                profile.OfficeAddress = string.Join(", ", address);


                profile.PageURL        = driver.Url;
                profile.WebsiteScraped = WebsiteName;


                ctx.BuildUppProfiles.Add(profile);
                ctx.SaveChanges();

                Trace.TraceInformation("Completed scraping infomation from " + driver.Url);
            };
        }
Exemple #11
0
        protected void StandardClean(bool RemoveExactDuplicates)
        {
            Trace.TraceInformation("Cleaning results data");
            Trace.TraceInformation("Remove duplicate profiles");
            using (var ctx = new ScrapedItem())
            {
                // Loop over each row and delete duplicates by adding isDeleted flag
                var duplicateProfiles = from r in ctx.BuildUppProfiles
                                        group r by new
                {
                    NameOfCompany  = r.NameOfCompany,
                    OfficeAddress  = r.OfficeAddress,
                    PhoneNumber    = r.PhoneNumber,
                    EmailAddress   = r.EmailAddress,
                    WebsiteScraped = this.WebsiteName
                }
                into g
                where g.Count() > 1
                select g;

                foreach (var g in duplicateProfiles)
                {
                    var removeProfiles = g.Skip(1);
                    foreach (var record in removeProfiles)
                    {
                        record.IsDeletedFlag = true;
                    }
                }

                // Add do not expoert for profiles that do not meet the basic requirment
                var invalidProfiles = ctx.BuildUppProfiles.Where(p =>
                                                                 String.IsNullOrEmpty(p.NameOfCompany) ||
                                                                 String.IsNullOrEmpty(p.EmailAddress) ||
                                                                 String.IsNullOrEmpty(p.OfficeAddress)
                                                                 );

                foreach (var profile in invalidProfiles)
                {
                    profile.DoNotExportFlag = true;
                }



                Trace.TraceInformation("do not export flag set for multiple company name listings");
                // Add the do not export flag for duplicate names
                var multipleAddressProfiles = from r in ctx.BuildUppProfiles
                                              where r.IsDeletedFlag != true
                                              group r by new
                {
                    NameOfCompany = r.NameOfCompany,
                }
                into g
                where g.Count() > 1
                select g;

                foreach (var g in multipleAddressProfiles)
                {
                    var removeProfiles = g.Skip(1);
                    foreach (var record in removeProfiles)
                    {
                        if (record.IsDeletedFlag != true)
                        {
                            record.DoNotExportFlag = true;
                        }
                    }
                }


                ctx.SaveChanges();
            }
        }