public static void Main(string[] args) { // conString to be passed around string conString; // loading cities into list for lookup HashSet <string> cities = new HashSet <string>(); cities = FileIO.LoadCities(); // loading provinces into list for lookup HashSet <string> provinces = new HashSet <string>(); provinces = FileIO.LoadProvinces(); // loading street suffixes into list for lookup HashSet <string> suffixes = new HashSet <string>(); suffixes = FileIO.LoadSuffixes(); conString = DBIO.ConnectToDB(); Console.Write("Please type the filename of the input data file: "); string inputfilename = Console.ReadLine(); // Console.WriteLine(inputfilename); // print headings for tabulated display Console.WriteLine("{0, -15} {1, -25} {2, -40} {3, -10} {4, -15}", "Street #", "Street Name", "City", "Province", "Postal Code"); FileIO.LoadXLS(inputfilename, cities, provinces, suffixes, conString); Console.WriteLine("All done. Press any key to finish..."); Console.ReadKey(true); }
/* * load and read XSL file with provided filename ***FILE MUST BE IN ROOT FOLDER OF PROJECT*** * code adapted from https://coderwall.com/p/app3ya/read-excel-file-in-c */ public static void LoadXLS(string filename, HashSet <string> cities, HashSet <string> provinces, HashSet <string> suffixes, string conString) { try { // Creating Excel objects Excel.Application xlApp = new Excel.Application(); filename = Path.GetFullPath("@..\\..\\..\\..\\..\\..\\" + filename); Excel.Workbook xlWorkbook = xlApp.Workbooks.Open(filename); Excel._Worksheet xlWorksheet = xlWorkbook.Sheets[1]; Excel.Range xlRange = xlWorksheet.UsedRange; int rowCount = xlRange.Rows.Count; //Console.WriteLine("row count: " + rowCount); int colCount = xlRange.Columns.Count; //Console.WriteLine("column count: " + colCount); string raw; //iterate over the rows and columns and print to the console as it appears in the file for (int i = 1; i <= rowCount; i++) { for (int j = 1; j <= colCount; j++) { //write the value to the console if (xlRange.Cells[i, j] != null && xlRange.Cells[i, j].Value != null) { raw = xlRange.Cells[i, j].Value; DBIO.WriteRaw(conString, raw); ParseAddress(raw, cities, provinces, suffixes, conString); } } } //cleanup GC.Collect(); GC.WaitForPendingFinalizers(); //rule of thumb for releasing com objects: // never use two dots, all COM objects must be referenced and released individually // ex: [somthing].[something].[something] is bad //release com objects to fully kill excel process from running in the background Marshal.ReleaseComObject(xlRange); Marshal.ReleaseComObject(xlWorksheet); //close and release xlWorkbook.Close(); Marshal.ReleaseComObject(xlWorkbook); //quit and release xlApp.Quit(); Marshal.ReleaseComObject(xlApp); } catch (Exception e) { Console.WriteLine(e.ToString()); } }
// parses raw string into its parts public static void ParseAddress(string raw, HashSet <string> cities, HashSet <string> provinces, HashSet <string> suffixes, string conString) { // initialize counter to be used to ensure all elements are gathered from raw string int counter; // variables to store address components string streetnum; string streetname; string city; string province = ""; string postalcode = ""; string potentialstreet; string potentialcity; // used to end loops early Boolean cityfound = false; Boolean numfound = false; // pointers to use for finding street and city names int suffixindex = 0; int numindex = 0; int cityindex = 0; int pointer = 0; int j; // replace anything that isn't alphanumeric or a symbol that could be used with " " raw = Regex.Replace(raw, @"[!@#$%^&*()_+=\[{\]};:<>|/?,\\""]", " "); // split raw string into parts about the spaces string[] split = Regex.Split(raw, @"\s+"); // convert array of substrings into List List <string> converted = new List <string>(split); // used to store results from AllSubstrings method List <string> substrings = new List <string>(); // reset counter to 0 for every iteration counter = 0; //Console.WriteLine("Before: "+converted.Count); // for each element in split string foreach (var s in converted.ToList()) { // check if current substring is of the postal code format ex. A1A1A1 if (Regex.IsMatch(s, @"\w\d\w\s*\d\w\d")) { // Console.WriteLine(s); // save postal code postalcode = s; converted.Remove(s); //Console.WriteLine("postal code exists: " + s); counter++; } // check if current substring exists in List of provinces else if (provinces.Contains(s, StringComparer.OrdinalIgnoreCase)) { // save province province = s; converted.Remove(s); //Console.WriteLine("provinces exists:" + s); counter++; } // remove any whitespace, null, empty entries from list else if (string.IsNullOrWhiteSpace(s)) { converted.Remove(s); } //Console.WriteLine("After: "+converted.Count); } // if province and postal code have been accounted for so far if (counter >= 2) { //Console.WriteLine("Street # Street name and city: " + String.Join(" ", split)); // for each element in converted (made up of street number, street name, city name) foreach (var x in converted) { // first find street suffix if (suffixes.Contains(x, StringComparer.OrdinalIgnoreCase)) { // get index of street suffix suffixindex = converted.IndexOf(x); // CASE 1: suffix is at the end of the string, so street name is at least 1 element before it if (suffixindex == (converted.Count - 1)) { potentialstreet = converted.ElementAt(suffixindex); potentialcity = ""; pointer = suffixindex - 1; // start building street name from right to left until search reaches a # while (!numfound && !cityfound && pointer != 0) { // building string from right to left potentialstreet = converted.ElementAt(pointer) + " " + potentialstreet; pointer--; //Console.WriteLine("building string backwards: " + potentialstreet); // if next element to the left is a number, could be street number if (Utils.IsNumeric(converted.ElementAt(pointer))) { numindex = pointer; // only 1 element before numindex, check that against cities if (numindex == 1) { potentialcity = String.Join(" ", converted.GetRange(0, numindex).ToArray()); // first element in string is city, so everything from numindex to suffixindex is the street number and name if (cities.Contains(Utils.RemoveDiacritics(potentialcity), StringComparer.OrdinalIgnoreCase)) { city = potentialcity; streetnum = converted.ElementAt(numindex); streetname = String.Join(" ", converted.GetRange(numindex + 1, suffixindex - numindex)); numfound = true; cityfound = true; Console.WriteLine("{0, -15} {1, -25} {2, -40} {3, -10} {4, -15}", streetnum, streetname, city, province, postalcode); } } // street number is first element in string, so city must be between 1 and at most suffixindex - 1 else if (numindex == 0) { // get all substrings from 1 to current potentialstreet substrings = Utils.AllSubstrings(converted.GetRange(1, suffixindex - 1).ToArray()); // check every substring for a city name foreach (var y in substrings) { // one of the substrings is a city name if (cities.Contains(Utils.RemoveDiacritics(y), StringComparer.OrdinalIgnoreCase)) { potentialcity = y; //Console.WriteLine("potential city: " + potentialcity); } } // last iteration of potentialcity should be the city name (longest one) city = potentialcity; cityindex = city.Split(null).Length; streetnum = converted.ElementAt(numindex); streetname = String.Join(" ", converted.GetRange(cityindex + 1, suffixindex - cityindex)); Console.WriteLine("{0, -15} {1, -25} {2, -40} {3, -10} {4, -15}", streetnum, streetname, city, province, postalcode); cityfound = true; numfound = true; } // multiple elements before numindex, need to find cities else { // get all substrings from 0 to index of potential street number substrings = Utils.AllSubstrings(converted.GetRange(0, numindex).ToArray()); // check every substring for a city name foreach (var y in substrings) { // one of the substrings is a city name if (cities.Contains(Utils.RemoveDiacritics(y), StringComparer.OrdinalIgnoreCase)) { potentialcity = y; // Console.WriteLine("potential city: " + potentialcity); } } // found a city name that spans from 0 to numindex - 1, so everything from numindex to suffixindex is the street number and name // .Split(null) assumes whitespace if (potentialcity.Split(null).Length == numindex) { city = potentialcity; cityfound = true; streetnum = converted.ElementAt(numindex); numfound = true; streetname = String.Join(" ", converted.GetRange(numindex + 1, suffixindex - numindex)); Console.WriteLine("{0, -15} {1, -25} {2, -40} {3, -10} {4, -15}", streetnum, streetname, city, province, postalcode); } } } } } // CASE 2: suffix is in the middle of the string, so street name must be somewhere in 0 to suffixindex else { potentialstreet = ""; potentialcity = ""; pointer = 0; // start building street from left to right, starting from 0 to suffixindex or until a # is found while (!numfound && !cityfound) { potentialstreet = potentialstreet + converted.ElementAt(pointer); //Console.WriteLine("building string forwards: " + potentialstreet); // if next element to the right is a number, could be street number if (Utils.IsNumeric(converted.ElementAt(pointer))) { numindex = pointer; // street number is first element in string, street name must be from 1 to suffixindex, and city from suffixindex + 1 to converted.Count if (numindex == 0) { // get all substrings from 1 to converted.Count substrings = Utils.AllSubstrings(converted.GetRange(suffixindex + 1, converted.Count - suffixindex - 1).ToArray()); // check every substring for a city name foreach (var y in substrings) { // one of the substrings is a city name if (cities.Contains(Utils.RemoveDiacritics(y), StringComparer.OrdinalIgnoreCase)) { potentialcity = y; //Console.WriteLine("potential city: " + potentialcity); } } // city not found if (potentialcity == "") { DBIO.WriteError(conString, raw); break; } else { // last iteration of potentialcity should be the city name (longest one) city = potentialcity; streetnum = converted.ElementAt(numindex); streetname = String.Join(" ", converted.GetRange(1, suffixindex)); cityfound = true; numfound = true; Console.WriteLine("{0, -15} {1, -25} {2, -40} {3, -10} {4, -15}", streetnum, streetname, city, province, postalcode); DBIO.WriteComplete(conString, streetnum, streetname, city, province, postalcode); } } // street number is last element in string, street name and city are somewhere in 0 to suffixindex else if ((numindex == converted.Count - 1)) { // street ending is 2nd last element in string, so street name and city name are somewhere between 0 and suffixindex - 1 if (suffixindex == numindex - 1) { substrings = Utils.AllSubstrings(converted.GetRange(0, converted.Count - suffixindex - 1).ToArray()); // check every substring for a city name foreach (var y in substrings) { // one of the substrings is a city name if (cities.Contains(Utils.RemoveDiacritics(y), StringComparer.OrdinalIgnoreCase)) { potentialcity = y; //Console.WriteLine("potential city: " + potentialcity); } } // city not found if (potentialcity == "") { DBIO.WriteError(conString, raw); break; } else { // last iteration of potentialcity should be the city name (longest one) city = potentialcity; cityindex = city.Split(null).Length; streetnum = converted.ElementAt(numindex); streetname = String.Join(" ", converted.GetRange(cityindex, suffixindex - cityindex + 1)); cityfound = true; numfound = true; Console.WriteLine("{0, -15} {1, -25} {2, -40} {3, -10} {4, -15}", streetnum, streetname, city, province, postalcode); } } // street ending is in the middle, so street name is 0 to suffixindex, and city is suffixindex + 1 onwards to numindex - 1 else { substrings = Utils.AllSubstrings(converted.GetRange(suffixindex + 1, converted.Count - suffixindex - 1).ToArray()); // check every substring for a city name foreach (var y in substrings) { // one of the substrings is a city name if (cities.Contains(Utils.RemoveDiacritics(y), StringComparer.OrdinalIgnoreCase)) { potentialcity = y; //Console.WriteLine("potential city: " + potentialcity); } } // city not found if (potentialcity == "") { DBIO.WriteError(conString, raw); break; } else { // last iteration of potentialcity should be the city name (longest one) city = potentialcity; streetnum = converted.ElementAt(numindex); streetname = String.Join(" ", converted.GetRange(0, suffixindex + 1)); cityfound = true; numfound = true; Console.WriteLine("{0, -15} {1, -25} {2, -40} {3, -10} {4, -15}", streetnum, streetname, city, province, postalcode); } } } // street number and street ending in the middle, so city is numindex + 1 onwards, and street is 0 to suffixindex else { // get all substrings from 0 to index of potential street number substrings = Utils.AllSubstrings(converted.GetRange(numindex + 1, converted.Count - 1 - numindex).ToArray()); // check every substring for a city name foreach (var y in substrings) { // one of the substrings is a city name if (cities.Contains(Utils.RemoveDiacritics(y), StringComparer.OrdinalIgnoreCase)) { potentialcity = y; // Console.WriteLine("potential city: " + potentialcity); } } // city not found if (potentialcity == "") { DBIO.WriteError(conString, raw); break; } else { // last iteration of potentialcity should be the city name (longest one) city = potentialcity; cityfound = true; streetnum = converted.ElementAt(numindex); numfound = true; streetname = String.Join(" ", converted.GetRange(0, suffixindex + 1)); Console.WriteLine("{0, -15} {1, -25} {2, -40} {3, -10} {4, -15}", streetnum, streetname, city, province, postalcode); } } } potentialstreet = potentialstreet + " "; pointer++; } } } } }// province and postal code could not be extracted, save in "error" database else { DBIO.WriteError(conString, raw); } }