public List <FullAddressFields> getBestAddresses(string user_county, string[] user_search_normalized) { TextFieldParser parser = new TextFieldParser(addrPath); parser.TextFieldType = FieldType.Delimited; parser.SetDelimiters(","); //Dictionary<int, string> header = new Dictionary<int, string>(); string[] fields = parser.ReadFields(); int fields_len = fields.Length; int county_index = -1; int locality_index = -1; int secLocality_index = -1; int thorofare_index = -1; int address_index = -1; int building_id_index = -1; int address_reference_index = -1; int building_number_index = -1; int building_group_name_index = -1; int building_name_index = -1; int sub_building_name_index = -1; int department_index = -1; int organisation_name_index = -1; for (int col_index = 0; col_index < fields_len; col_index++) { string value = fields.GetValueAt <string>(col_index); if (string.Equals(county, value, StringComparison.OrdinalIgnoreCase)) { county_index = col_index; } else if (string.Equals(locality, value, StringComparison.OrdinalIgnoreCase)) { locality_index = col_index; } else if (string.Equals(secLocality, value, StringComparison.OrdinalIgnoreCase)) { secLocality_index = col_index; } else if (string.Equals(thorofare, value, StringComparison.OrdinalIgnoreCase)) { thorofare_index = col_index; } else if (string.Equals(address, value, StringComparison.OrdinalIgnoreCase)) { address_index = col_index; } else if (string.Equals(building, value, StringComparison.OrdinalIgnoreCase)) { building_id_index = col_index; } else if (string.Equals(address_reference, value, StringComparison.OrdinalIgnoreCase)) { address_reference_index = col_index; } else if (string.Equals(building_number, value, StringComparison.OrdinalIgnoreCase)) { building_number_index = col_index; } else if (string.Equals(building_group_name, value, StringComparison.OrdinalIgnoreCase)) { building_group_name_index = col_index; } else if (string.Equals(building_name, value, StringComparison.OrdinalIgnoreCase)) { building_name_index = col_index; } else if (string.Equals(sub_building_name, value, StringComparison.OrdinalIgnoreCase)) { sub_building_name_index = col_index; } else if (string.Equals(department, value, StringComparison.OrdinalIgnoreCase)) { department_index = col_index; } else if (string.Equals(organisation_name, value, StringComparison.OrdinalIgnoreCase)) { organisation_name_index = col_index; } //header.Add(col_id, fields.GetValueAt<string>(col_id)); } System.Collections.Generic.HashSet <FullAddressFields> top_addresses = new System.Collections.Generic.HashSet <FullAddressFields>(); List <int> address_scores = new List <int>(); while (!parser.EndOfData) { fields = parser.ReadFields(); // build json using full address fields FullAddressFields fulladdr = new FullAddressFields(); fulladdr.locality = fields.GetValueAt <string>(locality_index); fulladdr.secondary_locality = fields.GetValueAt <string>(secLocality_index); fulladdr.thorofare = fields.GetValueAt <string>(thorofare_index); fulladdr.building_number = fields.GetValueAt <string>(building_number_index); fulladdr.county = fields.GetValueAt <string>(county_index); fulladdr.address = fields.GetValueAt <string>(address_index); fulladdr.building = fields.GetValueAt <int>(building_id_index); fulladdr.address_reference = fields.GetValueAt <long>(address_reference_index); fulladdr.building_group_name = fields.GetValueAt <string>(building_group_name_index); fulladdr.building_name = fields.GetValueAt <string>(building_name_index); fulladdr.sub_building_name = fields.GetValueAt <string>(sub_building_name_index); fulladdr.department = fields.GetValueAt <string>(department_index); fulladdr.organisation_name = fields.GetValueAt <string>(organisation_name_index); if (string.Equals(user_county, fulladdr.county, StringComparison.OrdinalIgnoreCase)) { string address_split = fulladdr.address.Replace(fulladdr.address.Split(",").Last(), ""); string[] address_split_normalized = ner.normalize(address_split); if (!fulladdr.building_number.IsEmpty()) { if (!ner.numbers.Contains <string>(fulladdr.building_number.ToLower().Trim())) { continue; } } if (!fulladdr.sub_building_name.IsEmpty()) { string sub_building_number = Regex.Replace(fulladdr.sub_building_name, @"(apartment)|(unit)|(flat)", "", RegexOptions.IgnoreCase).ToLower().Trim(); if (!ner.numbers.Contains <string>(sub_building_number)) { continue; } } int tokens_match = 0; var sim = new F23.StringSimilarity.JaroWinkler(); var dist = new F23.StringSimilarity.Damerau(); foreach (string user_search_token in user_search_normalized) { foreach (string address_token in address_split_normalized) { //double score = sim.Similarity(user_search_token, address_token); double score = Fuzz.WeightedRatio(user_search_token, address_token, PreprocessMode.Full); double score2 = dist.Distance(user_search_token, address_token); if (score >= MIN_SCORE * 100 || score2 <= MAX_DIST) //if (score >= MIN_SCORE && score2 <= MAX_DIST) { tokens_match += 1; break; } } } if (user_search_normalized.Length == tokens_match) { top_addresses.Add(fulladdr); } //break; } } List <FullAddressFields> best_addresses = new List <FullAddressFields>(); foreach (FullAddressFields addr in top_addresses) { if (!addr.building_number.IsEmpty() || !addr.sub_building_name.IsEmpty()) { best_addresses.Add(addr); } } if (best_addresses.Count == 0) { return(top_addresses.ToList <FullAddressFields>()); } return(best_addresses); }
static void Main() { Paths paths = new Paths(); // init locality database DataBase locality = new DataBase(paths.path.localitiesPath, "county_id", "locality_id", "name"); // init secondary locality database DataBase secLocality = new DataBase(paths.path.secLocalityPath, "county_id", "locality_id", "name"); // init thorofare database DataBase thorofare = new DataBase(paths.path.thorofarePath, "county_id", "thorfare_id", "thorfare_name"); // train data NERTrain train = new NERTrain(ref locality, ref secLocality, ref thorofare); try { File.OpenRead(paths.path.trainPath); } catch (IOException e) { Console.WriteLine("Generating training data..."); train.createTrainingData(paths.path.fullAddressesPath, paths.path.trainPath); Console.WriteLine("Creating model..."); train.createModelFromTrainingData(paths.path.trainPath, paths.path.modelPath, paths.path.properties); } /* * var t = new F23.StringSimilarity.Damerau(); * Console.WriteLine(t.Distance("", "12windmillpark")); * Console.WriteLine(Fuzz.PartialRatio("woodgreen", "wood")); * return; */ // use model /* * //string user_search_test = "6Woodlands Avenue, Dromahair"; * string user_search_test = ""; * string county_test = "leitrim"; * NERAddress address_test = new NERAddress(); * string[] user_search_test_normalized = address_test.normalize(user_search_test); * * // print input address * Console.WriteLine("Input address: " + user_search_test); * // print normalized address * Console.WriteLine("Normalized address[tokens]: " + string.Join("|", user_search_test_normalized)); * train.useModel(paths.path.modelPath, user_search_test.ToLower(), ref address_test); * * FullAddress addr_test = new FullAddress(paths.path.fullAddressesPath, ref address_test); * List<FullAddressFields> best_addr_test = addr_test.getBestAddresses(county_test, user_search_test_normalized); * foreach (FullAddressFields best_addr_i in best_addr_test) * { * Console.WriteLine(best_addr_i.address, Console.ForegroundColor = ConsoleColor.Green); * Console.ResetColor(); * } * * return; */ foreach (string partial_file in Directory.EnumerateFiles(paths.path.partialAddresses, "*.csv")) { TextFieldParser parser = new TextFieldParser(partial_file); parser.TextFieldType = FieldType.Delimited; parser.SetDelimiters(","); string[] fields = parser.ReadFields(); int address_index = 1; int county_index = 3; for (int field_index = 0; field_index < fields.Length; field_index++) { if (string.Equals(fields[field_index], "address", StringComparison.OrdinalIgnoreCase)) { address_index = field_index; } else if (string.Equals(fields[field_index], "county", StringComparison.OrdinalIgnoreCase)) { county_index = field_index; } } Stats statistics = new Stats(); int doc_id = 0; while (!parser.EndOfData) { doc_id += 1; Console.Write("\n"); fields = parser.ReadFields(); string user_search = fields.GetValueAt <string>(address_index); string county = fields.GetValueAt <string>(county_index); // init elastic Elastic obj = new Elastic(paths.path.indexPath); var json = obj.ConvertCsvFileToJsonBulkList(partial_file); // docs indexed starting from id 1 obj.SendJsonToElastic(json); // print input address Console.WriteLine("Input address: " + user_search); // set processing flag in elastic obj.updateDocument(doc_id.ToString(), @"{""doc"": {""flag"": ""processing""}}"); Console.WriteLine("Document in elasticsearch: " + obj.QueryElasticById(doc_id.ToString())); // begin searching Console.WriteLine("Searching...", Console.ForegroundColor = ConsoleColor.Green); Console.ResetColor(); // init model NERAddress address = new NERAddress(); // normalize data string[] user_search_normalized = address.normalize(user_search); // print what model found train.useModel(paths.path.modelPath, user_search.ToLower(), ref address); // print normalized address Console.WriteLine("Normalized address[tokens]: " + string.Join("|", user_search_normalized)); // init FullAddress FullAddress addr = new FullAddress(paths.path.fullAddressesPath, ref address); // set flag to processed in elastic obj.updateDocument(doc_id.ToString(), @"{""doc"": {""flag"": ""processed""}}"); // for maybe matches index nested objects List <FullAddressFields> best_addr = addr.getBestAddresses(county, user_search_normalized); if (best_addr.Count == 0) { Console.WriteLine("No matches found"); statistics.no_matches += 1; } else if (best_addr.Count == 1) { obj.updateDocument(doc_id.ToString(), @"{""doc"":" + JsonConvert.SerializeObject(best_addr[0]) + "}"); statistics.exact_matches += 1; } else { List <string> json_objects = new List <string>(); foreach (FullAddressFields best_addr_i in best_addr) { json_objects.Add(JsonConvert.SerializeObject(best_addr_i)); } string nested_json = "[" + String.Join(",", json_objects) + "]"; obj.updateDocument(doc_id.ToString(), @"{""doc"": {""matches"": " + nested_json + "}}"); statistics.maybe_matches += 1; } Console.WriteLine("Document in elasticsearch: " + obj.QueryElasticById(doc_id.ToString())); // Output only addresses foreach (FullAddressFields best_addr_i in best_addr) { Console.WriteLine(best_addr_i.address, Console.ForegroundColor = ConsoleColor.Green); Console.ResetColor(); } //break; } // print statistics Console.WriteLine("Exact matches found: " + statistics.exact_matches); Console.WriteLine("One to many matches found: " + statistics.maybe_matches); Console.WriteLine("No matches found: " + statistics.no_matches); } }