private void Load_Block_Offset_Mapping() { _block_CSV_line_mapping = new Dictionary<int, List<string>>(); StreamReader reader = new StreamReader(_input_CSV_path); string[] dels2 = { "\t" }; string line = reader.ReadLine(); while (line != null && line != "") { if (!line.StartsWith("#")) { string field = line.Split(dels2, StringSplitOptions.RemoveEmptyEntries)[1]; string[] parts = line.Split(dels2, StringSplitOptions.RemoveEmptyEntries); int block = int.Parse(parts[0]); string field_line = string.Empty; for (int ind = 1; ind < 6; ind++) { field_line += parts[ind] + "\t"; } if (!_block_CSV_line_mapping.ContainsKey(block)) { List<string> block_lines = new List<string>(); block_lines.Add(field_line); _block_CSV_line_mapping.Add(block, block_lines); } else { _block_CSV_line_mapping[block].Add(field_line); } } line = reader.ReadLine(); } reader.Close(); /// sort blocks using unsupervised/initial sorting algorithm _sorted_blocks = new Dictionary<int, List<string>>(); #if _BASE_LINE _bse = new TextRanker(_block_CSV_line_mapping, _dictionary_path, _pre_pop_path); _baseline = _bse._baseline; _baseline.Rank_Blocks(); _sorted_blocks = _baseline._sorted_blocks; _block_scores = _baseline._block_scores; #else #if _FILE_SYSTEM_INFO _bse = new TextRanker(_block_CSV_line_mapping, _dictionary_path, _pre_pop_path, _offset_map_path); _fso = _bse._fso; #else #if _HYBRID _bse = new TextRanker(_block_CSV_line_mapping, _dictionary_path, _pre_pop_path, _ground_truth_token_file_path, _input_CSV_path, _offset_map_path); _fso = _bse._fso; _investigator_input = _bse._investigator_input; #else _bse = new TextRanker(_block_CSV_line_mapping, _dictionary_path, _pre_pop_path, _ground_truth_token_file_path, _input_CSV_path); _investigator_input = _bse._investigator_input; #endif #endif _bse.Rank_Blocks(); _bad_block_features = new Dictionary<string, double>(_bse._bad_block_features); _sorted_blocks = _bse._sorted_blocks; _block_scores = _bse._block_scores; #endif //Write_Initial_Sorting_To_File(sorted_blocks); foreach (KeyValuePair<int, List<string>> pair in _sorted_blocks) { int curr_block = pair.Key; List<string> block_lines = pair.Value; for (int i = 0; i < block_lines.Count; i++) { string curr_line = block_lines[i]; long Offset = long.Parse(curr_line.Split(dels2, StringSplitOptions.RemoveEmptyEntries)[1]); string field = curr_line.Split(dels2, StringSplitOptions.RemoveEmptyEntries)[0]; if (Offset != 0 && (field.StartsWith("Text_") || field.StartsWith("PhoneNumber_"))) { if (!_block_offset_mapping_initial_sorted.ContainsKey(curr_block)) { List<long> Offset_list = new List<long>(); Offset_list.Add(Offset); _block_offset_mapping_initial_sorted.Add(curr_block, Offset_list); } else { _block_offset_mapping_initial_sorted[curr_block].Add(Offset); } } } } }
/// HYBRID public TextRanker(Dictionary<int, List<string>> block_CSV_line_mapping, string dict_path, string pre_pop_path, string ground_truth_token_file, string inference_res_csv, string offset_map_path) { _blocks = block_CSV_line_mapping; _sorted_blocks = new Dictionary<int, List<string>>(); _dict_path = dict_path; _pre_pop_path = pre_pop_path; _dict_words = new Dictionary<string, int>(); _bad_block_features = new Dictionary<string, double>(); Read_dictionary(); Read_Prepop_image(); _chars_to_split_on = new string[] { "@", "!", "#", "$", "%", "&", "*", "(", ")", "-", "[", "]", ":", ";", "'", "?", ".", ",", " ", "\"" }; _bad_token_indicators = new string[] { "+", "=", "`", "~", "<", ">", "{", "}", "*", "\\", "_", "^", "/", "|" }; _block_scores = new Dictionary<int, double>(); _chunk_offset_filename_map_path = offset_map_path; _fso = new FileSystemInfo(_chunk_offset_filename_map_path); _fso.Get_filename_block_map(_blocks); _ground_truth_token_file = ground_truth_token_file; _inference_res_csv = inference_res_csv; _investigator_input = new InvestigatorInput(_inference_res_csv, _ground_truth_token_file); }