コード例 #1
0
        private void Load_Block_Offset_Mapping()
        {
            _block_CSV_line_mapping = new Dictionary<int, List<string>>();
            StreamReader reader = new StreamReader(_input_CSV_path);
            string[] dels2 = { "\t" };
            string line = reader.ReadLine();
            while (line != null && line != "")
            {
                if (!line.StartsWith("#"))
                {
                    string field = line.Split(dels2, StringSplitOptions.RemoveEmptyEntries)[1];

                    string[] parts = line.Split(dels2, StringSplitOptions.RemoveEmptyEntries);
                    int block = int.Parse(parts[0]);
                    string field_line = string.Empty;
                    for (int ind = 1; ind < 6; ind++)
                    {
                        field_line += parts[ind] + "\t";
                    }

                    if (!_block_CSV_line_mapping.ContainsKey(block))
                    {
                        List<string> block_lines = new List<string>();
                        block_lines.Add(field_line);
                        _block_CSV_line_mapping.Add(block, block_lines);
                    }
                    else
                    {
                        _block_CSV_line_mapping[block].Add(field_line);
                    }
                }
                line = reader.ReadLine();
            }
            reader.Close();
            /// sort blocks using unsupervised/initial sorting algorithm
            _sorted_blocks = new Dictionary<int, List<string>>();

            #if _BASE_LINE
                _bse = new TextRanker(_block_CSV_line_mapping, _dictionary_path, _pre_pop_path);
                _baseline = _bse._baseline;
                _baseline.Rank_Blocks();
                _sorted_blocks = _baseline._sorted_blocks;
                _block_scores = _baseline._block_scores;
            #else
            #if _FILE_SYSTEM_INFO
                _bse = new TextRanker(_block_CSV_line_mapping, _dictionary_path, _pre_pop_path, _offset_map_path);
                _fso = _bse._fso;

            #else
            #if _HYBRID
                _bse = new TextRanker(_block_CSV_line_mapping, _dictionary_path, _pre_pop_path, _ground_truth_token_file_path, _input_CSV_path, _offset_map_path);
                _fso = _bse._fso;
                _investigator_input = _bse._investigator_input;
            #else
            _bse = new TextRanker(_block_CSV_line_mapping, _dictionary_path, _pre_pop_path, _ground_truth_token_file_path, _input_CSV_path);
            _investigator_input = _bse._investigator_input;
            #endif
            #endif
            _bse.Rank_Blocks();
            _bad_block_features = new Dictionary<string, double>(_bse._bad_block_features);

            _sorted_blocks = _bse._sorted_blocks;
            _block_scores = _bse._block_scores;
            #endif
            //Write_Initial_Sorting_To_File(sorted_blocks);

            foreach (KeyValuePair<int, List<string>> pair in _sorted_blocks)
            {
                int curr_block = pair.Key;
                List<string> block_lines = pair.Value;
                for (int i = 0; i < block_lines.Count; i++)
                {
                    string curr_line = block_lines[i];
                    long Offset = long.Parse(curr_line.Split(dels2, StringSplitOptions.RemoveEmptyEntries)[1]);
                    string field = curr_line.Split(dels2, StringSplitOptions.RemoveEmptyEntries)[0];
                    if (Offset != 0 && (field.StartsWith("Text_") || field.StartsWith("PhoneNumber_")))
                    {
                        if (!_block_offset_mapping_initial_sorted.ContainsKey(curr_block))
                        {
                            List<long> Offset_list = new List<long>();
                            Offset_list.Add(Offset);
                            _block_offset_mapping_initial_sorted.Add(curr_block, Offset_list);
                        }
                        else
                        {
                            _block_offset_mapping_initial_sorted[curr_block].Add(Offset);
                        }
                    }
                }
            }
        }
コード例 #2
0
        /// HYBRID
        public TextRanker(Dictionary<int, List<string>> block_CSV_line_mapping, string dict_path, string pre_pop_path, string ground_truth_token_file, string inference_res_csv,
            string offset_map_path)
        {
            _blocks = block_CSV_line_mapping;
            _sorted_blocks = new Dictionary<int, List<string>>();
            _dict_path = dict_path;
            _pre_pop_path = pre_pop_path;

            _dict_words = new Dictionary<string, int>();
            _bad_block_features = new Dictionary<string, double>();
            Read_dictionary();
            Read_Prepop_image();
            _chars_to_split_on = new string[] { "@", "!", "#", "$", "%", "&", "*", "(", ")", "-", "[", "]", ":", ";", "'", "?", ".", ",", " ", "\"" };
            _bad_token_indicators = new string[] { "+", "=", "`", "~", "<", ">", "{", "}", "*", "\\", "_", "^", "/", "|" };
            _block_scores = new Dictionary<int, double>();

            _chunk_offset_filename_map_path = offset_map_path;
            _fso = new FileSystemInfo(_chunk_offset_filename_map_path);
            _fso.Get_filename_block_map(_blocks);

            _ground_truth_token_file = ground_truth_token_file;
            _inference_res_csv = inference_res_csv;
            _investigator_input = new InvestigatorInput(_inference_res_csv, _ground_truth_token_file);
        }