public virtual UnknownDictionaryWriter ReadDictionaryFile(string filename, string encoding) { UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(5 * 1024 * 1024); List <string[]> lines = new List <string[]>(); Encoding decoder = Encoding.GetEncoding(encoding); using (Stream inputStream = new FileStream(filename, FileMode.Open, FileAccess.Read)) using (TextReader reader = new StreamReader(inputStream, decoder)) { dictionary.Put(CSVUtil.Parse(NGRAM_DICTIONARY_ENTRY)); string line = null; while ((line = reader.ReadLine()) != null) { // note: unk.def only has 10 fields, it simplifies the writer to just append empty reading and pronunciation, // even though the unknown dictionary returns hardcoded null here. string[] parsed = CSVUtil.Parse(line + ",*,*"); // Probably we don't need to validate entry lines.Add(parsed); } } lines.Sort(new ComparerAnonymousHelper()); foreach (string[] entry in lines) { dictionary.Put(entry); } return(dictionary); }
public void TestPut() { UnknownDictionaryWriter unkDic = new UnknownDictionaryWriter(10 * 1024 * 1024); try { unkDic.Put(CSVUtil.Parse("KANJI,1285,11426,名詞,一般,*,*,*,*,*,*,*")); fail(); } #pragma warning disable 168 catch (Exception e) #pragma warning restore 168 { } String entry1 = "ALPHA,1285,1285,13398,名詞,一般,*,*,*,*,*,*,*"; String entry2 = "HIRAGANA,1285,1285,13069,名詞,一般,*,*,*,*,*,*,*"; String entry3 = "KANJI,1285,1285,11426,名詞,一般,*,*,*,*,*,*,*"; unkDic.PutCharacterCategory(0, "ALPHA"); unkDic.PutCharacterCategory(1, "HIRAGANA"); unkDic.PutCharacterCategory(2, "KANJI"); unkDic.Put(CSVUtil.Parse(entry1)); unkDic.Put(CSVUtil.Parse(entry2)); unkDic.Put(CSVUtil.Parse(entry3)); }