public static void Build(DictionaryFormat format, string inputDirname, string outputDirname, string encoding, bool normalizeEntry) { Console.WriteLine("building tokeninfo dict..."); TokenInfoDictionaryBuilder tokenInfoBuilder = new TokenInfoDictionaryBuilder(format, encoding, normalizeEntry); TokenInfoDictionaryWriter tokenInfoDictionary = tokenInfoBuilder.Build(inputDirname); tokenInfoDictionary.Write(outputDirname); //tokenInfoDictionary = null; // LUCENENET: IDE0059: Remove unnecessary value assignment //tokenInfoBuilder = null; // LUCENENET: IDE0059: Remove unnecessary value assignment Console.WriteLine("done"); Console.WriteLine("building unknown word dict..."); UnknownDictionaryBuilder unkBuilder = new UnknownDictionaryBuilder(encoding); UnknownDictionaryWriter unkDictionary = unkBuilder.Build(inputDirname); unkDictionary.Write(outputDirname); //unkDictionary = null; // LUCENENET: IDE0059: Remove unnecessary value assignment //unkBuilder = null; // LUCENENET: IDE0059: Remove unnecessary value assignment Console.WriteLine("done"); Console.WriteLine("building connection costs..."); ConnectionCostsWriter connectionCosts = ConnectionCostsBuilder.Build(inputDirname + System.IO.Path.DirectorySeparatorChar + "matrix.def"); connectionCosts.Write(outputDirname); Console.WriteLine("done"); }
public virtual UnknownDictionaryWriter ReadDictionaryFile(string filename, string encoding) { UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(5 * 1024 * 1024); List <string[]> lines = new List <string[]>(); Encoding decoder = Encoding.GetEncoding(encoding); using (Stream inputStream = new FileStream(filename, FileMode.Open, FileAccess.Read)) using (TextReader reader = new StreamReader(inputStream, decoder)) { dictionary.Put(CSVUtil.Parse(NGRAM_DICTIONARY_ENTRY)); string line = null; while ((line = reader.ReadLine()) != null) { // note: unk.def only has 10 fields, it simplifies the writer to just append empty reading and pronunciation, // even though the unknown dictionary returns hardcoded null here. string[] parsed = CSVUtil.Parse(line + ",*,*"); // Probably we don't need to validate entry lines.Add(parsed); } } lines.Sort(new ComparerAnonymousHelper()); foreach (string[] entry in lines) { dictionary.Put(entry); } return(dictionary); }
public virtual UnknownDictionaryWriter Build(string dirname) { UnknownDictionaryWriter unkDictionary = ReadDictionaryFile(dirname + System.IO.Path.DirectorySeparatorChar + "unk.def"); //Should be only one file ReadCharacterDefinition(dirname + System.IO.Path.DirectorySeparatorChar + "char.def", unkDictionary); return(unkDictionary); }
public void TestPut() { UnknownDictionaryWriter unkDic = new UnknownDictionaryWriter(10 * 1024 * 1024); try { unkDic.Put(CSVUtil.Parse("KANJI,1285,11426,名詞,一般,*,*,*,*,*,*,*")); fail(); } #pragma warning disable 168 catch (Exception e) #pragma warning restore 168 { } String entry1 = "ALPHA,1285,1285,13398,名詞,一般,*,*,*,*,*,*,*"; String entry2 = "HIRAGANA,1285,1285,13069,名詞,一般,*,*,*,*,*,*,*"; String entry3 = "KANJI,1285,1285,11426,名詞,一般,*,*,*,*,*,*,*"; unkDic.PutCharacterCategory(0, "ALPHA"); unkDic.PutCharacterCategory(1, "HIRAGANA"); unkDic.PutCharacterCategory(2, "KANJI"); unkDic.Put(CSVUtil.Parse(entry1)); unkDic.Put(CSVUtil.Parse(entry2)); unkDic.Put(CSVUtil.Parse(entry3)); }
public void TestPutCharacterCategory() { UnknownDictionaryWriter unkDic = new UnknownDictionaryWriter(10 * 1024 * 1024); try { unkDic.PutCharacterCategory(0, "DUMMY_NAME"); fail(); } #pragma warning disable 168 catch (Exception e) #pragma warning restore 168 { } try { unkDic.PutCharacterCategory(-1, "KATAKANA"); fail(); } #pragma warning disable 168 catch (Exception e) #pragma warning restore 168 { } unkDic.PutCharacterCategory(0, "DEFAULT"); unkDic.PutCharacterCategory(1, "GREEK"); unkDic.PutCharacterCategory(2, "HIRAGANA"); unkDic.PutCharacterCategory(3, "KATAKANA"); unkDic.PutCharacterCategory(4, "KANJI"); }
public virtual void ReadCharacterDefinition(string filename, UnknownDictionaryWriter dictionary) { using (Stream inputStream = new FileStream(filename, FileMode.Open, FileAccess.Read)) using (TextReader reader = new StreamReader(inputStream, Encoding.GetEncoding(encoding))) { string line = null; while ((line = reader.ReadLine()) != null) { line = Regex.Replace(line, "^\\s", ""); line = Regex.Replace(line, "\\s*#.*", ""); line = Regex.Replace(line, "\\s+", " "); // Skip empty line or comment line if (line.Length == 0) { continue; } if (line.StartsWith("0x", StringComparison.Ordinal)) { // Category mapping string[] values = new Regex(" ").Split(line, 2); // Split only first space if (!values[0].Contains("..")) { int cp = Convert.ToInt32(values[0], 16); dictionary.PutCharacterCategory(cp, values[1]); } else { string[] codePoints = Regex.Split(values[0], "\\.\\.").TrimEnd(); int cpFrom = Convert.ToInt32(codePoints[0], 16); int cpTo = Convert.ToInt32(codePoints[1], 16); for (int i = cpFrom; i <= cpTo; i++) { dictionary.PutCharacterCategory(i, values[1]); } } } else { // Invoke definition string[] values = line.Split(' ').TrimEnd(); // Consecutive space is merged above string characterClassName = values[0]; int invoke = int.Parse(values[1], CultureInfo.InvariantCulture); int group = int.Parse(values[2], CultureInfo.InvariantCulture); int length = int.Parse(values[3], CultureInfo.InvariantCulture); dictionary.PutInvokeDefinition(characterClassName, invoke, group, length); } } } }