private TokenInfoDictionary() { FST <Int64> fst = null; using (Stream @is = GetResource(FST_FILENAME_SUFFIX)) { fst = new FST <Int64>(new InputStreamDataInput(@is), PositiveInt32Outputs.Singleton); } // TODO: some way to configure? this.fst = new TokenInfoFST(fst, true); }
public UserDictionary(TextReader reader) { string line = null; int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET; JCG.List <string[]> featureEntries = new JCG.List <string[]>(); // text, segmentation, readings, POS while ((line = reader.ReadLine()) != null) { // Remove comments line = specialChars.Replace(line, ""); // Skip empty lines or comment lines if (line.Trim().Length == 0) { continue; } string[] values = CSVUtil.Parse(line); featureEntries.Add(values); } // TODO: should we allow multiple segmentations per input 'phrase'? // the old treemap didn't support this either, and i'm not sure if its needed/useful? featureEntries.Sort(Comparer <string[]> .Create((left, right) => left[0].CompareToOrdinal(right[0]))); JCG.List <string> data = new JCG.List <string>(featureEntries.Count); JCG.List <int[]> segmentations = new JCG.List <int[]>(featureEntries.Count); PositiveInt32Outputs fstOutput = PositiveInt32Outputs.Singleton; Builder <Int64> fstBuilder = new Builder <Int64>(Lucene.Net.Util.Fst.FST.INPUT_TYPE.BYTE2, fstOutput); Int32sRef scratch = new Int32sRef(); long ord = 0; foreach (string[] values in featureEntries) { string[] segmentation = commentLine.Replace(values[1], " ").Split(' ').TrimEnd(); string[] readings = commentLine.Replace(values[2], " ").Split(' ').TrimEnd(); string pos = values[3]; if (segmentation.Length != readings.Length) { throw RuntimeException.Create("Illegal user dictionary entry " + values[0] + " - the number of segmentations (" + segmentation.Length + ")" + " does not the match number of readings (" + readings.Length + ")"); } int[] wordIdAndLength = new int[segmentation.Length + 1]; // wordId offset, length, length.... wordIdAndLength[0] = wordId; for (int i = 0; i < segmentation.Length; i++) { wordIdAndLength[i + 1] = segmentation[i].Length; data.Add(readings[i] + Dictionary.INTERNAL_SEPARATOR + pos); wordId++; } // add mapping to FST string token = values[0]; scratch.Grow(token.Length); scratch.Length = token.Length; for (int i = 0; i < token.Length; i++) { scratch.Int32s[i] = (int)token[i]; } fstBuilder.Add(scratch, ord); segmentations.Add(wordIdAndLength); ord++; } this.fst = new TokenInfoFST(fstBuilder.Finish(), false); this.data = data.ToArray(/*new string[data.Count]*/); this.segmentations = segmentations.ToArray(/*new int[segmentations.Count][]*/); }