Пример #1
0
        private TokenInfoDictionary()
        {
            FST <Int64> fst = null;

            using (Stream @is = GetResource(FST_FILENAME_SUFFIX))
            {
                fst = new FST <Int64>(new InputStreamDataInput(@is), PositiveInt32Outputs.Singleton);
            }
            // TODO: some way to configure?
            this.fst = new TokenInfoFST(fst, true);
        }
Пример #2
0
        public UserDictionary(TextReader reader)
        {
            string line   = null;
            int    wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;

            JCG.List <string[]> featureEntries = new JCG.List <string[]>();

            // text, segmentation, readings, POS
            while ((line = reader.ReadLine()) != null)
            {
                // Remove comments
                line = specialChars.Replace(line, "");

                // Skip empty lines or comment lines
                if (line.Trim().Length == 0)
                {
                    continue;
                }
                string[] values = CSVUtil.Parse(line);
                featureEntries.Add(values);
            }

            // TODO: should we allow multiple segmentations per input 'phrase'?
            // the old treemap didn't support this either, and i'm not sure if its needed/useful?
            featureEntries.Sort(Comparer <string[]> .Create((left, right) => left[0].CompareToOrdinal(right[0])));

            JCG.List <string> data          = new JCG.List <string>(featureEntries.Count);
            JCG.List <int[]>  segmentations = new JCG.List <int[]>(featureEntries.Count);

            PositiveInt32Outputs fstOutput  = PositiveInt32Outputs.Singleton;
            Builder <Int64>      fstBuilder = new Builder <Int64>(Lucene.Net.Util.Fst.FST.INPUT_TYPE.BYTE2, fstOutput);
            Int32sRef            scratch    = new Int32sRef();
            long ord = 0;

            foreach (string[] values in featureEntries)
            {
                string[] segmentation = commentLine.Replace(values[1], " ").Split(' ').TrimEnd();
                string[] readings     = commentLine.Replace(values[2], " ").Split(' ').TrimEnd();
                string   pos          = values[3];

                if (segmentation.Length != readings.Length)
                {
                    throw RuntimeException.Create("Illegal user dictionary entry " + values[0] +
                                                  " - the number of segmentations (" + segmentation.Length + ")" +
                                                  " does not the match number of readings (" + readings.Length + ")");
                }

                int[] wordIdAndLength = new int[segmentation.Length + 1]; // wordId offset, length, length....
                wordIdAndLength[0] = wordId;
                for (int i = 0; i < segmentation.Length; i++)
                {
                    wordIdAndLength[i + 1] = segmentation[i].Length;
                    data.Add(readings[i] + Dictionary.INTERNAL_SEPARATOR + pos);
                    wordId++;
                }
                // add mapping to FST
                string token = values[0];
                scratch.Grow(token.Length);
                scratch.Length = token.Length;
                for (int i = 0; i < token.Length; i++)
                {
                    scratch.Int32s[i] = (int)token[i];
                }
                fstBuilder.Add(scratch, ord);
                segmentations.Add(wordIdAndLength);
                ord++;
            }
            this.fst           = new TokenInfoFST(fstBuilder.Finish(), false);
            this.data          = data.ToArray(/*new string[data.Count]*/);
            this.segmentations = segmentations.ToArray(/*new int[segmentations.Count][]*/);
        }