public FSTFieldWriter(VariableGapTermsIndexWriter outerInstance, FieldInfo fieldInfo, long termsFilePointer)
            {
                this.outerInstance = outerInstance;

                this.fieldInfo = fieldInfo;
                fstOutputs     = PositiveInt32Outputs.Singleton;
                fstBuilder     = new Builder <long?>(FST.INPUT_TYPE.BYTE1, fstOutputs);
                indexStart     = outerInstance.m_output.Position; // LUCENENET specific: Renamed from getFilePointer() to match FileStream
                ////System.out.println("VGW: field=" + fieldInfo.name);

                // Always put empty string in
                fstBuilder.Add(new Int32sRef(), termsFilePointer);
                startTermsFilePointer = termsFilePointer;
            }
Exemplo n.º 2
0
            internal TermsWriter(FSTOrdTermsWriter outerInstance, FieldInfo fieldInfo)
            {
                _outerInstance = outerInstance;
                _numTerms      = 0;
                _fieldInfo     = fieldInfo;
                _longsSize     = outerInstance.postingsWriter.SetField(fieldInfo);
                _outputs       = PositiveInt32Outputs.Singleton;
                _builder       = new Builder <Int64>(FST.INPUT_TYPE.BYTE1, _outputs);

                _lastBlockStatsFp     = 0;
                _lastBlockMetaLongsFp = 0;
                _lastBlockMetaBytesFp = 0;
                _lastBlockLongs       = new long[_longsSize];

                _lastLongs       = new long[_longsSize];
                _lastMetaBytesFp = 0;
            }
Exemplo n.º 3
0
            private void LoadTermsIndex()
            {
                if (fst == null)
                {
                    using (IndexInput clone = (IndexInput)outerInstance.input.Clone())
                    {
                        clone.Seek(indexStart);
                        fst = new FST <long?>(clone, outerInstance.fstOutputs);
                    } // clone.Dispose();

                    /*
                     * final String dotFileName = segment + "_" + fieldInfo.name + ".dot";
                     * Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName));
                     * Util.toDot(fst, w, false, false);
                     * System.out.println("FST INDEX: SAVED to " + dotFileName);
                     * w.close();
                     */

                    if (outerInstance.indexDivisor > 1)
                    {
                        // subsample
                        Int32sRef                           scratchIntsRef = new Int32sRef();
                        PositiveInt32Outputs                outputs        = PositiveInt32Outputs.Singleton;
                        Builder <long?>                     builder        = new Builder <long?>(FST.INPUT_TYPE.BYTE1, outputs);
                        BytesRefFSTEnum <long?>             fstEnum        = new BytesRefFSTEnum <long?>(fst);
                        BytesRefFSTEnum.InputOutput <long?> result;
                        int count = outerInstance.indexDivisor;
                        while (fstEnum.MoveNext())
                        {
                            result = fstEnum.Current;
                            if (count == outerInstance.indexDivisor)
                            {
                                builder.Add(Util.Fst.Util.ToInt32sRef(result.Input, scratchIntsRef), result.Output);
                                count = 0;
                            }
                            count++;
                        }
                        fst = builder.Finish();
                    }
                }
            }
Exemplo n.º 4
0
        private void WriteFST(FieldInfo field, IEnumerable <BytesRef> values)
        {
            meta.WriteVInt32(field.Number);
            meta.WriteByte(MemoryDocValuesProducer.FST);
            meta.WriteInt64(data.GetFilePointer());
            PositiveInt32Outputs outputs = PositiveInt32Outputs.Singleton;
            var  builder = new Builder <long?>(INPUT_TYPE.BYTE1, outputs);
            var  scratch = new Int32sRef();
            long ord     = 0;

            foreach (BytesRef v in values)
            {
                builder.Add(Util.ToInt32sRef(v, scratch), ord);
                ord++;
            }
            FST <long?> fst = builder.Finish();

            if (fst != null)
            {
                fst.Save(data);
            }
            meta.WriteVInt64(ord);
        }
Exemplo n.º 5
0
        private void WriteFST(FieldInfo field, IEnumerable <BytesRef> values)
        {
            meta.WriteVInt32(field.Number);
            meta.WriteByte((byte)Lucene42DocValuesProducer.FST);
            meta.WriteInt64(data.Position); // LUCENENET specific: Renamed from getFilePointer() to match FileStream
            PositiveInt32Outputs outputs = PositiveInt32Outputs.Singleton;
            Builder <Int64>      builder = new Builder <Int64>(INPUT_TYPE.BYTE1, outputs);
            Int32sRef            scratch = new Int32sRef();
            long ord = 0;

            foreach (BytesRef v in values)
            {
                builder.Add(Util.ToInt32sRef(v, scratch), ord);
                ord++;
            }

            var fst = builder.Finish();

            if (fst != null)
            {
                fst.Save(data);
            }
            meta.WriteVInt64(ord);
        }
Exemplo n.º 6
0
        public UserDictionary(TextReader reader)
        {
            string line   = null;
            int    wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;

            JCG.List <string[]> featureEntries = new JCG.List <string[]>();

            // text, segmentation, readings, POS
            while ((line = reader.ReadLine()) != null)
            {
                // Remove comments
                line = specialChars.Replace(line, "");

                // Skip empty lines or comment lines
                if (line.Trim().Length == 0)
                {
                    continue;
                }
                string[] values = CSVUtil.Parse(line);
                featureEntries.Add(values);
            }

            // TODO: should we allow multiple segmentations per input 'phrase'?
            // the old treemap didn't support this either, and i'm not sure if its needed/useful?
            featureEntries.Sort(Comparer <string[]> .Create((left, right) => left[0].CompareToOrdinal(right[0])));

            JCG.List <string> data          = new JCG.List <string>(featureEntries.Count);
            JCG.List <int[]>  segmentations = new JCG.List <int[]>(featureEntries.Count);

            PositiveInt32Outputs fstOutput  = PositiveInt32Outputs.Singleton;
            Builder <Int64>      fstBuilder = new Builder <Int64>(Lucene.Net.Util.Fst.FST.INPUT_TYPE.BYTE2, fstOutput);
            Int32sRef            scratch    = new Int32sRef();
            long ord = 0;

            foreach (string[] values in featureEntries)
            {
                string[] segmentation = commentLine.Replace(values[1], " ").Split(' ').TrimEnd();
                string[] readings     = commentLine.Replace(values[2], " ").Split(' ').TrimEnd();
                string   pos          = values[3];

                if (segmentation.Length != readings.Length)
                {
                    throw RuntimeException.Create("Illegal user dictionary entry " + values[0] +
                                                  " - the number of segmentations (" + segmentation.Length + ")" +
                                                  " does not the match number of readings (" + readings.Length + ")");
                }

                int[] wordIdAndLength = new int[segmentation.Length + 1]; // wordId offset, length, length....
                wordIdAndLength[0] = wordId;
                for (int i = 0; i < segmentation.Length; i++)
                {
                    wordIdAndLength[i + 1] = segmentation[i].Length;
                    data.Add(readings[i] + Dictionary.INTERNAL_SEPARATOR + pos);
                    wordId++;
                }
                // add mapping to FST
                string token = values[0];
                scratch.Grow(token.Length);
                scratch.Length = token.Length;
                for (int i = 0; i < token.Length; i++)
                {
                    scratch.Int32s[i] = (int)token[i];
                }
                fstBuilder.Add(scratch, ord);
                segmentations.Add(wordIdAndLength);
                ord++;
            }
            this.fst           = new TokenInfoFST(fstBuilder.Finish(), false);
            this.data          = data.ToArray(/*new string[data.Count]*/);
            this.segmentations = segmentations.ToArray(/*new int[segmentations.Count][]*/);
        }
Exemplo n.º 7
0
        public virtual TokenInfoDictionaryWriter BuildDictionary(IList <string> csvFiles)
        {
            TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(10 * 1024 * 1024);

            // all lines in the file
            Console.WriteLine("  parse...");
            List <string[]> lines = new List <string[]>(400000);

            foreach (string file in csvFiles)
            {
                using (Stream inputStream = new FileStream(file, FileMode.Open, FileAccess.Read))
                {
                    Encoding   decoder = Encoding.GetEncoding(encoding);
                    TextReader reader  = new StreamReader(inputStream, decoder);

                    string line = null;
                    while ((line = reader.ReadLine()) != null)
                    {
                        string[] entry = CSVUtil.Parse(line);

                        if (entry.Length < 13)
                        {
                            Console.WriteLine("Entry in CSV is not valid: " + line);
                            continue;
                        }

                        string[] formatted = FormatEntry(entry);
                        lines.Add(formatted);

                        // NFKC normalize dictionary entry
                        if (normalizeEntries)
                        {
                            //if (normalizer.isNormalized(entry[0])){
                            if (entry[0].IsNormalized(NormalizationForm.FormKC))
                            {
                                continue;
                            }
                            string[] normalizedEntry = new string[entry.Length];
                            for (int i = 0; i < entry.Length; i++)
                            {
                                //normalizedEntry[i] = normalizer.normalize(entry[i]);
                                normalizedEntry[i] = entry[i].Normalize(NormalizationForm.FormKC);
                            }

                            formatted = FormatEntry(normalizedEntry);
                            lines.Add(formatted);
                        }
                    }
                }
            }

            Console.WriteLine("  sort...");

            // sort by term: we sorted the files already and use a stable sort.
            lines.Sort(new ComparerAnonymousHelper());

            Console.WriteLine("  encode...");

            PositiveInt32Outputs fstOutput  = PositiveInt32Outputs.Singleton;
            Builder <long?>      fstBuilder = new Builder <long?>(Lucene.Net.Util.Fst.FST.INPUT_TYPE.BYTE2, 0, 0, true, true, int.MaxValue, fstOutput, null, true, PackedInt32s.DEFAULT, true, 15);
            Int32sRef            scratch    = new Int32sRef();
            long   ord       = -1; // first ord will be 0
            string lastValue = null;

            // build tokeninfo dictionary
            foreach (string[] entry in lines)
            {
                int next = dictionary.Put(entry);

                if (next == offset)
                {
                    Console.WriteLine("Failed to process line: " + Collections.ToString(entry));
                    continue;
                }

                string token = entry[0];
                if (!token.Equals(lastValue, StringComparison.Ordinal))
                {
                    // new word to add to fst
                    ord++;
                    lastValue = token;
                    scratch.Grow(token.Length);
                    scratch.Length = token.Length;
                    for (int i = 0; i < token.Length; i++)
                    {
                        scratch.Int32s[i] = (int)token[i];
                    }
                    fstBuilder.Add(scratch, ord);
                }
                dictionary.AddMapping((int)ord, offset);
                offset = next;
            }

            FST <long?> fst = fstBuilder.Finish();

            Console.WriteLine("  " + fst.NodeCount + " nodes, " + fst.ArcCount + " arcs, " + fst.GetSizeInBytes() + " bytes...  ");
            dictionary.SetFST(fst);
            Console.WriteLine(" done");

            return(dictionary);
        }
Exemplo n.º 8
0
            private void LoadTerms()
            {
                PositiveInt32Outputs posIntOutputs = PositiveInt32Outputs.Singleton;
                var outputsInner = new PairOutputs <Int64, Int64>(posIntOutputs, posIntOutputs);
                var outputs      = new PairOutputs <Int64, PairOutputs <Int64, Int64> .Pair>(posIntOutputs,
                                                                                             outputsInner);
                var        b   = new Builder <PairOutputs <Int64, PairOutputs <Int64, Int64> .Pair> .Pair>(FST.INPUT_TYPE.BYTE1, outputs);
                IndexInput @in = (IndexInput)outerInstance.input.Clone();

                @in.Seek(termsStart);
                BytesRef    lastTerm       = new BytesRef(10);
                long        lastDocsStart  = -1;
                int         docFreq        = 0;
                long        totalTermFreq  = 0;
                FixedBitSet visitedDocs    = new FixedBitSet(maxDoc);
                Int32sRef   scratchIntsRef = new Int32sRef();

                while (true)
                {
                    SimpleTextUtil.ReadLine(@in, scratch);
                    if (scratch.Equals(SimpleTextFieldsWriter.END) || StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.FIELD))
                    {
                        if (lastDocsStart != -1)
                        {
                            b.Add(Util.ToInt32sRef(lastTerm, scratchIntsRef),
                                  outputs.NewPair(lastDocsStart,
                                                  outputsInner.NewPair((long)docFreq, totalTermFreq)));
                            sumTotalTermFreq += totalTermFreq;
                        }
                        break;
                    }
                    else if (StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.DOC))
                    {
                        docFreq++;
                        sumDocFreq++;
                        UnicodeUtil.UTF8toUTF16(scratch.Bytes, scratch.Offset + SimpleTextFieldsWriter.DOC.Length, scratch.Length - SimpleTextFieldsWriter.DOC.Length, scratchUTF16);
                        int docID = ArrayUtil.ParseInt32(scratchUTF16.Chars, 0, scratchUTF16.Length);
                        visitedDocs.Set(docID);
                    }
                    else if (StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.FREQ))
                    {
                        UnicodeUtil.UTF8toUTF16(scratch.Bytes, scratch.Offset + SimpleTextFieldsWriter.FREQ.Length, scratch.Length - SimpleTextFieldsWriter.FREQ.Length, scratchUTF16);
                        totalTermFreq += ArrayUtil.ParseInt32(scratchUTF16.Chars, 0, scratchUTF16.Length);
                    }
                    else if (StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.TERM))
                    {
                        if (lastDocsStart != -1)
                        {
                            b.Add(Util.ToInt32sRef(lastTerm, scratchIntsRef), outputs.NewPair(lastDocsStart,
                                                                                              outputsInner.NewPair((long)docFreq, totalTermFreq)));
                        }
                        lastDocsStart = @in.Position; // LUCENENET specific: Renamed from getFilePointer() to match FileStream
                        int len = scratch.Length - SimpleTextFieldsWriter.TERM.Length;
                        if (len > lastTerm.Length)
                        {
                            lastTerm.Grow(len);
                        }
                        System.Array.Copy(scratch.Bytes, SimpleTextFieldsWriter.TERM.Length, lastTerm.Bytes, 0, len);
                        lastTerm.Length   = len;
                        docFreq           = 0;
                        sumTotalTermFreq += totalTermFreq;
                        totalTermFreq     = 0;
                        termCount++;
                    }
                }
                docCount = visitedDocs.Cardinality;
                fst      = b.Finish();

                /*
                 * PrintStream ps = new PrintStream("out.dot");
                 * fst.toDot(ps);
                 * ps.close();
                 * System.out.println("SAVED out.dot");
                 */
                //System.out.println("FST " + fst.sizeInBytes());
            }