예제 #1
0
            protected override bool AcceptResult(Int32sRef input, PairOutputs <long?, BytesRef> .Pair output)
            {
                // Dedup: when the input analyzes to a graph we
                // can get duplicate surface forms:
                if (seen.Contains(output.Output2))
                {
                    return(false);
                }
                seen.Add(output.Output2);

                if (!outerInstance.exactFirst)
                {
                    return(true);
                }
                else
                {
                    // In exactFirst mode, don't accept any paths
                    // matching the surface form since that will
                    // create duplicate results:
                    if (outerInstance.SameSurfaceForm(utf8Key, output.Output2))
                    {
                        // We found exact match, which means we should
                        // have already found it in the first search:
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(results.Count == 1);
                        }
                        return(false);
                    }
                    else
                    {
                        return(true);
                    }
                }
            }
예제 #2
0
            private void LoadTerms()
            {
                var posIntOutputs = PositiveInt32Outputs.Singleton;
                var outputsInner  = new PairOutputs <long?, long?>(posIntOutputs, posIntOutputs);
                var outputs       = new PairOutputs <long?, PairOutputs <long?, long?> .Pair>(posIntOutputs, outputsInner);

                // honestly, wtf kind of generic mess is this.
                var b     = new Builder <PairOutputs <long?, PairOutputs <long?, long?> .Pair> .Pair>(FST.INPUT_TYPE.BYTE1, outputs);
                var input = (IndexInput)_outerInstance._input.Clone();

                input.Seek(_termsStart);

                var  lastTerm      = new BytesRef(10);
                long lastDocsStart = -1;
                int  docFreq       = 0;
                long totalTermFreq = 0;
                var  visitedDocs   = new FixedBitSet(_maxDoc);

                var scratchIntsRef = new Int32sRef();

                while (true)
                {
                    SimpleTextUtil.ReadLine(input, _scratch);
                    if (_scratch.Equals(SimpleTextFieldsWriter.END) || StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.FIELD))
                    {
                        if (lastDocsStart != -1)
                        {
                            b.Add(Util.ToInt32sRef(lastTerm, scratchIntsRef),
                                  outputs.NewPair(lastDocsStart, outputsInner.NewPair(docFreq, totalTermFreq)));
                            _sumTotalTermFreq += totalTermFreq;
                        }
                        break;
                    }

                    if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.DOC))
                    {
                        docFreq++;
                        _sumDocFreq++;
                        UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.DOC.Length, _scratch.Length - SimpleTextFieldsWriter.DOC.Length,
                                                _scratchUtf16);
                        int docId = ArrayUtil.ParseInt32(_scratchUtf16.Chars, 0, _scratchUtf16.Length);
                        visitedDocs.Set(docId);
                    }
                    else if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.FREQ))
                    {
                        UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.FREQ.Length,
                                                _scratch.Length - SimpleTextFieldsWriter.FREQ.Length, _scratchUtf16);
                        totalTermFreq += ArrayUtil.ParseInt32(_scratchUtf16.Chars, 0, _scratchUtf16.Length);
                    }
                    else if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.TERM))
                    {
                        if (lastDocsStart != -1)
                        {
                            b.Add(Util.ToInt32sRef(lastTerm, scratchIntsRef),
                                  outputs.NewPair(lastDocsStart, outputsInner.NewPair(docFreq, totalTermFreq)));
                        }
                        lastDocsStart = input.GetFilePointer();
                        int len = _scratch.Length - SimpleTextFieldsWriter.TERM.Length;
                        if (len > lastTerm.Length)
                        {
                            lastTerm.Grow(len);
                        }
                        Array.Copy(_scratch.Bytes, SimpleTextFieldsWriter.TERM.Length, lastTerm.Bytes, 0, len);
                        lastTerm.Length    = len;
                        docFreq            = 0;
                        _sumTotalTermFreq += totalTermFreq;
                        totalTermFreq      = 0;
                        _termCount++;
                    }
                }
                _docCount = visitedDocs.Cardinality();
                _fst      = b.Finish();
            }
예제 #3
0
        public override void Build(IInputEnumerator enumerator)
        {
            if (enumerator.HasContexts)
            {
                throw new ArgumentException("this suggester doesn't support contexts");
            }
            string prefix     = this.GetType().Name;
            var    directory  = OfflineSorter.DefaultTempDir();
            var    tempInput  = FileSupport.CreateTempFile(prefix, ".input", directory);
            var    tempSorted = FileSupport.CreateTempFile(prefix, ".sorted", directory);

            hasPayloads = enumerator.HasPayloads;

            var writer = new OfflineSorter.ByteSequencesWriter(tempInput);

            OfflineSorter.ByteSequencesReader reader = null;
            var scratch = new BytesRef();

            TokenStreamToAutomaton ts2a = GetTokenStreamToAutomaton();

            bool success = false;

            count = 0;
            byte[] buffer = new byte[8];
            try
            {
                var      output = new ByteArrayDataOutput(buffer);
                BytesRef surfaceForm;

                while (enumerator.MoveNext())
                {
                    surfaceForm = enumerator.Current;
                    ISet <Int32sRef> paths = ToFiniteStrings(surfaceForm, ts2a);

                    maxAnalyzedPathsForOneInput = Math.Max(maxAnalyzedPathsForOneInput, paths.Count);

                    foreach (Int32sRef path in paths)
                    {
                        Util.Fst.Util.ToBytesRef(path, scratch);

                        // length of the analyzed text (FST input)
                        if (scratch.Length > ushort.MaxValue - 2)
                        {
                            throw new ArgumentException("cannot handle analyzed forms > " + (ushort.MaxValue - 2) +
                                                        " in length (got " + scratch.Length + ")");
                        }
                        ushort analyzedLength = (ushort)scratch.Length;

                        // compute the required length:
                        // analyzed sequence + weight (4) + surface + analyzedLength (short)
                        int requiredLength = analyzedLength + 4 + surfaceForm.Length + 2;

                        BytesRef payload;

                        if (hasPayloads)
                        {
                            if (surfaceForm.Length > (ushort.MaxValue - 2))
                            {
                                throw new ArgumentException("cannot handle surface form > " + (ushort.MaxValue - 2) +
                                                            " in length (got " + surfaceForm.Length + ")");
                            }
                            payload = enumerator.Payload;
                            // payload + surfaceLength (short)
                            requiredLength += payload.Length + 2;
                        }
                        else
                        {
                            payload = null;
                        }

                        buffer = ArrayUtil.Grow(buffer, requiredLength);

                        output.Reset(buffer);

                        output.WriteInt16((short)analyzedLength);

                        output.WriteBytes(scratch.Bytes, scratch.Offset, scratch.Length);

                        output.WriteInt32(EncodeWeight(enumerator.Weight));

                        if (hasPayloads)
                        {
                            for (int i = 0; i < surfaceForm.Length; i++)
                            {
                                if (surfaceForm.Bytes[i] == PAYLOAD_SEP)
                                {
                                    throw new ArgumentException(
                                              "surface form cannot contain unit separator character U+001F; this character is reserved");
                                }
                            }
                            output.WriteInt16((short)surfaceForm.Length);
                            output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length);
                            output.WriteBytes(payload.Bytes, payload.Offset, payload.Length);
                        }
                        else
                        {
                            output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length);
                        }

                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(output.Position == requiredLength, () => output.Position + " vs " + requiredLength);
                        }

                        writer.Write(buffer, 0, output.Position);
                    }
                    count++;
                }
                writer.Dispose();

                // Sort all input/output pairs (required by FST.Builder):
                (new OfflineSorter(new AnalyzingComparer(hasPayloads))).Sort(tempInput, tempSorted);

                // Free disk space:
                tempInput.Delete();

                reader = new OfflineSorter.ByteSequencesReader(tempSorted);

                var outputs = new PairOutputs <long?, BytesRef>(PositiveInt32Outputs.Singleton,
                                                                ByteSequenceOutputs.Singleton);
                var builder = new Builder <PairOutputs <long?, BytesRef> .Pair>(FST.INPUT_TYPE.BYTE1, outputs);

                // Build FST:
                BytesRef  previousAnalyzed = null;
                BytesRef  analyzed         = new BytesRef();
                BytesRef  surface          = new BytesRef();
                Int32sRef scratchInts      = new Int32sRef();
                var       input            = new ByteArrayDataInput();

                // Used to remove duplicate surface forms (but we
                // still index the hightest-weight one).  We clear
                // this when we see a new analyzed form, so it cannot
                // grow unbounded (at most 256 entries):
                var seenSurfaceForms = new JCG.HashSet <BytesRef>();

                var dedup = 0;
                while (reader.Read(scratch))
                {
                    input.Reset(scratch.Bytes, scratch.Offset, scratch.Length);
                    ushort analyzedLength = (ushort)input.ReadInt16();
                    analyzed.Grow(analyzedLength + 2);
                    input.ReadBytes(analyzed.Bytes, 0, analyzedLength);
                    analyzed.Length = analyzedLength;

                    long cost = input.ReadInt32();

                    surface.Bytes = scratch.Bytes;
                    if (hasPayloads)
                    {
                        surface.Length = (ushort)input.ReadInt16();
                        surface.Offset = input.Position;
                    }
                    else
                    {
                        surface.Offset = input.Position;
                        surface.Length = scratch.Length - surface.Offset;
                    }

                    if (previousAnalyzed == null)
                    {
                        previousAnalyzed = new BytesRef();
                        previousAnalyzed.CopyBytes(analyzed);
                        seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface));
                    }
                    else if (analyzed.Equals(previousAnalyzed))
                    {
                        dedup++;
                        if (dedup >= maxSurfaceFormsPerAnalyzedForm)
                        {
                            // More than maxSurfaceFormsPerAnalyzedForm
                            // dups: skip the rest:
                            continue;
                        }
                        if (seenSurfaceForms.Contains(surface))
                        {
                            continue;
                        }
                        seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface));
                    }
                    else
                    {
                        dedup = 0;
                        previousAnalyzed.CopyBytes(analyzed);
                        seenSurfaceForms.Clear();
                        seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface));
                    }

                    // TODO: I think we can avoid the extra 2 bytes when
                    // there is no dup (dedup==0), but we'd have to fix
                    // the exactFirst logic ... which would be sort of
                    // hairy because we'd need to special case the two
                    // (dup/not dup)...

                    // NOTE: must be byte 0 so we sort before whatever
                    // is next
                    analyzed.Bytes[analyzed.Offset + analyzed.Length]     = 0;
                    analyzed.Bytes[analyzed.Offset + analyzed.Length + 1] = (byte)dedup;
                    analyzed.Length += 2;

                    Util.Fst.Util.ToInt32sRef(analyzed, scratchInts);
                    //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString());
                    if (!hasPayloads)
                    {
                        builder.Add(scratchInts, outputs.NewPair(cost, BytesRef.DeepCopyOf(surface)));
                    }
                    else
                    {
                        int      payloadOffset = input.Position + surface.Length;
                        int      payloadLength = scratch.Length - payloadOffset;
                        BytesRef br            = new BytesRef(surface.Length + 1 + payloadLength);
                        Array.Copy(surface.Bytes, surface.Offset, br.Bytes, 0, surface.Length);
                        br.Bytes[surface.Length] = PAYLOAD_SEP;
                        Array.Copy(scratch.Bytes, payloadOffset, br.Bytes, surface.Length + 1, payloadLength);
                        br.Length = br.Bytes.Length;
                        builder.Add(scratchInts, outputs.NewPair(cost, br));
                    }
                }
                fst = builder.Finish();

                //Util.dotToFile(fst, "/tmp/suggest.dot");

                success = true;
            }
            finally
            {
                if (success)
                {
                    IOUtils.Dispose(reader, writer);
                }
                else
                {
                    IOUtils.DisposeWhileHandlingException(reader, writer);
                }

                tempInput.Delete();
                tempSorted.Delete();
            }
        }
예제 #4
0
            private void LoadTerms()
            {
                var posIntOutputs = PositiveIntOutputs.Singleton;
                var outputsInner = new PairOutputs<long?, long?>(posIntOutputs, posIntOutputs);
                var outputs = new PairOutputs<long?, PairOutputs<long?,long?>.Pair>(posIntOutputs, outputsInner);

                // honestly, wtf kind of generic mess is this.
                var b = new Builder<PairOutputs<long?, PairOutputs<long?,long?>.Pair>.Pair>(FST.INPUT_TYPE.BYTE1, outputs);
                var input = (IndexInput) _outerInstance._input.Clone();
                input.Seek(_termsStart);

                var lastTerm = new BytesRef(10);
                long lastDocsStart = -1;
                int docFreq = 0;
                long totalTermFreq = 0;
                var visitedDocs = new FixedBitSet(_maxDoc);

                var scratchIntsRef = new IntsRef();
                while (true)
                {
                    SimpleTextUtil.ReadLine(input, _scratch);
                    if (_scratch.Equals(SimpleTextFieldsWriter.END) || StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.FIELD))
                    {
                        if (lastDocsStart != -1)
                        {
                            b.Add(Util.ToIntsRef(lastTerm, scratchIntsRef),
                                outputs.NewPair(lastDocsStart, outputsInner.NewPair(docFreq, totalTermFreq)));
                            _sumTotalTermFreq += totalTermFreq;
                        }
                        break;
                    }

                    if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.DOC))
                    {
                        docFreq++;
                        _sumDocFreq++;
                        UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.DOC.Length, _scratch.Length - SimpleTextFieldsWriter.DOC.Length,
                            _scratchUtf16);
                        int docId = ArrayUtil.ParseInt(_scratchUtf16.Chars, 0, _scratchUtf16.Length);
                        visitedDocs.Set(docId);
                    }
                    else if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.FREQ))
                    {
                        UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.FREQ.Length,
                            _scratch.Length - SimpleTextFieldsWriter.FREQ.Length, _scratchUtf16);
                        totalTermFreq += ArrayUtil.ParseInt(_scratchUtf16.Chars, 0, _scratchUtf16.Length);
                    }
                    else if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.TERM))
                    {
                        if (lastDocsStart != -1)
                        {
                            b.Add(Util.ToIntsRef(lastTerm, scratchIntsRef),
                                outputs.NewPair(lastDocsStart, outputsInner.NewPair(docFreq, totalTermFreq)));
                        }
                        lastDocsStart = input.FilePointer;
                        int len = _scratch.Length - SimpleTextFieldsWriter.TERM.Length;
                        if (len > lastTerm.Length)
                        {
                            lastTerm.Grow(len);
                        }
                        Array.Copy(_scratch.Bytes, SimpleTextFieldsWriter.TERM.Length, lastTerm.Bytes, 0, len);
                        lastTerm.Length = len;
                        docFreq = 0;
                        _sumTotalTermFreq += totalTermFreq;
                        totalTermFreq = 0;
                        _termCount++;
                    }
                }
                _docCount = visitedDocs.Cardinality();
                _fst = b.Finish();
            }
예제 #5
0
 public int Compare(PairOutputs <long?, BytesRef> .Pair left, PairOutputs <long?, BytesRef> .Pair right)
 {
     return(Comparer <long?> .Default.Compare(left.Output1, right.Output1));
 }
예제 #6
0
 public int Compare(PairOutputs <long?, BytesRef> .Pair left, PairOutputs <long?, BytesRef> .Pair right)
 {
     return(left.Output1.CompareTo(right.Output1));
 }
예제 #7
0
            internal virtual void LoadTerms()
            {
                PositiveIntOutputs posIntOutputs = PositiveIntOutputs.Singleton;
                Builder <PairOutputs.Pair <long?, PairOutputs.Pair <long?, long?> > > b;
                PairOutputs <long?, long?> outputsInner = new PairOutputs <long?, long?>(posIntOutputs, posIntOutputs);
                PairOutputs <long?, PairOutputs.Pair <long?, long?> > outputs =
                    new PairOutputs <long?, PairOutputs.Pair <long?, long?> >(posIntOutputs, outputsInner);

                b = new Builder <>(FST.INPUT_TYPE.BYTE1, outputs);
                IndexInput @in = (IndexInput)outerInstance._input.Clone();

                @in.Seek(termsStart);

                BytesRef    lastTerm      = new BytesRef(10);
                long        lastDocsStart = -1;
                int         docFreq       = 0;
                long        totalTermFreq = 0;
                FixedBitSet visitedDocs   = new FixedBitSet(maxDoc);

                IntsRef scratchIntsRef = new IntsRef();

                while (true)
                {
                    SimpleTextUtil.ReadLine(@in, scratch);
                    if (scratch.Equals(END) || StringHelper.StartsWith(scratch, FIELD))
                    {
                        if (lastDocsStart != -1)
                        {
                            b.Add(Util.ToIntsRef(lastTerm, scratchIntsRef),
                                  outputs.NewPair(lastDocsStart, outputsInner.NewPair((long)docFreq, totalTermFreq)));
                            sumTotalTermFreq += totalTermFreq;
                        }
                        break;
                    }
                    else if (StringHelper.StartsWith(scratch, DOC))
                    {
                        docFreq++;
                        sumDocFreq++;
                        UnicodeUtil.UTF8toUTF16(scratch.Bytes, scratch.Offset + DOC.length, scratch.Length - DOC.length,
                                                scratchUTF16);
                        int docID = ArrayUtil.ParseInt(scratchUTF16.Chars, 0, scratchUTF16.length);
                        visitedDocs.Set(docID);
                    }
                    else if (StringHelper.StartsWith(scratch, FREQ))
                    {
                        UnicodeUtil.UTF8toUTF16(scratch.Bytes, scratch.Offset + FREQ.length,
                                                scratch.Length - FREQ.length, scratchUTF16);
                        totalTermFreq += ArrayUtil.ParseInt(scratchUTF16.Chars, 0, scratchUTF16.length);
                    }
                    else if (StringHelper.StartsWith(scratch, TERM))
                    {
                        if (lastDocsStart != -1)
                        {
                            b.Add(Util.ToIntsRef(lastTerm, scratchIntsRef),
                                  outputs.NewPair(lastDocsStart, outputsInner.NewPair((long)docFreq, totalTermFreq)));
                        }
                        lastDocsStart = @in.FilePointer;
                        int len = scratch.Length - TERM.length;
                        if (len > lastTerm.Length)
                        {
                            lastTerm.Grow(len);
                        }
                        Array.Copy(scratch.Bytes, TERM.length, lastTerm.Bytes, 0, len);
                        lastTerm.Length   = len;
                        docFreq           = 0;
                        sumTotalTermFreq += totalTermFreq;
                        totalTermFreq     = 0;
                        termCount++;
                    }
                }
                docCount = visitedDocs.Cardinality();
                fst      = b.Finish();
            }
예제 #8
0
            private void LoadTerms()
            {
                PositiveInt32Outputs posIntOutputs = PositiveInt32Outputs.Singleton;
                var outputsInner = new PairOutputs <Int64, Int64>(posIntOutputs, posIntOutputs);
                var outputs      = new PairOutputs <Int64, PairOutputs <Int64, Int64> .Pair>(posIntOutputs,
                                                                                             outputsInner);
                var        b   = new Builder <PairOutputs <Int64, PairOutputs <Int64, Int64> .Pair> .Pair>(FST.INPUT_TYPE.BYTE1, outputs);
                IndexInput @in = (IndexInput)outerInstance.input.Clone();

                @in.Seek(termsStart);
                BytesRef    lastTerm       = new BytesRef(10);
                long        lastDocsStart  = -1;
                int         docFreq        = 0;
                long        totalTermFreq  = 0;
                FixedBitSet visitedDocs    = new FixedBitSet(maxDoc);
                Int32sRef   scratchIntsRef = new Int32sRef();

                while (true)
                {
                    SimpleTextUtil.ReadLine(@in, scratch);
                    if (scratch.Equals(SimpleTextFieldsWriter.END) || StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.FIELD))
                    {
                        if (lastDocsStart != -1)
                        {
                            b.Add(Util.ToInt32sRef(lastTerm, scratchIntsRef),
                                  outputs.NewPair(lastDocsStart,
                                                  outputsInner.NewPair((long)docFreq, totalTermFreq)));
                            sumTotalTermFreq += totalTermFreq;
                        }
                        break;
                    }
                    else if (StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.DOC))
                    {
                        docFreq++;
                        sumDocFreq++;
                        UnicodeUtil.UTF8toUTF16(scratch.Bytes, scratch.Offset + SimpleTextFieldsWriter.DOC.Length, scratch.Length - SimpleTextFieldsWriter.DOC.Length, scratchUTF16);
                        int docID = ArrayUtil.ParseInt32(scratchUTF16.Chars, 0, scratchUTF16.Length);
                        visitedDocs.Set(docID);
                    }
                    else if (StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.FREQ))
                    {
                        UnicodeUtil.UTF8toUTF16(scratch.Bytes, scratch.Offset + SimpleTextFieldsWriter.FREQ.Length, scratch.Length - SimpleTextFieldsWriter.FREQ.Length, scratchUTF16);
                        totalTermFreq += ArrayUtil.ParseInt32(scratchUTF16.Chars, 0, scratchUTF16.Length);
                    }
                    else if (StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.TERM))
                    {
                        if (lastDocsStart != -1)
                        {
                            b.Add(Util.ToInt32sRef(lastTerm, scratchIntsRef), outputs.NewPair(lastDocsStart,
                                                                                              outputsInner.NewPair((long)docFreq, totalTermFreq)));
                        }
                        lastDocsStart = @in.Position; // LUCENENET specific: Renamed from getFilePointer() to match FileStream
                        int len = scratch.Length - SimpleTextFieldsWriter.TERM.Length;
                        if (len > lastTerm.Length)
                        {
                            lastTerm.Grow(len);
                        }
                        System.Array.Copy(scratch.Bytes, SimpleTextFieldsWriter.TERM.Length, lastTerm.Bytes, 0, len);
                        lastTerm.Length   = len;
                        docFreq           = 0;
                        sumTotalTermFreq += totalTermFreq;
                        totalTermFreq     = 0;
                        termCount++;
                    }
                }
                docCount = visitedDocs.Cardinality;
                fst      = b.Finish();

                /*
                 * PrintStream ps = new PrintStream("out.dot");
                 * fst.toDot(ps);
                 * ps.close();
                 * System.out.println("SAVED out.dot");
                 */
                //System.out.println("FST " + fst.sizeInBytes());
            }