Ejemplo n.º 1
0
        private CompiledNode compileNode(UnCompiledNode <T> nodeIn, int tailLength)
        {
            long node;
            long bytesPosStart = bytes.getPosition();

            //TODO: deduphash
            node = fst.addNode(this, nodeIn);
            Debug.Assert(node != -2);

            long bytesPosEnd = bytes.getPosition();

            if (bytesPosEnd != bytesPosStart)
            {
                // The FST added a new node:
                Debug.Assert(bytesPosEnd > bytesPosStart);
                lastFrozenNode = node;
            }

            nodeIn.clear();

            CompiledNode fn = new CompiledNode();

            fn.node = node;
            return(fn);
        }
Ejemplo n.º 2
0
        /// Returns final FST.  NOTE: this will return null if
        ///nothing is accepted by the FST.
        public FST <T> finish()
        {
            UnCompiledNode <T> root = frontier[0];

            // minimize nodes in the last word's suffix
            freezeTail(0);
            if (root.inputCount < minSuffixCount1 || root.inputCount < minSuffixCount2 || root.numArcs == 0)
            {
                if (fst.emptyOutput == null)
                {
                    return(null);
                }
                else if (minSuffixCount1 > 0 || minSuffixCount2 > 0)
                {
                    // empty string got pruned
                    return(null);
                }
            }
            else
            {
                if (minSuffixCount2 != 0)
                {
                    compileAllTargets(root, lastInput.length);
                }
            }
            fst.finish(compileNode(root, lastInput.length).node);

            return(fst);
        }
Ejemplo n.º 3
0
        private void writeNodeForBinarySearch(Builder <T> builder, UnCompiledNode <T> nodeIn, long startAddress, int maxBytesPerArc)
        {
            // Build the header in a buffer.
            // It is a false/special arc which is in fact a node header with node flags followed by node metadata.
            builder.fixedLengthArcsBuffer
            .resetPosition()
            .writeByte(ARCS_FOR_BINARY_SEARCH)
            .writeVInt(nodeIn.numArcs)
            .writeVInt(maxBytesPerArc);
            int headerLen = builder.fixedLengthArcsBuffer.getPosition();

            // Expand the arcs in place, backwards.
            long srcPos  = builder.bytes.getPosition();
            long destPos = startAddress + headerLen + nodeIn.numArcs * maxBytesPerArc;

            Debug.Assert(destPos >= srcPos);
            if (destPos > srcPos)
            {
                builder.bytes.skipBytes((int)(destPos - srcPos));
                for (int arcIdx = nodeIn.numArcs - 1; arcIdx >= 0; arcIdx--)
                {
                    destPos -= maxBytesPerArc;
                    int arcLen = builder.numBytesPerArc[arcIdx];
                    srcPos -= arcLen;
                    if (srcPos != destPos)
                    {
                        Debug.Assert(destPos > srcPos, "destPos=" + destPos + " srcPos=" + srcPos + " arcIdx=" + arcIdx + " maxBytesPerArc=" + maxBytesPerArc + " arcLen=" + arcLen + " nodeIn.numArcs=" + nodeIn.numArcs);
                        builder.bytes.copyBytes(srcPos, destPos, arcLen);
                    }
                }
            }

            // Write the header.
            builder.bytes.writeBytes(startAddress, builder.fixedLengthArcsBuffer.getBytes(), 0, headerLen);
        }
Ejemplo n.º 4
0
 private void compileAllTargets(UnCompiledNode <T> node, int tailLength)
 {
     for (int arcIdx = 0; arcIdx < node.numArcs; arcIdx++)
     {
         Arc <T> arc = node.arcs[arcIdx];
         if (!arc.target.isCompiled())
         {
             // not yet compiled
             UnCompiledNode <T> n = (UnCompiledNode <T>)arc.target;
             if (n.numArcs == 0)
             {
                 arc.isFinal = n.isFinal = true;
             }
             arc.target = compileNode(n, tailLength - 1);
         }
     }
 }
Ejemplo n.º 5
0
        public Builder <T> add(IntsRef input, T output)
        {
            Debug.Assert(lastInput.length == 0 || input.CompareTo(lastInput) >= 0, "inputs are added out of order lastInput=" + lastInput + " vs input=" + input);
            if (input.length == 0)
            {
                // empty input: only allowed as first input.  we have
                // to special case this because the packed FST
                // format cannot represent the empty input since
                // 'finalness' is stored on the incoming arc, not on
                // the node
                frontier[0].inputCount++;
                frontier[0].isFinal = true;
                fst.setEmptyOutput(output);
                return(this);
            }

            // compare shared prefix length
            int pos1     = 0;
            int pos2     = input.offset;
            int pos1Stop = Math.Min(lastInput.length, input.length);

            while (true)
            {
                frontier[pos1].inputCount++;
                if (pos1 >= pos1Stop || lastInput.ints[pos1] != input.ints[pos2])
                {
                    break;
                }
                pos1++;
                pos2++;
            }
            int prefixLenPlus1 = pos1 + 1;

            if (frontier.Length < input.length + 1)
            {
                UnCompiledNode <T>[] next = ArrayUtil.grow(frontier, input.length + 1);
                for (int idx = frontier.Length; idx < next.Length; idx++)
                {
                    next[idx] = new UnCompiledNode <T>(this, idx);
                }
                frontier = next;
            }

            // minimize/compile states from previous input's
            // orphan'd suffix
            freezeTail(prefixLenPlus1);

            // init tail states for current input
            for (int idx = prefixLenPlus1; idx <= input.length; idx++)
            {
                frontier[idx - 1].addArc(input.ints[input.offset + idx - 1],
                                         frontier[idx]);
                frontier[idx].inputCount++;
            }
            UnCompiledNode <T> lastNode = frontier[input.length];

            if (lastInput.length != input.length || prefixLenPlus1 != input.length + 1)
            {
                lastNode.isFinal = true;
                lastNode.output  = NO_OUTPUT;
            }

            // push conflicting outputs forward, only as far as
            // needed
            for (int idx = 1; idx < prefixLenPlus1; idx++)
            {
                UnCompiledNode <T> node       = frontier[idx];
                UnCompiledNode <T> parentNode = frontier[idx - 1];

                T lastOutput = parentNode.getLastOutput(input.ints[input.offset + idx - 1]);

                T commonOutputPrefix;
                T wordSuffix;

                if (!lastOutput.Equals(NO_OUTPUT))
                {
                    commonOutputPrefix = fst.outputs.common(output, lastOutput);
                    wordSuffix         = fst.outputs.subtract(lastOutput, commonOutputPrefix);
                    parentNode.setLastOutput(input.ints[input.offset + idx - 1], commonOutputPrefix);
                    node.prependOutput(wordSuffix);
                }
                else
                {
                    commonOutputPrefix = wordSuffix = NO_OUTPUT;
                }

                output = fst.outputs.subtract(output, commonOutputPrefix);
            }
            if (lastInput.length == input.length && prefixLenPlus1 == 1 + input.length)
            {
                // same input more than 1 time in a row, mapping to
                // multiple outputs
                lastNode.output = fst.outputs.merge(lastNode.output, output);
            }
            else
            {
                // this new arc is private to this new input; set its
                // arc output to the leftover output:
                frontier[prefixLenPlus1 - 1].setLastOutput(input.ints[input.offset + prefixLenPlus1 - 1], output);
            }

            // save last input
            lastInput = (IntsRef)input.Clone();

            return(this);
        }
Ejemplo n.º 6
0
        private void freezeTail(int prefixLenPlus1)
        {
            int downTo = Math.Max(1, prefixLenPlus1);

            for (int idx = lastInput.length; idx >= downTo; idx--)
            {
                bool doPrune   = false;
                bool doCompile = false;

                UnCompiledNode <T> node   = frontier[idx];
                UnCompiledNode <T> parent = frontier[idx - 1];

                if (node.inputCount < minSuffixCount1)
                {
                    doPrune   = true;
                    doCompile = true;
                }
                else if (idx > prefixLenPlus1)
                {
                    // prune if parent's inputCount is less than suffixMinCount2
                    if (parent.inputCount < minSuffixCount2 || (minSuffixCount2 == 1 && parent.inputCount == 1 && idx > 1))
                    {
                        // my parent, about to be compiled, doesn't make the cut, so
                        // I'm definitely pruned

                        // if minSuffixCount2 is 1, we keep only up
                        // until the 'distinguished edge', ie we keep only the
                        // 'divergent' part of the FST. if my parent, about to be
                        // compiled, has inputCount 1 then we are already past the
                        // distinguished edge.  NOTE: this only works if
                        // the FST outputs are not "compressible" (simple
                        // ords ARE compressible).
                        doPrune = true;
                    }
                    else
                    {
                        // my parent, about to be compiled, does make the cut, so
                        // I'm definitely not pruned
                        doPrune = false;
                    }
                    doCompile = true;
                }
                else
                {
                    // if pruning is disabled (count is 0) we can always
                    // compile current node
                    doCompile = minSuffixCount2 == 0;
                }

                if (node.inputCount < minSuffixCount2 || (minSuffixCount2 == 1 && node.inputCount == 1 && idx > 1))
                {
                    // drop all arcs
                    for (int arcIdx = 0; arcIdx < node.numArcs; arcIdx++)
                    {
                        UnCompiledNode <T> target = (UnCompiledNode <T>)node.arcs[arcIdx].target;
                        target.clear();
                    }
                    node.numArcs = 0;
                }

                if (doPrune)
                {
                    // this node doesn't make it -- deref it
                    node.clear();
                    parent.deleteLast(lastInput.ints[idx - 1], node);
                }
                else
                {
                    if (minSuffixCount2 != 0)
                    {
                        //TODO: minSuffixCount2 is always 0 for now.
                        //compileAllTargets(node, lastInput.length()-idx);
                    }
                    T nextFinalOutput = node.output;

                    // We "fake" the node as being final if it has no
                    // outgoing arcs; in theory we could leave it
                    // as non-final (the FST can represent this), but
                    // FSTEnum, Util, etc., have trouble w/ non-final
                    // dead-end states:
                    bool isFinal = node.isFinal || node.numArcs == 0;

                    if (doCompile)
                    {
                        // this node makes it and we now compile it.  first,
                        // compile any targets that were previously
                        // undecided:
                        parent.replaceLast(lastInput.ints[idx - 1],
                                           compileNode(node, 1 + lastInput.length - idx),
                                           nextFinalOutput,
                                           isFinal);
                    }
                    else
                    {
                        // replaceLast just to install
                        // nextFinalOutput/isFinal onto the arc
                        parent.replaceLast(lastInput.ints[idx - 1],
                                           node,
                                           nextFinalOutput,
                                           isFinal);
                        // this node will stay in play for now, since we are
                        // undecided on whether to prune it.  later, it
                        // will be either compiled or pruned, so we must
                        // allocate a new node:
                        frontier[idx] = new UnCompiledNode <T>(this, idx);
                    }
                }
            }
        }
Ejemplo n.º 7
0
        // serializes new node by appending its bytes to the end
        // of the current byte[]
        public long addNode(Builder <T> builder, UnCompiledNode <T> nodeIn)
        {
            T NO_OUTPUT = outputs.getNoOutput();

            if (nodeIn.numArcs == 0)
            {
                if (nodeIn.isFinal)
                {
                    return(FINAL_END_NODE);
                }
                else
                {
                    return(NON_FINAL_END_NODE);
                }
            }
            long startAddress = builder.bytes.getPosition();

            bool doFixedLengthArcs = shouldExpandNodeWithFixedLengthArcs(builder, nodeIn);

            if (doFixedLengthArcs)
            {
                if (builder.numBytesPerArc.Length < nodeIn.numArcs)
                {
                    builder.numBytesPerArc      = new int[ArrayUtil.oversize(nodeIn.numArcs, 4)];
                    builder.numLabelBytesPerArc = new int[builder.numBytesPerArc.Length];
                }
            }

            builder.arcCount += nodeIn.numArcs;
            int  lastArc                    = nodeIn.numArcs - 1;
            long lastArcStart               = builder.bytes.getPosition();
            int  maxBytesPerArc             = 0;
            int  maxBytesPerArcWithoutLabel = 0;

            for (int arcIdx = 0; arcIdx < nodeIn.numArcs; arcIdx++)
            {
                Arc <T>      arc    = nodeIn.arcs[arcIdx];
                CompiledNode target = (CompiledNode)arc.target;
                int          flags  = 0;

                if (arcIdx == lastArc)
                {
                    flags += BIT_LAST_ARC;
                }

                if (builder.lastFrozenNode == target.node && !doFixedLengthArcs)
                {
                    // TODO: for better perf (but more RAM used) we
                    // could avoid this except when arc is "near" the
                    // last arc:
                    flags += BIT_TARGET_NEXT;
                }

                if (arc.isFinal)
                {
                    flags += BIT_FINAL_ARC;
                    if (!NO_OUTPUT.Equals(arc.nextFinalOutput))
                    {
                        flags += BIT_ARC_HAS_FINAL_OUTPUT;
                    }
                }
                else
                {
                    Debug.Assert(NO_OUTPUT.Equals(arc.nextFinalOutput));
                }

                bool targetHasArcs = target.node > 0;

                if (!targetHasArcs)
                {
                    flags += BIT_STOP_NODE;
                }

                if (!NO_OUTPUT.Equals(arc.output))
                {
                    flags += BIT_ARC_HAS_OUTPUT;
                }

                builder.bytes.writeByte((byte)flags);
                long labelStart = builder.bytes.getPosition();
                writeLabel(builder.bytes, arc.label);
                int numLabelBytes = (int)(builder.bytes.getPosition() - labelStart);

                if (!NO_OUTPUT.Equals(arc.output))
                {
                    throw new NotImplementedException();
                    //TODO: outputs.write(arc.output, builder.bytes);
                }

                if (!NO_OUTPUT.Equals(arc.nextFinalOutput))
                {
                    throw new NotImplementedException();
                    //TODO: outputs.writeFinalOutput(arc.nextFinalOutput, builder.bytes);
                }

                if (targetHasArcs && (flags & BIT_TARGET_NEXT) == 0)
                {
                    Debug.Assert(target.node > 0);
                    builder.bytes.writeVLong(target.node);
                }

                // just write the arcs "like normal" on first pass, but record how many bytes each one took
                // and max byte size:
                if (doFixedLengthArcs)
                {
                    int numArcBytes = (int)(builder.bytes.getPosition() - lastArcStart);
                    builder.numBytesPerArc[arcIdx]      = numArcBytes;
                    builder.numLabelBytesPerArc[arcIdx] = numLabelBytes;
                    lastArcStart               = builder.bytes.getPosition();
                    maxBytesPerArc             = Math.Max(maxBytesPerArc, numArcBytes);
                    maxBytesPerArcWithoutLabel = Math.Max(maxBytesPerArcWithoutLabel, numArcBytes - numLabelBytes);
                }
            }

            if (doFixedLengthArcs)
            {
                Debug.Assert(maxBytesPerArc > 0);

                // 2nd pass just "expands" all arcs to take up a fixed byte size
                int labelRange = nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label + 1;
                Debug.Assert(labelRange > 0);
                if (shouldExpandNodeWithDirectAddressing(builder, nodeIn, maxBytesPerArc, maxBytesPerArcWithoutLabel, labelRange))
                {
                    //writeNodeForDirectAddressing(builder, nodeIn, startAddress, maxBytesPerArcWithoutLabel, labelRange);
                    //builder.directAddressingNodeCount++;
                    throw new NotImplementedException();
                }
                else
                {
                    writeNodeForBinarySearch(builder, nodeIn, startAddress, maxBytesPerArc);
                    builder.binarySearchNodeCount++;
                }
            }

            long thisNodeAddress = builder.bytes.getPosition() - 1;

            builder.bytes.reverse(startAddress, thisNodeAddress);
            builder.nodeCount++;
            return(thisNodeAddress);
        }
Ejemplo n.º 8
0
 private bool shouldExpandNodeWithFixedLengthArcs(Builder <T> builder, UnCompiledNode <T> node)
 {
     return(builder.allowFixedLengthArcs &&
            ((node.depth <= FIXED_LENGTH_ARC_SHALLOW_DEPTH && node.numArcs >= FIXED_LENGTH_ARC_SHALLOW_NUM_ARCS) ||
             node.numArcs >= FIXED_LENGTH_ARC_DEEP_NUM_ARCS));
 }
Ejemplo n.º 9
0
 private bool shouldExpandNodeWithDirectAddressing(Builder <T> builder, UnCompiledNode <T> nodeIn,
                                                   int numBytesPerArc, int maxBytesPerArcWithoutLabel, int labelRange)
 {
     //TODO
     return(false);
 }