コード例 #1
0
        private CompiledNode CompileNode(UnCompiledNode <T> nodeIn, int tailLength)
        {
            long node;

            if (DedupHash != null && (DoShareNonSingletonNodes || nodeIn.NumArcs <= 1) && tailLength <= ShareMaxTailLength)
            {
                if (nodeIn.NumArcs == 0)
                {
                    node = Fst.AddNode(nodeIn);
                }
                else
                {
                    node = DedupHash.Add(nodeIn);
                }
            }
            else
            {
                node = Fst.AddNode(nodeIn);
            }
            Debug.Assert(node != -2);

            nodeIn.Clear();

            CompiledNode fn = new CompiledNode();

            fn.Node = node;
            return(fn);
        }
コード例 #2
0
        /// <summary>
        /// Instantiates an FST/FSA builder with all the possible tuning and construction
        /// tweaks. Read parameter documentation carefully.
        /// </summary>
        /// <param name="inputType">
        ///    The input type (transition labels). Can be anything from <seealso cref="INPUT_TYPE"/>
        ///    enumeration. Shorter types will consume less memory. Strings (character sequences) are
        ///    represented as <seealso cref="INPUT_TYPE#BYTE4"/> (full unicode codepoints).
        /// </param>
        /// <param name="minSuffixCount1">
        ///    If pruning the input graph during construction, this threshold is used for telling
        ///    if a node is kept or pruned. If transition_count(node) &gt;= minSuffixCount1, the node
        ///    is kept.
        /// </param>
        /// <param name="minSuffixCount2">
        ///    (Note: only Mike McCandless knows what this one is really doing...)
        /// </param>
        /// <param name="doShareSuffix">
        ///    If <code>true</code>, the shared suffixes will be compacted into unique paths.
        ///    this requires an additional RAM-intensive hash map for lookups in memory. Setting this parameter to
        ///    <code>false</code> creates a single suffix path for all input sequences. this will result in a larger
        ///    FST, but requires substantially less memory and CPU during building.
        /// </param>
        /// <param name="doShareNonSingletonNodes">
        ///    Only used if doShareSuffix is true.  Set this to
        ///    true to ensure FST is fully minimal, at cost of more
        ///    CPU and more RAM during building.
        /// </param>
        /// <param name="shareMaxTailLength">
        ///    Only used if doShareSuffix is true.  Set this to
        ///    Integer.MAX_VALUE to ensure FST is fully minimal, at cost of more
        ///    CPU and more RAM during building.
        /// </param>
        /// <param name="outputs"> The output type for each input sequence. Applies only if building an FST. For
        ///    FSA, use <seealso cref="NoOutputs#getSingleton()"/> and <seealso cref="NoOutputs#getNoOutput()"/> as the
        ///    singleton output object.
        /// </param>
        /// <param name="doPackFST"> Pass true to create a packed FST.
        /// </param>
        /// <param name="acceptableOverheadRatio"> How to trade speed for space when building the FST. this option </param>
        ///    is only relevant when doPackFST is true. <seealso cref= PackedInts#getMutable(int, int, float)
        /// </seealso>
        /// <param name="allowArrayArcs"> Pass false to disable the array arc optimization
        ///    while building the FST; this will make the resulting
        ///    FST smaller but slower to traverse.
        /// </param>
        /// <param name="bytesPageBits"> How many bits wide to make each
        ///    byte[] block in the BytesStore; if you know the FST
        ///    will be large then make this larger.  For example 15
        ///    bits = 32768 byte pages. </param>
        public Builder(FST <T> .INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, bool doShareSuffix, bool doShareNonSingletonNodes, int shareMaxTailLength, Outputs <T> outputs, FreezeTail <T> freezeTail, bool doPackFST, float acceptableOverheadRatio, bool allowArrayArcs, int bytesPageBits)
        {
            this.MinSuffixCount1          = minSuffixCount1;
            this.MinSuffixCount2          = minSuffixCount2;
            this.FreezeTail_Renamed       = freezeTail;
            this.DoShareNonSingletonNodes = doShareNonSingletonNodes;
            this.ShareMaxTailLength       = shareMaxTailLength;
            this.DoPackFST = doPackFST;
            this.AcceptableOverheadRatio = acceptableOverheadRatio;
            Fst = new FST <T>(inputType, outputs, doPackFST, acceptableOverheadRatio, allowArrayArcs, bytesPageBits);
            if (doShareSuffix)
            {
                DedupHash = new NodeHash <T>(Fst, Fst.Bytes.GetReverseReader(false));
            }
            else
            {
                DedupHash = null;
            }
            NO_OUTPUT = outputs.NoOutput;

            UnCompiledNode <T>[] f = (UnCompiledNode <T>[]) new UnCompiledNode <T> [10];
            Frontier = f;
            for (int idx = 0; idx < Frontier.Length; idx++)
            {
                Frontier[idx] = new UnCompiledNode <T>(this, idx);
            }
        }
コード例 #3
0
        private CompiledNode CompileNode(UnCompiledNode <T> nodeIn, int tailLength)
        {
            long node;

            if (dedupHash != null && (doShareNonSingletonNodes || nodeIn.NumArcs <= 1) && tailLength <= shareMaxTailLength)
            {
                if (nodeIn.NumArcs == 0)
                {
                    node = fst.AddNode(nodeIn);
                }
                else
                {
                    node = dedupHash.Add(nodeIn);
                }
            }
            else
            {
                node = fst.AddNode(nodeIn);
            }
            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(node != -2);
            }

            nodeIn.Clear();

            return(new CompiledNode {
                Node = node
            });
        }
コード例 #4
0
 private void CompileAllTargets(UnCompiledNode <T> node, int tailLength)
 {
     for (int arcIdx = 0; arcIdx < node.NumArcs; arcIdx++)
     {
         Arc <T> arc = node.Arcs[arcIdx];
         if (!arc.Target.Compiled)
         {
             // not yet compiled
             UnCompiledNode <T> n = (UnCompiledNode <T>)arc.Target;
             if (n.NumArcs == 0)
             {
                 //System.out.println("seg=" + segment + "        FORCE final arc=" + (char) arc.Label);
                 arc.IsFinal = n.IsFinal = true;
             }
             arc.Target = CompileNode(n, tailLength - 1);
         }
     }
 }
コード例 #5
0
        /// <summary>
        /// Returns final FST.  NOTE: this will return null if
        ///  nothing is accepted by the FST.
        /// </summary>
        public virtual FST <T> Finish()
        {
            UnCompiledNode <T> root = Frontier[0];

            // minimize nodes in the last word's suffix
            DoFreezeTail(0);
            if (root.InputCount < MinSuffixCount1 || root.InputCount < MinSuffixCount2 || root.NumArcs == 0)
            {
                if (Fst.emptyOutput == null)
                {
                    return(null);
                }
                else if (MinSuffixCount1 > 0 || MinSuffixCount2 > 0)
                {
                    // empty string got pruned
                    return(null);
                }
            }
            else
            {
                if (MinSuffixCount2 != 0)
                {
                    CompileAllTargets(root, LastInput.Length);
                }
            }
            //if (DEBUG) System.out.println("  builder.finish root.isFinal=" + root.isFinal + " root.Output=" + root.Output);
            Fst.Finish(CompileNode(root, LastInput.Length).Node);

            if (DoPackFST)
            {
                return(Fst.Pack(3, Math.Max(10, (int)(Fst.NodeCount / 4)), AcceptableOverheadRatio));
            }
            else
            {
                return(Fst);
            }
        }
コード例 #6
0
        // for debugging

        /*
         * private String toString(BytesRef b) {
         * try {
         *  return b.utf8ToString() + " " + b;
         * } catch (Throwable t) {
         *  return b.toString();
         * }
         * }
         */

        /// <summary>
        /// It's OK to add the same input twice in a row with
        ///  different outputs, as long as outputs impls the merge
        ///  method. Note that input is fully consumed after this
        ///  method is returned (so caller is free to reuse), but
        ///  output is not.  So if your outputs are changeable (eg
        ///  <seealso cref="ByteSequenceOutputs"/> or {@link
        ///  IntSequenceOutputs}) then you cannot reuse across
        ///  calls.
        /// </summary>
        public virtual void Add(IntsRef input, T output)
        {
            /*
             * if (DEBUG) {
             * BytesRef b = new BytesRef(input.length);
             * for(int x=0;x<input.length;x++) {
             *  b.bytes[x] = (byte) input.ints[x];
             * }
             * b.length = input.length;
             * if (output == NO_OUTPUT) {
             *  System.out.println("\nFST ADD: input=" + toString(b) + " " + b);
             * } else {
             *  System.out.println("\nFST ADD: input=" + toString(b) + " " + b + " output=" + fst.outputs.outputToString(output));
             * }
             * }
             */

            // De-dup NO_OUTPUT since it must be a singleton:
            if (output.Equals(NO_OUTPUT))
            {
                output = NO_OUTPUT;
            }

            Debug.Assert(LastInput.Length == 0 || input.CompareTo(LastInput) >= 0, "inputs are added out of order lastInput=" + LastInput + " vs input=" + input);
            Debug.Assert(ValidOutput(output));

            //System.out.println("\nadd: " + input);
            if (input.Length == 0)
            {
                // empty input: only allowed as first input.  we have
                // to special case this because the packed FST
                // format cannot represent the empty input since
                // 'finalness' is stored on the incoming arc, not on
                // the node
                Frontier[0].InputCount++;
                Frontier[0].IsFinal = true;
                Fst.EmptyOutput     = output;
                return;
            }

            // compare shared prefix length
            int pos1     = 0;
            int pos2     = input.Offset;
            int pos1Stop = Math.Min(LastInput.Length, input.Length);

            while (true)
            {
                Frontier[pos1].InputCount++;
                //System.out.println("  incr " + pos1 + " ct=" + frontier[pos1].inputCount + " n=" + frontier[pos1]);
                if (pos1 >= pos1Stop || LastInput.Ints[pos1] != input.Ints[pos2])
                {
                    break;
                }
                pos1++;
                pos2++;
            }
            int prefixLenPlus1 = pos1 + 1;

            if (Frontier.Length < input.Length + 1)
            {
                UnCompiledNode <T>[] next = new UnCompiledNode <T> [ArrayUtil.Oversize(input.Length + 1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
                Array.Copy(Frontier, 0, next, 0, Frontier.Length);
                for (int idx = Frontier.Length; idx < next.Length; idx++)
                {
                    next[idx] = new UnCompiledNode <T>(this, idx);
                }
                Frontier = next;
            }

            // minimize/compile states from previous input's
            // orphan'd suffix
            DoFreezeTail(prefixLenPlus1);

            // init tail states for current input
            for (int idx = prefixLenPlus1; idx <= input.Length; idx++)
            {
                Frontier[idx - 1].AddArc(input.Ints[input.Offset + idx - 1], Frontier[idx]);
                Frontier[idx].InputCount++;
            }

            UnCompiledNode <T> lastNode = Frontier[input.Length];

            if (LastInput.Length != input.Length || prefixLenPlus1 != input.Length + 1)
            {
                lastNode.IsFinal = true;
                lastNode.Output  = NO_OUTPUT;
            }

            // push conflicting outputs forward, only as far as
            // needed
            for (int idx = 1; idx < prefixLenPlus1; idx++)
            {
                UnCompiledNode <T> node       = Frontier[idx];
                UnCompiledNode <T> parentNode = Frontier[idx - 1];

                T lastOutput = parentNode.GetLastOutput(input.Ints[input.Offset + idx - 1]);
                Debug.Assert(ValidOutput(lastOutput));

                T commonOutputPrefix;
                T wordSuffix;

                if ((object)lastOutput != (object)NO_OUTPUT)
                {
                    commonOutputPrefix = Fst.Outputs.Common(output, lastOutput);
                    Debug.Assert(ValidOutput(commonOutputPrefix));
                    wordSuffix = Fst.Outputs.Subtract(lastOutput, commonOutputPrefix);
                    Debug.Assert(ValidOutput(wordSuffix));
                    parentNode.SetLastOutput(input.Ints[input.Offset + idx - 1], commonOutputPrefix);
                    node.PrependOutput(wordSuffix);
                }
                else
                {
                    commonOutputPrefix = wordSuffix = NO_OUTPUT;
                }

                output = Fst.Outputs.Subtract(output, commonOutputPrefix);
                Debug.Assert(ValidOutput(output));
            }

            if (LastInput.Length == input.Length && prefixLenPlus1 == 1 + input.Length)
            {
                // same input more than 1 time in a row, mapping to
                // multiple outputs
                lastNode.Output = Fst.Outputs.Merge(lastNode.Output, output);
            }
            else
            {
                // this new arc is private to this new input; set its
                // arc output to the leftover output:
                Frontier[prefixLenPlus1 - 1].SetLastOutput(input.Ints[input.Offset + prefixLenPlus1 - 1], output);
            }

            // save last input
            LastInput.CopyInts(input);

            //System.out.println("  count[0]=" + frontier[0].inputCount);
        }
コード例 #7
0
        private void DoFreezeTail(int prefixLenPlus1)
        {
            if (FreezeTail_Renamed != null)
            {
                // Custom plugin:
                FreezeTail_Renamed.Freeze(Frontier, prefixLenPlus1, LastInput);
            }
            else
            {
                //System.out.println("  compileTail " + prefixLenPlus1);
                int downTo = Math.Max(1, prefixLenPlus1);
                for (int idx = LastInput.Length; idx >= downTo; idx--)
                {
                    bool doPrune   = false;
                    bool doCompile = false;

                    UnCompiledNode <T> node   = Frontier[idx];
                    UnCompiledNode <T> parent = Frontier[idx - 1];

                    if (node.InputCount < MinSuffixCount1)
                    {
                        doPrune   = true;
                        doCompile = true;
                    }
                    else if (idx > prefixLenPlus1)
                    {
                        // prune if parent's inputCount is less than suffixMinCount2
                        if (parent.InputCount < MinSuffixCount2 || (MinSuffixCount2 == 1 && parent.InputCount == 1 && idx > 1))
                        {
                            // my parent, about to be compiled, doesn't make the cut, so
                            // I'm definitely pruned

                            // if minSuffixCount2 is 1, we keep only up
                            // until the 'distinguished edge', ie we keep only the
                            // 'divergent' part of the FST. if my parent, about to be
                            // compiled, has inputCount 1 then we are already past the
                            // distinguished edge.  NOTE: this only works if
                            // the FST outputs are not "compressible" (simple
                            // ords ARE compressible).
                            doPrune = true;
                        }
                        else
                        {
                            // my parent, about to be compiled, does make the cut, so
                            // I'm definitely not pruned
                            doPrune = false;
                        }
                        doCompile = true;
                    }
                    else
                    {
                        // if pruning is disabled (count is 0) we can always
                        // compile current node
                        doCompile = MinSuffixCount2 == 0;
                    }

                    //System.out.println("    label=" + ((char) lastInput.ints[lastInput.offset+idx-1]) + " idx=" + idx + " inputCount=" + frontier[idx].inputCount + " doCompile=" + doCompile + " doPrune=" + doPrune);

                    if (node.InputCount < MinSuffixCount2 || (MinSuffixCount2 == 1 && node.InputCount == 1 && idx > 1))
                    {
                        // drop all arcs
                        for (int arcIdx = 0; arcIdx < node.NumArcs; arcIdx++)
                        {
                            UnCompiledNode <T> target = (UnCompiledNode <T>)node.Arcs[arcIdx].Target;
                            target.Clear();
                        }
                        node.NumArcs = 0;
                    }

                    if (doPrune)
                    {
                        // this node doesn't make it -- deref it
                        node.Clear();
                        parent.DeleteLast(LastInput.Ints[LastInput.Offset + idx - 1], node);
                    }
                    else
                    {
                        if (MinSuffixCount2 != 0)
                        {
                            CompileAllTargets(node, LastInput.Length - idx);
                        }
                        T nextFinalOutput = node.Output;

                        // We "fake" the node as being final if it has no
                        // outgoing arcs; in theory we could leave it
                        // as non-final (the FST can represent this), but
                        // FSTEnum, Util, etc., have trouble w/ non-final
                        // dead-end states:
                        bool isFinal = node.IsFinal || node.NumArcs == 0;

                        if (doCompile)
                        {
                            // this node makes it and we now compile it.  first,
                            // compile any targets that were previously
                            // undecided:
                            parent.ReplaceLast(LastInput.Ints[LastInput.Offset + idx - 1], CompileNode(node, 1 + LastInput.Length - idx), nextFinalOutput, isFinal);
                        }
                        else
                        {
                            // replaceLast just to install
                            // nextFinalOutput/isFinal onto the arc
                            parent.ReplaceLast(LastInput.Ints[LastInput.Offset + idx - 1], node, nextFinalOutput, isFinal);
                            // this node will stay in play for now, since we are
                            // undecided on whether to prune it.  later, it
                            // will be either compiled or pruned, so we must
                            // allocate a new node:
                            Frontier[idx] = new UnCompiledNode <T>(this, idx);
                        }
                    }
                }
            }
        }