private CompiledNode compileNode(UnCompiledNode <T> nodeIn, int tailLength) { long node; long bytesPosStart = bytes.getPosition(); //TODO: deduphash node = fst.addNode(this, nodeIn); Debug.Assert(node != -2); long bytesPosEnd = bytes.getPosition(); if (bytesPosEnd != bytesPosStart) { // The FST added a new node: Debug.Assert(bytesPosEnd > bytesPosStart); lastFrozenNode = node; } nodeIn.clear(); CompiledNode fn = new CompiledNode(); fn.node = node; return(fn); }
/// Returns final FST. NOTE: this will return null if ///nothing is accepted by the FST. public FST <T> finish() { UnCompiledNode <T> root = frontier[0]; // minimize nodes in the last word's suffix freezeTail(0); if (root.inputCount < minSuffixCount1 || root.inputCount < minSuffixCount2 || root.numArcs == 0) { if (fst.emptyOutput == null) { return(null); } else if (minSuffixCount1 > 0 || minSuffixCount2 > 0) { // empty string got pruned return(null); } } else { if (minSuffixCount2 != 0) { compileAllTargets(root, lastInput.length); } } fst.finish(compileNode(root, lastInput.length).node); return(fst); }
private void writeNodeForBinarySearch(Builder <T> builder, UnCompiledNode <T> nodeIn, long startAddress, int maxBytesPerArc) { // Build the header in a buffer. // It is a false/special arc which is in fact a node header with node flags followed by node metadata. builder.fixedLengthArcsBuffer .resetPosition() .writeByte(ARCS_FOR_BINARY_SEARCH) .writeVInt(nodeIn.numArcs) .writeVInt(maxBytesPerArc); int headerLen = builder.fixedLengthArcsBuffer.getPosition(); // Expand the arcs in place, backwards. long srcPos = builder.bytes.getPosition(); long destPos = startAddress + headerLen + nodeIn.numArcs * maxBytesPerArc; Debug.Assert(destPos >= srcPos); if (destPos > srcPos) { builder.bytes.skipBytes((int)(destPos - srcPos)); for (int arcIdx = nodeIn.numArcs - 1; arcIdx >= 0; arcIdx--) { destPos -= maxBytesPerArc; int arcLen = builder.numBytesPerArc[arcIdx]; srcPos -= arcLen; if (srcPos != destPos) { Debug.Assert(destPos > srcPos, "destPos=" + destPos + " srcPos=" + srcPos + " arcIdx=" + arcIdx + " maxBytesPerArc=" + maxBytesPerArc + " arcLen=" + arcLen + " nodeIn.numArcs=" + nodeIn.numArcs); builder.bytes.copyBytes(srcPos, destPos, arcLen); } } } // Write the header. builder.bytes.writeBytes(startAddress, builder.fixedLengthArcsBuffer.getBytes(), 0, headerLen); }
private void compileAllTargets(UnCompiledNode <T> node, int tailLength) { for (int arcIdx = 0; arcIdx < node.numArcs; arcIdx++) { Arc <T> arc = node.arcs[arcIdx]; if (!arc.target.isCompiled()) { // not yet compiled UnCompiledNode <T> n = (UnCompiledNode <T>)arc.target; if (n.numArcs == 0) { arc.isFinal = n.isFinal = true; } arc.target = compileNode(n, tailLength - 1); } } }
public Builder <T> add(IntsRef input, T output) { Debug.Assert(lastInput.length == 0 || input.CompareTo(lastInput) >= 0, "inputs are added out of order lastInput=" + lastInput + " vs input=" + input); if (input.length == 0) { // empty input: only allowed as first input. we have // to special case this because the packed FST // format cannot represent the empty input since // 'finalness' is stored on the incoming arc, not on // the node frontier[0].inputCount++; frontier[0].isFinal = true; fst.setEmptyOutput(output); return(this); } // compare shared prefix length int pos1 = 0; int pos2 = input.offset; int pos1Stop = Math.Min(lastInput.length, input.length); while (true) { frontier[pos1].inputCount++; if (pos1 >= pos1Stop || lastInput.ints[pos1] != input.ints[pos2]) { break; } pos1++; pos2++; } int prefixLenPlus1 = pos1 + 1; if (frontier.Length < input.length + 1) { UnCompiledNode <T>[] next = ArrayUtil.grow(frontier, input.length + 1); for (int idx = frontier.Length; idx < next.Length; idx++) { next[idx] = new UnCompiledNode <T>(this, idx); } frontier = next; } // minimize/compile states from previous input's // orphan'd suffix freezeTail(prefixLenPlus1); // init tail states for current input for (int idx = prefixLenPlus1; idx <= input.length; idx++) { frontier[idx - 1].addArc(input.ints[input.offset + idx - 1], frontier[idx]); frontier[idx].inputCount++; } UnCompiledNode <T> lastNode = frontier[input.length]; if (lastInput.length != input.length || prefixLenPlus1 != input.length + 1) { lastNode.isFinal = true; lastNode.output = NO_OUTPUT; } // push conflicting outputs forward, only as far as // needed for (int idx = 1; idx < prefixLenPlus1; idx++) { UnCompiledNode <T> node = frontier[idx]; UnCompiledNode <T> parentNode = frontier[idx - 1]; T lastOutput = parentNode.getLastOutput(input.ints[input.offset + idx - 1]); T commonOutputPrefix; T wordSuffix; if (!lastOutput.Equals(NO_OUTPUT)) { commonOutputPrefix = fst.outputs.common(output, lastOutput); wordSuffix = fst.outputs.subtract(lastOutput, commonOutputPrefix); parentNode.setLastOutput(input.ints[input.offset + idx - 1], commonOutputPrefix); node.prependOutput(wordSuffix); } else { commonOutputPrefix = wordSuffix = NO_OUTPUT; } output = fst.outputs.subtract(output, commonOutputPrefix); } if (lastInput.length == input.length && prefixLenPlus1 == 1 + input.length) { // same input more than 1 time in a row, mapping to // multiple outputs lastNode.output = fst.outputs.merge(lastNode.output, output); } else { // this new arc is private to this new input; set its // arc output to the leftover output: frontier[prefixLenPlus1 - 1].setLastOutput(input.ints[input.offset + prefixLenPlus1 - 1], output); } // save last input lastInput = (IntsRef)input.Clone(); return(this); }
private void freezeTail(int prefixLenPlus1) { int downTo = Math.Max(1, prefixLenPlus1); for (int idx = lastInput.length; idx >= downTo; idx--) { bool doPrune = false; bool doCompile = false; UnCompiledNode <T> node = frontier[idx]; UnCompiledNode <T> parent = frontier[idx - 1]; if (node.inputCount < minSuffixCount1) { doPrune = true; doCompile = true; } else if (idx > prefixLenPlus1) { // prune if parent's inputCount is less than suffixMinCount2 if (parent.inputCount < minSuffixCount2 || (minSuffixCount2 == 1 && parent.inputCount == 1 && idx > 1)) { // my parent, about to be compiled, doesn't make the cut, so // I'm definitely pruned // if minSuffixCount2 is 1, we keep only up // until the 'distinguished edge', ie we keep only the // 'divergent' part of the FST. if my parent, about to be // compiled, has inputCount 1 then we are already past the // distinguished edge. NOTE: this only works if // the FST outputs are not "compressible" (simple // ords ARE compressible). doPrune = true; } else { // my parent, about to be compiled, does make the cut, so // I'm definitely not pruned doPrune = false; } doCompile = true; } else { // if pruning is disabled (count is 0) we can always // compile current node doCompile = minSuffixCount2 == 0; } if (node.inputCount < minSuffixCount2 || (minSuffixCount2 == 1 && node.inputCount == 1 && idx > 1)) { // drop all arcs for (int arcIdx = 0; arcIdx < node.numArcs; arcIdx++) { UnCompiledNode <T> target = (UnCompiledNode <T>)node.arcs[arcIdx].target; target.clear(); } node.numArcs = 0; } if (doPrune) { // this node doesn't make it -- deref it node.clear(); parent.deleteLast(lastInput.ints[idx - 1], node); } else { if (minSuffixCount2 != 0) { //TODO: minSuffixCount2 is always 0 for now. //compileAllTargets(node, lastInput.length()-idx); } T nextFinalOutput = node.output; // We "fake" the node as being final if it has no // outgoing arcs; in theory we could leave it // as non-final (the FST can represent this), but // FSTEnum, Util, etc., have trouble w/ non-final // dead-end states: bool isFinal = node.isFinal || node.numArcs == 0; if (doCompile) { // this node makes it and we now compile it. first, // compile any targets that were previously // undecided: parent.replaceLast(lastInput.ints[idx - 1], compileNode(node, 1 + lastInput.length - idx), nextFinalOutput, isFinal); } else { // replaceLast just to install // nextFinalOutput/isFinal onto the arc parent.replaceLast(lastInput.ints[idx - 1], node, nextFinalOutput, isFinal); // this node will stay in play for now, since we are // undecided on whether to prune it. later, it // will be either compiled or pruned, so we must // allocate a new node: frontier[idx] = new UnCompiledNode <T>(this, idx); } } } }
// serializes new node by appending its bytes to the end // of the current byte[] public long addNode(Builder <T> builder, UnCompiledNode <T> nodeIn) { T NO_OUTPUT = outputs.getNoOutput(); if (nodeIn.numArcs == 0) { if (nodeIn.isFinal) { return(FINAL_END_NODE); } else { return(NON_FINAL_END_NODE); } } long startAddress = builder.bytes.getPosition(); bool doFixedLengthArcs = shouldExpandNodeWithFixedLengthArcs(builder, nodeIn); if (doFixedLengthArcs) { if (builder.numBytesPerArc.Length < nodeIn.numArcs) { builder.numBytesPerArc = new int[ArrayUtil.oversize(nodeIn.numArcs, 4)]; builder.numLabelBytesPerArc = new int[builder.numBytesPerArc.Length]; } } builder.arcCount += nodeIn.numArcs; int lastArc = nodeIn.numArcs - 1; long lastArcStart = builder.bytes.getPosition(); int maxBytesPerArc = 0; int maxBytesPerArcWithoutLabel = 0; for (int arcIdx = 0; arcIdx < nodeIn.numArcs; arcIdx++) { Arc <T> arc = nodeIn.arcs[arcIdx]; CompiledNode target = (CompiledNode)arc.target; int flags = 0; if (arcIdx == lastArc) { flags += BIT_LAST_ARC; } if (builder.lastFrozenNode == target.node && !doFixedLengthArcs) { // TODO: for better perf (but more RAM used) we // could avoid this except when arc is "near" the // last arc: flags += BIT_TARGET_NEXT; } if (arc.isFinal) { flags += BIT_FINAL_ARC; if (!NO_OUTPUT.Equals(arc.nextFinalOutput)) { flags += BIT_ARC_HAS_FINAL_OUTPUT; } } else { Debug.Assert(NO_OUTPUT.Equals(arc.nextFinalOutput)); } bool targetHasArcs = target.node > 0; if (!targetHasArcs) { flags += BIT_STOP_NODE; } if (!NO_OUTPUT.Equals(arc.output)) { flags += BIT_ARC_HAS_OUTPUT; } builder.bytes.writeByte((byte)flags); long labelStart = builder.bytes.getPosition(); writeLabel(builder.bytes, arc.label); int numLabelBytes = (int)(builder.bytes.getPosition() - labelStart); if (!NO_OUTPUT.Equals(arc.output)) { throw new NotImplementedException(); //TODO: outputs.write(arc.output, builder.bytes); } if (!NO_OUTPUT.Equals(arc.nextFinalOutput)) { throw new NotImplementedException(); //TODO: outputs.writeFinalOutput(arc.nextFinalOutput, builder.bytes); } if (targetHasArcs && (flags & BIT_TARGET_NEXT) == 0) { Debug.Assert(target.node > 0); builder.bytes.writeVLong(target.node); } // just write the arcs "like normal" on first pass, but record how many bytes each one took // and max byte size: if (doFixedLengthArcs) { int numArcBytes = (int)(builder.bytes.getPosition() - lastArcStart); builder.numBytesPerArc[arcIdx] = numArcBytes; builder.numLabelBytesPerArc[arcIdx] = numLabelBytes; lastArcStart = builder.bytes.getPosition(); maxBytesPerArc = Math.Max(maxBytesPerArc, numArcBytes); maxBytesPerArcWithoutLabel = Math.Max(maxBytesPerArcWithoutLabel, numArcBytes - numLabelBytes); } } if (doFixedLengthArcs) { Debug.Assert(maxBytesPerArc > 0); // 2nd pass just "expands" all arcs to take up a fixed byte size int labelRange = nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label + 1; Debug.Assert(labelRange > 0); if (shouldExpandNodeWithDirectAddressing(builder, nodeIn, maxBytesPerArc, maxBytesPerArcWithoutLabel, labelRange)) { //writeNodeForDirectAddressing(builder, nodeIn, startAddress, maxBytesPerArcWithoutLabel, labelRange); //builder.directAddressingNodeCount++; throw new NotImplementedException(); } else { writeNodeForBinarySearch(builder, nodeIn, startAddress, maxBytesPerArc); builder.binarySearchNodeCount++; } } long thisNodeAddress = builder.bytes.getPosition() - 1; builder.bytes.reverse(startAddress, thisNodeAddress); builder.nodeCount++; return(thisNodeAddress); }
private bool shouldExpandNodeWithFixedLengthArcs(Builder <T> builder, UnCompiledNode <T> node) { return(builder.allowFixedLengthArcs && ((node.depth <= FIXED_LENGTH_ARC_SHALLOW_DEPTH && node.numArcs >= FIXED_LENGTH_ARC_SHALLOW_NUM_ARCS) || node.numArcs >= FIXED_LENGTH_ARC_DEEP_NUM_ARCS)); }
private bool shouldExpandNodeWithDirectAddressing(Builder <T> builder, UnCompiledNode <T> nodeIn, int numBytesPerArc, int maxBytesPerArcWithoutLabel, int labelRange) { //TODO return(false); }