// Use the builder to create:
 private NormalizeCharMap(FST<CharsRef> map)
 {
     this.map = map;
     if (map != null)
     {
         try
         {
             // Pre-cache root arcs:
             var scratchArc = new FST.Arc<CharsRef>();
             FST.BytesReader fstReader = map.BytesReader;
             map.GetFirstArc(scratchArc);
             if (FST<CharsRef>.TargetHasArcs(scratchArc))
             {
                 map.ReadFirstRealTargetArc(scratchArc.Target, scratchArc, fstReader);
                 while (true)
                 {
                     Debug.Assert(scratchArc.Label != FST.END_LABEL);
                     cachedRootArcs[Convert.ToChar((char)scratchArc.Label)] = (new FST.Arc<CharsRef>()).CopyFrom(scratchArc);
                     if (scratchArc.IsLast)
                     {
                         break;
                     }
                     map.ReadNextRealArc(scratchArc, fstReader);
                 }
             }
             //System.out.println("cached " + cachedRootArcs.size() + " root arcs");
         }
         catch (IOException ioe)
         {
             // Bogus FST IOExceptions!!  (will never happen)
             throw new Exception("Should never happen", ioe);
         }
     }
 }
	  /// <summary>
	  /// Default constructor that takes a <seealso cref="Reader"/>. </summary>
	  public MappingCharFilter(NormalizeCharMap normMap, TextReader @in) : base(@in)
	  {
		buffer.Reset(@in);

		map = normMap.map;
		cachedRootArcs = normMap.cachedRootArcs;

		if (map != null)
		{
		  fstReader = map.BytesReader;
		}
		else
		{
		  fstReader = null;
		}
	  }
        /// <summary>
        /// Default constructor that takes a <seealso cref="TextReader"/>. </summary>
        public MappingCharFilter(NormalizeCharMap normMap, TextReader @in) : base(@in)
        {
            //LUCENENET support to reset the reader.
            _input = GetBufferedReader(@in);
            _input.Mark(BufferedCharFilter.defaultCharBufferSize);
            buffer.Reset(_input);
            //buffer.Reset(@in);

            map = normMap.map;
            cachedRootArcs = normMap.cachedRootArcs;

            if (map != null)
            {
                fstReader = map.BytesReader;
            }
            else
            {
                fstReader = null;
            }
        }
	  // Use the builder to create:
	  private NormalizeCharMap(FST<CharsRef> map)
	  {
		this.map = map;
		if (map != null)
		{
		  try
		  {
			// Pre-cache root arcs:
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.util.fst.FST.Arc<org.apache.lucene.util.CharsRef> scratchArc = new org.apache.lucene.util.fst.FST.Arc<>();
			FST.Arc<CharsRef> scratchArc = new FST.Arc<CharsRef>();
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.apache.lucene.util.fst.FST.BytesReader fstReader = map.getBytesReader();
			FST.BytesReader fstReader = map.BytesReader;
			map.getFirstArc(scratchArc);
			if (FST.targetHasArcs(scratchArc))
			{
			  map.readFirstRealTargetArc(scratchArc.target, scratchArc, fstReader);
			  while (true)
			  {
				Debug.Assert(scratchArc.label != FST.END_LABEL);
				cachedRootArcs[Convert.ToChar((char) scratchArc.label)] = (new FST.Arc<CharsRef>()).copyFrom(scratchArc);
				if (scratchArc.Last)
				{
				  break;
				}
				map.readNextRealArc(scratchArc, fstReader);
			  }
			}
			//System.out.println("cached " + cachedRootArcs.size() + " root arcs");
		  }
		  catch (IOException ioe)
		  {
			// Bogus FST IOExceptions!!  (will never happen)
			throw new Exception(ioe);
		  }
		}
	  }
Beispiel #5
0
 public SortedDocValuesAnonymousInnerClassHelper(FSTEntry entry, NumericDocValues docToOrd, FST <long?> fst, FST.BytesReader @in, FST.Arc <long?> firstArc, FST.Arc <long?> scratchArc, Int32sRef scratchInts, BytesRefFSTEnum <long?> fstEnum)
 {
     this.entry       = entry;
     this.docToOrd    = docToOrd;
     this.fst         = fst;
     this.@in         = @in;
     this.firstArc    = firstArc;
     this.scratchArc  = scratchArc;
     this.scratchInts = scratchInts;
     this.fstEnum     = fstEnum;
 }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public TermsReader(index.FieldInfos fieldInfos, store.IndexInput in, int termCount) throws java.io.IOException
		public TermsReader(FieldInfos fieldInfos, IndexInput @in, int termCount)
		{
		  this.termCount = termCount;
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int fieldNumber = in.readVInt();
		  int fieldNumber = @in.readVInt();
		  field = fieldInfos.fieldInfo(fieldNumber);
		  if (field.IndexOptions != IndexOptions.DOCS_ONLY)
		  {
			sumTotalTermFreq = @in.readVLong();
		  }
		  else
		  {
			sumTotalTermFreq = -1;
		  }
		  sumDocFreq = @in.readVLong();
		  docCount = @in.readVInt();

		  fst = new FST<>(@in, outputs);
		}
Beispiel #7
0
        public override int Read()
        {
            //System.out.println("\nread");
            while (true)
            {
                if (replacement != null && replacementPointer < replacement.Length)
                {
                    //System.out.println("  return repl[" + replacementPointer + "]=" + replacement.chars[replacement.offset + replacementPointer]);
                    return(replacement.Chars[replacement.Offset + replacementPointer++]);
                }

                // TODO: a more efficient approach would be Aho/Corasick's
                // algorithm
                // (http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm)
                // or this generalizatio: www.cis.uni-muenchen.de/people/Schulz/Pub/dictle5.ps
                //
                // I think this would be (almost?) equivalent to 1) adding
                // epsilon arcs from all final nodes back to the init
                // node in the FST, 2) adding a .* (skip any char)
                // loop on the initial node, and 3) determinizing
                // that.  Then we would not have to Restart matching
                // at each position.

                int      lastMatchLen = -1;
                CharsRef lastMatch    = null;

                int firstCH = buffer.Get(inputOff);
                if (firstCH != -1)
                {
                    // LUCENENET fix: Check the dictionary to ensure it contains a key before reading it.
                    char key = Convert.ToChar((char)firstCH);
                    if (cachedRootArcs.TryGetValue(key, out FST.Arc <CharsRef> arc) && arc != null)
                    {
                        if (!FST.TargetHasArcs(arc))
                        {
                            // Fast pass for single character match:
                            if (Debugging.AssertsEnabled)
                            {
                                Debugging.Assert(arc.IsFinal);
                            }
                            lastMatchLen = 1;
                            lastMatch    = arc.Output;
                        }
                        else
                        {
                            int      lookahead = 0;
                            CharsRef output    = arc.Output;
                            while (true)
                            {
                                lookahead++;

                                if (arc.IsFinal)
                                {
                                    // Match! (to node is final)
                                    lastMatchLen = lookahead;
                                    lastMatch    = outputs.Add(output, arc.NextFinalOutput);
                                    // Greedy: keep searching to see if there's a
                                    // longer match...
                                }

                                if (!FST.TargetHasArcs(arc))
                                {
                                    break;
                                }

                                int ch = buffer.Get(inputOff + lookahead);
                                if (ch == -1)
                                {
                                    break;
                                }
                                if ((arc = map.FindTargetArc(ch, arc, scratchArc, fstReader)) is null)
                                {
                                    // Dead end
                                    break;
                                }
                                output = outputs.Add(output, arc.Output);
                            }
                        }
                    }
                }

                if (lastMatch != null)
                {
                    inputOff += lastMatchLen;
                    //System.out.println("  match!  len=" + lastMatchLen + " repl=" + lastMatch);
                    int diff = lastMatchLen - lastMatch.Length;

                    if (diff != 0)
                    {
                        int prevCumulativeDiff = LastCumulativeDiff;
                        if (diff > 0)
                        {
                            // Replacement is shorter than matched input:
                            AddOffCorrectMap(inputOff - diff - prevCumulativeDiff, prevCumulativeDiff + diff);
                        }
                        else
                        {
                            // Replacement is longer than matched input: remap
                            // the "extra" chars all back to the same input
                            // offset:
                            int outputStart = inputOff - prevCumulativeDiff;
                            for (int extraIDX = 0; extraIDX < -diff; extraIDX++)
                            {
                                AddOffCorrectMap(outputStart + extraIDX, prevCumulativeDiff - extraIDX - 1);
                            }
                        }
                    }

                    replacement        = lastMatch;
                    replacementPointer = 0;
                }
                else
                {
                    int ret = buffer.Get(inputOff);
                    if (ret != -1)
                    {
                        inputOff++;
                        buffer.FreeBefore(inputOff);
                    }
                    return(ret);
                }
            }
        }
            private void LoadTermsIndex()
            {
                if (Fst != null) return;

                var clone = (IndexInput) _vgtir._input.Clone();
                clone.Seek(_indexStart);
                Fst = new FST<long?>(clone, _vgtir._fstOutputs);
                clone.Dispose();

                /*
                final String dotFileName = segment + "_" + fieldInfo.name + ".dot";
                Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName));
                Util.toDot(fst, w, false, false);
                System.out.println("FST INDEX: SAVED to " + dotFileName);
                w.close();
                */

                if (_vgtir._indexDivisor > 1)
                {
                    // subsample
                    var scratchIntsRef = new IntsRef();
                    var outputs = PositiveIntOutputs.Singleton;
                    var builder = new Builder<long?>(FST.INPUT_TYPE.BYTE1, outputs);
                    var fstEnum = new BytesRefFSTEnum<long?>(Fst);
                    var count = _vgtir._indexDivisor;
                        
                    BytesRefFSTEnum<long?>.InputOutput<long?> result;
                    while ((result = fstEnum.Next()) != null)
                    {
                        if (count == _vgtir._indexDivisor)
                        {
                            builder.Add(Util.ToIntsRef(result.Input, scratchIntsRef), result.Output);
                            count = 0;
                        }
                        count++;
                    }
                    Fst = builder.Finish();
                }
            }
 public SimpleTextTermsEnum(SimpleTextFieldsReader outerInstance,
     FST<PairOutputs<long?, PairOutputs<long?,long?>.Pair>.Pair> fst, IndexOptions indexOptions)
 {
     _outerInstance = outerInstance;
     _indexOptions = indexOptions;
     _fstEnum = new BytesRefFSTEnum<PairOutputs<long?, PairOutputs<long?,long?>.Pair>.Pair>(fst);
 }
Beispiel #10
0
        // Uncomment for debugging:

        /*
         * public static <T> void dotToFile(FST<T> fst, String filePath) throws IOException {
         * Writer w = new OutputStreamWriter(new FileOutputStream(filePath));
         * toDot(fst, w, true, true);
         * w.close();
         * }
         */

        /// <summary>
        /// Reads the first arc greater or equal that the given label into the provided
        /// arc in place and returns it iff found, otherwise return <code>null</code>.
        /// </summary>
        /// <param name="label"> the label to ceil on </param>
        /// <param name="fst"> the fst to operate on </param>
        /// <param name="follow"> the arc to follow reading the label from </param>
        /// <param name="arc"> the arc to read into in place </param>
        /// <param name="in"> the fst's <seealso cref="BytesReader"/> </param>
        public static FST <T> .Arc <T> readCeilArc <T>(int label, FST <T> fst, FST <T> .Arc <T> follow, FST <T> .Arc <T> arc, FST <T> .BytesReader @in)
        {
            // TODO maybe this is a useful in the FST class - we could simplify some other code like FSTEnum?
            if (label == FST <T> .END_LABEL)
            {
                if (follow.Final)
                {
                    if (follow.Target <= 0)
                    {
                        arc.Flags = (sbyte)FST <T> .BIT_LAST_ARC;
                    }
                    else
                    {
                        arc.Flags = 0;
                        // NOTE: nextArc is a node (not an address!) in this case:
                        arc.NextArc = follow.Target;
                        arc.Node    = follow.Target;
                    }
                    arc.Output = follow.NextFinalOutput;
                    arc.Label  = FST <T> .END_LABEL;
                    return(arc);
                }
                else
                {
                    return(null);
                }
            }

            if (!FST <T> .TargetHasArcs(follow))
            {
                return(null);
            }
            fst.ReadFirstTargetArc(follow, arc, @in);
            if (arc.BytesPerArc != 0 && arc.Label != FST <T> .END_LABEL)
            {
                // Arcs are fixed array -- use binary search to find
                // the target.

                int low  = arc.ArcIdx;
                int high = arc.NumArcs - 1;
                int mid  = 0;
                // System.out.println("do arc array low=" + low + " high=" + high +
                // " targetLabel=" + targetLabel);
                while (low <= high)
                {
                    mid          = (int)((uint)(low + high) >> 1);
                    @in.Position = arc.PosArcsStart;
                    @in.SkipBytes(arc.BytesPerArc * mid + 1);
                    int midLabel = fst.ReadLabel(@in);
                    int cmp      = midLabel - label;
                    // System.out.println("  cycle low=" + low + " high=" + high + " mid=" +
                    // mid + " midLabel=" + midLabel + " cmp=" + cmp);
                    if (cmp < 0)
                    {
                        low = mid + 1;
                    }
                    else if (cmp > 0)
                    {
                        high = mid - 1;
                    }
                    else
                    {
                        arc.ArcIdx = mid - 1;
                        return(fst.ReadNextRealArc(arc, @in));
                    }
                }
                if (low == arc.NumArcs)
                {
                    // DEAD END!
                    return(null);
                }

                arc.ArcIdx = (low > high ? high : low);
                return(fst.ReadNextRealArc(arc, @in));
            }

            // Linear scan
            fst.ReadFirstRealTargetArc(follow.Target, arc, @in);

            while (true)
            {
                // System.out.println("  non-bs cycle");
                // TODO: we should fix this code to not have to create
                // object for the output of every arc we scan... only
                // for the matching arc, if found
                if (arc.Label >= label)
                {
                    // System.out.println("    found!");
                    return(arc);
                }
                else if (arc.Last)
                {
                    return(null);
                }
                else
                {
                    fst.ReadNextRealArc(arc, @in);
                }
            }
        }
Beispiel #11
0
            /// <summary>
            /// Builds an <see cref="SynonymMap"/> and returns it.
            /// </summary>
            public virtual SynonymMap Build()
            {
                ByteSequenceOutputs outputs = ByteSequenceOutputs.Singleton;
                // TODO: are we using the best sharing options?
                var builder = new Builder <BytesRef>(FST.INPUT_TYPE.BYTE4, outputs);

                BytesRef            scratch       = new BytesRef(64);
                ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();

                ISet <int?> dedupSet;

                if (dedup)
                {
                    dedupSet = new JCG.HashSet <int?>();
                }
                else
                {
                    dedupSet = null;
                }


                var spare = new byte[5];

                ICollection <CharsRef> keys = workingSet.Keys;

                CharsRef[] sortedKeys = new CharsRef[keys.Count];
                keys.CopyTo(sortedKeys, 0);
#pragma warning disable 612, 618
                System.Array.Sort(sortedKeys, CharsRef.UTF16SortedAsUTF8Comparer);
#pragma warning restore 612, 618


                Int32sRef scratchIntsRef = new Int32sRef();

                //System.out.println("fmap.build");
                for (int keyIdx = 0; keyIdx < sortedKeys.Length; keyIdx++)
                {
                    CharsRef input  = sortedKeys[keyIdx];
                    MapEntry output = workingSet[input];

                    int numEntries = output.ords.Count;
                    // output size, assume the worst case
                    int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry

                    scratch.Grow(estimatedSize);
                    scratchOutput.Reset(scratch.Bytes, scratch.Offset, scratch.Bytes.Length);
                    Debug.Assert(scratch.Offset == 0);

                    // now write our output data:
                    int count = 0;
                    for (int i = 0; i < numEntries; i++)
                    {
                        if (dedupSet != null)
                        {
                            // box once
                            int?ent = output.ords[i];
                            if (dedupSet.Contains(ent))
                            {
                                continue;
                            }
                            dedupSet.Add(ent);
                        }
                        scratchOutput.WriteVInt32(output.ords[i]);
                        count++;
                    }

                    int pos = scratchOutput.Position;
                    scratchOutput.WriteVInt32(count << 1 | (output.includeOrig ? 0 : 1));
                    int pos2    = scratchOutput.Position;
                    int vIntLen = pos2 - pos;

                    // Move the count + includeOrig to the front of the byte[]:
                    Array.Copy(scratch.Bytes, pos, spare, 0, vIntLen);
                    Array.Copy(scratch.Bytes, 0, scratch.Bytes, vIntLen, pos);
                    Array.Copy(spare, 0, scratch.Bytes, 0, vIntLen);

                    if (dedupSet != null)
                    {
                        dedupSet.Clear();
                    }

                    scratch.Length = scratchOutput.Position - scratch.Offset;
                    //System.out.println("  add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count);
                    builder.Add(Lucene.Net.Util.Fst.Util.ToUTF32(input.ToString(), scratchIntsRef), BytesRef.DeepCopyOf(scratch));
                }

                FST <BytesRef> fst = builder.Finish();
                return(new SynonymMap(fst, words, maxHorizontalContext));
            }
            //private boolean DEBUG;

            internal FieldReader(BlockTreeTermsReader outerInstance, FieldInfo fieldInfo, long numTerms, BytesRef rootCode, long sumTotalTermFreq, long sumDocFreq, int docCount, long indexStartFP, int longsSize, IndexInput indexIn)
            {
                this.OuterInstance = outerInstance;
                Debug.Assert(numTerms > 0);
                this.fieldInfo = fieldInfo;
                //DEBUG = BlockTreeTermsReader.DEBUG && fieldInfo.name.equals("id");
                this.NumTerms = numTerms;
                this.SumTotalTermFreq_Renamed = sumTotalTermFreq;
                this.SumDocFreq_Renamed = sumDocFreq;
                this.DocCount_Renamed = docCount;
                this.IndexStartFP = indexStartFP;
                this.RootCode = rootCode;
                this.LongsSize = longsSize;
                // if (DEBUG) {
                //   System.out.println("BTTR: seg=" + segment + " field=" + fieldInfo.name + " rootBlockCode=" + rootCode + " divisor=" + indexDivisor);
                // }

                RootBlockFP = (int)((uint)(new ByteArrayDataInput((byte[])(Array)rootCode.Bytes, rootCode.Offset, rootCode.Length)).ReadVLong() >> BlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS);

                if (indexIn != null)
                {
                    IndexInput clone = (IndexInput)indexIn.Clone();
                    //System.out.println("start=" + indexStartFP + " field=" + fieldInfo.name);
                    clone.Seek(indexStartFP);
                    Index = new FST<BytesRef>(clone, ByteSequenceOutputs.Singleton);

                    /*
                    if (false) {
                      final String dotFileName = segment + "_" + fieldInfo.name + ".dot";
                      Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName));
                      Util.toDot(index, w, false, false);
                      System.out.println("FST INDEX: SAVED to " + dotFileName);
                      w.close();
                    }
                    */
                }
                else
                {
                    Index = null;
                }
            }
Beispiel #13
0
        public virtual TokenInfoDictionaryWriter BuildDictionary(IList <string> csvFiles)
        {
            TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(10 * 1024 * 1024);

            // all lines in the file
            Console.WriteLine("  parse...");
            List <string[]> lines = new List <string[]>(400000);

            foreach (string file in csvFiles)
            {
                using (Stream inputStream = new FileStream(file, FileMode.Open, FileAccess.Read))
                {
                    Encoding   decoder = Encoding.GetEncoding(encoding);
                    TextReader reader  = new StreamReader(inputStream, decoder);

                    string line = null;
                    while ((line = reader.ReadLine()) != null)
                    {
                        string[] entry = CSVUtil.Parse(line);

                        if (entry.Length < 13)
                        {
                            Console.WriteLine("Entry in CSV is not valid: " + line);
                            continue;
                        }

                        string[] formatted = FormatEntry(entry);
                        lines.Add(formatted);

                        // NFKC normalize dictionary entry
                        if (normalizeEntries)
                        {
                            //if (normalizer.isNormalized(entry[0])){
                            if (entry[0].IsNormalized(NormalizationForm.FormKC))
                            {
                                continue;
                            }
                            string[] normalizedEntry = new string[entry.Length];
                            for (int i = 0; i < entry.Length; i++)
                            {
                                //normalizedEntry[i] = normalizer.normalize(entry[i]);
                                normalizedEntry[i] = entry[i].Normalize(NormalizationForm.FormKC);
                            }

                            formatted = FormatEntry(normalizedEntry);
                            lines.Add(formatted);
                        }
                    }
                }
            }

            Console.WriteLine("  sort...");

            // sort by term: we sorted the files already and use a stable sort.
            lines.Sort(new ComparerAnonymousHelper());

            Console.WriteLine("  encode...");

            PositiveInt32Outputs fstOutput  = PositiveInt32Outputs.Singleton;
            Builder <long?>      fstBuilder = new Builder <long?>(Lucene.Net.Util.Fst.FST.INPUT_TYPE.BYTE2, 0, 0, true, true, int.MaxValue, fstOutput, null, true, PackedInt32s.DEFAULT, true, 15);
            Int32sRef            scratch    = new Int32sRef();
            long   ord       = -1; // first ord will be 0
            string lastValue = null;

            // build tokeninfo dictionary
            foreach (string[] entry in lines)
            {
                int next = dictionary.Put(entry);

                if (next == offset)
                {
                    Console.WriteLine("Failed to process line: " + Collections.ToString(entry));
                    continue;
                }

                string token = entry[0];
                if (!token.Equals(lastValue, StringComparison.Ordinal))
                {
                    // new word to add to fst
                    ord++;
                    lastValue = token;
                    scratch.Grow(token.Length);
                    scratch.Length = token.Length;
                    for (int i = 0; i < token.Length; i++)
                    {
                        scratch.Int32s[i] = (int)token[i];
                    }
                    fstBuilder.Add(scratch, ord);
                }
                dictionary.AddMapping((int)ord, offset);
                offset = next;
            }

            FST <long?> fst = fstBuilder.Finish();

            Console.WriteLine("  " + fst.NodeCount + " nodes, " + fst.ArcCount + " arcs, " + fst.GetSizeInBytes() + " bytes...  ");
            dictionary.SetFST(fst);
            Console.WriteLine(" done");

            return(dictionary);
        }
        /// <param name="input"> input tokenstream </param>
        /// <param name="synonyms"> synonym map </param>
        /// <param name="ignoreCase"> case-folds input for matching with <seealso cref="Character#toLowerCase(int)"/>.
        ///                   Note, if you set this to true, its your responsibility to lowercase
        ///                   the input entries when you create the <seealso cref="SynonymMap"/> </param>
        public SynonymFilter(TokenStream input, SynonymMap synonyms, bool ignoreCase)
            : base(input)
        {
            termAtt = AddAttribute<ICharTermAttribute>();
            posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
            posLenAtt = AddAttribute<IPositionLengthAttribute>();
            typeAtt = AddAttribute<ITypeAttribute>();
            offsetAtt = AddAttribute<IOffsetAttribute>();

            this.synonyms = synonyms;
            this.ignoreCase = ignoreCase;
            this.fst = synonyms.fst;
            if (fst == null)
            {
                throw new System.ArgumentException("fst must be non-null");
            }
            this.fstReader = fst.BytesReader;

            // Must be 1+ so that when roll buffer is at full
            // lookahead we can distinguish this full buffer from
            // the empty buffer:
            rollBufferSize = 1 + synonyms.maxHorizontalContext;

            futureInputs = new PendingInput[rollBufferSize];
            futureOutputs = new PendingOutputs[rollBufferSize];
            for (int pos = 0; pos < rollBufferSize; pos++)
            {
                futureInputs[pos] = new PendingInput();
                futureOutputs[pos] = new PendingOutputs();
            }

            //System.out.println("FSTFilt maxH=" + synonyms.maxHorizontalContext);

            scratchArc = new FST.Arc<BytesRef>();
        }
 /// <summary>
 /// Returns the value mapped to the given key or <code>null</code> if the key is not in the FST dictionary.
 /// </summary>
 public BytesRef Get(char[] buffer, int bufferLen, FST.Arc<BytesRef> scratchArc, FST.BytesReader fstReader)
 {
     BytesRef pendingOutput = fst.Outputs.NoOutput;
     BytesRef matchOutput = null;
     int bufUpto = 0;
     fst.GetFirstArc(scratchArc);
     while (bufUpto < bufferLen)
     {
         int codePoint = Character.CodePointAt(buffer, bufUpto, bufferLen);
         if (fst.FindTargetArc(ignoreCase ? Character.ToLowerCase(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null)
         {
             return null;
         }
         pendingOutput = fst.Outputs.Add(pendingOutput, scratchArc.Output);
         bufUpto += Character.CharCount(codePoint);
     }
     if (scratchArc.IsFinal)
     {
         matchOutput = fst.Outputs.Add(pendingOutput, scratchArc.NextFinalOutput);
     }
     return matchOutput;
 }
 public override void Finish(long termsFilePointer)
 {
     fst = fstBuilder.Finish();
     if (fst != null)
     {
         fst.Save(output);
     }
 }
        protected internal override IList<FSTUtil.Path<Pair<long?, BytesRef>>> GetFullPrefixPaths(IList<FSTUtil.Path<Pair<long?, BytesRef>>> prefixPaths, Automaton lookupAutomaton, FST<Pair<long?, BytesRef>> fst)
        {
            // TODO: right now there's no penalty for fuzzy/edits,
            // ie a completion whose prefix matched exactly what the
            // user typed gets no boost over completions that
            // required an edit, which get no boost over completions
            // requiring two edits.  I suspect a multiplicative
            // factor is appropriate (eg, say a fuzzy match must be at
            // least 2X better weight than the non-fuzzy match to
            // "compete") ... in which case I think the wFST needs
            // to be log weights or something ...

            Automaton levA = convertAutomaton(ToLevenshteinAutomata(lookupAutomaton));
            /*
              Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), StandardCharsets.UTF_8);
              w.write(levA.toDot());
              w.close();
              System.out.println("Wrote LevA to out.dot");
            */
            return FSTUtil.IntersectPrefixPaths(levA, fst);
        }
		public FSTTermsEnum(FieldInfo field, FST<BytesRef> fst)
		{
		  this.field = field;
		  fstEnum = new BytesRefFSTEnum<>(fst);
		}
Beispiel #19
0
 internal FSTTermsEnum(FST <long?> fst)
 {
     this.fst    = fst;
     @in         = new BytesRefFSTEnum <long?>(fst);
     bytesReader = fst.GetBytesReader();
 }
Beispiel #20
0
        public void TestEnumerateAll()
        {
            // just for debugging
            int numTerms                     = 0;
            int numWords                     = 0;
            int lastWordId                   = -1;
            int lastSourceId                 = -1;
            TokenInfoDictionary      tid     = TokenInfoDictionary.Instance;
            ConnectionCosts          matrix  = ConnectionCosts.Instance;
            FST <Int64>              fst     = tid.FST.InternalFST;
            Int32sRefFSTEnum <Int64> fstEnum = new Int32sRefFSTEnum <Int64>(fst);

            Int32sRefFSTEnum.InputOutput <Int64> mapping;
            Int32sRef scratch = new Int32sRef();

            while (fstEnum.MoveNext())
            {
                mapping = fstEnum.Current;
                numTerms++;
                Int32sRef input = mapping.Input;
                char[]    chars = new char[input.Length];
                for (int i = 0; i < chars.Length; i++)
                {
                    chars[i] = (char)input.Int32s[input.Offset + i];
                }
                assertTrue(UnicodeUtil.ValidUTF16String(new string(chars)));

                long?output   = mapping.Output;
                int  sourceId = (int)output.Value;
                // we walk in order, terms, sourceIds, and wordIds should always be increasing
                assertTrue(sourceId > lastSourceId);
                lastSourceId = sourceId;
                tid.LookupWordIds(sourceId, scratch);
                for (int i = 0; i < scratch.Length; i++)
                {
                    numWords++;
                    int wordId = scratch.Int32s[scratch.Offset + i];
                    assertTrue(wordId > lastWordId);
                    lastWordId = wordId;

                    String baseForm = tid.GetBaseForm(wordId, chars, 0, chars.Length);
                    assertTrue(baseForm is null || UnicodeUtil.ValidUTF16String(baseForm));

                    String inflectionForm = tid.GetInflectionForm(wordId);
                    assertTrue(inflectionForm is null || UnicodeUtil.ValidUTF16String(inflectionForm));
                    if (inflectionForm != null)
                    {
                        // check that its actually an ipadic inflection form
                        assertNotNull(ToStringUtil.GetInflectedFormTranslation(inflectionForm));
                    }

                    String inflectionType = tid.GetInflectionType(wordId);
                    assertTrue(inflectionType is null || UnicodeUtil.ValidUTF16String(inflectionType));
                    if (inflectionType != null)
                    {
                        // check that its actually an ipadic inflection type
                        assertNotNull(ToStringUtil.GetInflectionTypeTranslation(inflectionType));
                    }

                    int leftId  = tid.GetLeftId(wordId);
                    int rightId = tid.GetRightId(wordId);

                    matrix.Get(rightId, leftId);

                    tid.GetWordCost(wordId);

                    String pos = tid.GetPartOfSpeech(wordId);
                    assertNotNull(pos);
                    assertTrue(UnicodeUtil.ValidUTF16String(pos));
                    // check that its actually an ipadic pos tag
                    assertNotNull(ToStringUtil.GetPOSTranslation(pos));

                    String pronunciation = tid.GetPronunciation(wordId, chars, 0, chars.Length);
                    assertNotNull(pronunciation);
                    assertTrue(UnicodeUtil.ValidUTF16String(pronunciation));

                    String reading = tid.GetReading(wordId, chars, 0, chars.Length);
                    assertNotNull(reading);
                    assertTrue(UnicodeUtil.ValidUTF16String(reading));
                }
            }
            if (Verbose)
            {
                Console.WriteLine("checked " + numTerms + " terms, " + numWords + " words.");
            }
        }
Beispiel #21
0
        public override void Build(IInputEnumerator enumerator)
        {
            // LUCENENET: Added guard clause for null
            if (enumerator is null)
            {
                throw new ArgumentNullException(nameof(enumerator));
            }

            if (enumerator.HasContexts)
            {
                throw new ArgumentException("this suggester doesn't support contexts");
            }
            string prefix     = this.GetType().Name;
            var    directory  = OfflineSorter.DefaultTempDir();
            var    tempInput  = FileSupport.CreateTempFile(prefix, ".input", directory);
            var    tempSorted = FileSupport.CreateTempFile(prefix, ".sorted", directory);

            hasPayloads = enumerator.HasPayloads;

            var writer = new OfflineSorter.ByteSequencesWriter(tempInput);

            OfflineSorter.ByteSequencesReader reader = null;
            var scratch = new BytesRef();

            TokenStreamToAutomaton ts2a = GetTokenStreamToAutomaton();

            bool success = false;

            count = 0;
            byte[] buffer = new byte[8];
            try
            {
                var      output = new ByteArrayDataOutput(buffer);
                BytesRef surfaceForm;

                while (enumerator.MoveNext())
                {
                    surfaceForm = enumerator.Current;
                    ISet <Int32sRef> paths = ToFiniteStrings(surfaceForm, ts2a);

                    maxAnalyzedPathsForOneInput = Math.Max(maxAnalyzedPathsForOneInput, paths.Count);

                    foreach (Int32sRef path in paths)
                    {
                        Util.Fst.Util.ToBytesRef(path, scratch);

                        // length of the analyzed text (FST input)
                        if (scratch.Length > ushort.MaxValue - 2)
                        {
                            throw new ArgumentException("cannot handle analyzed forms > " + (ushort.MaxValue - 2) +
                                                        " in length (got " + scratch.Length + ")");
                        }
                        ushort analyzedLength = (ushort)scratch.Length;

                        // compute the required length:
                        // analyzed sequence + weight (4) + surface + analyzedLength (short)
                        int requiredLength = analyzedLength + 4 + surfaceForm.Length + 2;

                        BytesRef payload;

                        if (hasPayloads)
                        {
                            if (surfaceForm.Length > (ushort.MaxValue - 2))
                            {
                                throw new ArgumentException("cannot handle surface form > " + (ushort.MaxValue - 2) +
                                                            " in length (got " + surfaceForm.Length + ")");
                            }
                            payload = enumerator.Payload;
                            // payload + surfaceLength (short)
                            requiredLength += payload.Length + 2;
                        }
                        else
                        {
                            payload = null;
                        }

                        buffer = ArrayUtil.Grow(buffer, requiredLength);

                        output.Reset(buffer);

                        output.WriteInt16((short)analyzedLength);

                        output.WriteBytes(scratch.Bytes, scratch.Offset, scratch.Length);

                        output.WriteInt32(EncodeWeight(enumerator.Weight));

                        if (hasPayloads)
                        {
                            for (int i = 0; i < surfaceForm.Length; i++)
                            {
                                if (surfaceForm.Bytes[i] == PAYLOAD_SEP)
                                {
                                    throw new ArgumentException(
                                              "surface form cannot contain unit separator character U+001F; this character is reserved");
                                }
                            }
                            output.WriteInt16((short)surfaceForm.Length);
                            output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length);
                            output.WriteBytes(payload.Bytes, payload.Offset, payload.Length);
                        }
                        else
                        {
                            output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length);
                        }

                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(output.Position == requiredLength, "{0} vs {1}", output.Position, requiredLength);
                        }

                        writer.Write(buffer, 0, output.Position);
                    }
                    count++;
                }
                writer.Dispose();

                // Sort all input/output pairs (required by FST.Builder):
                (new OfflineSorter(new AnalyzingComparer(hasPayloads))).Sort(tempInput, tempSorted);

                // Free disk space:
                tempInput.Delete();

                reader = new OfflineSorter.ByteSequencesReader(tempSorted);

                var outputs = new PairOutputs <long?, BytesRef>(PositiveInt32Outputs.Singleton,
                                                                ByteSequenceOutputs.Singleton);
                var builder = new Builder <PairOutputs <long?, BytesRef> .Pair>(FST.INPUT_TYPE.BYTE1, outputs);

                // Build FST:
                BytesRef  previousAnalyzed = null;
                BytesRef  analyzed         = new BytesRef();
                BytesRef  surface          = new BytesRef();
                Int32sRef scratchInts      = new Int32sRef();
                var       input            = new ByteArrayDataInput();

                // Used to remove duplicate surface forms (but we
                // still index the hightest-weight one).  We clear
                // this when we see a new analyzed form, so it cannot
                // grow unbounded (at most 256 entries):
                var seenSurfaceForms = new JCG.HashSet <BytesRef>();

                var dedup = 0;
                while (reader.Read(scratch))
                {
                    input.Reset(scratch.Bytes, scratch.Offset, scratch.Length);
                    ushort analyzedLength = (ushort)input.ReadInt16();
                    analyzed.Grow(analyzedLength + 2);
                    input.ReadBytes(analyzed.Bytes, 0, analyzedLength);
                    analyzed.Length = analyzedLength;

                    long cost = input.ReadInt32();

                    surface.Bytes = scratch.Bytes;
                    if (hasPayloads)
                    {
                        surface.Length = (ushort)input.ReadInt16();
                        surface.Offset = input.Position;
                    }
                    else
                    {
                        surface.Offset = input.Position;
                        surface.Length = scratch.Length - surface.Offset;
                    }

                    if (previousAnalyzed == null)
                    {
                        previousAnalyzed = new BytesRef();
                        previousAnalyzed.CopyBytes(analyzed);
                        seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface));
                    }
                    else if (analyzed.Equals(previousAnalyzed))
                    {
                        dedup++;
                        if (dedup >= maxSurfaceFormsPerAnalyzedForm)
                        {
                            // More than maxSurfaceFormsPerAnalyzedForm
                            // dups: skip the rest:
                            continue;
                        }
                        if (seenSurfaceForms.Contains(surface))
                        {
                            continue;
                        }
                        seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface));
                    }
                    else
                    {
                        dedup = 0;
                        previousAnalyzed.CopyBytes(analyzed);
                        seenSurfaceForms.Clear();
                        seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface));
                    }

                    // TODO: I think we can avoid the extra 2 bytes when
                    // there is no dup (dedup==0), but we'd have to fix
                    // the exactFirst logic ... which would be sort of
                    // hairy because we'd need to special case the two
                    // (dup/not dup)...

                    // NOTE: must be byte 0 so we sort before whatever
                    // is next
                    analyzed.Bytes[analyzed.Offset + analyzed.Length]     = 0;
                    analyzed.Bytes[analyzed.Offset + analyzed.Length + 1] = (byte)dedup;
                    analyzed.Length += 2;

                    Util.Fst.Util.ToInt32sRef(analyzed, scratchInts);
                    //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString());
                    if (!hasPayloads)
                    {
                        builder.Add(scratchInts, outputs.NewPair(cost, BytesRef.DeepCopyOf(surface)));
                    }
                    else
                    {
                        int      payloadOffset = input.Position + surface.Length;
                        int      payloadLength = scratch.Length - payloadOffset;
                        BytesRef br            = new BytesRef(surface.Length + 1 + payloadLength);
                        Array.Copy(surface.Bytes, surface.Offset, br.Bytes, 0, surface.Length);
                        br.Bytes[surface.Length] = PAYLOAD_SEP;
                        Array.Copy(scratch.Bytes, payloadOffset, br.Bytes, surface.Length + 1, payloadLength);
                        br.Length = br.Bytes.Length;
                        builder.Add(scratchInts, outputs.NewPair(cost, br));
                    }
                }
                fst = builder.Finish();

                //Util.dotToFile(fst, "/tmp/suggest.dot");

                success = true;
            }
            finally
            {
                if (success)
                {
                    IOUtils.Dispose(reader, writer);
                }
                else
                {
                    IOUtils.DisposeWhileHandlingException(reader, writer);
                }

                tempInput.Delete();
                tempSorted.Delete();
            }
        }
        public override SortedDocValues GetSorted(FieldInfo field)
        {
            FSTEntry entry = Fsts[field.Number];
            FST<long> instance;
            lock (this)
            {
                if (!FstInstances.TryGetValue(field.Number, out instance))
                {
                    Data.Seek(entry.Offset);
                    instance = new FST<long>(Data, PositiveIntOutputs.Singleton);
                    RamBytesUsed_Renamed.AddAndGet(instance.SizeInBytes());
                    FstInstances[field.Number] = instance;
                }
            }
            NumericDocValues docToOrd = GetNumeric(field);
            FST<long> fst = instance;

            // per-thread resources
            FST<long>.BytesReader @in = fst.BytesReader;
            FST<long>.Arc<long> firstArc = new FST<long>.Arc<long>();
            FST<long>.Arc<long> scratchArc = new FST<long>.Arc<long>();
            IntsRef scratchInts = new IntsRef();
            BytesRefFSTEnum<long> fstEnum = new BytesRefFSTEnum<long>(fst);

            return new SortedDocValuesAnonymousInnerClassHelper(this, entry, docToOrd, fst, @in, firstArc, scratchArc, scratchInts, fstEnum);
        }
Beispiel #23
0
 /// <summary>
 /// Sole constructor </summary>
 public FSTPath(T cost, FST <T> .Arc <T> arc, IntsRef input)
 {
     this.Arc   = (new FST <T> .Arc <T>()).CopyFrom(arc);
     this.Cost  = cost;
     this.Input = input;
 }
                // Pushes next'd frame or seek'd frame; we later
                // lazy-load the frame only when needed
                internal Frame PushFrame(FST<BytesRef>.Arc<BytesRef> arc, long fp, int length)
                {
                    Frame f = GetFrame(1 + CurrentFrame.Ord);
                    f.Arc = arc;
                    if (f.FpOrig == fp && f.NextEnt != -1)
                    {
                        //if (DEBUG) System.out.println("      push reused frame ord=" + f.ord + " fp=" + f.fp + " isFloor?=" + f.isFloor + " hasTerms=" + f.hasTerms + " pref=" + term + " nextEnt=" + f.nextEnt + " targetBeforeCurrentLength=" + targetBeforeCurrentLength + " term.length=" + term.length + " vs prefix=" + f.prefix);
                        if (f.Prefix > TargetBeforeCurrentLength)
                        {
                            f.Rewind();
                        }
                        else
                        {
                            // if (DEBUG) {
                            //   System.out.println("        skip rewind!");
                            // }
                        }
                        Debug.Assert(length == f.Prefix);
                    }
                    else
                    {
                        f.NextEnt = -1;
                        f.Prefix = length;
                        f.State.TermBlockOrd = 0;
                        f.FpOrig = f.Fp = fp;
                        f.LastSubFP = -1;
                        // if (DEBUG) {
                        //   final int sav = term.length;
                        //   term.length = length;
                        //   System.out.println("      push new frame ord=" + f.ord + " fp=" + f.fp + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " pref=" + brToString(term));
                        //   term.length = sav;
                        // }
                    }

                    return f;
                }
		/// <summary>
		/// Returns the value mapped to the given key or <code>null</code> if the key is not in the FST dictionary.
		/// </summary>
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
//ORIGINAL LINE: public org.apache.lucene.util.BytesRef get(char[] buffer, int bufferLen, org.apache.lucene.util.fst.FST.Arc<org.apache.lucene.util.BytesRef> scratchArc, org.apache.lucene.util.fst.FST.BytesReader fstReader) throws java.io.IOException
		public BytesRef get(char[] buffer, int bufferLen, FST.Arc<BytesRef> scratchArc, FST.BytesReader fstReader)
		{
		  BytesRef pendingOutput = fst.outputs.NoOutput;
		  BytesRef matchOutput = null;
		  int bufUpto = 0;
		  fst.getFirstArc(scratchArc);
		  while (bufUpto < bufferLen)
		  {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen);
			int codePoint = char.codePointAt(buffer, bufUpto, bufferLen);
			if (fst.findTargetArc(ignoreCase ? char.ToLower(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null)
			{
			  return null;
			}
			pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
			bufUpto += char.charCount(codePoint);
		  }
		  if (scratchArc.Final)
		  {
			matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput);
		  }
		  return matchOutput;
		}
Beispiel #26
0
        /// <summary>
        /// Enumerates all minimal prefix paths in the automaton that also intersect the <see cref="FST"/>,
        /// accumulating the <see cref="FST"/> end node and output for each path.
        /// </summary>
        public static IList <Path <T> > IntersectPrefixPaths <T>(Automaton a, FST <T> fst)
        {
            Debug.Assert(a.IsDeterministic);
            IList <Path <T> > queue    = new List <Path <T> >();
            List <Path <T> >  endNodes = new List <Path <T> >();

            queue.Add(new Path <T>(a.GetInitialState(), fst.GetFirstArc(new FST.Arc <T>()), fst.Outputs.NoOutput, new Int32sRef()));

            FST.Arc <T>     scratchArc = new FST.Arc <T>();
            FST.BytesReader fstReader  = fst.GetBytesReader();

            while (queue.Count != 0)
            {
                Path <T> path = queue.ElementAt(queue.Count - 1);
                queue.Remove(path);
                if (path.State.Accept)
                {
                    endNodes.Add(path);
                    // we can stop here if we accept this path,
                    // we accept all further paths too
                    continue;
                }

                Int32sRef currentInput = path.Input;
                foreach (Transition t in path.State.GetTransitions())
                {
                    int min = t.Min;
                    int max = t.Max;
                    if (min == max)
                    {
                        FST.Arc <T> nextArc = fst.FindTargetArc(t.Min, path.FstNode, scratchArc, fstReader);
                        if (nextArc != null)
                        {
                            Int32sRef newInput = new Int32sRef(currentInput.Length + 1);
                            newInput.CopyInt32s(currentInput);
                            newInput.Int32s[currentInput.Length] = t.Min;
                            newInput.Length = currentInput.Length + 1;
                            queue.Add(new Path <T>(t.Dest, new FST.Arc <T>()
                                                   .CopyFrom(nextArc), fst.Outputs.Add(path.Output, nextArc.Output), newInput));
                        }
                    }
                    else
                    {
                        // TODO: if this transition's TO state is accepting, and
                        // it accepts the entire range possible in the FST (ie. 0 to 255),
                        // we can simply use the prefix as the accepted state instead of
                        // looking up all the ranges and terminate early
                        // here.  This just shifts the work from one queue
                        // (this one) to another (the completion search
                        // done in AnalyzingSuggester).

                        FST.Arc <T> nextArc = Lucene.Net.Util.Fst.Util.ReadCeilArc(min, fst, path.FstNode, scratchArc, fstReader);
                        while (nextArc != null && nextArc.Label <= max)
                        {
                            Debug.Assert(nextArc.Label <= max);
                            Debug.Assert(nextArc.Label >= min, nextArc.Label + " " + min);
                            Int32sRef newInput = new Int32sRef(currentInput.Length + 1);
                            newInput.CopyInt32s(currentInput);
                            newInput.Int32s[currentInput.Length] = nextArc.Label;
                            newInput.Length = currentInput.Length + 1;
                            queue.Add(new Path <T>(t.Dest, new FST.Arc <T>()
                                                   .CopyFrom(nextArc), fst.Outputs.Add(path.Output, nextArc.Output), newInput));
                            int label = nextArc.Label; // used in assert
                            nextArc = nextArc.IsLast ? null : fst.ReadNextRealArc(nextArc, fstReader);
                            Debug.Assert(nextArc == null || label < nextArc.Label, "last: " + label + " next: " + (nextArc == null ? "" : nextArc.Label.ToString()));
                        }
                    }
                }
            }
            return(endNodes);
        }
        /// <summary>
        /// Builds the final automaton from a list of added entries. This method may
        /// take a longer while as it needs to build the automaton.
        /// </summary>
        public virtual FSTCompletion Build()
        {
            this.automaton = BuildAutomaton(sorter);

            // Dispose of it if it is a disposable
            using (sorter as IDisposable)
            {

            }

            return new FSTCompletion(automaton);
        }
Beispiel #28
0
 public SortedSetDocValuesAnonymousInnerClassHelper(FSTEntry fstEntry, BinaryDocValues binaryDocValues, FST <long?> fst1,
                                                    FST.BytesReader @in, FST.Arc <long?> arc, FST.Arc <long?> scratchArc1, Int32sRef intsRef, BytesRefFSTEnum <long?> bytesRefFstEnum,
                                                    BytesRef @ref, ByteArrayDataInput byteArrayDataInput)
 {
     entry       = fstEntry;
     docToOrds   = binaryDocValues;
     fst         = fst1;
     this.@in    = @in;
     firstArc    = arc;
     scratchArc  = scratchArc1;
     scratchInts = intsRef;
     fstEnum     = bytesRefFstEnum;
     this.@ref   = @ref;
     input       = byteArrayDataInput;
 }
 public SortedDocValuesAnonymousInnerClassHelper(Lucene42DocValuesProducer outerInstance, FSTEntry entry, NumericDocValues docToOrd, FST<long> fst, FST<long>.BytesReader @in, FST<long>.Arc<long> firstArc, FST<long>.Arc<long> scratchArc, IntsRef scratchInts, BytesRefFSTEnum<long> fstEnum)
 {
     this.OuterInstance = outerInstance;
     this.Entry = entry;
     this.DocToOrd = docToOrd;
     this.Fst = fst;
     this.@in = @in;
     this.FirstArc = firstArc;
     this.ScratchArc = scratchArc;
     this.ScratchInts = scratchInts;
     this.FstEnum = fstEnum;
 }
 public SortedDocValuesAnonymousInnerClassHelper(FSTEntry entry, NumericDocValues docToOrd, FST<long?> fst, FST<long?>.BytesReader @in, FST<long?>.Arc<long?> firstArc, FST<long?>.Arc<long?> scratchArc, IntsRef scratchInts, BytesRefFSTEnum<long?> fstEnum)
 {
     this.Entry = entry;
     this.DocToOrd = docToOrd;
     this.Fst = fst;
     this.@in = @in;
     this.FirstArc = firstArc;
     this.ScratchArc = scratchArc;
     this.ScratchInts = scratchInts;
     this.FstEnum = fstEnum;
 }
Beispiel #31
0
        /// <summary>
        /// Build the suggest index, using up to the specified
        ///  amount of temporary RAM while building.  Note that
        ///  the weights for the suggestions are ignored.
        /// </summary>
        public virtual void Build(IInputIterator iterator, double ramBufferSizeMB)
        {
            if (iterator.HasPayloads)
            {
                throw new System.ArgumentException("this suggester doesn't support payloads");
            }
            if (iterator.HasContexts)
            {
                throw new System.ArgumentException("this suggester doesn't support contexts");
            }

            string prefix    = this.GetType().Name;
            var    directory = OfflineSorter.DefaultTempDir();

            // LUCENENET specific - using GetRandomFileName() instead of picking a random int
            DirectoryInfo tempIndexPath = null;

            while (true)
            {
                tempIndexPath = new DirectoryInfo(Path.Combine(directory.FullName, prefix + ".index." + Path.GetFileNameWithoutExtension(Path.GetRandomFileName())));
                tempIndexPath.Create();
                if (System.IO.Directory.Exists(tempIndexPath.FullName))
                {
                    break;
                }
            }

            Directory dir = FSDirectory.Open(tempIndexPath);

            try
            {
#pragma warning disable 612, 618
                IndexWriterConfig iwc = new IndexWriterConfig(LuceneVersion.LUCENE_CURRENT, indexAnalyzer);
#pragma warning restore 612, 618
                iwc.SetOpenMode(OpenMode.CREATE);
                iwc.SetRAMBufferSizeMB(ramBufferSizeMB);
                IndexWriter writer = new IndexWriter(dir, iwc);

                var ft = new FieldType(TextField.TYPE_NOT_STORED);
                // TODO: if only we had IndexOptions.TERMS_ONLY...
                ft.IndexOptions = IndexOptions.DOCS_AND_FREQS;
                ft.OmitNorms    = true;
                ft.Freeze();

                Document doc   = new Document();
                Field    field = new Field("body", "", ft);
                doc.Add(field);

                totTokens = 0;
                IndexReader reader = null;

                bool success = false;
                count = 0;
                try
                {
                    while (true)
                    {
                        BytesRef surfaceForm = iterator.Next();
                        if (surfaceForm == null)
                        {
                            break;
                        }
                        field.SetStringValue(surfaceForm.Utf8ToString());
                        writer.AddDocument(doc);
                        count++;
                    }
                    reader = DirectoryReader.Open(writer, false);

                    Terms terms = MultiFields.GetTerms(reader, "body");
                    if (terms == null)
                    {
                        throw new System.ArgumentException("need at least one suggestion");
                    }

                    // Move all ngrams into an FST:
                    TermsEnum termsEnum = terms.GetIterator(null);

                    Outputs <long?> outputs = PositiveInt32Outputs.Singleton;
                    Builder <long?> builder = new Builder <long?>(FST.INPUT_TYPE.BYTE1, outputs);

                    Int32sRef scratchInts = new Int32sRef();
                    while (true)
                    {
                        BytesRef term = termsEnum.Next();
                        if (term == null)
                        {
                            break;
                        }
                        int ngramCount = CountGrams(term);
                        if (ngramCount > grams)
                        {
                            throw new System.ArgumentException("tokens must not contain separator byte; got token=" + term + " but gramCount=" + ngramCount + ", which is greater than expected max ngram size=" + grams);
                        }
                        if (ngramCount == 1)
                        {
                            totTokens += termsEnum.TotalTermFreq;
                        }

                        builder.Add(Lucene.Net.Util.Fst.Util.ToInt32sRef(term, scratchInts), EncodeWeight(termsEnum.TotalTermFreq));
                    }

                    fst = builder.Finish();
                    if (fst == null)
                    {
                        throw new System.ArgumentException("need at least one suggestion");
                    }
                    //System.out.println("FST: " + fst.getNodeCount() + " nodes");

                    /*
                     * PrintWriter pw = new PrintWriter("/x/tmp/out.dot");
                     * Util.toDot(fst, pw, true, true);
                     * pw.close();
                     */

                    success = true;
                }
                finally
                {
                    if (success)
                    {
                        IOUtils.Dispose(writer, reader);
                    }
                    else
                    {
                        IOUtils.DisposeWhileHandlingException(writer, reader);
                    }
                }
            }
            finally
            {
                try
                {
                    IOUtils.Dispose(dir);
                }
                finally
                {
                    // LUCENENET specific - since we are removing the entire directory anyway,
                    // it doesn't make sense to first do a loop in order remove the files.
                    // Let the System.IO.Directory.Delete() method handle that.
                    // We also need to dispose the Directory instance first before deleting from disk.
                    try
                    {
                        System.IO.Directory.Delete(tempIndexPath.FullName, true);
                    }
                    catch (Exception e)
                    {
                        throw new InvalidOperationException("failed to remove " + tempIndexPath, e);
                    }
                }
            }
        }
        public override SortedSetDocValues GetSortedSet(FieldInfo field)
        {
            FSTEntry entry = Fsts[field.Number];
            if (entry.NumOrds == 0)
            {
                return DocValues.EMPTY_SORTED_SET; // empty FST!
            }
            FST<long> instance;
            lock (this)
            {
                if (!FstInstances.TryGetValue(field.Number, out instance))
                {
                    Data.Seek(entry.Offset);
                    instance = new FST<long>((DataInput)Data, Lucene.Net.Util.Fst.PositiveIntOutputs.Singleton);
                    RamBytesUsed_Renamed.AddAndGet(instance.SizeInBytes());
                    FstInstances[field.Number] = instance;
                }
            }
            BinaryDocValues docToOrds = GetBinary(field);
            FST<long> fst = instance;

            // per-thread resources
            FST<long>.BytesReader @in = fst.BytesReader;
            FST<long>.Arc<long> firstArc = new FST<long>.Arc<long>();
            FST<long>.Arc<long> scratchArc = new FST<long>.Arc<long>();
            IntsRef scratchInts = new IntsRef();
            BytesRefFSTEnum<long> fstEnum = new BytesRefFSTEnum<long>(fst);
            BytesRef @ref = new BytesRef();
            ByteArrayDataInput input = new ByteArrayDataInput();
            return new SortedSetDocValuesAnonymousInnerClassHelper(this, entry, docToOrds, fst, @in, firstArc, scratchArc, scratchInts, fstEnum, @ref, input);
        }
Beispiel #33
0
 public SortedDocValuesAnonymousInnerClassHelper(FSTEntry fstEntry,
                                                 NumericDocValues numericDocValues, FST <long?> fst1, FST.BytesReader @in, FST.Arc <long?> arc, FST.Arc <long?> scratchArc1,
                                                 Int32sRef intsRef, BytesRefFSTEnum <long?> bytesRefFstEnum)
 {
     entry       = fstEntry;
     docToOrd    = numericDocValues;
     fst         = fst1;
     this.@in    = @in;
     firstArc    = arc;
     scratchArc  = scratchArc1;
     scratchInts = intsRef;
     fstEnum     = bytesRefFstEnum;
 }
 public SortedSetDocValuesAnonymousInnerClassHelper(Lucene42DocValuesProducer outerInstance, FSTEntry entry, BinaryDocValues docToOrds, FST<long> fst, FST<long>.BytesReader @in, FST<long>.Arc<long> firstArc, FST<long>.Arc<long> scratchArc, IntsRef scratchInts, BytesRefFSTEnum<long> fstEnum, BytesRef @ref, ByteArrayDataInput input)
 {
     this.OuterInstance = outerInstance;
     this.Entry = entry;
     this.DocToOrds = docToOrds;
     this.Fst = fst;
     this.@in = @in;
     this.FirstArc = firstArc;
     this.ScratchArc = scratchArc;
     this.ScratchInts = scratchInts;
     this.FstEnum = fstEnum;
     this.@ref = @ref;
     this.Input = input;
 }
Beispiel #35
0
 public SynonymMap(FST <BytesRef> fst, BytesRefHash words, int maxHorizontalContext)
 {
     this.fst   = fst;
     this.words = words;
     this.maxHorizontalContext = maxHorizontalContext;
 }
 internal FSTTermsEnum(FST<long> fst)
 {
     this.Fst = fst;
     @in = new BytesRefFSTEnum<long>(fst);
     BytesReader = fst.BytesReader;
 }
Beispiel #37
0
 internal bool CanGrow(Frame frame) // can walk forward on both fst&fsa
 {
     return(frame.fsaState != -1 && FST <Memory.FSTTermOutputs.TermData> .TargetHasArcs(frame.fstArc));
 }
Beispiel #38
0
        //static final boolean TEST = false;

        public FSTOrdTermsReader(SegmentReadState state, PostingsReaderBase postingsReader)
        {
            string termsIndexFileName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, FSTOrdTermsWriter.TERMS_INDEX_EXTENSION);
            string termsBlockFileName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, FSTOrdTermsWriter.TERMS_BLOCK_EXTENSION);

            this.postingsReader = postingsReader;
            ChecksumIndexInput indexIn = null;
            IndexInput blockIn = null;
            bool success = false;
            try
            {
                indexIn = state.Directory.OpenChecksumInput(termsIndexFileName, state.Context);
                blockIn = state.Directory.OpenInput(termsBlockFileName, state.Context);
                version = ReadHeader(indexIn);
                ReadHeader(blockIn);
                if (version >= FSTOrdTermsWriter.TERMS_VERSION_CHECKSUM)
                {
                    CodecUtil.ChecksumEntireFile(blockIn);
                }

                this.postingsReader.Init(blockIn);
                SeekDir(blockIn);

                FieldInfos fieldInfos = state.FieldInfos;
                int numFields = blockIn.ReadVInt();
                for (int i = 0; i < numFields; i++)
                {
                    FieldInfo fieldInfo = fieldInfos.FieldInfo(blockIn.ReadVInt());
                    bool hasFreq = fieldInfo.FieldIndexOptions != FieldInfo.IndexOptions.DOCS_ONLY;
                    long numTerms = blockIn.ReadVLong();
                    long sumTotalTermFreq = hasFreq ? blockIn.ReadVLong() : -1;
                    long sumDocFreq = blockIn.ReadVLong();
                    int docCount = blockIn.ReadVInt();
                    int longsSize = blockIn.ReadVInt();
                    var index = new FST<long?>(indexIn, PositiveIntOutputs.Singleton);

                    var current = new TermsReader(this, fieldInfo, blockIn, numTerms, sumTotalTermFreq, sumDocFreq, docCount, longsSize, index);
            
                    var previous = fields[fieldInfo.Name] = current;
                    CheckFieldSummary(state.SegmentInfo, indexIn, blockIn, current, previous);
                }
                if (version >= FSTOrdTermsWriter.TERMS_VERSION_CHECKSUM)
                {
                    CodecUtil.CheckFooter(indexIn);
                }
                else
                {
                    CodecUtil.CheckEOF(indexIn);
                }
                success = true;
            }
            finally
            {
                if (success)
                {
                    IOUtils.Close(indexIn, blockIn);
                }
                else
                {
                    IOUtils.CloseWhileHandlingException(indexIn, blockIn);
                }
            }
        }
Beispiel #39
0
 public SortedSetDocValuesAnonymousInnerClassHelper(FSTEntry entry, BinaryDocValues docToOrds, FST <long?> fst, FST.BytesReader @in, FST.Arc <long?> firstArc, FST.Arc <long?> scratchArc, Int32sRef scratchInts, BytesRefFSTEnum <long?> fstEnum, BytesRef @ref, ByteArrayDataInput input)
 {
     this.entry       = entry;
     this.docToOrds   = docToOrds;
     this.fst         = fst;
     this.@in         = @in;
     this.firstArc    = firstArc;
     this.scratchArc  = scratchArc;
     this.scratchInts = scratchInts;
     this.fstEnum     = fstEnum;
     this.@ref        = @ref;
     this.input       = input;
 }
            private void loadTermsIndex()
            {
                if (Fst == null)
                {
                    IndexInput clone = input.Clone();
                    clone.Seek(indexStart);
                    Fst = new FST<>(clone, fstOutputs);
                    clone.Close();

                    /*
        final String dotFileName = segment + "_" + fieldInfo.name + ".dot";
        Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName));
        Util.toDot(fst, w, false, false);
        System.out.println("FST INDEX: SAVED to " + dotFileName);
        w.close();
        */

                    if (indexDivisor > 1)
                    {
                        // subsample
                        IntsRef scratchIntsRef = new IntsRef();
                        PositiveIntOutputs outputs = PositiveIntOutputs.GetSingleton();
                        Builder<long> builder = new Builder<long>(FST.INPUT_TYPE.BYTE1, outputs);
                        BytesRefFSTEnum<long> fstEnum = new BytesRefFSTEnum<long>(fst);
                        BytesRefFSTEnum.InputOutput<long> result;
                        int count = indexDivisor;
                        while ((result = fstEnum.Next()) != null)
                        {
                            if (count == indexDivisor)
                            {
                                builder.Add(Util.ToIntsRef(result.Input, scratchIntsRef), result.Output);
                                count = 0;
                            }
                            count++;
                        }
                        Fst = builder.Finish();
                    }
                }
            }
Beispiel #41
0
        // LUCENENET NOTE: InputOutput<T> was moved to the BytesRefFSTEnum class

        /// <summary>
        /// doFloor controls the behavior of advance: if it's true
        /// doFloor is true, advance positions to the biggest
        /// term before target.
        /// </summary>
        public BytesRefFSTEnum(FST <T> fst)
            : base(fst)
        {
            result.Input   = current;
            current.Offset = 1;
        }
Beispiel #42
0
 public IndexEnum(FST <long?> fst)
 {
     _fstEnum = new BytesRefFSTEnum <long?>(fst);
 }
Beispiel #43
0
 /// <summary>
 /// Returns all prefix paths to initialize the search.
 /// </summary>
 protected internal virtual IList <FSTUtil.Path <PairOutputs <long?, BytesRef> .Pair> > GetFullPrefixPaths(
     IList <FSTUtil.Path <PairOutputs <long?, BytesRef> .Pair> > prefixPaths, Automaton lookupAutomaton,
     FST <PairOutputs <long?, BytesRef> .Pair> fst)
 {
     return(prefixPaths);
 }
Beispiel #44
0
 public FSTTermsEnum(FieldInfo field, FST <BytesRef> fst)
 {
     this.field = field;
     fstEnum    = new BytesRefFSTEnum <BytesRef>(fst);
 }
Beispiel #45
0
        /// <summary>
        /// Expert: like <seealso cref="Util#getByOutput(FST, long)"/> except reusing
        /// BytesReader, initial and scratch Arc, and result.
        /// </summary>
        public static IntsRef GetByOutput(FST <long?> fst, long targetOutput, FST <long?> .BytesReader @in, FST <long?> .Arc <long?> arc, FST <long?> .Arc <long?> scratchArc, IntsRef result)
        {
            long output = arc.Output.Value;
            int  upto   = 0;

            //System.out.println("reverseLookup output=" + targetOutput);

            while (true)
            {
                //System.out.println("loop: output=" + output + " upto=" + upto + " arc=" + arc);
                if (arc.Final)
                {
                    long finalOutput = output + arc.NextFinalOutput.Value;
                    //System.out.println("  isFinal finalOutput=" + finalOutput);
                    if (finalOutput == targetOutput)
                    {
                        result.Length = upto;
                        //System.out.println("    found!");
                        return(result);
                    }
                    else if (finalOutput > targetOutput)
                    {
                        //System.out.println("    not found!");
                        return(null);
                    }
                }

                if (FST <long?> .TargetHasArcs(arc))
                {
                    //System.out.println("  targetHasArcs");
                    if (result.Ints.Length == upto)
                    {
                        result.Grow(1 + upto);
                    }

                    fst.ReadFirstRealTargetArc(arc.Target, arc, @in);

                    if (arc.BytesPerArc != 0)
                    {
                        int low  = 0;
                        int high = arc.NumArcs - 1;
                        int mid  = 0;
                        //System.out.println("bsearch: numArcs=" + arc.numArcs + " target=" + targetOutput + " output=" + output);
                        bool exact = false;
                        while (low <= high)
                        {
                            mid          = (int)((uint)(low + high) >> 1);
                            @in.Position = arc.PosArcsStart;
                            @in.SkipBytes(arc.BytesPerArc * mid);
                            var flags = (sbyte)@in.ReadByte();
                            fst.ReadLabel(@in);
                            long minArcOutput;
                            if ((flags & FST <long> .BIT_ARC_HAS_OUTPUT) != 0)
                            {
                                long arcOutput = fst.Outputs.Read(@in).Value;
                                minArcOutput = output + arcOutput;
                            }
                            else
                            {
                                minArcOutput = output;
                            }
                            if (minArcOutput == targetOutput)
                            {
                                exact = true;
                                break;
                            }
                            else if (minArcOutput < targetOutput)
                            {
                                low = mid + 1;
                            }
                            else
                            {
                                high = mid - 1;
                            }
                        }

                        if (high == -1)
                        {
                            return(null);
                        }
                        else if (exact)
                        {
                            arc.ArcIdx = mid - 1;
                        }
                        else
                        {
                            arc.ArcIdx = low - 2;
                        }

                        fst.ReadNextRealArc(arc, @in);
                        result.Ints[upto++] = arc.Label;
                        output += arc.Output.Value;
                    }
                    else
                    {
                        FST <long?> .Arc <long?> prevArc = null;

                        while (true)
                        {
                            //System.out.println("    cycle label=" + arc.label + " output=" + arc.output);

                            // this is the min output we'd hit if we follow
                            // this arc:
                            long minArcOutput = output + arc.Output.Value;

                            if (minArcOutput == targetOutput)
                            {
                                // Recurse on this arc:
                                //System.out.println("  match!  break");
                                output = minArcOutput;
                                result.Ints[upto++] = arc.Label;
                                break;
                            }
                            else if (minArcOutput > targetOutput)
                            {
                                if (prevArc == null)
                                {
                                    // Output doesn't exist
                                    return(null);
                                }
                                else
                                {
                                    // Recurse on previous arc:
                                    arc.CopyFrom(prevArc);
                                    result.Ints[upto++] = arc.Label;
                                    output += arc.Output.Value;
                                    //System.out.println("    recurse prev label=" + (char) arc.label + " output=" + output);
                                    break;
                                }
                            }
                            else if (arc.Last)
                            {
                                // Recurse on this arc:
                                output = minArcOutput;
                                //System.out.println("    recurse last label=" + (char) arc.label + " output=" + output);
                                result.Ints[upto++] = arc.Label;
                                break;
                            }
                            else
                            {
                                // Read next arc in this node:
                                prevArc = scratchArc;
                                prevArc.CopyFrom(arc);
                                //System.out.println("      after copy label=" + (char) prevArc.label + " vs " + (char) arc.label);
                                fst.ReadNextRealArc(arc, @in);
                            }
                        }
                    }
                }
                else
                {
                    //System.out.println("  no target arcs; not found!");
                    return(null);
                }
            }
        }
Beispiel #46
0
            internal TermsReader(FSTOrdTermsReader outerInstance, FieldInfo fieldInfo, IndexInput blockIn, long numTerms, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize, FST <long?> index)
            {
                this.outerInstance    = outerInstance;
                this.fieldInfo        = fieldInfo;
                this.numTerms         = numTerms;
                this.sumTotalTermFreq = sumTotalTermFreq;
                this.sumDocFreq       = sumDocFreq;
                this.docCount         = docCount;
                this.longsSize        = longsSize;
                this.index            = index;

                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert((numTerms & (~0xffffffffL)) == 0);
                }
                int numBlocks = (int)(numTerms + INTERVAL - 1) / INTERVAL;

                this.numSkipInfo    = longsSize + 3;
                this.skipInfo       = new long[numBlocks * numSkipInfo];
                this.statsBlock     = new byte[(int)blockIn.ReadVInt64()];
                this.metaLongsBlock = new byte[(int)blockIn.ReadVInt64()];
                this.metaBytesBlock = new byte[(int)blockIn.ReadVInt64()];

                int last = 0, next = 0;

                for (int i = 1; i < numBlocks; i++)
                {
                    next = numSkipInfo * i;
                    for (int j = 0; j < numSkipInfo; j++)
                    {
                        skipInfo[next + j] = skipInfo[last + j] + blockIn.ReadVInt64();
                    }
                    last = next;
                }
                blockIn.ReadBytes(statsBlock, 0, statsBlock.Length);
                blockIn.ReadBytes(metaLongsBlock, 0, metaLongsBlock.Length);
                blockIn.ReadBytes(metaBytesBlock, 0, metaBytesBlock.Length);
            }
Beispiel #47
0
        /// <summary>
        /// Dumps an <seealso cref="FST"/> to a GraphViz's <code>dot</code> language description
        /// for visualization. Example of use:
        ///
        /// <pre class="prettyprint">
        /// PrintWriter pw = new PrintWriter(&quot;out.dot&quot;);
        /// Util.toDot(fst, pw, true, true);
        /// pw.close();
        /// </pre>
        ///
        /// and then, from command line:
        ///
        /// <pre>
        /// dot -Tpng -o out.png out.dot
        /// </pre>
        ///
        /// <p>
        /// Note: larger FSTs (a few thousand nodes) won't even
        /// render, don't bother.  If the FST is > 2.1 GB in size
        /// then this method will throw strange exceptions.
        /// </summary>
        /// <param name="sameRank">
        ///          If <code>true</code>, the resulting <code>dot</code> file will try
        ///          to order states in layers of breadth-first traversal. this may
        ///          mess up arcs, but makes the output FST's structure a bit clearer.
        /// </param>
        /// <param name="labelStates">
        ///          If <code>true</code> states will have labels equal to their offsets in their
        ///          binary format. Expands the graph considerably.
        /// </param>
        /// <seealso cref= "http://www.graphviz.org/" </seealso>
        public static void toDot <T>(FST <T> fst, TextWriter @out, bool sameRank, bool labelStates)
        {
            const string expandedNodeColor = "blue";

            // this is the start arc in the automaton (from the epsilon state to the first state
            // with outgoing transitions.
            FST <T> .Arc <T> startArc = fst.GetFirstArc(new FST <T> .Arc <T>());

            // A queue of transitions to consider for the next level.
            IList <FST <T> .Arc <T> > thisLevelQueue = new List <FST <T> .Arc <T> >();

            // A queue of transitions to consider when processing the next level.
            IList <FST <T> .Arc <T> > nextLevelQueue = new List <FST <T> .Arc <T> >();

            nextLevelQueue.Add(startArc);
            //System.out.println("toDot: startArc: " + startArc);

            // A list of states on the same level (for ranking).
            IList <int?> sameLevelStates = new List <int?>();

            // A bitset of already seen states (target offset).
            BitArray seen = new BitArray(32);

            seen.SafeSet((int)startArc.Target, true);

            // Shape for states.
            const string stateShape      = "circle";
            const string finalStateShape = "doublecircle";

            // Emit DOT prologue.
            @out.Write("digraph FST {\n");
            @out.Write("  rankdir = LR; splines=true; concentrate=true; ordering=out; ranksep=2.5; \n");

            if (!labelStates)
            {
                @out.Write("  node [shape=circle, width=.2, height=.2, style=filled]\n");
            }

            EmitDotState(@out, "initial", "point", "white", "");

            T   NO_OUTPUT = fst.Outputs.NoOutput;
            var r         = fst.BytesReader;

            // final FST.Arc<T> scratchArc = new FST.Arc<>();

            {
                string stateColor;
                if (fst.IsExpandedTarget(startArc, r))
                {
                    stateColor = expandedNodeColor;
                }
                else
                {
                    stateColor = null;
                }

                bool isFinal;
                T    finalOutput;
                if (startArc.Final)
                {
                    isFinal     = true;
                    finalOutput = startArc.NextFinalOutput.Equals(NO_OUTPUT) ? default(T) : startArc.NextFinalOutput;
                }
                else
                {
                    isFinal     = false;
                    finalOutput = default(T);
                }

                EmitDotState(@out, Convert.ToString(startArc.Target), isFinal ? finalStateShape : stateShape, stateColor, finalOutput == null ? "" : fst.Outputs.OutputToString(finalOutput));
            }

            @out.Write("  initial -> " + startArc.Target + "\n");

            int level = 0;

            while (nextLevelQueue.Count > 0)
            {
                // we could double buffer here, but it doesn't matter probably.
                //System.out.println("next level=" + level);
                thisLevelQueue.AddRange(nextLevelQueue);
                nextLevelQueue.Clear();

                level++;
                @out.Write("\n  // Transitions and states at level: " + level + "\n");
                while (thisLevelQueue.Count > 0)
                {
                    FST <T> .Arc <T> arc = thisLevelQueue[thisLevelQueue.Count - 1];
                    thisLevelQueue.RemoveAt(thisLevelQueue.Count - 1);
                    //System.out.println("  pop: " + arc);
                    if (FST <T> .TargetHasArcs(arc))
                    {
                        // scan all target arcs
                        //System.out.println("  readFirstTarget...");

                        long node = arc.Target;

                        fst.ReadFirstRealTargetArc(arc.Target, arc, r);

                        //System.out.println("    firstTarget: " + arc);

                        while (true)
                        {
                            //System.out.println("  cycle arc=" + arc);
                            // Emit the unseen state and add it to the queue for the next level.
                            if (arc.Target >= 0 && !seen.SafeGet((int)arc.Target))
                            {
                                /*
                                 * boolean isFinal = false;
                                 * T finalOutput = null;
                                 * fst.readFirstTargetArc(arc, scratchArc);
                                 * if (scratchArc.isFinal() && fst.targetHasArcs(scratchArc)) {
                                 * // target is final
                                 * isFinal = true;
                                 * finalOutput = scratchArc.output == NO_OUTPUT ? null : scratchArc.output;
                                 * System.out.println("dot hit final label=" + (char) scratchArc.label);
                                 * }
                                 */
                                string stateColor;
                                if (fst.IsExpandedTarget(arc, r))
                                {
                                    stateColor = expandedNodeColor;
                                }
                                else
                                {
                                    stateColor = null;
                                }

                                string finalOutput;
                                if (arc.NextFinalOutput != null && !arc.NextFinalOutput.Equals(NO_OUTPUT))
                                {
                                    finalOutput = fst.Outputs.OutputToString(arc.NextFinalOutput);
                                }
                                else
                                {
                                    finalOutput = "";
                                }

                                EmitDotState(@out, Convert.ToString(arc.Target), stateShape, stateColor, finalOutput);
                                // To see the node address, use this instead:
                                //emitDotState(out, Integer.toString(arc.target), stateShape, stateColor, String.valueOf(arc.target));
                                seen.SafeSet((int)arc.Target, true);
                                nextLevelQueue.Add((new FST <T> .Arc <T>()).CopyFrom(arc));
                                sameLevelStates.Add((int)arc.Target);
                            }

                            string outs;
                            if (!arc.Output.Equals(NO_OUTPUT))
                            {
                                outs = "/" + fst.Outputs.OutputToString(arc.Output);
                            }
                            else
                            {
                                outs = "";
                            }

                            if (!FST <T> .TargetHasArcs(arc) && arc.Final && !arc.NextFinalOutput.Equals(NO_OUTPUT))
                            {
                                // Tricky special case: sometimes, due to
                                // pruning, the builder can [sillily] produce
                                // an FST with an arc into the final end state
                                // (-1) but also with a next final output; in
                                // this case we pull that output up onto this
                                // arc
                                outs = outs + "/[" + fst.Outputs.OutputToString(arc.NextFinalOutput) + "]";
                            }

                            string arcColor;
                            if (arc.Flag(FST <T> .BIT_TARGET_NEXT))
                            {
                                arcColor = "red";
                            }
                            else
                            {
                                arcColor = "black";
                            }

                            Debug.Assert(arc.Label != FST <T> .END_LABEL);
                            @out.Write("  " + node + " -> " + arc.Target + " [label=\"" + PrintableLabel(arc.Label) + outs + "\"" + (arc.Final ? " style=\"bold\"" : "") + " color=\"" + arcColor + "\"]\n");

                            // Break the loop if we're on the last arc of this state.
                            if (arc.Last)
                            {
                                //System.out.println("    break");
                                break;
                            }
                            fst.ReadNextRealArc(arc, r);
                        }
                    }
                }

                // Emit state ranking information.
                if (sameRank && sameLevelStates.Count > 1)
                {
                    @out.Write("  {rank=same; ");
                    foreach (int state in sameLevelStates)
                    {
                        @out.Write(state + "; ");
                    }
                    @out.Write(" }\n");
                }
                sameLevelStates.Clear();
            }

            // Emit terminating state (always there anyway).
            @out.Write("  -1 [style=filled, color=black, shape=doublecircle, label=\"\"]\n\n");
            @out.Write("  {rank=sink; -1 }\n");

            @out.Write("}\n");
            @out.Flush();
        }
Beispiel #48
0
        //static final boolean TEST = false;

        public FSTOrdTermsReader(SegmentReadState state, PostingsReaderBase postingsReader)
        {
            string termsIndexFileName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, FSTOrdTermsWriter.TERMS_INDEX_EXTENSION);
            string termsBlockFileName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, FSTOrdTermsWriter.TERMS_BLOCK_EXTENSION);

            this.postingsReader = postingsReader;
            ChecksumIndexInput indexIn = null;
            IndexInput         blockIn = null;
            bool success = false;

            try
            {
                indexIn = state.Directory.OpenChecksumInput(termsIndexFileName, state.Context);
                blockIn = state.Directory.OpenInput(termsBlockFileName, state.Context);
                version = ReadHeader(indexIn);
                ReadHeader(blockIn);
                if (version >= FSTOrdTermsWriter.TERMS_VERSION_CHECKSUM)
                {
                    CodecUtil.ChecksumEntireFile(blockIn);
                }

                this.postingsReader.Init(blockIn);
                SeekDir(blockIn);

                FieldInfos fieldInfos = state.FieldInfos;
                int        numFields  = blockIn.ReadVInt32();
                for (int i = 0; i < numFields; i++)
                {
                    FieldInfo fieldInfo        = fieldInfos.FieldInfo(blockIn.ReadVInt32());
                    bool      hasFreq          = fieldInfo.IndexOptions != IndexOptions.DOCS_ONLY;
                    long      numTerms         = blockIn.ReadVInt64();
                    long      sumTotalTermFreq = hasFreq ? blockIn.ReadVInt64() : -1;
                    long      sumDocFreq       = blockIn.ReadVInt64();
                    int       docCount         = blockIn.ReadVInt32();
                    int       longsSize        = blockIn.ReadVInt32();
                    var       index            = new FST <long?>(indexIn, PositiveInt32Outputs.Singleton);

                    var         current = new TermsReader(this, fieldInfo, blockIn, numTerms, sumTotalTermFreq, sumDocFreq, docCount, longsSize, index);
                    TermsReader previous;
                    // LUCENENET NOTE: This simulates a put operation in Java,
                    // getting the prior value first before setting it.
                    fields.TryGetValue(fieldInfo.Name, out previous);
                    fields[fieldInfo.Name] = current;
                    CheckFieldSummary(state.SegmentInfo, indexIn, blockIn, current, previous);
                }
                if (version >= FSTOrdTermsWriter.TERMS_VERSION_CHECKSUM)
                {
                    CodecUtil.CheckFooter(indexIn);
                }
                else
                {
#pragma warning disable 612, 618
                    CodecUtil.CheckEOF(indexIn);
#pragma warning restore 612, 618
                }
                success = true;
            }
            finally
            {
                if (success)
                {
                    IOUtils.Dispose(indexIn, blockIn);
                }
                else
                {
                    IOUtils.DisposeWhileHandlingException(indexIn, blockIn);
                }
            }
        }
		/// <summary>
		/// Creates a new <seealso cref="StemmerOverrideMap"/> </summary>
		/// <param name="fst"> the fst to lookup the overrides </param>
		/// <param name="ignoreCase"> if the keys case should be ingored </param>
		public StemmerOverrideMap(FST<BytesRef> fst, bool ignoreCase)
		{
		  this.fst = fst;
		  this.ignoreCase = ignoreCase;
		}
Beispiel #50
0
 private bool CanGrow(Frame frame) // can walk forward on both fst&fsa
 {
     return(frame.state != -1 && FST <long?> .TargetHasArcs(frame.arc));
 }
Beispiel #51
0
        private void MainForm_Load(object sender, EventArgs e)
        {
            this.Icon = Icon.ExtractAssociatedIcon(System.Reflection.Assembly.GetEntryAssembly().Location);

            int i;
            SettingsFile = new Xml("gecko.xml");
            SettingsFile.RootName = "gecko";

            gamename = "";
            gecko = new USBGecko();
            gecko.chunkUpdate += transfer;

            exceptionHandling = new ExceptionHandler(this);

            if (!Directory.Exists("DumpHistory"))
                Directory.CreateDirectory("DumpHistory");

            search = new MemSearch(gecko, SearchResults,
                PrvPage, NxtPage, ResList, UpDownSearchResultPage, exceptionHandling);

            viewer = new MemoryViewer(gecko, ValidMemory.ValidAreas[0].low, memViewGrid,
                memViewPAddress, memViewPValue, MemViewFPValue, exceptionHandling);

#if MONO
            disassembler = new Disassembly(gecko, "./vdappc", DisAssBox, DisScroll,
                DisRegion, AsAddress, AsText, exceptionHandling);
#else
            disassembler = new Disassembly(gecko, "vdappc.exe", DisAssBox, DisScroll,
                DisRegion, AsAddress, AsText, exceptionHandling);
#endif

            bpHandler = new Breakpoints(gecko, BPList, this, disassembler, BPDiss, BPClassic, BPCondList, exceptionHandling);
            //bpHandler = new Breakpoints(gecko, BPList, this, disassembler, richTextBox1, BPClassic, BPCondList, exceptionHandling);
            foreach (String reg in BPList.longRegNames)
                BPConditionRegSelect.Items.Add(reg.Trim());
            BPConditionRegSelect.Items.Add("VoA");

            BPConditionRegSelect.SelectedIndex = 0;
            BPConditionCompare.SelectedIndex = 0;

            bpHandler.BPSkip += BPSkipped;

            watcher = new WatchList(gecko, WatchList, WatchIntervalSet, exceptionHandling);
            addWatchDialog = null;
            watchValueInput = null;

            fst = new FST(gecko, FSTTreeView, FSTCodeData, FSTSetAsSource, FSTSetAsTarget,
                FSTGenSwap, FSTFileSource, FSTFileTarget, FSTSwapCode, FSTSwapNow, exceptionHandling);

            GCTCodeContents = new CodeController(GCTCodeList, GCTCodeValues);
            GCTCodeContents.codesModified += GCTModified;

            bpHandler.BPStop += BPStopped;

            for (i = 0; i < ValidMemory.ValidAreas.Length; i++)
            {
                memRange.Items.Add(
                    GlobalFunctions.toHex(ValidMemory.ValidAreas[i].id, 2));
                MemViewARange.Items.Add(
                    GlobalFunctions.toHex(ValidMemory.ValidAreas[i].id, 2));
                ToolsDumpRegions.Items.Add(
                    GlobalFunctions.toHex(ValidMemory.ValidAreas[i].id, 2));
            }

            codeWizard = new GCTWizard(GCTCodeContents);

            memRange.SelectedIndex = 0;
            MemViewARange.SelectedIndex = 0;
            MemViewShowMode.SelectedIndex = 0;
            MemViewSearchType.SelectedIndex = 0;
            ToolsDumpRegions.SelectedIndex = 0;

            comboBoxSearchDataType.SelectedIndex = 2;

            //UpperEnable.SelectedIndex = 0;


            //BPType.SelectedIndex = 0;
            comboBoxDisplayType.SelectedIndex = 0;

            WasAlreadyDisabled = new List<Control>();
            searchComparisons = new List<SearchComparisonInfo>();
            searchComparisons.Add(new SearchComparisonInfo());
            comboBoxComparisonType.SelectedIndex = 0;
            comboBoxComparisonRHS.SelectedIndex = 0;
            buttonCancelSearch.Enabled = false;
            SteppingOut = false;
            buttonUndoSearch.Enabled = search.CanUndo();

            TabLock = null;
            BPStepLogWriter = null;

            comboBoxPokeOperation.SelectedIndex = 0;

            SetComboboxValue("Screenshots", "Format", 0, ImgFormat);
            SetComboboxValue("Screenshots", "Sizing", 0, ShotSizingType);

            int value = SettingsFile.GetValue("Screenshots", "JPEGQuality", 85);
            if (value < 0 || value > 100)
                value = 85;
            JPGQual.Value = value;

            multiPokeAddr = new List<UInt32>();

            FormStop(false);
            CUSBGecko.Enabled = true;

            codesModified = false;

            AbtText.Text = "gecko dotNET Beta 0.63 by Link and dcx2\n\n"
                          + "Special thanks to:\n\n"
                          + "kenobi: for original WiiRd GUI!\n"
                          + "Nuke: for the USB Gecko!\n"
                          + "brkirch: for continuing Gecko OS!\n"
                          + "Y.S.: for the original code handler!\n"
                          + "Team Twiizers for bringing homebrew to the Wii\n"
                          + "DevKitPro team: No homebrew without them!\n"
                          + "Frank Wille: vdappc developer!\n"
                          + "various beta testers!\n"
                          + "and you!";

            notes = new NoteSheets();

            //Set MEM2 upper to 93400000
            MEM2UpperBoundary.SelectedIndex = 0;

            // Restore previous settings
            checkBoxAlwaysOnTop.Checked = GeckoApp.Properties.Settings.Default.AlwaysOnTop;
            numericUpDownFPS.Value = GeckoApp.Properties.Settings.Default.FPS;
            BPAddress.Text = GeckoApp.Properties.Settings.Default.BPAddr;
            memViewAValue.Text = GeckoApp.Properties.Settings.Default.MemViewAddr;
            BPType.SelectedIndex = GeckoApp.Properties.Settings.Default.BPType;
            checkBoxBPNext.Checked = GeckoApp.Properties.Settings.Default.BPNext;
            checkBoxPauseCodes.Checked = GeckoApp.Properties.Settings.Default.PauseCodes;
            Size = GeckoApp.Properties.Settings.Default.LastSize;
            int oldSplitter = GeckoApp.Properties.Settings.Default.LastSplitterSize;
            // The splitter gets moved when the breakpoint page is entered
            // so artificially force it to move
            MainControl.SelectedTab = BreakpointPage;
            MainControl.SelectedTab = searchPage;
            splitContainerRegASM.SplitterDistance = oldSplitter;
            toolStripTextBoxMemViewFontSize.Text = GeckoApp.Properties.Settings.Default.MemViewFontSize.ToString();
            toolStripTextBoxMemViewFontSize_KeyDown(null, new KeyEventArgs(Keys.Enter));
            viewFloatsInHexToolStripMenuItem.Checked = GeckoApp.Properties.Settings.Default.ViewFloatsInHex;
            //addressTextBox1.CopyStringToHistory(GeckoApp.Properties.Settings.Default.addressHistory);
            //addressTextBox1.AutoHistory = true;

        }
Beispiel #52
0
        public virtual void Test()
        {
            int[]     ints  = new int[7];
            Int32sRef input = new Int32sRef(ints, 0, ints.Length);
            int       seed  = Random.Next();

            Directory dir = new MMapDirectory(CreateTempDir("2BFST"));

            for (int doPackIter = 0; doPackIter < 2; doPackIter++)
            {
                bool doPack = doPackIter == 1;

                // Build FST w/ NoOutputs and stop when nodeCount > 2.2B
                if (!doPack)
                {
                    Console.WriteLine("\nTEST: 3B nodes; doPack=false output=NO_OUTPUTS");
                    Outputs <object> outputs   = NoOutputs.Singleton;
                    object           NO_OUTPUT = outputs.NoOutput;
                    Builder <object> b         = new Builder <object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, int.MaxValue, outputs, null, doPack, PackedInt32s.COMPACT, true, 15);

                    int       count  = 0;
                    Random    r      = new Random(seed);
                    int[]     ints2  = new int[200];
                    Int32sRef input2 = new Int32sRef(ints2, 0, ints2.Length);
                    while (true)
                    {
                        //System.out.println("add: " + input + " -> " + output);
                        for (int i = 10; i < ints2.Length; i++)
                        {
                            ints2[i] = r.Next(256);
                        }
                        b.Add(input2, NO_OUTPUT);
                        count++;
                        if (count % 100000 == 0)
                        {
                            Console.WriteLine(count + ": " + b.GetFstSizeInBytes() + " bytes; " + b.TotStateCount + " nodes");
                        }
                        if (b.TotStateCount > int.MaxValue + 100L * 1024 * 1024)
                        {
                            break;
                        }
                        NextInput(r, ints2);
                    }

                    FST <object> fst = b.Finish();

                    for (int verify = 0; verify < 2; verify++)
                    {
                        Console.WriteLine("\nTEST: now verify [fst size=" + fst.GetSizeInBytes() + "; nodeCount=" + fst.NodeCount + "; arcCount=" + fst.ArcCount + "]");

                        Arrays.Fill(ints2, 0);
                        r = new Random(seed);

                        for (int i = 0; i < count; i++)
                        {
                            if (i % 1000000 == 0)
                            {
                                Console.WriteLine(i + "...: ");
                            }
                            for (int j = 10; j < ints2.Length; j++)
                            {
                                ints2[j] = r.Next(256);
                            }
                            Assert.AreEqual(NO_OUTPUT, Util.Get(fst, input2));
                            NextInput(r, ints2);
                        }

                        Console.WriteLine("\nTEST: enum all input/outputs");
                        Int32sRefFSTEnum <object> fstEnum = new Int32sRefFSTEnum <object>(fst);

                        Arrays.Fill(ints2, 0);
                        r = new Random(seed);
                        int upto = 0;
                        while (true)
                        {
                            Int32sRefFSTEnum.InputOutput <object> pair = fstEnum.Next();
                            if (pair == null)
                            {
                                break;
                            }
                            for (int j = 10; j < ints2.Length; j++)
                            {
                                ints2[j] = r.Next(256);
                            }
                            Assert.AreEqual(input2, pair.Input);
                            Assert.AreEqual(NO_OUTPUT, pair.Output);
                            upto++;
                            NextInput(r, ints2);
                        }
                        Assert.AreEqual(count, upto);

                        if (verify == 0)
                        {
                            Console.WriteLine("\nTEST: save/load FST and re-verify");
                            IndexOutput @out = dir.CreateOutput("fst", IOContext.DEFAULT);
                            fst.Save(@out);
                            @out.Dispose();
                            IndexInput @in = dir.OpenInput("fst", IOContext.DEFAULT);
                            fst = new FST <object>(@in, outputs);
                            @in.Dispose();
                        }
                        else
                        {
                            dir.DeleteFile("fst");
                        }
                    }
                }

                // Build FST w/ ByteSequenceOutputs and stop when FST
                // size = 3GB
                {
                    Console.WriteLine("\nTEST: 3 GB size; doPack=" + doPack + " outputs=bytes");
                    Outputs <BytesRef> outputs = ByteSequenceOutputs.Singleton;
                    Builder <BytesRef> b       = new Builder <BytesRef>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, int.MaxValue, outputs, null, doPack, PackedInt32s.COMPACT, true, 15);

                    var      outputBytes = new byte[20];
                    BytesRef output      = new BytesRef(outputBytes);
                    Arrays.Fill(ints, 0);
                    int    count = 0;
                    Random r     = new Random(seed);
                    while (true)
                    {
                        r.NextBytes(outputBytes);
                        //System.out.println("add: " + input + " -> " + output);
                        b.Add(input, BytesRef.DeepCopyOf(output));
                        count++;
                        if (count % 1000000 == 0)
                        {
                            Console.WriteLine(count + "...: " + b.GetFstSizeInBytes() + " bytes");
                        }
                        if (b.GetFstSizeInBytes() > LIMIT)
                        {
                            break;
                        }
                        NextInput(r, ints);
                    }

                    FST <BytesRef> fst = b.Finish();
                    for (int verify = 0; verify < 2; verify++)
                    {
                        Console.WriteLine("\nTEST: now verify [fst size=" + fst.GetSizeInBytes() + "; nodeCount=" + fst.NodeCount + "; arcCount=" + fst.ArcCount + "]");

                        r = new Random(seed);
                        Arrays.Fill(ints, 0);

                        for (int i = 0; i < count; i++)
                        {
                            if (i % 1000000 == 0)
                            {
                                Console.WriteLine(i + "...: ");
                            }
                            r.NextBytes(outputBytes);
                            Assert.AreEqual(output, Util.Get(fst, input));
                            NextInput(r, ints);
                        }

                        Console.WriteLine("\nTEST: enum all input/outputs");
                        Int32sRefFSTEnum <BytesRef> fstEnum = new Int32sRefFSTEnum <BytesRef>(fst);

                        Arrays.Fill(ints, 0);
                        r = new Random(seed);
                        int upto = 0;
                        while (true)
                        {
                            Int32sRefFSTEnum.InputOutput <BytesRef> pair = fstEnum.Next();
                            if (pair == null)
                            {
                                break;
                            }
                            Assert.AreEqual(input, pair.Input);
                            r.NextBytes(outputBytes);
                            Assert.AreEqual(output, pair.Output);
                            upto++;
                            NextInput(r, ints);
                        }
                        Assert.AreEqual(count, upto);

                        if (verify == 0)
                        {
                            Console.WriteLine("\nTEST: save/load FST and re-verify");
                            IndexOutput @out = dir.CreateOutput("fst", IOContext.DEFAULT);
                            fst.Save(@out);
                            @out.Dispose();
                            IndexInput @in = dir.OpenInput("fst", IOContext.DEFAULT);
                            fst = new FST <BytesRef>(@in, outputs);
                            @in.Dispose();
                        }
                        else
                        {
                            dir.DeleteFile("fst");
                        }
                    }
                }

                // Build FST w/ PositiveIntOutputs and stop when FST
                // size = 3GB
                {
                    Console.WriteLine("\nTEST: 3 GB size; doPack=" + doPack + " outputs=long");
                    Outputs <long?> outputs = PositiveInt32Outputs.Singleton;
                    Builder <long?> b       = new Builder <long?>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, int.MaxValue, outputs, null, doPack, PackedInt32s.COMPACT, true, 15);

                    long output = 1;

                    Arrays.Fill(ints, 0);
                    int    count = 0;
                    Random r     = new Random(seed);
                    while (true)
                    {
                        //System.out.println("add: " + input + " -> " + output);
                        b.Add(input, output);
                        output += 1 + r.Next(10);
                        count++;
                        if (count % 1000000 == 0)
                        {
                            Console.WriteLine(count + "...: " + b.GetFstSizeInBytes() + " bytes");
                        }
                        if (b.GetFstSizeInBytes() > LIMIT)
                        {
                            break;
                        }
                        NextInput(r, ints);
                    }

                    FST <long?> fst = b.Finish();

                    for (int verify = 0; verify < 2; verify++)
                    {
                        Console.WriteLine("\nTEST: now verify [fst size=" + fst.GetSizeInBytes() + "; nodeCount=" + fst.NodeCount + "; arcCount=" + fst.ArcCount + "]");

                        Arrays.Fill(ints, 0);

                        output = 1;
                        r      = new Random(seed);
                        for (int i = 0; i < count; i++)
                        {
                            if (i % 1000000 == 0)
                            {
                                Console.WriteLine(i + "...: ");
                            }

                            // forward lookup:
                            Assert.AreEqual(output, (long)Util.Get(fst, input));
                            // reverse lookup:
                            Assert.AreEqual(input, Util.GetByOutput(fst, output));
                            output += 1 + r.Next(10);
                            NextInput(r, ints);
                        }

                        Console.WriteLine("\nTEST: enum all input/outputs");
                        Int32sRefFSTEnum <long?> fstEnum = new Int32sRefFSTEnum <long?>(fst);

                        Arrays.Fill(ints, 0);
                        r = new Random(seed);
                        int upto = 0;
                        output = 1;
                        while (true)
                        {
                            Int32sRefFSTEnum.InputOutput <long?> pair = fstEnum.Next();
                            if (pair == null)
                            {
                                break;
                            }
                            Assert.AreEqual(input, pair.Input);
                            Assert.AreEqual(output, pair.Output.Value);
                            output += 1 + r.Next(10);
                            upto++;
                            NextInput(r, ints);
                        }
                        Assert.AreEqual(count, upto);

                        if (verify == 0)
                        {
                            Console.WriteLine("\nTEST: save/load FST and re-verify");
                            IndexOutput @out = dir.CreateOutput("fst", IOContext.DEFAULT);
                            fst.Save(@out);
                            @out.Dispose();
                            IndexInput @in = dir.OpenInput("fst", IOContext.DEFAULT);
                            fst = new FST <long?>(@in, outputs);
                            @in.Dispose();
                        }
                        else
                        {
                            dir.DeleteFile("fst");
                        }
                    }
                }
            }
            dir.Dispose();
        }
 public IndexEnum(FST<long?> fst)
 {
     _fstEnum = new BytesRefFSTEnum<long?>(fst);
 }
Beispiel #54
0
 public override bool Load(DataInput input)
 {
     count    = input.ReadVInt64();
     this.fst = new FST <long?>(input, PositiveInt32Outputs.Singleton);
     return(true);
 }
            private void LoadTerms()
            {
                var posIntOutputs = PositiveIntOutputs.Singleton;
                var outputsInner = new PairOutputs<long?, long?>(posIntOutputs, posIntOutputs);
                var outputs = new PairOutputs<long?, PairOutputs<long?,long?>.Pair>(posIntOutputs, outputsInner);

                // honestly, wtf kind of generic mess is this.
                var b = new Builder<PairOutputs<long?, PairOutputs<long?,long?>.Pair>.Pair>(FST.INPUT_TYPE.BYTE1, outputs);
                var input = (IndexInput) _outerInstance._input.Clone();
                input.Seek(_termsStart);

                var lastTerm = new BytesRef(10);
                long lastDocsStart = -1;
                int docFreq = 0;
                long totalTermFreq = 0;
                var visitedDocs = new FixedBitSet(_maxDoc);

                var scratchIntsRef = new IntsRef();
                while (true)
                {
                    SimpleTextUtil.ReadLine(input, _scratch);
                    if (_scratch.Equals(SimpleTextFieldsWriter.END) || StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.FIELD))
                    {
                        if (lastDocsStart != -1)
                        {
                            b.Add(Util.ToIntsRef(lastTerm, scratchIntsRef),
                                outputs.NewPair(lastDocsStart, outputsInner.NewPair(docFreq, totalTermFreq)));
                            _sumTotalTermFreq += totalTermFreq;
                        }
                        break;
                    }

                    if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.DOC))
                    {
                        docFreq++;
                        _sumDocFreq++;
                        UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.DOC.Length, _scratch.Length - SimpleTextFieldsWriter.DOC.Length,
                            _scratchUtf16);
                        int docId = ArrayUtil.ParseInt(_scratchUtf16.Chars, 0, _scratchUtf16.Length);
                        visitedDocs.Set(docId);
                    }
                    else if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.FREQ))
                    {
                        UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.FREQ.Length,
                            _scratch.Length - SimpleTextFieldsWriter.FREQ.Length, _scratchUtf16);
                        totalTermFreq += ArrayUtil.ParseInt(_scratchUtf16.Chars, 0, _scratchUtf16.Length);
                    }
                    else if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.TERM))
                    {
                        if (lastDocsStart != -1)
                        {
                            b.Add(Util.ToIntsRef(lastTerm, scratchIntsRef),
                                outputs.NewPair(lastDocsStart, outputsInner.NewPair(docFreq, totalTermFreq)));
                        }
                        lastDocsStart = input.FilePointer;
                        int len = _scratch.Length - SimpleTextFieldsWriter.TERM.Length;
                        if (len > lastTerm.Length)
                        {
                            lastTerm.Grow(len);
                        }
                        Array.Copy(_scratch.Bytes, SimpleTextFieldsWriter.TERM.Length, lastTerm.Bytes, 0, len);
                        lastTerm.Length = len;
                        docFreq = 0;
                        _sumTotalTermFreq += totalTermFreq;
                        totalTermFreq = 0;
                        _termCount++;
                    }
                }
                _docCount = visitedDocs.Cardinality();
                _fst = b.Finish();
            }
 /// <summary>
 /// Creates a new <seealso cref="StemmerOverrideMap"/> </summary>
 /// <param name="fst"> the fst to lookup the overrides </param>
 /// <param name="ignoreCase"> if the keys case should be ingored </param>
 public StemmerOverrideMap(FST <BytesRef> fst, bool ignoreCase)
 {
     this.fst        = fst;
     this.ignoreCase = ignoreCase;
 }
Beispiel #57
0
 public FieldMetaData(FieldInfo fieldInfo, long numTerms, long sumTotalTermFreq, long sumDocFreq,
     int docCount, int longsSize, FST<FSTTermOutputs.TermData> fst)
 {
     FieldInfo = fieldInfo;
     NumTerms = numTerms;
     SumTotalTermFreq = sumTotalTermFreq;
     SumDocFreq = sumDocFreq;
     DocCount = docCount;
     LongsSize = longsSize;
     Dict = fst;
 }
 internal FSTTermsEnum(FST <long?> fst)
 {
     this.Fst    = fst;
     @in         = new BytesRefFSTEnum <long?>(fst);
     BytesReader = fst.BytesReader;
 }
Beispiel #59
0
        public virtual void Test()
        {
            int[] ints = new int[7];
            IntsRef input = new IntsRef(ints, 0, ints.Length);
            int seed = Random().Next();

            Directory dir = new MMapDirectory(CreateTempDir("2BFST"));

            for (int doPackIter = 0; doPackIter < 2; doPackIter++)
            {
                bool doPack = doPackIter == 1;

                // Build FST w/ NoOutputs and stop when nodeCount > 2.2B
                if (!doPack)
                {
                    Console.WriteLine("\nTEST: 3B nodes; doPack=false output=NO_OUTPUTS");
                    Outputs<object> outputs = NoOutputs.Singleton;
                    object NO_OUTPUT = outputs.NoOutput;
                    Builder<object> b = new Builder<object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, int.MaxValue, outputs, null, doPack, PackedInts.COMPACT, true, 15);

                    int count = 0;
                    Random r = new Random(seed);
                    int[] ints2 = new int[200];
                    IntsRef input2 = new IntsRef(ints2, 0, ints2.Length);
                    while (true)
                    {
                        //System.out.println("add: " + input + " -> " + output);
                        for (int i = 10; i < ints2.Length; i++)
                        {
                            ints2[i] = r.Next(256);
                        }
                        b.Add(input2, NO_OUTPUT);
                        count++;
                        if (count % 100000 == 0)
                        {
                            Console.WriteLine(count + ": " + b.FstSizeInBytes() + " bytes; " + b.TotStateCount + " nodes");
                        }
                        if (b.TotStateCount > int.MaxValue + 100L * 1024 * 1024)
                        {
                            break;
                        }
                        NextInput(r, ints2);
                    }

                    FST<object> fst = b.Finish();

                    for (int verify = 0; verify < 2; verify++)
                    {
                        Console.WriteLine("\nTEST: now verify [fst size=" + fst.SizeInBytes() + "; nodeCount=" + fst.NodeCount + "; arcCount=" + fst.ArcCount + "]");

                        Arrays.Fill(ints2, 0);
                        r = new Random(seed);

                        for (int i = 0; i < count; i++)
                        {
                            if (i % 1000000 == 0)
                            {
                                Console.WriteLine(i + "...: ");
                            }
                            for (int j = 10; j < ints2.Length; j++)
                            {
                                ints2[j] = r.Next(256);
                            }
                            Assert.AreEqual(NO_OUTPUT, Util.Get(fst, input2));
                            NextInput(r, ints2);
                        }

                        Console.WriteLine("\nTEST: enum all input/outputs");
                        IntsRefFSTEnum<object> fstEnum = new IntsRefFSTEnum<object>(fst);

                        Arrays.Fill(ints2, 0);
                        r = new Random(seed);
                        int upto = 0;
                        while (true)
                        {
                            IntsRefFSTEnum<object>.InputOutput<object> pair = fstEnum.Next();
                            if (pair == null)
                            {
                                break;
                            }
                            for (int j = 10; j < ints2.Length; j++)
                            {
                                ints2[j] = r.Next(256);
                            }
                            Assert.AreEqual(input2, pair.Input);
                            Assert.AreEqual(NO_OUTPUT, pair.Output);
                            upto++;
                            NextInput(r, ints2);
                        }
                        Assert.AreEqual(count, upto);

                        if (verify == 0)
                        {
                            Console.WriteLine("\nTEST: save/load FST and re-verify");
                            IndexOutput @out = dir.CreateOutput("fst", IOContext.DEFAULT);
                            fst.Save(@out);
                            @out.Dispose();
                            IndexInput @in = dir.OpenInput("fst", IOContext.DEFAULT);
                            fst = new FST<object>(@in, outputs);
                            @in.Dispose();
                        }
                        else
                        {
                            dir.DeleteFile("fst");
                        }
                    }
                }

                // Build FST w/ ByteSequenceOutputs and stop when FST
                // size = 3GB
                {
                    Console.WriteLine("\nTEST: 3 GB size; doPack=" + doPack + " outputs=bytes");
                    Outputs<BytesRef> outputs = ByteSequenceOutputs.Singleton;
                    Builder<BytesRef> b = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, int.MaxValue, outputs, null, doPack, PackedInts.COMPACT, true, 15);

                    var outputBytes = new byte[20];
                    BytesRef output = new BytesRef(outputBytes);
                    Arrays.Fill(ints, 0);
                    int count = 0;
                    Random r = new Random(seed);
                    while (true)
                    {
                        r.NextBytes(outputBytes);
                        //System.out.println("add: " + input + " -> " + output);
                        b.Add(input, BytesRef.DeepCopyOf(output));
                        count++;
                        if (count % 1000000 == 0)
                        {
                            Console.WriteLine(count + "...: " + b.FstSizeInBytes() + " bytes");
                        }
                        if (b.FstSizeInBytes() > LIMIT)
                        {
                            break;
                        }
                        NextInput(r, ints);
                    }

                    FST<BytesRef> fst = b.Finish();
                    for (int verify = 0; verify < 2; verify++)
                    {
                        Console.WriteLine("\nTEST: now verify [fst size=" + fst.SizeInBytes() + "; nodeCount=" + fst.NodeCount + "; arcCount=" + fst.ArcCount + "]");

                        r = new Random(seed);
                        Arrays.Fill(ints, 0);

                        for (int i = 0; i < count; i++)
                        {
                            if (i % 1000000 == 0)
                            {
                                Console.WriteLine(i + "...: ");
                            }
                            r.NextBytes((byte[])(Array)outputBytes);
                            Assert.AreEqual(output, Util.Get(fst, input));
                            NextInput(r, ints);
                        }

                        Console.WriteLine("\nTEST: enum all input/outputs");
                        IntsRefFSTEnum<BytesRef> fstEnum = new IntsRefFSTEnum<BytesRef>(fst);

                        Arrays.Fill(ints, 0);
                        r = new Random(seed);
                        int upto = 0;
                        while (true)
                        {
                            IntsRefFSTEnum<BytesRef>.InputOutput<BytesRef> pair = fstEnum.Next();
                            if (pair == null)
                            {
                                break;
                            }
                            Assert.AreEqual(input, pair.Input);
                            r.NextBytes((byte[])(Array)outputBytes);
                            Assert.AreEqual(output, pair.Output);
                            upto++;
                            NextInput(r, ints);
                        }
                        Assert.AreEqual(count, upto);

                        if (verify == 0)
                        {
                            Console.WriteLine("\nTEST: save/load FST and re-verify");
                            IndexOutput @out = dir.CreateOutput("fst", IOContext.DEFAULT);
                            fst.Save(@out);
                            @out.Dispose();
                            IndexInput @in = dir.OpenInput("fst", IOContext.DEFAULT);
                            fst = new FST<BytesRef>(@in, outputs);
                            @in.Dispose();
                        }
                        else
                        {
                            dir.DeleteFile("fst");
                        }
                    }
                }

                // Build FST w/ PositiveIntOutputs and stop when FST
                // size = 3GB
                {
                    Console.WriteLine("\nTEST: 3 GB size; doPack=" + doPack + " outputs=long");
                    Outputs<long?> outputs = PositiveIntOutputs.Singleton;
                    Builder<long?> b = new Builder<long?>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, int.MaxValue, outputs, null, doPack, PackedInts.COMPACT, true, 15);

                    long output = 1;

                    Arrays.Fill(ints, 0);
                    int count = 0;
                    Random r = new Random(seed);
                    while (true)
                    {
                        //System.out.println("add: " + input + " -> " + output);
                        b.Add(input, output);
                        output += 1 + r.Next(10);
                        count++;
                        if (count % 1000000 == 0)
                        {
                            Console.WriteLine(count + "...: " + b.FstSizeInBytes() + " bytes");
                        }
                        if (b.FstSizeInBytes() > LIMIT)
                        {
                            break;
                        }
                        NextInput(r, ints);
                    }

                    FST<long?> fst = b.Finish();

                    for (int verify = 0; verify < 2; verify++)
                    {
                        Console.WriteLine("\nTEST: now verify [fst size=" + fst.SizeInBytes() + "; nodeCount=" + fst.NodeCount + "; arcCount=" + fst.ArcCount + "]");

                        Arrays.Fill(ints, 0);

                        output = 1;
                        r = new Random(seed);
                        for (int i = 0; i < count; i++)
                        {
                            if (i % 1000000 == 0)
                            {
                                Console.WriteLine(i + "...: ");
                            }

                            // forward lookup:
                            Assert.AreEqual(output, (long)Util.Get(fst, input));
                            // reverse lookup:
                            Assert.AreEqual(input, Util.GetByOutput(fst, output));
                            output += 1 + r.Next(10);
                            NextInput(r, ints);
                        }

                        Console.WriteLine("\nTEST: enum all input/outputs");
                        IntsRefFSTEnum<long?> fstEnum = new IntsRefFSTEnum<long?>(fst);

                        Arrays.Fill(ints, 0);
                        r = new Random(seed);
                        int upto = 0;
                        output = 1;
                        while (true)
                        {
                            IntsRefFSTEnum<long?>.InputOutput<long?> pair = fstEnum.Next();
                            if (pair == null)
                            {
                                break;
                            }
                            Assert.AreEqual(input, pair.Input);
                            Assert.AreEqual(output, pair.Output.Value);
                            output += 1 + r.Next(10);
                            upto++;
                            NextInput(r, ints);
                        }
                        Assert.AreEqual(count, upto);

                        if (verify == 0)
                        {
                            Console.WriteLine("\nTEST: save/load FST and re-verify");
                            IndexOutput @out = dir.CreateOutput("fst", IOContext.DEFAULT);
                            fst.Save(@out);
                            @out.Dispose();
                            IndexInput @in = dir.OpenInput("fst", IOContext.DEFAULT);
                            fst = new FST<long?>(@in, outputs);
                            @in.Dispose();
                        }
                        else
                        {
                            dir.DeleteFile("fst");
                        }
                    }
                }
            }
            dir.Dispose();
        }
Beispiel #60
0
	  public SynonymMap(FST<BytesRef> fst, BytesRefHash words, int maxHorizontalContext)
	  {
		this.fst = fst;
		this.words = words;
		this.maxHorizontalContext = maxHorizontalContext;
	  }