// Use the builder to create: private NormalizeCharMap(FST<CharsRef> map) { this.map = map; if (map != null) { try { // Pre-cache root arcs: var scratchArc = new FST.Arc<CharsRef>(); FST.BytesReader fstReader = map.BytesReader; map.GetFirstArc(scratchArc); if (FST<CharsRef>.TargetHasArcs(scratchArc)) { map.ReadFirstRealTargetArc(scratchArc.Target, scratchArc, fstReader); while (true) { Debug.Assert(scratchArc.Label != FST.END_LABEL); cachedRootArcs[Convert.ToChar((char)scratchArc.Label)] = (new FST.Arc<CharsRef>()).CopyFrom(scratchArc); if (scratchArc.IsLast) { break; } map.ReadNextRealArc(scratchArc, fstReader); } } //System.out.println("cached " + cachedRootArcs.size() + " root arcs"); } catch (IOException ioe) { // Bogus FST IOExceptions!! (will never happen) throw new Exception("Should never happen", ioe); } } }
/// <summary> /// Default constructor that takes a <seealso cref="Reader"/>. </summary> public MappingCharFilter(NormalizeCharMap normMap, TextReader @in) : base(@in) { buffer.Reset(@in); map = normMap.map; cachedRootArcs = normMap.cachedRootArcs; if (map != null) { fstReader = map.BytesReader; } else { fstReader = null; } }
/// <summary> /// Default constructor that takes a <seealso cref="TextReader"/>. </summary> public MappingCharFilter(NormalizeCharMap normMap, TextReader @in) : base(@in) { //LUCENENET support to reset the reader. _input = GetBufferedReader(@in); _input.Mark(BufferedCharFilter.defaultCharBufferSize); buffer.Reset(_input); //buffer.Reset(@in); map = normMap.map; cachedRootArcs = normMap.cachedRootArcs; if (map != null) { fstReader = map.BytesReader; } else { fstReader = null; } }
// Use the builder to create: private NormalizeCharMap(FST<CharsRef> map) { this.map = map; if (map != null) { try { // Pre-cache root arcs: //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.util.fst.FST.Arc<org.apache.lucene.util.CharsRef> scratchArc = new org.apache.lucene.util.fst.FST.Arc<>(); FST.Arc<CharsRef> scratchArc = new FST.Arc<CharsRef>(); //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.apache.lucene.util.fst.FST.BytesReader fstReader = map.getBytesReader(); FST.BytesReader fstReader = map.BytesReader; map.getFirstArc(scratchArc); if (FST.targetHasArcs(scratchArc)) { map.readFirstRealTargetArc(scratchArc.target, scratchArc, fstReader); while (true) { Debug.Assert(scratchArc.label != FST.END_LABEL); cachedRootArcs[Convert.ToChar((char) scratchArc.label)] = (new FST.Arc<CharsRef>()).copyFrom(scratchArc); if (scratchArc.Last) { break; } map.readNextRealArc(scratchArc, fstReader); } } //System.out.println("cached " + cachedRootArcs.size() + " root arcs"); } catch (IOException ioe) { // Bogus FST IOExceptions!! (will never happen) throw new Exception(ioe); } } }
public SortedDocValuesAnonymousInnerClassHelper(FSTEntry entry, NumericDocValues docToOrd, FST <long?> fst, FST.BytesReader @in, FST.Arc <long?> firstArc, FST.Arc <long?> scratchArc, Int32sRef scratchInts, BytesRefFSTEnum <long?> fstEnum) { this.entry = entry; this.docToOrd = docToOrd; this.fst = fst; this.@in = @in; this.firstArc = firstArc; this.scratchArc = scratchArc; this.scratchInts = scratchInts; this.fstEnum = fstEnum; }
//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public TermsReader(index.FieldInfos fieldInfos, store.IndexInput in, int termCount) throws java.io.IOException public TermsReader(FieldInfos fieldInfos, IndexInput @in, int termCount) { this.termCount = termCount; //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int fieldNumber = in.readVInt(); int fieldNumber = @in.readVInt(); field = fieldInfos.fieldInfo(fieldNumber); if (field.IndexOptions != IndexOptions.DOCS_ONLY) { sumTotalTermFreq = @in.readVLong(); } else { sumTotalTermFreq = -1; } sumDocFreq = @in.readVLong(); docCount = @in.readVInt(); fst = new FST<>(@in, outputs); }
public override int Read() { //System.out.println("\nread"); while (true) { if (replacement != null && replacementPointer < replacement.Length) { //System.out.println(" return repl[" + replacementPointer + "]=" + replacement.chars[replacement.offset + replacementPointer]); return(replacement.Chars[replacement.Offset + replacementPointer++]); } // TODO: a more efficient approach would be Aho/Corasick's // algorithm // (http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm) // or this generalizatio: www.cis.uni-muenchen.de/people/Schulz/Pub/dictle5.ps // // I think this would be (almost?) equivalent to 1) adding // epsilon arcs from all final nodes back to the init // node in the FST, 2) adding a .* (skip any char) // loop on the initial node, and 3) determinizing // that. Then we would not have to Restart matching // at each position. int lastMatchLen = -1; CharsRef lastMatch = null; int firstCH = buffer.Get(inputOff); if (firstCH != -1) { // LUCENENET fix: Check the dictionary to ensure it contains a key before reading it. char key = Convert.ToChar((char)firstCH); if (cachedRootArcs.TryGetValue(key, out FST.Arc <CharsRef> arc) && arc != null) { if (!FST.TargetHasArcs(arc)) { // Fast pass for single character match: if (Debugging.AssertsEnabled) { Debugging.Assert(arc.IsFinal); } lastMatchLen = 1; lastMatch = arc.Output; } else { int lookahead = 0; CharsRef output = arc.Output; while (true) { lookahead++; if (arc.IsFinal) { // Match! (to node is final) lastMatchLen = lookahead; lastMatch = outputs.Add(output, arc.NextFinalOutput); // Greedy: keep searching to see if there's a // longer match... } if (!FST.TargetHasArcs(arc)) { break; } int ch = buffer.Get(inputOff + lookahead); if (ch == -1) { break; } if ((arc = map.FindTargetArc(ch, arc, scratchArc, fstReader)) is null) { // Dead end break; } output = outputs.Add(output, arc.Output); } } } } if (lastMatch != null) { inputOff += lastMatchLen; //System.out.println(" match! len=" + lastMatchLen + " repl=" + lastMatch); int diff = lastMatchLen - lastMatch.Length; if (diff != 0) { int prevCumulativeDiff = LastCumulativeDiff; if (diff > 0) { // Replacement is shorter than matched input: AddOffCorrectMap(inputOff - diff - prevCumulativeDiff, prevCumulativeDiff + diff); } else { // Replacement is longer than matched input: remap // the "extra" chars all back to the same input // offset: int outputStart = inputOff - prevCumulativeDiff; for (int extraIDX = 0; extraIDX < -diff; extraIDX++) { AddOffCorrectMap(outputStart + extraIDX, prevCumulativeDiff - extraIDX - 1); } } } replacement = lastMatch; replacementPointer = 0; } else { int ret = buffer.Get(inputOff); if (ret != -1) { inputOff++; buffer.FreeBefore(inputOff); } return(ret); } } }
private void LoadTermsIndex() { if (Fst != null) return; var clone = (IndexInput) _vgtir._input.Clone(); clone.Seek(_indexStart); Fst = new FST<long?>(clone, _vgtir._fstOutputs); clone.Dispose(); /* final String dotFileName = segment + "_" + fieldInfo.name + ".dot"; Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName)); Util.toDot(fst, w, false, false); System.out.println("FST INDEX: SAVED to " + dotFileName); w.close(); */ if (_vgtir._indexDivisor > 1) { // subsample var scratchIntsRef = new IntsRef(); var outputs = PositiveIntOutputs.Singleton; var builder = new Builder<long?>(FST.INPUT_TYPE.BYTE1, outputs); var fstEnum = new BytesRefFSTEnum<long?>(Fst); var count = _vgtir._indexDivisor; BytesRefFSTEnum<long?>.InputOutput<long?> result; while ((result = fstEnum.Next()) != null) { if (count == _vgtir._indexDivisor) { builder.Add(Util.ToIntsRef(result.Input, scratchIntsRef), result.Output); count = 0; } count++; } Fst = builder.Finish(); } }
public SimpleTextTermsEnum(SimpleTextFieldsReader outerInstance, FST<PairOutputs<long?, PairOutputs<long?,long?>.Pair>.Pair> fst, IndexOptions indexOptions) { _outerInstance = outerInstance; _indexOptions = indexOptions; _fstEnum = new BytesRefFSTEnum<PairOutputs<long?, PairOutputs<long?,long?>.Pair>.Pair>(fst); }
// Uncomment for debugging: /* * public static <T> void dotToFile(FST<T> fst, String filePath) throws IOException { * Writer w = new OutputStreamWriter(new FileOutputStream(filePath)); * toDot(fst, w, true, true); * w.close(); * } */ /// <summary> /// Reads the first arc greater or equal that the given label into the provided /// arc in place and returns it iff found, otherwise return <code>null</code>. /// </summary> /// <param name="label"> the label to ceil on </param> /// <param name="fst"> the fst to operate on </param> /// <param name="follow"> the arc to follow reading the label from </param> /// <param name="arc"> the arc to read into in place </param> /// <param name="in"> the fst's <seealso cref="BytesReader"/> </param> public static FST <T> .Arc <T> readCeilArc <T>(int label, FST <T> fst, FST <T> .Arc <T> follow, FST <T> .Arc <T> arc, FST <T> .BytesReader @in) { // TODO maybe this is a useful in the FST class - we could simplify some other code like FSTEnum? if (label == FST <T> .END_LABEL) { if (follow.Final) { if (follow.Target <= 0) { arc.Flags = (sbyte)FST <T> .BIT_LAST_ARC; } else { arc.Flags = 0; // NOTE: nextArc is a node (not an address!) in this case: arc.NextArc = follow.Target; arc.Node = follow.Target; } arc.Output = follow.NextFinalOutput; arc.Label = FST <T> .END_LABEL; return(arc); } else { return(null); } } if (!FST <T> .TargetHasArcs(follow)) { return(null); } fst.ReadFirstTargetArc(follow, arc, @in); if (arc.BytesPerArc != 0 && arc.Label != FST <T> .END_LABEL) { // Arcs are fixed array -- use binary search to find // the target. int low = arc.ArcIdx; int high = arc.NumArcs - 1; int mid = 0; // System.out.println("do arc array low=" + low + " high=" + high + // " targetLabel=" + targetLabel); while (low <= high) { mid = (int)((uint)(low + high) >> 1); @in.Position = arc.PosArcsStart; @in.SkipBytes(arc.BytesPerArc * mid + 1); int midLabel = fst.ReadLabel(@in); int cmp = midLabel - label; // System.out.println(" cycle low=" + low + " high=" + high + " mid=" + // mid + " midLabel=" + midLabel + " cmp=" + cmp); if (cmp < 0) { low = mid + 1; } else if (cmp > 0) { high = mid - 1; } else { arc.ArcIdx = mid - 1; return(fst.ReadNextRealArc(arc, @in)); } } if (low == arc.NumArcs) { // DEAD END! return(null); } arc.ArcIdx = (low > high ? high : low); return(fst.ReadNextRealArc(arc, @in)); } // Linear scan fst.ReadFirstRealTargetArc(follow.Target, arc, @in); while (true) { // System.out.println(" non-bs cycle"); // TODO: we should fix this code to not have to create // object for the output of every arc we scan... only // for the matching arc, if found if (arc.Label >= label) { // System.out.println(" found!"); return(arc); } else if (arc.Last) { return(null); } else { fst.ReadNextRealArc(arc, @in); } } }
/// <summary> /// Builds an <see cref="SynonymMap"/> and returns it. /// </summary> public virtual SynonymMap Build() { ByteSequenceOutputs outputs = ByteSequenceOutputs.Singleton; // TODO: are we using the best sharing options? var builder = new Builder <BytesRef>(FST.INPUT_TYPE.BYTE4, outputs); BytesRef scratch = new BytesRef(64); ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput(); ISet <int?> dedupSet; if (dedup) { dedupSet = new JCG.HashSet <int?>(); } else { dedupSet = null; } var spare = new byte[5]; ICollection <CharsRef> keys = workingSet.Keys; CharsRef[] sortedKeys = new CharsRef[keys.Count]; keys.CopyTo(sortedKeys, 0); #pragma warning disable 612, 618 System.Array.Sort(sortedKeys, CharsRef.UTF16SortedAsUTF8Comparer); #pragma warning restore 612, 618 Int32sRef scratchIntsRef = new Int32sRef(); //System.out.println("fmap.build"); for (int keyIdx = 0; keyIdx < sortedKeys.Length; keyIdx++) { CharsRef input = sortedKeys[keyIdx]; MapEntry output = workingSet[input]; int numEntries = output.ords.Count; // output size, assume the worst case int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry scratch.Grow(estimatedSize); scratchOutput.Reset(scratch.Bytes, scratch.Offset, scratch.Bytes.Length); Debug.Assert(scratch.Offset == 0); // now write our output data: int count = 0; for (int i = 0; i < numEntries; i++) { if (dedupSet != null) { // box once int?ent = output.ords[i]; if (dedupSet.Contains(ent)) { continue; } dedupSet.Add(ent); } scratchOutput.WriteVInt32(output.ords[i]); count++; } int pos = scratchOutput.Position; scratchOutput.WriteVInt32(count << 1 | (output.includeOrig ? 0 : 1)); int pos2 = scratchOutput.Position; int vIntLen = pos2 - pos; // Move the count + includeOrig to the front of the byte[]: Array.Copy(scratch.Bytes, pos, spare, 0, vIntLen); Array.Copy(scratch.Bytes, 0, scratch.Bytes, vIntLen, pos); Array.Copy(spare, 0, scratch.Bytes, 0, vIntLen); if (dedupSet != null) { dedupSet.Clear(); } scratch.Length = scratchOutput.Position - scratch.Offset; //System.out.println(" add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count); builder.Add(Lucene.Net.Util.Fst.Util.ToUTF32(input.ToString(), scratchIntsRef), BytesRef.DeepCopyOf(scratch)); } FST <BytesRef> fst = builder.Finish(); return(new SynonymMap(fst, words, maxHorizontalContext)); }
//private boolean DEBUG; internal FieldReader(BlockTreeTermsReader outerInstance, FieldInfo fieldInfo, long numTerms, BytesRef rootCode, long sumTotalTermFreq, long sumDocFreq, int docCount, long indexStartFP, int longsSize, IndexInput indexIn) { this.OuterInstance = outerInstance; Debug.Assert(numTerms > 0); this.fieldInfo = fieldInfo; //DEBUG = BlockTreeTermsReader.DEBUG && fieldInfo.name.equals("id"); this.NumTerms = numTerms; this.SumTotalTermFreq_Renamed = sumTotalTermFreq; this.SumDocFreq_Renamed = sumDocFreq; this.DocCount_Renamed = docCount; this.IndexStartFP = indexStartFP; this.RootCode = rootCode; this.LongsSize = longsSize; // if (DEBUG) { // System.out.println("BTTR: seg=" + segment + " field=" + fieldInfo.name + " rootBlockCode=" + rootCode + " divisor=" + indexDivisor); // } RootBlockFP = (int)((uint)(new ByteArrayDataInput((byte[])(Array)rootCode.Bytes, rootCode.Offset, rootCode.Length)).ReadVLong() >> BlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS); if (indexIn != null) { IndexInput clone = (IndexInput)indexIn.Clone(); //System.out.println("start=" + indexStartFP + " field=" + fieldInfo.name); clone.Seek(indexStartFP); Index = new FST<BytesRef>(clone, ByteSequenceOutputs.Singleton); /* if (false) { final String dotFileName = segment + "_" + fieldInfo.name + ".dot"; Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName)); Util.toDot(index, w, false, false); System.out.println("FST INDEX: SAVED to " + dotFileName); w.close(); } */ } else { Index = null; } }
public virtual TokenInfoDictionaryWriter BuildDictionary(IList <string> csvFiles) { TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(10 * 1024 * 1024); // all lines in the file Console.WriteLine(" parse..."); List <string[]> lines = new List <string[]>(400000); foreach (string file in csvFiles) { using (Stream inputStream = new FileStream(file, FileMode.Open, FileAccess.Read)) { Encoding decoder = Encoding.GetEncoding(encoding); TextReader reader = new StreamReader(inputStream, decoder); string line = null; while ((line = reader.ReadLine()) != null) { string[] entry = CSVUtil.Parse(line); if (entry.Length < 13) { Console.WriteLine("Entry in CSV is not valid: " + line); continue; } string[] formatted = FormatEntry(entry); lines.Add(formatted); // NFKC normalize dictionary entry if (normalizeEntries) { //if (normalizer.isNormalized(entry[0])){ if (entry[0].IsNormalized(NormalizationForm.FormKC)) { continue; } string[] normalizedEntry = new string[entry.Length]; for (int i = 0; i < entry.Length; i++) { //normalizedEntry[i] = normalizer.normalize(entry[i]); normalizedEntry[i] = entry[i].Normalize(NormalizationForm.FormKC); } formatted = FormatEntry(normalizedEntry); lines.Add(formatted); } } } } Console.WriteLine(" sort..."); // sort by term: we sorted the files already and use a stable sort. lines.Sort(new ComparerAnonymousHelper()); Console.WriteLine(" encode..."); PositiveInt32Outputs fstOutput = PositiveInt32Outputs.Singleton; Builder <long?> fstBuilder = new Builder <long?>(Lucene.Net.Util.Fst.FST.INPUT_TYPE.BYTE2, 0, 0, true, true, int.MaxValue, fstOutput, null, true, PackedInt32s.DEFAULT, true, 15); Int32sRef scratch = new Int32sRef(); long ord = -1; // first ord will be 0 string lastValue = null; // build tokeninfo dictionary foreach (string[] entry in lines) { int next = dictionary.Put(entry); if (next == offset) { Console.WriteLine("Failed to process line: " + Collections.ToString(entry)); continue; } string token = entry[0]; if (!token.Equals(lastValue, StringComparison.Ordinal)) { // new word to add to fst ord++; lastValue = token; scratch.Grow(token.Length); scratch.Length = token.Length; for (int i = 0; i < token.Length; i++) { scratch.Int32s[i] = (int)token[i]; } fstBuilder.Add(scratch, ord); } dictionary.AddMapping((int)ord, offset); offset = next; } FST <long?> fst = fstBuilder.Finish(); Console.WriteLine(" " + fst.NodeCount + " nodes, " + fst.ArcCount + " arcs, " + fst.GetSizeInBytes() + " bytes... "); dictionary.SetFST(fst); Console.WriteLine(" done"); return(dictionary); }
/// <param name="input"> input tokenstream </param> /// <param name="synonyms"> synonym map </param> /// <param name="ignoreCase"> case-folds input for matching with <seealso cref="Character#toLowerCase(int)"/>. /// Note, if you set this to true, its your responsibility to lowercase /// the input entries when you create the <seealso cref="SynonymMap"/> </param> public SynonymFilter(TokenStream input, SynonymMap synonyms, bool ignoreCase) : base(input) { termAtt = AddAttribute<ICharTermAttribute>(); posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); posLenAtt = AddAttribute<IPositionLengthAttribute>(); typeAtt = AddAttribute<ITypeAttribute>(); offsetAtt = AddAttribute<IOffsetAttribute>(); this.synonyms = synonyms; this.ignoreCase = ignoreCase; this.fst = synonyms.fst; if (fst == null) { throw new System.ArgumentException("fst must be non-null"); } this.fstReader = fst.BytesReader; // Must be 1+ so that when roll buffer is at full // lookahead we can distinguish this full buffer from // the empty buffer: rollBufferSize = 1 + synonyms.maxHorizontalContext; futureInputs = new PendingInput[rollBufferSize]; futureOutputs = new PendingOutputs[rollBufferSize]; for (int pos = 0; pos < rollBufferSize; pos++) { futureInputs[pos] = new PendingInput(); futureOutputs[pos] = new PendingOutputs(); } //System.out.println("FSTFilt maxH=" + synonyms.maxHorizontalContext); scratchArc = new FST.Arc<BytesRef>(); }
/// <summary> /// Returns the value mapped to the given key or <code>null</code> if the key is not in the FST dictionary. /// </summary> public BytesRef Get(char[] buffer, int bufferLen, FST.Arc<BytesRef> scratchArc, FST.BytesReader fstReader) { BytesRef pendingOutput = fst.Outputs.NoOutput; BytesRef matchOutput = null; int bufUpto = 0; fst.GetFirstArc(scratchArc); while (bufUpto < bufferLen) { int codePoint = Character.CodePointAt(buffer, bufUpto, bufferLen); if (fst.FindTargetArc(ignoreCase ? Character.ToLowerCase(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null) { return null; } pendingOutput = fst.Outputs.Add(pendingOutput, scratchArc.Output); bufUpto += Character.CharCount(codePoint); } if (scratchArc.IsFinal) { matchOutput = fst.Outputs.Add(pendingOutput, scratchArc.NextFinalOutput); } return matchOutput; }
public override void Finish(long termsFilePointer) { fst = fstBuilder.Finish(); if (fst != null) { fst.Save(output); } }
protected internal override IList<FSTUtil.Path<Pair<long?, BytesRef>>> GetFullPrefixPaths(IList<FSTUtil.Path<Pair<long?, BytesRef>>> prefixPaths, Automaton lookupAutomaton, FST<Pair<long?, BytesRef>> fst) { // TODO: right now there's no penalty for fuzzy/edits, // ie a completion whose prefix matched exactly what the // user typed gets no boost over completions that // required an edit, which get no boost over completions // requiring two edits. I suspect a multiplicative // factor is appropriate (eg, say a fuzzy match must be at // least 2X better weight than the non-fuzzy match to // "compete") ... in which case I think the wFST needs // to be log weights or something ... Automaton levA = convertAutomaton(ToLevenshteinAutomata(lookupAutomaton)); /* Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), StandardCharsets.UTF_8); w.write(levA.toDot()); w.close(); System.out.println("Wrote LevA to out.dot"); */ return FSTUtil.IntersectPrefixPaths(levA, fst); }
public FSTTermsEnum(FieldInfo field, FST<BytesRef> fst) { this.field = field; fstEnum = new BytesRefFSTEnum<>(fst); }
internal FSTTermsEnum(FST <long?> fst) { this.fst = fst; @in = new BytesRefFSTEnum <long?>(fst); bytesReader = fst.GetBytesReader(); }
public void TestEnumerateAll() { // just for debugging int numTerms = 0; int numWords = 0; int lastWordId = -1; int lastSourceId = -1; TokenInfoDictionary tid = TokenInfoDictionary.Instance; ConnectionCosts matrix = ConnectionCosts.Instance; FST <Int64> fst = tid.FST.InternalFST; Int32sRefFSTEnum <Int64> fstEnum = new Int32sRefFSTEnum <Int64>(fst); Int32sRefFSTEnum.InputOutput <Int64> mapping; Int32sRef scratch = new Int32sRef(); while (fstEnum.MoveNext()) { mapping = fstEnum.Current; numTerms++; Int32sRef input = mapping.Input; char[] chars = new char[input.Length]; for (int i = 0; i < chars.Length; i++) { chars[i] = (char)input.Int32s[input.Offset + i]; } assertTrue(UnicodeUtil.ValidUTF16String(new string(chars))); long?output = mapping.Output; int sourceId = (int)output.Value; // we walk in order, terms, sourceIds, and wordIds should always be increasing assertTrue(sourceId > lastSourceId); lastSourceId = sourceId; tid.LookupWordIds(sourceId, scratch); for (int i = 0; i < scratch.Length; i++) { numWords++; int wordId = scratch.Int32s[scratch.Offset + i]; assertTrue(wordId > lastWordId); lastWordId = wordId; String baseForm = tid.GetBaseForm(wordId, chars, 0, chars.Length); assertTrue(baseForm is null || UnicodeUtil.ValidUTF16String(baseForm)); String inflectionForm = tid.GetInflectionForm(wordId); assertTrue(inflectionForm is null || UnicodeUtil.ValidUTF16String(inflectionForm)); if (inflectionForm != null) { // check that its actually an ipadic inflection form assertNotNull(ToStringUtil.GetInflectedFormTranslation(inflectionForm)); } String inflectionType = tid.GetInflectionType(wordId); assertTrue(inflectionType is null || UnicodeUtil.ValidUTF16String(inflectionType)); if (inflectionType != null) { // check that its actually an ipadic inflection type assertNotNull(ToStringUtil.GetInflectionTypeTranslation(inflectionType)); } int leftId = tid.GetLeftId(wordId); int rightId = tid.GetRightId(wordId); matrix.Get(rightId, leftId); tid.GetWordCost(wordId); String pos = tid.GetPartOfSpeech(wordId); assertNotNull(pos); assertTrue(UnicodeUtil.ValidUTF16String(pos)); // check that its actually an ipadic pos tag assertNotNull(ToStringUtil.GetPOSTranslation(pos)); String pronunciation = tid.GetPronunciation(wordId, chars, 0, chars.Length); assertNotNull(pronunciation); assertTrue(UnicodeUtil.ValidUTF16String(pronunciation)); String reading = tid.GetReading(wordId, chars, 0, chars.Length); assertNotNull(reading); assertTrue(UnicodeUtil.ValidUTF16String(reading)); } } if (Verbose) { Console.WriteLine("checked " + numTerms + " terms, " + numWords + " words."); } }
public override void Build(IInputEnumerator enumerator) { // LUCENENET: Added guard clause for null if (enumerator is null) { throw new ArgumentNullException(nameof(enumerator)); } if (enumerator.HasContexts) { throw new ArgumentException("this suggester doesn't support contexts"); } string prefix = this.GetType().Name; var directory = OfflineSorter.DefaultTempDir(); var tempInput = FileSupport.CreateTempFile(prefix, ".input", directory); var tempSorted = FileSupport.CreateTempFile(prefix, ".sorted", directory); hasPayloads = enumerator.HasPayloads; var writer = new OfflineSorter.ByteSequencesWriter(tempInput); OfflineSorter.ByteSequencesReader reader = null; var scratch = new BytesRef(); TokenStreamToAutomaton ts2a = GetTokenStreamToAutomaton(); bool success = false; count = 0; byte[] buffer = new byte[8]; try { var output = new ByteArrayDataOutput(buffer); BytesRef surfaceForm; while (enumerator.MoveNext()) { surfaceForm = enumerator.Current; ISet <Int32sRef> paths = ToFiniteStrings(surfaceForm, ts2a); maxAnalyzedPathsForOneInput = Math.Max(maxAnalyzedPathsForOneInput, paths.Count); foreach (Int32sRef path in paths) { Util.Fst.Util.ToBytesRef(path, scratch); // length of the analyzed text (FST input) if (scratch.Length > ushort.MaxValue - 2) { throw new ArgumentException("cannot handle analyzed forms > " + (ushort.MaxValue - 2) + " in length (got " + scratch.Length + ")"); } ushort analyzedLength = (ushort)scratch.Length; // compute the required length: // analyzed sequence + weight (4) + surface + analyzedLength (short) int requiredLength = analyzedLength + 4 + surfaceForm.Length + 2; BytesRef payload; if (hasPayloads) { if (surfaceForm.Length > (ushort.MaxValue - 2)) { throw new ArgumentException("cannot handle surface form > " + (ushort.MaxValue - 2) + " in length (got " + surfaceForm.Length + ")"); } payload = enumerator.Payload; // payload + surfaceLength (short) requiredLength += payload.Length + 2; } else { payload = null; } buffer = ArrayUtil.Grow(buffer, requiredLength); output.Reset(buffer); output.WriteInt16((short)analyzedLength); output.WriteBytes(scratch.Bytes, scratch.Offset, scratch.Length); output.WriteInt32(EncodeWeight(enumerator.Weight)); if (hasPayloads) { for (int i = 0; i < surfaceForm.Length; i++) { if (surfaceForm.Bytes[i] == PAYLOAD_SEP) { throw new ArgumentException( "surface form cannot contain unit separator character U+001F; this character is reserved"); } } output.WriteInt16((short)surfaceForm.Length); output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length); output.WriteBytes(payload.Bytes, payload.Offset, payload.Length); } else { output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length); } if (Debugging.AssertsEnabled) { Debugging.Assert(output.Position == requiredLength, "{0} vs {1}", output.Position, requiredLength); } writer.Write(buffer, 0, output.Position); } count++; } writer.Dispose(); // Sort all input/output pairs (required by FST.Builder): (new OfflineSorter(new AnalyzingComparer(hasPayloads))).Sort(tempInput, tempSorted); // Free disk space: tempInput.Delete(); reader = new OfflineSorter.ByteSequencesReader(tempSorted); var outputs = new PairOutputs <long?, BytesRef>(PositiveInt32Outputs.Singleton, ByteSequenceOutputs.Singleton); var builder = new Builder <PairOutputs <long?, BytesRef> .Pair>(FST.INPUT_TYPE.BYTE1, outputs); // Build FST: BytesRef previousAnalyzed = null; BytesRef analyzed = new BytesRef(); BytesRef surface = new BytesRef(); Int32sRef scratchInts = new Int32sRef(); var input = new ByteArrayDataInput(); // Used to remove duplicate surface forms (but we // still index the hightest-weight one). We clear // this when we see a new analyzed form, so it cannot // grow unbounded (at most 256 entries): var seenSurfaceForms = new JCG.HashSet <BytesRef>(); var dedup = 0; while (reader.Read(scratch)) { input.Reset(scratch.Bytes, scratch.Offset, scratch.Length); ushort analyzedLength = (ushort)input.ReadInt16(); analyzed.Grow(analyzedLength + 2); input.ReadBytes(analyzed.Bytes, 0, analyzedLength); analyzed.Length = analyzedLength; long cost = input.ReadInt32(); surface.Bytes = scratch.Bytes; if (hasPayloads) { surface.Length = (ushort)input.ReadInt16(); surface.Offset = input.Position; } else { surface.Offset = input.Position; surface.Length = scratch.Length - surface.Offset; } if (previousAnalyzed == null) { previousAnalyzed = new BytesRef(); previousAnalyzed.CopyBytes(analyzed); seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface)); } else if (analyzed.Equals(previousAnalyzed)) { dedup++; if (dedup >= maxSurfaceFormsPerAnalyzedForm) { // More than maxSurfaceFormsPerAnalyzedForm // dups: skip the rest: continue; } if (seenSurfaceForms.Contains(surface)) { continue; } seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface)); } else { dedup = 0; previousAnalyzed.CopyBytes(analyzed); seenSurfaceForms.Clear(); seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface)); } // TODO: I think we can avoid the extra 2 bytes when // there is no dup (dedup==0), but we'd have to fix // the exactFirst logic ... which would be sort of // hairy because we'd need to special case the two // (dup/not dup)... // NOTE: must be byte 0 so we sort before whatever // is next analyzed.Bytes[analyzed.Offset + analyzed.Length] = 0; analyzed.Bytes[analyzed.Offset + analyzed.Length + 1] = (byte)dedup; analyzed.Length += 2; Util.Fst.Util.ToInt32sRef(analyzed, scratchInts); //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString()); if (!hasPayloads) { builder.Add(scratchInts, outputs.NewPair(cost, BytesRef.DeepCopyOf(surface))); } else { int payloadOffset = input.Position + surface.Length; int payloadLength = scratch.Length - payloadOffset; BytesRef br = new BytesRef(surface.Length + 1 + payloadLength); Array.Copy(surface.Bytes, surface.Offset, br.Bytes, 0, surface.Length); br.Bytes[surface.Length] = PAYLOAD_SEP; Array.Copy(scratch.Bytes, payloadOffset, br.Bytes, surface.Length + 1, payloadLength); br.Length = br.Bytes.Length; builder.Add(scratchInts, outputs.NewPair(cost, br)); } } fst = builder.Finish(); //Util.dotToFile(fst, "/tmp/suggest.dot"); success = true; } finally { if (success) { IOUtils.Dispose(reader, writer); } else { IOUtils.DisposeWhileHandlingException(reader, writer); } tempInput.Delete(); tempSorted.Delete(); } }
public override SortedDocValues GetSorted(FieldInfo field) { FSTEntry entry = Fsts[field.Number]; FST<long> instance; lock (this) { if (!FstInstances.TryGetValue(field.Number, out instance)) { Data.Seek(entry.Offset); instance = new FST<long>(Data, PositiveIntOutputs.Singleton); RamBytesUsed_Renamed.AddAndGet(instance.SizeInBytes()); FstInstances[field.Number] = instance; } } NumericDocValues docToOrd = GetNumeric(field); FST<long> fst = instance; // per-thread resources FST<long>.BytesReader @in = fst.BytesReader; FST<long>.Arc<long> firstArc = new FST<long>.Arc<long>(); FST<long>.Arc<long> scratchArc = new FST<long>.Arc<long>(); IntsRef scratchInts = new IntsRef(); BytesRefFSTEnum<long> fstEnum = new BytesRefFSTEnum<long>(fst); return new SortedDocValuesAnonymousInnerClassHelper(this, entry, docToOrd, fst, @in, firstArc, scratchArc, scratchInts, fstEnum); }
/// <summary> /// Sole constructor </summary> public FSTPath(T cost, FST <T> .Arc <T> arc, IntsRef input) { this.Arc = (new FST <T> .Arc <T>()).CopyFrom(arc); this.Cost = cost; this.Input = input; }
// Pushes next'd frame or seek'd frame; we later // lazy-load the frame only when needed internal Frame PushFrame(FST<BytesRef>.Arc<BytesRef> arc, long fp, int length) { Frame f = GetFrame(1 + CurrentFrame.Ord); f.Arc = arc; if (f.FpOrig == fp && f.NextEnt != -1) { //if (DEBUG) System.out.println(" push reused frame ord=" + f.ord + " fp=" + f.fp + " isFloor?=" + f.isFloor + " hasTerms=" + f.hasTerms + " pref=" + term + " nextEnt=" + f.nextEnt + " targetBeforeCurrentLength=" + targetBeforeCurrentLength + " term.length=" + term.length + " vs prefix=" + f.prefix); if (f.Prefix > TargetBeforeCurrentLength) { f.Rewind(); } else { // if (DEBUG) { // System.out.println(" skip rewind!"); // } } Debug.Assert(length == f.Prefix); } else { f.NextEnt = -1; f.Prefix = length; f.State.TermBlockOrd = 0; f.FpOrig = f.Fp = fp; f.LastSubFP = -1; // if (DEBUG) { // final int sav = term.length; // term.length = length; // System.out.println(" push new frame ord=" + f.ord + " fp=" + f.fp + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " pref=" + brToString(term)); // term.length = sav; // } } return f; }
/// <summary> /// Returns the value mapped to the given key or <code>null</code> if the key is not in the FST dictionary. /// </summary> //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET: //ORIGINAL LINE: public org.apache.lucene.util.BytesRef get(char[] buffer, int bufferLen, org.apache.lucene.util.fst.FST.Arc<org.apache.lucene.util.BytesRef> scratchArc, org.apache.lucene.util.fst.FST.BytesReader fstReader) throws java.io.IOException public BytesRef get(char[] buffer, int bufferLen, FST.Arc<BytesRef> scratchArc, FST.BytesReader fstReader) { BytesRef pendingOutput = fst.outputs.NoOutput; BytesRef matchOutput = null; int bufUpto = 0; fst.getFirstArc(scratchArc); while (bufUpto < bufferLen) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen); int codePoint = char.codePointAt(buffer, bufUpto, bufferLen); if (fst.findTargetArc(ignoreCase ? char.ToLower(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null) { return null; } pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output); bufUpto += char.charCount(codePoint); } if (scratchArc.Final) { matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput); } return matchOutput; }
/// <summary> /// Enumerates all minimal prefix paths in the automaton that also intersect the <see cref="FST"/>, /// accumulating the <see cref="FST"/> end node and output for each path. /// </summary> public static IList <Path <T> > IntersectPrefixPaths <T>(Automaton a, FST <T> fst) { Debug.Assert(a.IsDeterministic); IList <Path <T> > queue = new List <Path <T> >(); List <Path <T> > endNodes = new List <Path <T> >(); queue.Add(new Path <T>(a.GetInitialState(), fst.GetFirstArc(new FST.Arc <T>()), fst.Outputs.NoOutput, new Int32sRef())); FST.Arc <T> scratchArc = new FST.Arc <T>(); FST.BytesReader fstReader = fst.GetBytesReader(); while (queue.Count != 0) { Path <T> path = queue.ElementAt(queue.Count - 1); queue.Remove(path); if (path.State.Accept) { endNodes.Add(path); // we can stop here if we accept this path, // we accept all further paths too continue; } Int32sRef currentInput = path.Input; foreach (Transition t in path.State.GetTransitions()) { int min = t.Min; int max = t.Max; if (min == max) { FST.Arc <T> nextArc = fst.FindTargetArc(t.Min, path.FstNode, scratchArc, fstReader); if (nextArc != null) { Int32sRef newInput = new Int32sRef(currentInput.Length + 1); newInput.CopyInt32s(currentInput); newInput.Int32s[currentInput.Length] = t.Min; newInput.Length = currentInput.Length + 1; queue.Add(new Path <T>(t.Dest, new FST.Arc <T>() .CopyFrom(nextArc), fst.Outputs.Add(path.Output, nextArc.Output), newInput)); } } else { // TODO: if this transition's TO state is accepting, and // it accepts the entire range possible in the FST (ie. 0 to 255), // we can simply use the prefix as the accepted state instead of // looking up all the ranges and terminate early // here. This just shifts the work from one queue // (this one) to another (the completion search // done in AnalyzingSuggester). FST.Arc <T> nextArc = Lucene.Net.Util.Fst.Util.ReadCeilArc(min, fst, path.FstNode, scratchArc, fstReader); while (nextArc != null && nextArc.Label <= max) { Debug.Assert(nextArc.Label <= max); Debug.Assert(nextArc.Label >= min, nextArc.Label + " " + min); Int32sRef newInput = new Int32sRef(currentInput.Length + 1); newInput.CopyInt32s(currentInput); newInput.Int32s[currentInput.Length] = nextArc.Label; newInput.Length = currentInput.Length + 1; queue.Add(new Path <T>(t.Dest, new FST.Arc <T>() .CopyFrom(nextArc), fst.Outputs.Add(path.Output, nextArc.Output), newInput)); int label = nextArc.Label; // used in assert nextArc = nextArc.IsLast ? null : fst.ReadNextRealArc(nextArc, fstReader); Debug.Assert(nextArc == null || label < nextArc.Label, "last: " + label + " next: " + (nextArc == null ? "" : nextArc.Label.ToString())); } } } } return(endNodes); }
/// <summary> /// Builds the final automaton from a list of added entries. This method may /// take a longer while as it needs to build the automaton. /// </summary> public virtual FSTCompletion Build() { this.automaton = BuildAutomaton(sorter); // Dispose of it if it is a disposable using (sorter as IDisposable) { } return new FSTCompletion(automaton); }
public SortedSetDocValuesAnonymousInnerClassHelper(FSTEntry fstEntry, BinaryDocValues binaryDocValues, FST <long?> fst1, FST.BytesReader @in, FST.Arc <long?> arc, FST.Arc <long?> scratchArc1, Int32sRef intsRef, BytesRefFSTEnum <long?> bytesRefFstEnum, BytesRef @ref, ByteArrayDataInput byteArrayDataInput) { entry = fstEntry; docToOrds = binaryDocValues; fst = fst1; this.@in = @in; firstArc = arc; scratchArc = scratchArc1; scratchInts = intsRef; fstEnum = bytesRefFstEnum; this.@ref = @ref; input = byteArrayDataInput; }
public SortedDocValuesAnonymousInnerClassHelper(Lucene42DocValuesProducer outerInstance, FSTEntry entry, NumericDocValues docToOrd, FST<long> fst, FST<long>.BytesReader @in, FST<long>.Arc<long> firstArc, FST<long>.Arc<long> scratchArc, IntsRef scratchInts, BytesRefFSTEnum<long> fstEnum) { this.OuterInstance = outerInstance; this.Entry = entry; this.DocToOrd = docToOrd; this.Fst = fst; this.@in = @in; this.FirstArc = firstArc; this.ScratchArc = scratchArc; this.ScratchInts = scratchInts; this.FstEnum = fstEnum; }
public SortedDocValuesAnonymousInnerClassHelper(FSTEntry entry, NumericDocValues docToOrd, FST<long?> fst, FST<long?>.BytesReader @in, FST<long?>.Arc<long?> firstArc, FST<long?>.Arc<long?> scratchArc, IntsRef scratchInts, BytesRefFSTEnum<long?> fstEnum) { this.Entry = entry; this.DocToOrd = docToOrd; this.Fst = fst; this.@in = @in; this.FirstArc = firstArc; this.ScratchArc = scratchArc; this.ScratchInts = scratchInts; this.FstEnum = fstEnum; }
/// <summary> /// Build the suggest index, using up to the specified /// amount of temporary RAM while building. Note that /// the weights for the suggestions are ignored. /// </summary> public virtual void Build(IInputIterator iterator, double ramBufferSizeMB) { if (iterator.HasPayloads) { throw new System.ArgumentException("this suggester doesn't support payloads"); } if (iterator.HasContexts) { throw new System.ArgumentException("this suggester doesn't support contexts"); } string prefix = this.GetType().Name; var directory = OfflineSorter.DefaultTempDir(); // LUCENENET specific - using GetRandomFileName() instead of picking a random int DirectoryInfo tempIndexPath = null; while (true) { tempIndexPath = new DirectoryInfo(Path.Combine(directory.FullName, prefix + ".index." + Path.GetFileNameWithoutExtension(Path.GetRandomFileName()))); tempIndexPath.Create(); if (System.IO.Directory.Exists(tempIndexPath.FullName)) { break; } } Directory dir = FSDirectory.Open(tempIndexPath); try { #pragma warning disable 612, 618 IndexWriterConfig iwc = new IndexWriterConfig(LuceneVersion.LUCENE_CURRENT, indexAnalyzer); #pragma warning restore 612, 618 iwc.SetOpenMode(OpenMode.CREATE); iwc.SetRAMBufferSizeMB(ramBufferSizeMB); IndexWriter writer = new IndexWriter(dir, iwc); var ft = new FieldType(TextField.TYPE_NOT_STORED); // TODO: if only we had IndexOptions.TERMS_ONLY... ft.IndexOptions = IndexOptions.DOCS_AND_FREQS; ft.OmitNorms = true; ft.Freeze(); Document doc = new Document(); Field field = new Field("body", "", ft); doc.Add(field); totTokens = 0; IndexReader reader = null; bool success = false; count = 0; try { while (true) { BytesRef surfaceForm = iterator.Next(); if (surfaceForm == null) { break; } field.SetStringValue(surfaceForm.Utf8ToString()); writer.AddDocument(doc); count++; } reader = DirectoryReader.Open(writer, false); Terms terms = MultiFields.GetTerms(reader, "body"); if (terms == null) { throw new System.ArgumentException("need at least one suggestion"); } // Move all ngrams into an FST: TermsEnum termsEnum = terms.GetIterator(null); Outputs <long?> outputs = PositiveInt32Outputs.Singleton; Builder <long?> builder = new Builder <long?>(FST.INPUT_TYPE.BYTE1, outputs); Int32sRef scratchInts = new Int32sRef(); while (true) { BytesRef term = termsEnum.Next(); if (term == null) { break; } int ngramCount = CountGrams(term); if (ngramCount > grams) { throw new System.ArgumentException("tokens must not contain separator byte; got token=" + term + " but gramCount=" + ngramCount + ", which is greater than expected max ngram size=" + grams); } if (ngramCount == 1) { totTokens += termsEnum.TotalTermFreq; } builder.Add(Lucene.Net.Util.Fst.Util.ToInt32sRef(term, scratchInts), EncodeWeight(termsEnum.TotalTermFreq)); } fst = builder.Finish(); if (fst == null) { throw new System.ArgumentException("need at least one suggestion"); } //System.out.println("FST: " + fst.getNodeCount() + " nodes"); /* * PrintWriter pw = new PrintWriter("/x/tmp/out.dot"); * Util.toDot(fst, pw, true, true); * pw.close(); */ success = true; } finally { if (success) { IOUtils.Dispose(writer, reader); } else { IOUtils.DisposeWhileHandlingException(writer, reader); } } } finally { try { IOUtils.Dispose(dir); } finally { // LUCENENET specific - since we are removing the entire directory anyway, // it doesn't make sense to first do a loop in order remove the files. // Let the System.IO.Directory.Delete() method handle that. // We also need to dispose the Directory instance first before deleting from disk. try { System.IO.Directory.Delete(tempIndexPath.FullName, true); } catch (Exception e) { throw new InvalidOperationException("failed to remove " + tempIndexPath, e); } } } }
public override SortedSetDocValues GetSortedSet(FieldInfo field) { FSTEntry entry = Fsts[field.Number]; if (entry.NumOrds == 0) { return DocValues.EMPTY_SORTED_SET; // empty FST! } FST<long> instance; lock (this) { if (!FstInstances.TryGetValue(field.Number, out instance)) { Data.Seek(entry.Offset); instance = new FST<long>((DataInput)Data, Lucene.Net.Util.Fst.PositiveIntOutputs.Singleton); RamBytesUsed_Renamed.AddAndGet(instance.SizeInBytes()); FstInstances[field.Number] = instance; } } BinaryDocValues docToOrds = GetBinary(field); FST<long> fst = instance; // per-thread resources FST<long>.BytesReader @in = fst.BytesReader; FST<long>.Arc<long> firstArc = new FST<long>.Arc<long>(); FST<long>.Arc<long> scratchArc = new FST<long>.Arc<long>(); IntsRef scratchInts = new IntsRef(); BytesRefFSTEnum<long> fstEnum = new BytesRefFSTEnum<long>(fst); BytesRef @ref = new BytesRef(); ByteArrayDataInput input = new ByteArrayDataInput(); return new SortedSetDocValuesAnonymousInnerClassHelper(this, entry, docToOrds, fst, @in, firstArc, scratchArc, scratchInts, fstEnum, @ref, input); }
public SortedDocValuesAnonymousInnerClassHelper(FSTEntry fstEntry, NumericDocValues numericDocValues, FST <long?> fst1, FST.BytesReader @in, FST.Arc <long?> arc, FST.Arc <long?> scratchArc1, Int32sRef intsRef, BytesRefFSTEnum <long?> bytesRefFstEnum) { entry = fstEntry; docToOrd = numericDocValues; fst = fst1; this.@in = @in; firstArc = arc; scratchArc = scratchArc1; scratchInts = intsRef; fstEnum = bytesRefFstEnum; }
public SortedSetDocValuesAnonymousInnerClassHelper(Lucene42DocValuesProducer outerInstance, FSTEntry entry, BinaryDocValues docToOrds, FST<long> fst, FST<long>.BytesReader @in, FST<long>.Arc<long> firstArc, FST<long>.Arc<long> scratchArc, IntsRef scratchInts, BytesRefFSTEnum<long> fstEnum, BytesRef @ref, ByteArrayDataInput input) { this.OuterInstance = outerInstance; this.Entry = entry; this.DocToOrds = docToOrds; this.Fst = fst; this.@in = @in; this.FirstArc = firstArc; this.ScratchArc = scratchArc; this.ScratchInts = scratchInts; this.FstEnum = fstEnum; this.@ref = @ref; this.Input = input; }
public SynonymMap(FST <BytesRef> fst, BytesRefHash words, int maxHorizontalContext) { this.fst = fst; this.words = words; this.maxHorizontalContext = maxHorizontalContext; }
internal FSTTermsEnum(FST<long> fst) { this.Fst = fst; @in = new BytesRefFSTEnum<long>(fst); BytesReader = fst.BytesReader; }
internal bool CanGrow(Frame frame) // can walk forward on both fst&fsa { return(frame.fsaState != -1 && FST <Memory.FSTTermOutputs.TermData> .TargetHasArcs(frame.fstArc)); }
//static final boolean TEST = false; public FSTOrdTermsReader(SegmentReadState state, PostingsReaderBase postingsReader) { string termsIndexFileName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, FSTOrdTermsWriter.TERMS_INDEX_EXTENSION); string termsBlockFileName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, FSTOrdTermsWriter.TERMS_BLOCK_EXTENSION); this.postingsReader = postingsReader; ChecksumIndexInput indexIn = null; IndexInput blockIn = null; bool success = false; try { indexIn = state.Directory.OpenChecksumInput(termsIndexFileName, state.Context); blockIn = state.Directory.OpenInput(termsBlockFileName, state.Context); version = ReadHeader(indexIn); ReadHeader(blockIn); if (version >= FSTOrdTermsWriter.TERMS_VERSION_CHECKSUM) { CodecUtil.ChecksumEntireFile(blockIn); } this.postingsReader.Init(blockIn); SeekDir(blockIn); FieldInfos fieldInfos = state.FieldInfos; int numFields = blockIn.ReadVInt(); for (int i = 0; i < numFields; i++) { FieldInfo fieldInfo = fieldInfos.FieldInfo(blockIn.ReadVInt()); bool hasFreq = fieldInfo.FieldIndexOptions != FieldInfo.IndexOptions.DOCS_ONLY; long numTerms = blockIn.ReadVLong(); long sumTotalTermFreq = hasFreq ? blockIn.ReadVLong() : -1; long sumDocFreq = blockIn.ReadVLong(); int docCount = blockIn.ReadVInt(); int longsSize = blockIn.ReadVInt(); var index = new FST<long?>(indexIn, PositiveIntOutputs.Singleton); var current = new TermsReader(this, fieldInfo, blockIn, numTerms, sumTotalTermFreq, sumDocFreq, docCount, longsSize, index); var previous = fields[fieldInfo.Name] = current; CheckFieldSummary(state.SegmentInfo, indexIn, blockIn, current, previous); } if (version >= FSTOrdTermsWriter.TERMS_VERSION_CHECKSUM) { CodecUtil.CheckFooter(indexIn); } else { CodecUtil.CheckEOF(indexIn); } success = true; } finally { if (success) { IOUtils.Close(indexIn, blockIn); } else { IOUtils.CloseWhileHandlingException(indexIn, blockIn); } } }
public SortedSetDocValuesAnonymousInnerClassHelper(FSTEntry entry, BinaryDocValues docToOrds, FST <long?> fst, FST.BytesReader @in, FST.Arc <long?> firstArc, FST.Arc <long?> scratchArc, Int32sRef scratchInts, BytesRefFSTEnum <long?> fstEnum, BytesRef @ref, ByteArrayDataInput input) { this.entry = entry; this.docToOrds = docToOrds; this.fst = fst; this.@in = @in; this.firstArc = firstArc; this.scratchArc = scratchArc; this.scratchInts = scratchInts; this.fstEnum = fstEnum; this.@ref = @ref; this.input = input; }
private void loadTermsIndex() { if (Fst == null) { IndexInput clone = input.Clone(); clone.Seek(indexStart); Fst = new FST<>(clone, fstOutputs); clone.Close(); /* final String dotFileName = segment + "_" + fieldInfo.name + ".dot"; Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName)); Util.toDot(fst, w, false, false); System.out.println("FST INDEX: SAVED to " + dotFileName); w.close(); */ if (indexDivisor > 1) { // subsample IntsRef scratchIntsRef = new IntsRef(); PositiveIntOutputs outputs = PositiveIntOutputs.GetSingleton(); Builder<long> builder = new Builder<long>(FST.INPUT_TYPE.BYTE1, outputs); BytesRefFSTEnum<long> fstEnum = new BytesRefFSTEnum<long>(fst); BytesRefFSTEnum.InputOutput<long> result; int count = indexDivisor; while ((result = fstEnum.Next()) != null) { if (count == indexDivisor) { builder.Add(Util.ToIntsRef(result.Input, scratchIntsRef), result.Output); count = 0; } count++; } Fst = builder.Finish(); } } }
// LUCENENET NOTE: InputOutput<T> was moved to the BytesRefFSTEnum class /// <summary> /// doFloor controls the behavior of advance: if it's true /// doFloor is true, advance positions to the biggest /// term before target. /// </summary> public BytesRefFSTEnum(FST <T> fst) : base(fst) { result.Input = current; current.Offset = 1; }
public IndexEnum(FST <long?> fst) { _fstEnum = new BytesRefFSTEnum <long?>(fst); }
/// <summary> /// Returns all prefix paths to initialize the search. /// </summary> protected internal virtual IList <FSTUtil.Path <PairOutputs <long?, BytesRef> .Pair> > GetFullPrefixPaths( IList <FSTUtil.Path <PairOutputs <long?, BytesRef> .Pair> > prefixPaths, Automaton lookupAutomaton, FST <PairOutputs <long?, BytesRef> .Pair> fst) { return(prefixPaths); }
public FSTTermsEnum(FieldInfo field, FST <BytesRef> fst) { this.field = field; fstEnum = new BytesRefFSTEnum <BytesRef>(fst); }
/// <summary> /// Expert: like <seealso cref="Util#getByOutput(FST, long)"/> except reusing /// BytesReader, initial and scratch Arc, and result. /// </summary> public static IntsRef GetByOutput(FST <long?> fst, long targetOutput, FST <long?> .BytesReader @in, FST <long?> .Arc <long?> arc, FST <long?> .Arc <long?> scratchArc, IntsRef result) { long output = arc.Output.Value; int upto = 0; //System.out.println("reverseLookup output=" + targetOutput); while (true) { //System.out.println("loop: output=" + output + " upto=" + upto + " arc=" + arc); if (arc.Final) { long finalOutput = output + arc.NextFinalOutput.Value; //System.out.println(" isFinal finalOutput=" + finalOutput); if (finalOutput == targetOutput) { result.Length = upto; //System.out.println(" found!"); return(result); } else if (finalOutput > targetOutput) { //System.out.println(" not found!"); return(null); } } if (FST <long?> .TargetHasArcs(arc)) { //System.out.println(" targetHasArcs"); if (result.Ints.Length == upto) { result.Grow(1 + upto); } fst.ReadFirstRealTargetArc(arc.Target, arc, @in); if (arc.BytesPerArc != 0) { int low = 0; int high = arc.NumArcs - 1; int mid = 0; //System.out.println("bsearch: numArcs=" + arc.numArcs + " target=" + targetOutput + " output=" + output); bool exact = false; while (low <= high) { mid = (int)((uint)(low + high) >> 1); @in.Position = arc.PosArcsStart; @in.SkipBytes(arc.BytesPerArc * mid); var flags = (sbyte)@in.ReadByte(); fst.ReadLabel(@in); long minArcOutput; if ((flags & FST <long> .BIT_ARC_HAS_OUTPUT) != 0) { long arcOutput = fst.Outputs.Read(@in).Value; minArcOutput = output + arcOutput; } else { minArcOutput = output; } if (minArcOutput == targetOutput) { exact = true; break; } else if (minArcOutput < targetOutput) { low = mid + 1; } else { high = mid - 1; } } if (high == -1) { return(null); } else if (exact) { arc.ArcIdx = mid - 1; } else { arc.ArcIdx = low - 2; } fst.ReadNextRealArc(arc, @in); result.Ints[upto++] = arc.Label; output += arc.Output.Value; } else { FST <long?> .Arc <long?> prevArc = null; while (true) { //System.out.println(" cycle label=" + arc.label + " output=" + arc.output); // this is the min output we'd hit if we follow // this arc: long minArcOutput = output + arc.Output.Value; if (minArcOutput == targetOutput) { // Recurse on this arc: //System.out.println(" match! break"); output = minArcOutput; result.Ints[upto++] = arc.Label; break; } else if (minArcOutput > targetOutput) { if (prevArc == null) { // Output doesn't exist return(null); } else { // Recurse on previous arc: arc.CopyFrom(prevArc); result.Ints[upto++] = arc.Label; output += arc.Output.Value; //System.out.println(" recurse prev label=" + (char) arc.label + " output=" + output); break; } } else if (arc.Last) { // Recurse on this arc: output = minArcOutput; //System.out.println(" recurse last label=" + (char) arc.label + " output=" + output); result.Ints[upto++] = arc.Label; break; } else { // Read next arc in this node: prevArc = scratchArc; prevArc.CopyFrom(arc); //System.out.println(" after copy label=" + (char) prevArc.label + " vs " + (char) arc.label); fst.ReadNextRealArc(arc, @in); } } } } else { //System.out.println(" no target arcs; not found!"); return(null); } } }
internal TermsReader(FSTOrdTermsReader outerInstance, FieldInfo fieldInfo, IndexInput blockIn, long numTerms, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize, FST <long?> index) { this.outerInstance = outerInstance; this.fieldInfo = fieldInfo; this.numTerms = numTerms; this.sumTotalTermFreq = sumTotalTermFreq; this.sumDocFreq = sumDocFreq; this.docCount = docCount; this.longsSize = longsSize; this.index = index; if (Debugging.AssertsEnabled) { Debugging.Assert((numTerms & (~0xffffffffL)) == 0); } int numBlocks = (int)(numTerms + INTERVAL - 1) / INTERVAL; this.numSkipInfo = longsSize + 3; this.skipInfo = new long[numBlocks * numSkipInfo]; this.statsBlock = new byte[(int)blockIn.ReadVInt64()]; this.metaLongsBlock = new byte[(int)blockIn.ReadVInt64()]; this.metaBytesBlock = new byte[(int)blockIn.ReadVInt64()]; int last = 0, next = 0; for (int i = 1; i < numBlocks; i++) { next = numSkipInfo * i; for (int j = 0; j < numSkipInfo; j++) { skipInfo[next + j] = skipInfo[last + j] + blockIn.ReadVInt64(); } last = next; } blockIn.ReadBytes(statsBlock, 0, statsBlock.Length); blockIn.ReadBytes(metaLongsBlock, 0, metaLongsBlock.Length); blockIn.ReadBytes(metaBytesBlock, 0, metaBytesBlock.Length); }
/// <summary> /// Dumps an <seealso cref="FST"/> to a GraphViz's <code>dot</code> language description /// for visualization. Example of use: /// /// <pre class="prettyprint"> /// PrintWriter pw = new PrintWriter("out.dot"); /// Util.toDot(fst, pw, true, true); /// pw.close(); /// </pre> /// /// and then, from command line: /// /// <pre> /// dot -Tpng -o out.png out.dot /// </pre> /// /// <p> /// Note: larger FSTs (a few thousand nodes) won't even /// render, don't bother. If the FST is > 2.1 GB in size /// then this method will throw strange exceptions. /// </summary> /// <param name="sameRank"> /// If <code>true</code>, the resulting <code>dot</code> file will try /// to order states in layers of breadth-first traversal. this may /// mess up arcs, but makes the output FST's structure a bit clearer. /// </param> /// <param name="labelStates"> /// If <code>true</code> states will have labels equal to their offsets in their /// binary format. Expands the graph considerably. /// </param> /// <seealso cref= "http://www.graphviz.org/" </seealso> public static void toDot <T>(FST <T> fst, TextWriter @out, bool sameRank, bool labelStates) { const string expandedNodeColor = "blue"; // this is the start arc in the automaton (from the epsilon state to the first state // with outgoing transitions. FST <T> .Arc <T> startArc = fst.GetFirstArc(new FST <T> .Arc <T>()); // A queue of transitions to consider for the next level. IList <FST <T> .Arc <T> > thisLevelQueue = new List <FST <T> .Arc <T> >(); // A queue of transitions to consider when processing the next level. IList <FST <T> .Arc <T> > nextLevelQueue = new List <FST <T> .Arc <T> >(); nextLevelQueue.Add(startArc); //System.out.println("toDot: startArc: " + startArc); // A list of states on the same level (for ranking). IList <int?> sameLevelStates = new List <int?>(); // A bitset of already seen states (target offset). BitArray seen = new BitArray(32); seen.SafeSet((int)startArc.Target, true); // Shape for states. const string stateShape = "circle"; const string finalStateShape = "doublecircle"; // Emit DOT prologue. @out.Write("digraph FST {\n"); @out.Write(" rankdir = LR; splines=true; concentrate=true; ordering=out; ranksep=2.5; \n"); if (!labelStates) { @out.Write(" node [shape=circle, width=.2, height=.2, style=filled]\n"); } EmitDotState(@out, "initial", "point", "white", ""); T NO_OUTPUT = fst.Outputs.NoOutput; var r = fst.BytesReader; // final FST.Arc<T> scratchArc = new FST.Arc<>(); { string stateColor; if (fst.IsExpandedTarget(startArc, r)) { stateColor = expandedNodeColor; } else { stateColor = null; } bool isFinal; T finalOutput; if (startArc.Final) { isFinal = true; finalOutput = startArc.NextFinalOutput.Equals(NO_OUTPUT) ? default(T) : startArc.NextFinalOutput; } else { isFinal = false; finalOutput = default(T); } EmitDotState(@out, Convert.ToString(startArc.Target), isFinal ? finalStateShape : stateShape, stateColor, finalOutput == null ? "" : fst.Outputs.OutputToString(finalOutput)); } @out.Write(" initial -> " + startArc.Target + "\n"); int level = 0; while (nextLevelQueue.Count > 0) { // we could double buffer here, but it doesn't matter probably. //System.out.println("next level=" + level); thisLevelQueue.AddRange(nextLevelQueue); nextLevelQueue.Clear(); level++; @out.Write("\n // Transitions and states at level: " + level + "\n"); while (thisLevelQueue.Count > 0) { FST <T> .Arc <T> arc = thisLevelQueue[thisLevelQueue.Count - 1]; thisLevelQueue.RemoveAt(thisLevelQueue.Count - 1); //System.out.println(" pop: " + arc); if (FST <T> .TargetHasArcs(arc)) { // scan all target arcs //System.out.println(" readFirstTarget..."); long node = arc.Target; fst.ReadFirstRealTargetArc(arc.Target, arc, r); //System.out.println(" firstTarget: " + arc); while (true) { //System.out.println(" cycle arc=" + arc); // Emit the unseen state and add it to the queue for the next level. if (arc.Target >= 0 && !seen.SafeGet((int)arc.Target)) { /* * boolean isFinal = false; * T finalOutput = null; * fst.readFirstTargetArc(arc, scratchArc); * if (scratchArc.isFinal() && fst.targetHasArcs(scratchArc)) { * // target is final * isFinal = true; * finalOutput = scratchArc.output == NO_OUTPUT ? null : scratchArc.output; * System.out.println("dot hit final label=" + (char) scratchArc.label); * } */ string stateColor; if (fst.IsExpandedTarget(arc, r)) { stateColor = expandedNodeColor; } else { stateColor = null; } string finalOutput; if (arc.NextFinalOutput != null && !arc.NextFinalOutput.Equals(NO_OUTPUT)) { finalOutput = fst.Outputs.OutputToString(arc.NextFinalOutput); } else { finalOutput = ""; } EmitDotState(@out, Convert.ToString(arc.Target), stateShape, stateColor, finalOutput); // To see the node address, use this instead: //emitDotState(out, Integer.toString(arc.target), stateShape, stateColor, String.valueOf(arc.target)); seen.SafeSet((int)arc.Target, true); nextLevelQueue.Add((new FST <T> .Arc <T>()).CopyFrom(arc)); sameLevelStates.Add((int)arc.Target); } string outs; if (!arc.Output.Equals(NO_OUTPUT)) { outs = "/" + fst.Outputs.OutputToString(arc.Output); } else { outs = ""; } if (!FST <T> .TargetHasArcs(arc) && arc.Final && !arc.NextFinalOutput.Equals(NO_OUTPUT)) { // Tricky special case: sometimes, due to // pruning, the builder can [sillily] produce // an FST with an arc into the final end state // (-1) but also with a next final output; in // this case we pull that output up onto this // arc outs = outs + "/[" + fst.Outputs.OutputToString(arc.NextFinalOutput) + "]"; } string arcColor; if (arc.Flag(FST <T> .BIT_TARGET_NEXT)) { arcColor = "red"; } else { arcColor = "black"; } Debug.Assert(arc.Label != FST <T> .END_LABEL); @out.Write(" " + node + " -> " + arc.Target + " [label=\"" + PrintableLabel(arc.Label) + outs + "\"" + (arc.Final ? " style=\"bold\"" : "") + " color=\"" + arcColor + "\"]\n"); // Break the loop if we're on the last arc of this state. if (arc.Last) { //System.out.println(" break"); break; } fst.ReadNextRealArc(arc, r); } } } // Emit state ranking information. if (sameRank && sameLevelStates.Count > 1) { @out.Write(" {rank=same; "); foreach (int state in sameLevelStates) { @out.Write(state + "; "); } @out.Write(" }\n"); } sameLevelStates.Clear(); } // Emit terminating state (always there anyway). @out.Write(" -1 [style=filled, color=black, shape=doublecircle, label=\"\"]\n\n"); @out.Write(" {rank=sink; -1 }\n"); @out.Write("}\n"); @out.Flush(); }
//static final boolean TEST = false; public FSTOrdTermsReader(SegmentReadState state, PostingsReaderBase postingsReader) { string termsIndexFileName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, FSTOrdTermsWriter.TERMS_INDEX_EXTENSION); string termsBlockFileName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, FSTOrdTermsWriter.TERMS_BLOCK_EXTENSION); this.postingsReader = postingsReader; ChecksumIndexInput indexIn = null; IndexInput blockIn = null; bool success = false; try { indexIn = state.Directory.OpenChecksumInput(termsIndexFileName, state.Context); blockIn = state.Directory.OpenInput(termsBlockFileName, state.Context); version = ReadHeader(indexIn); ReadHeader(blockIn); if (version >= FSTOrdTermsWriter.TERMS_VERSION_CHECKSUM) { CodecUtil.ChecksumEntireFile(blockIn); } this.postingsReader.Init(blockIn); SeekDir(blockIn); FieldInfos fieldInfos = state.FieldInfos; int numFields = blockIn.ReadVInt32(); for (int i = 0; i < numFields; i++) { FieldInfo fieldInfo = fieldInfos.FieldInfo(blockIn.ReadVInt32()); bool hasFreq = fieldInfo.IndexOptions != IndexOptions.DOCS_ONLY; long numTerms = blockIn.ReadVInt64(); long sumTotalTermFreq = hasFreq ? blockIn.ReadVInt64() : -1; long sumDocFreq = blockIn.ReadVInt64(); int docCount = blockIn.ReadVInt32(); int longsSize = blockIn.ReadVInt32(); var index = new FST <long?>(indexIn, PositiveInt32Outputs.Singleton); var current = new TermsReader(this, fieldInfo, blockIn, numTerms, sumTotalTermFreq, sumDocFreq, docCount, longsSize, index); TermsReader previous; // LUCENENET NOTE: This simulates a put operation in Java, // getting the prior value first before setting it. fields.TryGetValue(fieldInfo.Name, out previous); fields[fieldInfo.Name] = current; CheckFieldSummary(state.SegmentInfo, indexIn, blockIn, current, previous); } if (version >= FSTOrdTermsWriter.TERMS_VERSION_CHECKSUM) { CodecUtil.CheckFooter(indexIn); } else { #pragma warning disable 612, 618 CodecUtil.CheckEOF(indexIn); #pragma warning restore 612, 618 } success = true; } finally { if (success) { IOUtils.Dispose(indexIn, blockIn); } else { IOUtils.DisposeWhileHandlingException(indexIn, blockIn); } } }
/// <summary> /// Creates a new <seealso cref="StemmerOverrideMap"/> </summary> /// <param name="fst"> the fst to lookup the overrides </param> /// <param name="ignoreCase"> if the keys case should be ingored </param> public StemmerOverrideMap(FST<BytesRef> fst, bool ignoreCase) { this.fst = fst; this.ignoreCase = ignoreCase; }
private bool CanGrow(Frame frame) // can walk forward on both fst&fsa { return(frame.state != -1 && FST <long?> .TargetHasArcs(frame.arc)); }
private void MainForm_Load(object sender, EventArgs e) { this.Icon = Icon.ExtractAssociatedIcon(System.Reflection.Assembly.GetEntryAssembly().Location); int i; SettingsFile = new Xml("gecko.xml"); SettingsFile.RootName = "gecko"; gamename = ""; gecko = new USBGecko(); gecko.chunkUpdate += transfer; exceptionHandling = new ExceptionHandler(this); if (!Directory.Exists("DumpHistory")) Directory.CreateDirectory("DumpHistory"); search = new MemSearch(gecko, SearchResults, PrvPage, NxtPage, ResList, UpDownSearchResultPage, exceptionHandling); viewer = new MemoryViewer(gecko, ValidMemory.ValidAreas[0].low, memViewGrid, memViewPAddress, memViewPValue, MemViewFPValue, exceptionHandling); #if MONO disassembler = new Disassembly(gecko, "./vdappc", DisAssBox, DisScroll, DisRegion, AsAddress, AsText, exceptionHandling); #else disassembler = new Disassembly(gecko, "vdappc.exe", DisAssBox, DisScroll, DisRegion, AsAddress, AsText, exceptionHandling); #endif bpHandler = new Breakpoints(gecko, BPList, this, disassembler, BPDiss, BPClassic, BPCondList, exceptionHandling); //bpHandler = new Breakpoints(gecko, BPList, this, disassembler, richTextBox1, BPClassic, BPCondList, exceptionHandling); foreach (String reg in BPList.longRegNames) BPConditionRegSelect.Items.Add(reg.Trim()); BPConditionRegSelect.Items.Add("VoA"); BPConditionRegSelect.SelectedIndex = 0; BPConditionCompare.SelectedIndex = 0; bpHandler.BPSkip += BPSkipped; watcher = new WatchList(gecko, WatchList, WatchIntervalSet, exceptionHandling); addWatchDialog = null; watchValueInput = null; fst = new FST(gecko, FSTTreeView, FSTCodeData, FSTSetAsSource, FSTSetAsTarget, FSTGenSwap, FSTFileSource, FSTFileTarget, FSTSwapCode, FSTSwapNow, exceptionHandling); GCTCodeContents = new CodeController(GCTCodeList, GCTCodeValues); GCTCodeContents.codesModified += GCTModified; bpHandler.BPStop += BPStopped; for (i = 0; i < ValidMemory.ValidAreas.Length; i++) { memRange.Items.Add( GlobalFunctions.toHex(ValidMemory.ValidAreas[i].id, 2)); MemViewARange.Items.Add( GlobalFunctions.toHex(ValidMemory.ValidAreas[i].id, 2)); ToolsDumpRegions.Items.Add( GlobalFunctions.toHex(ValidMemory.ValidAreas[i].id, 2)); } codeWizard = new GCTWizard(GCTCodeContents); memRange.SelectedIndex = 0; MemViewARange.SelectedIndex = 0; MemViewShowMode.SelectedIndex = 0; MemViewSearchType.SelectedIndex = 0; ToolsDumpRegions.SelectedIndex = 0; comboBoxSearchDataType.SelectedIndex = 2; //UpperEnable.SelectedIndex = 0; //BPType.SelectedIndex = 0; comboBoxDisplayType.SelectedIndex = 0; WasAlreadyDisabled = new List<Control>(); searchComparisons = new List<SearchComparisonInfo>(); searchComparisons.Add(new SearchComparisonInfo()); comboBoxComparisonType.SelectedIndex = 0; comboBoxComparisonRHS.SelectedIndex = 0; buttonCancelSearch.Enabled = false; SteppingOut = false; buttonUndoSearch.Enabled = search.CanUndo(); TabLock = null; BPStepLogWriter = null; comboBoxPokeOperation.SelectedIndex = 0; SetComboboxValue("Screenshots", "Format", 0, ImgFormat); SetComboboxValue("Screenshots", "Sizing", 0, ShotSizingType); int value = SettingsFile.GetValue("Screenshots", "JPEGQuality", 85); if (value < 0 || value > 100) value = 85; JPGQual.Value = value; multiPokeAddr = new List<UInt32>(); FormStop(false); CUSBGecko.Enabled = true; codesModified = false; AbtText.Text = "gecko dotNET Beta 0.63 by Link and dcx2\n\n" + "Special thanks to:\n\n" + "kenobi: for original WiiRd GUI!\n" + "Nuke: for the USB Gecko!\n" + "brkirch: for continuing Gecko OS!\n" + "Y.S.: for the original code handler!\n" + "Team Twiizers for bringing homebrew to the Wii\n" + "DevKitPro team: No homebrew without them!\n" + "Frank Wille: vdappc developer!\n" + "various beta testers!\n" + "and you!"; notes = new NoteSheets(); //Set MEM2 upper to 93400000 MEM2UpperBoundary.SelectedIndex = 0; // Restore previous settings checkBoxAlwaysOnTop.Checked = GeckoApp.Properties.Settings.Default.AlwaysOnTop; numericUpDownFPS.Value = GeckoApp.Properties.Settings.Default.FPS; BPAddress.Text = GeckoApp.Properties.Settings.Default.BPAddr; memViewAValue.Text = GeckoApp.Properties.Settings.Default.MemViewAddr; BPType.SelectedIndex = GeckoApp.Properties.Settings.Default.BPType; checkBoxBPNext.Checked = GeckoApp.Properties.Settings.Default.BPNext; checkBoxPauseCodes.Checked = GeckoApp.Properties.Settings.Default.PauseCodes; Size = GeckoApp.Properties.Settings.Default.LastSize; int oldSplitter = GeckoApp.Properties.Settings.Default.LastSplitterSize; // The splitter gets moved when the breakpoint page is entered // so artificially force it to move MainControl.SelectedTab = BreakpointPage; MainControl.SelectedTab = searchPage; splitContainerRegASM.SplitterDistance = oldSplitter; toolStripTextBoxMemViewFontSize.Text = GeckoApp.Properties.Settings.Default.MemViewFontSize.ToString(); toolStripTextBoxMemViewFontSize_KeyDown(null, new KeyEventArgs(Keys.Enter)); viewFloatsInHexToolStripMenuItem.Checked = GeckoApp.Properties.Settings.Default.ViewFloatsInHex; //addressTextBox1.CopyStringToHistory(GeckoApp.Properties.Settings.Default.addressHistory); //addressTextBox1.AutoHistory = true; }
public virtual void Test() { int[] ints = new int[7]; Int32sRef input = new Int32sRef(ints, 0, ints.Length); int seed = Random.Next(); Directory dir = new MMapDirectory(CreateTempDir("2BFST")); for (int doPackIter = 0; doPackIter < 2; doPackIter++) { bool doPack = doPackIter == 1; // Build FST w/ NoOutputs and stop when nodeCount > 2.2B if (!doPack) { Console.WriteLine("\nTEST: 3B nodes; doPack=false output=NO_OUTPUTS"); Outputs <object> outputs = NoOutputs.Singleton; object NO_OUTPUT = outputs.NoOutput; Builder <object> b = new Builder <object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, int.MaxValue, outputs, null, doPack, PackedInt32s.COMPACT, true, 15); int count = 0; Random r = new Random(seed); int[] ints2 = new int[200]; Int32sRef input2 = new Int32sRef(ints2, 0, ints2.Length); while (true) { //System.out.println("add: " + input + " -> " + output); for (int i = 10; i < ints2.Length; i++) { ints2[i] = r.Next(256); } b.Add(input2, NO_OUTPUT); count++; if (count % 100000 == 0) { Console.WriteLine(count + ": " + b.GetFstSizeInBytes() + " bytes; " + b.TotStateCount + " nodes"); } if (b.TotStateCount > int.MaxValue + 100L * 1024 * 1024) { break; } NextInput(r, ints2); } FST <object> fst = b.Finish(); for (int verify = 0; verify < 2; verify++) { Console.WriteLine("\nTEST: now verify [fst size=" + fst.GetSizeInBytes() + "; nodeCount=" + fst.NodeCount + "; arcCount=" + fst.ArcCount + "]"); Arrays.Fill(ints2, 0); r = new Random(seed); for (int i = 0; i < count; i++) { if (i % 1000000 == 0) { Console.WriteLine(i + "...: "); } for (int j = 10; j < ints2.Length; j++) { ints2[j] = r.Next(256); } Assert.AreEqual(NO_OUTPUT, Util.Get(fst, input2)); NextInput(r, ints2); } Console.WriteLine("\nTEST: enum all input/outputs"); Int32sRefFSTEnum <object> fstEnum = new Int32sRefFSTEnum <object>(fst); Arrays.Fill(ints2, 0); r = new Random(seed); int upto = 0; while (true) { Int32sRefFSTEnum.InputOutput <object> pair = fstEnum.Next(); if (pair == null) { break; } for (int j = 10; j < ints2.Length; j++) { ints2[j] = r.Next(256); } Assert.AreEqual(input2, pair.Input); Assert.AreEqual(NO_OUTPUT, pair.Output); upto++; NextInput(r, ints2); } Assert.AreEqual(count, upto); if (verify == 0) { Console.WriteLine("\nTEST: save/load FST and re-verify"); IndexOutput @out = dir.CreateOutput("fst", IOContext.DEFAULT); fst.Save(@out); @out.Dispose(); IndexInput @in = dir.OpenInput("fst", IOContext.DEFAULT); fst = new FST <object>(@in, outputs); @in.Dispose(); } else { dir.DeleteFile("fst"); } } } // Build FST w/ ByteSequenceOutputs and stop when FST // size = 3GB { Console.WriteLine("\nTEST: 3 GB size; doPack=" + doPack + " outputs=bytes"); Outputs <BytesRef> outputs = ByteSequenceOutputs.Singleton; Builder <BytesRef> b = new Builder <BytesRef>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, int.MaxValue, outputs, null, doPack, PackedInt32s.COMPACT, true, 15); var outputBytes = new byte[20]; BytesRef output = new BytesRef(outputBytes); Arrays.Fill(ints, 0); int count = 0; Random r = new Random(seed); while (true) { r.NextBytes(outputBytes); //System.out.println("add: " + input + " -> " + output); b.Add(input, BytesRef.DeepCopyOf(output)); count++; if (count % 1000000 == 0) { Console.WriteLine(count + "...: " + b.GetFstSizeInBytes() + " bytes"); } if (b.GetFstSizeInBytes() > LIMIT) { break; } NextInput(r, ints); } FST <BytesRef> fst = b.Finish(); for (int verify = 0; verify < 2; verify++) { Console.WriteLine("\nTEST: now verify [fst size=" + fst.GetSizeInBytes() + "; nodeCount=" + fst.NodeCount + "; arcCount=" + fst.ArcCount + "]"); r = new Random(seed); Arrays.Fill(ints, 0); for (int i = 0; i < count; i++) { if (i % 1000000 == 0) { Console.WriteLine(i + "...: "); } r.NextBytes(outputBytes); Assert.AreEqual(output, Util.Get(fst, input)); NextInput(r, ints); } Console.WriteLine("\nTEST: enum all input/outputs"); Int32sRefFSTEnum <BytesRef> fstEnum = new Int32sRefFSTEnum <BytesRef>(fst); Arrays.Fill(ints, 0); r = new Random(seed); int upto = 0; while (true) { Int32sRefFSTEnum.InputOutput <BytesRef> pair = fstEnum.Next(); if (pair == null) { break; } Assert.AreEqual(input, pair.Input); r.NextBytes(outputBytes); Assert.AreEqual(output, pair.Output); upto++; NextInput(r, ints); } Assert.AreEqual(count, upto); if (verify == 0) { Console.WriteLine("\nTEST: save/load FST and re-verify"); IndexOutput @out = dir.CreateOutput("fst", IOContext.DEFAULT); fst.Save(@out); @out.Dispose(); IndexInput @in = dir.OpenInput("fst", IOContext.DEFAULT); fst = new FST <BytesRef>(@in, outputs); @in.Dispose(); } else { dir.DeleteFile("fst"); } } } // Build FST w/ PositiveIntOutputs and stop when FST // size = 3GB { Console.WriteLine("\nTEST: 3 GB size; doPack=" + doPack + " outputs=long"); Outputs <long?> outputs = PositiveInt32Outputs.Singleton; Builder <long?> b = new Builder <long?>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, int.MaxValue, outputs, null, doPack, PackedInt32s.COMPACT, true, 15); long output = 1; Arrays.Fill(ints, 0); int count = 0; Random r = new Random(seed); while (true) { //System.out.println("add: " + input + " -> " + output); b.Add(input, output); output += 1 + r.Next(10); count++; if (count % 1000000 == 0) { Console.WriteLine(count + "...: " + b.GetFstSizeInBytes() + " bytes"); } if (b.GetFstSizeInBytes() > LIMIT) { break; } NextInput(r, ints); } FST <long?> fst = b.Finish(); for (int verify = 0; verify < 2; verify++) { Console.WriteLine("\nTEST: now verify [fst size=" + fst.GetSizeInBytes() + "; nodeCount=" + fst.NodeCount + "; arcCount=" + fst.ArcCount + "]"); Arrays.Fill(ints, 0); output = 1; r = new Random(seed); for (int i = 0; i < count; i++) { if (i % 1000000 == 0) { Console.WriteLine(i + "...: "); } // forward lookup: Assert.AreEqual(output, (long)Util.Get(fst, input)); // reverse lookup: Assert.AreEqual(input, Util.GetByOutput(fst, output)); output += 1 + r.Next(10); NextInput(r, ints); } Console.WriteLine("\nTEST: enum all input/outputs"); Int32sRefFSTEnum <long?> fstEnum = new Int32sRefFSTEnum <long?>(fst); Arrays.Fill(ints, 0); r = new Random(seed); int upto = 0; output = 1; while (true) { Int32sRefFSTEnum.InputOutput <long?> pair = fstEnum.Next(); if (pair == null) { break; } Assert.AreEqual(input, pair.Input); Assert.AreEqual(output, pair.Output.Value); output += 1 + r.Next(10); upto++; NextInput(r, ints); } Assert.AreEqual(count, upto); if (verify == 0) { Console.WriteLine("\nTEST: save/load FST and re-verify"); IndexOutput @out = dir.CreateOutput("fst", IOContext.DEFAULT); fst.Save(@out); @out.Dispose(); IndexInput @in = dir.OpenInput("fst", IOContext.DEFAULT); fst = new FST <long?>(@in, outputs); @in.Dispose(); } else { dir.DeleteFile("fst"); } } } } dir.Dispose(); }
public IndexEnum(FST<long?> fst) { _fstEnum = new BytesRefFSTEnum<long?>(fst); }
public override bool Load(DataInput input) { count = input.ReadVInt64(); this.fst = new FST <long?>(input, PositiveInt32Outputs.Singleton); return(true); }
private void LoadTerms() { var posIntOutputs = PositiveIntOutputs.Singleton; var outputsInner = new PairOutputs<long?, long?>(posIntOutputs, posIntOutputs); var outputs = new PairOutputs<long?, PairOutputs<long?,long?>.Pair>(posIntOutputs, outputsInner); // honestly, wtf kind of generic mess is this. var b = new Builder<PairOutputs<long?, PairOutputs<long?,long?>.Pair>.Pair>(FST.INPUT_TYPE.BYTE1, outputs); var input = (IndexInput) _outerInstance._input.Clone(); input.Seek(_termsStart); var lastTerm = new BytesRef(10); long lastDocsStart = -1; int docFreq = 0; long totalTermFreq = 0; var visitedDocs = new FixedBitSet(_maxDoc); var scratchIntsRef = new IntsRef(); while (true) { SimpleTextUtil.ReadLine(input, _scratch); if (_scratch.Equals(SimpleTextFieldsWriter.END) || StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.FIELD)) { if (lastDocsStart != -1) { b.Add(Util.ToIntsRef(lastTerm, scratchIntsRef), outputs.NewPair(lastDocsStart, outputsInner.NewPair(docFreq, totalTermFreq))); _sumTotalTermFreq += totalTermFreq; } break; } if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.DOC)) { docFreq++; _sumDocFreq++; UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.DOC.Length, _scratch.Length - SimpleTextFieldsWriter.DOC.Length, _scratchUtf16); int docId = ArrayUtil.ParseInt(_scratchUtf16.Chars, 0, _scratchUtf16.Length); visitedDocs.Set(docId); } else if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.FREQ)) { UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.FREQ.Length, _scratch.Length - SimpleTextFieldsWriter.FREQ.Length, _scratchUtf16); totalTermFreq += ArrayUtil.ParseInt(_scratchUtf16.Chars, 0, _scratchUtf16.Length); } else if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.TERM)) { if (lastDocsStart != -1) { b.Add(Util.ToIntsRef(lastTerm, scratchIntsRef), outputs.NewPair(lastDocsStart, outputsInner.NewPair(docFreq, totalTermFreq))); } lastDocsStart = input.FilePointer; int len = _scratch.Length - SimpleTextFieldsWriter.TERM.Length; if (len > lastTerm.Length) { lastTerm.Grow(len); } Array.Copy(_scratch.Bytes, SimpleTextFieldsWriter.TERM.Length, lastTerm.Bytes, 0, len); lastTerm.Length = len; docFreq = 0; _sumTotalTermFreq += totalTermFreq; totalTermFreq = 0; _termCount++; } } _docCount = visitedDocs.Cardinality(); _fst = b.Finish(); }
/// <summary> /// Creates a new <seealso cref="StemmerOverrideMap"/> </summary> /// <param name="fst"> the fst to lookup the overrides </param> /// <param name="ignoreCase"> if the keys case should be ingored </param> public StemmerOverrideMap(FST <BytesRef> fst, bool ignoreCase) { this.fst = fst; this.ignoreCase = ignoreCase; }
public FieldMetaData(FieldInfo fieldInfo, long numTerms, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize, FST<FSTTermOutputs.TermData> fst) { FieldInfo = fieldInfo; NumTerms = numTerms; SumTotalTermFreq = sumTotalTermFreq; SumDocFreq = sumDocFreq; DocCount = docCount; LongsSize = longsSize; Dict = fst; }
internal FSTTermsEnum(FST <long?> fst) { this.Fst = fst; @in = new BytesRefFSTEnum <long?>(fst); BytesReader = fst.BytesReader; }
public virtual void Test() { int[] ints = new int[7]; IntsRef input = new IntsRef(ints, 0, ints.Length); int seed = Random().Next(); Directory dir = new MMapDirectory(CreateTempDir("2BFST")); for (int doPackIter = 0; doPackIter < 2; doPackIter++) { bool doPack = doPackIter == 1; // Build FST w/ NoOutputs and stop when nodeCount > 2.2B if (!doPack) { Console.WriteLine("\nTEST: 3B nodes; doPack=false output=NO_OUTPUTS"); Outputs<object> outputs = NoOutputs.Singleton; object NO_OUTPUT = outputs.NoOutput; Builder<object> b = new Builder<object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, int.MaxValue, outputs, null, doPack, PackedInts.COMPACT, true, 15); int count = 0; Random r = new Random(seed); int[] ints2 = new int[200]; IntsRef input2 = new IntsRef(ints2, 0, ints2.Length); while (true) { //System.out.println("add: " + input + " -> " + output); for (int i = 10; i < ints2.Length; i++) { ints2[i] = r.Next(256); } b.Add(input2, NO_OUTPUT); count++; if (count % 100000 == 0) { Console.WriteLine(count + ": " + b.FstSizeInBytes() + " bytes; " + b.TotStateCount + " nodes"); } if (b.TotStateCount > int.MaxValue + 100L * 1024 * 1024) { break; } NextInput(r, ints2); } FST<object> fst = b.Finish(); for (int verify = 0; verify < 2; verify++) { Console.WriteLine("\nTEST: now verify [fst size=" + fst.SizeInBytes() + "; nodeCount=" + fst.NodeCount + "; arcCount=" + fst.ArcCount + "]"); Arrays.Fill(ints2, 0); r = new Random(seed); for (int i = 0; i < count; i++) { if (i % 1000000 == 0) { Console.WriteLine(i + "...: "); } for (int j = 10; j < ints2.Length; j++) { ints2[j] = r.Next(256); } Assert.AreEqual(NO_OUTPUT, Util.Get(fst, input2)); NextInput(r, ints2); } Console.WriteLine("\nTEST: enum all input/outputs"); IntsRefFSTEnum<object> fstEnum = new IntsRefFSTEnum<object>(fst); Arrays.Fill(ints2, 0); r = new Random(seed); int upto = 0; while (true) { IntsRefFSTEnum<object>.InputOutput<object> pair = fstEnum.Next(); if (pair == null) { break; } for (int j = 10; j < ints2.Length; j++) { ints2[j] = r.Next(256); } Assert.AreEqual(input2, pair.Input); Assert.AreEqual(NO_OUTPUT, pair.Output); upto++; NextInput(r, ints2); } Assert.AreEqual(count, upto); if (verify == 0) { Console.WriteLine("\nTEST: save/load FST and re-verify"); IndexOutput @out = dir.CreateOutput("fst", IOContext.DEFAULT); fst.Save(@out); @out.Dispose(); IndexInput @in = dir.OpenInput("fst", IOContext.DEFAULT); fst = new FST<object>(@in, outputs); @in.Dispose(); } else { dir.DeleteFile("fst"); } } } // Build FST w/ ByteSequenceOutputs and stop when FST // size = 3GB { Console.WriteLine("\nTEST: 3 GB size; doPack=" + doPack + " outputs=bytes"); Outputs<BytesRef> outputs = ByteSequenceOutputs.Singleton; Builder<BytesRef> b = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, int.MaxValue, outputs, null, doPack, PackedInts.COMPACT, true, 15); var outputBytes = new byte[20]; BytesRef output = new BytesRef(outputBytes); Arrays.Fill(ints, 0); int count = 0; Random r = new Random(seed); while (true) { r.NextBytes(outputBytes); //System.out.println("add: " + input + " -> " + output); b.Add(input, BytesRef.DeepCopyOf(output)); count++; if (count % 1000000 == 0) { Console.WriteLine(count + "...: " + b.FstSizeInBytes() + " bytes"); } if (b.FstSizeInBytes() > LIMIT) { break; } NextInput(r, ints); } FST<BytesRef> fst = b.Finish(); for (int verify = 0; verify < 2; verify++) { Console.WriteLine("\nTEST: now verify [fst size=" + fst.SizeInBytes() + "; nodeCount=" + fst.NodeCount + "; arcCount=" + fst.ArcCount + "]"); r = new Random(seed); Arrays.Fill(ints, 0); for (int i = 0; i < count; i++) { if (i % 1000000 == 0) { Console.WriteLine(i + "...: "); } r.NextBytes((byte[])(Array)outputBytes); Assert.AreEqual(output, Util.Get(fst, input)); NextInput(r, ints); } Console.WriteLine("\nTEST: enum all input/outputs"); IntsRefFSTEnum<BytesRef> fstEnum = new IntsRefFSTEnum<BytesRef>(fst); Arrays.Fill(ints, 0); r = new Random(seed); int upto = 0; while (true) { IntsRefFSTEnum<BytesRef>.InputOutput<BytesRef> pair = fstEnum.Next(); if (pair == null) { break; } Assert.AreEqual(input, pair.Input); r.NextBytes((byte[])(Array)outputBytes); Assert.AreEqual(output, pair.Output); upto++; NextInput(r, ints); } Assert.AreEqual(count, upto); if (verify == 0) { Console.WriteLine("\nTEST: save/load FST and re-verify"); IndexOutput @out = dir.CreateOutput("fst", IOContext.DEFAULT); fst.Save(@out); @out.Dispose(); IndexInput @in = dir.OpenInput("fst", IOContext.DEFAULT); fst = new FST<BytesRef>(@in, outputs); @in.Dispose(); } else { dir.DeleteFile("fst"); } } } // Build FST w/ PositiveIntOutputs and stop when FST // size = 3GB { Console.WriteLine("\nTEST: 3 GB size; doPack=" + doPack + " outputs=long"); Outputs<long?> outputs = PositiveIntOutputs.Singleton; Builder<long?> b = new Builder<long?>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, int.MaxValue, outputs, null, doPack, PackedInts.COMPACT, true, 15); long output = 1; Arrays.Fill(ints, 0); int count = 0; Random r = new Random(seed); while (true) { //System.out.println("add: " + input + " -> " + output); b.Add(input, output); output += 1 + r.Next(10); count++; if (count % 1000000 == 0) { Console.WriteLine(count + "...: " + b.FstSizeInBytes() + " bytes"); } if (b.FstSizeInBytes() > LIMIT) { break; } NextInput(r, ints); } FST<long?> fst = b.Finish(); for (int verify = 0; verify < 2; verify++) { Console.WriteLine("\nTEST: now verify [fst size=" + fst.SizeInBytes() + "; nodeCount=" + fst.NodeCount + "; arcCount=" + fst.ArcCount + "]"); Arrays.Fill(ints, 0); output = 1; r = new Random(seed); for (int i = 0; i < count; i++) { if (i % 1000000 == 0) { Console.WriteLine(i + "...: "); } // forward lookup: Assert.AreEqual(output, (long)Util.Get(fst, input)); // reverse lookup: Assert.AreEqual(input, Util.GetByOutput(fst, output)); output += 1 + r.Next(10); NextInput(r, ints); } Console.WriteLine("\nTEST: enum all input/outputs"); IntsRefFSTEnum<long?> fstEnum = new IntsRefFSTEnum<long?>(fst); Arrays.Fill(ints, 0); r = new Random(seed); int upto = 0; output = 1; while (true) { IntsRefFSTEnum<long?>.InputOutput<long?> pair = fstEnum.Next(); if (pair == null) { break; } Assert.AreEqual(input, pair.Input); Assert.AreEqual(output, pair.Output.Value); output += 1 + r.Next(10); upto++; NextInput(r, ints); } Assert.AreEqual(count, upto); if (verify == 0) { Console.WriteLine("\nTEST: save/load FST and re-verify"); IndexOutput @out = dir.CreateOutput("fst", IOContext.DEFAULT); fst.Save(@out); @out.Dispose(); IndexInput @in = dir.OpenInput("fst", IOContext.DEFAULT); fst = new FST<long?>(@in, outputs); @in.Dispose(); } else { dir.DeleteFile("fst"); } } } } dir.Dispose(); }
public SynonymMap(FST<BytesRef> fst, BytesRefHash words, int maxHorizontalContext) { this.fst = fst; this.words = words; this.maxHorizontalContext = maxHorizontalContext; }