/// <summary> /// Enumerates all minimal prefix paths in the automaton that also intersect the <see cref="FST"/>, /// accumulating the <see cref="FST"/> end node and output for each path. /// </summary> public static IList <Path <T> > IntersectPrefixPaths <T>(Automaton a, FST <T> fst) { if (Debugging.AssertsEnabled) { Debugging.Assert(a.IsDeterministic); } IList <Path <T> > queue = new List <Path <T> >(); List <Path <T> > endNodes = new List <Path <T> >(); queue.Add(new Path <T>(a.GetInitialState(), fst.GetFirstArc(new FST.Arc <T>()), fst.Outputs.NoOutput, new Int32sRef())); FST.Arc <T> scratchArc = new FST.Arc <T>(); FST.BytesReader fstReader = fst.GetBytesReader(); while (queue.Count != 0) { Path <T> path = queue[queue.Count - 1]; queue.Remove(path); if (path.State.Accept) { endNodes.Add(path); // we can stop here if we accept this path, // we accept all further paths too continue; } Int32sRef currentInput = path.Input; foreach (Transition t in path.State.GetTransitions()) { int min = t.Min; int max = t.Max; if (min == max) { FST.Arc <T> nextArc = fst.FindTargetArc(t.Min, path.FstNode, scratchArc, fstReader); if (nextArc != null) { Int32sRef newInput = new Int32sRef(currentInput.Length + 1); newInput.CopyInt32s(currentInput); newInput.Int32s[currentInput.Length] = t.Min; newInput.Length = currentInput.Length + 1; queue.Add(new Path <T>(t.Dest, new FST.Arc <T>() .CopyFrom(nextArc), fst.Outputs.Add(path.Output, nextArc.Output), newInput)); } } else { // TODO: if this transition's TO state is accepting, and // it accepts the entire range possible in the FST (ie. 0 to 255), // we can simply use the prefix as the accepted state instead of // looking up all the ranges and terminate early // here. This just shifts the work from one queue // (this one) to another (the completion search // done in AnalyzingSuggester). FST.Arc <T> nextArc = Lucene.Net.Util.Fst.Util.ReadCeilArc(min, fst, path.FstNode, scratchArc, fstReader); while (nextArc != null && nextArc.Label <= max) { if (Debugging.AssertsEnabled) { Debugging.Assert(nextArc.Label <= max); } if (Debugging.AssertsEnabled) { Debugging.Assert(nextArc.Label >= min, "{0} {1}", nextArc.Label, min); } Int32sRef newInput = new Int32sRef(currentInput.Length + 1); newInput.CopyInt32s(currentInput); newInput.Int32s[currentInput.Length] = nextArc.Label; newInput.Length = currentInput.Length + 1; queue.Add(new Path <T>(t.Dest, new FST.Arc <T>() .CopyFrom(nextArc), fst.Outputs.Add(path.Output, nextArc.Output), newInput)); int label = nextArc.Label; // used in assert nextArc = nextArc.IsLast ? null : fst.ReadNextRealArc(nextArc, fstReader); if (Debugging.AssertsEnabled) { Debugging.Assert(nextArc == null || label < nextArc.Label, "last: {0} next: {1}", label, nextArc?.Label); } } } } } return(endNodes); }
public UserDictionary(TextReader reader) { string line = null; int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET; List <string[]> featureEntries = new List <string[]>(); // text, segmentation, readings, POS while ((line = reader.ReadLine()) != null) { // Remove comments line = specialChars.Replace(line, ""); // Skip empty lines or comment lines if (line.Trim().Length == 0) { continue; } string[] values = CSVUtil.Parse(line); featureEntries.Add(values); } // TODO: should we allow multiple segmentations per input 'phrase'? // the old treemap didn't support this either, and i'm not sure if its needed/useful? featureEntries.Sort(Comparer <string[]> .Create((left, right) => left[0].CompareToOrdinal(right[0]))); List <string> data = new List <string>(featureEntries.Count); List <int[]> segmentations = new List <int[]>(featureEntries.Count); PositiveInt32Outputs fstOutput = PositiveInt32Outputs.Singleton; Builder <long?> fstBuilder = new Builder <long?>(Lucene.Net.Util.Fst.FST.INPUT_TYPE.BYTE2, fstOutput); Int32sRef scratch = new Int32sRef(); long ord = 0; foreach (string[] values in featureEntries) { string[] segmentation = commentLine.Replace(values[1], " ").Split(' ').TrimEnd(); string[] readings = commentLine.Replace(values[2], " ").Split(' ').TrimEnd(); string pos = values[3]; if (segmentation.Length != readings.Length) { throw new Exception("Illegal user dictionary entry " + values[0] + " - the number of segmentations (" + segmentation.Length + ")" + " does not the match number of readings (" + readings.Length + ")"); } int[] wordIdAndLength = new int[segmentation.Length + 1]; // wordId offset, length, length.... wordIdAndLength[0] = wordId; for (int i = 0; i < segmentation.Length; i++) { wordIdAndLength[i + 1] = segmentation[i].Length; data.Add(readings[i] + Dictionary.INTERNAL_SEPARATOR + pos); wordId++; } // add mapping to FST string token = values[0]; scratch.Grow(token.Length); scratch.Length = token.Length; for (int i = 0; i < token.Length; i++) { scratch.Int32s[i] = (int)token[i]; } fstBuilder.Add(scratch, ord); segmentations.Add(wordIdAndLength); ord++; } this.fst = new TokenInfoFST(fstBuilder.Finish(), false); this.data = data.ToArray(/*new string[data.Count]*/); this.segmentations = segmentations.ToArray(/*new int[segmentations.Count][]*/); }
/// <summary> /// Returns the strings that can be produced from the given state, or /// <c>false</c> if more than <paramref name="limit"/> strings are found. /// <paramref name="limit"/><0 means "infinite". /// </summary> private static bool GetFiniteStrings(State s, HashSet <State> pathstates, HashSet <Int32sRef> strings, Int32sRef path, int limit) { pathstates.Add(s); foreach (Transition t in s.GetTransitions()) { if (pathstates.Contains(t.to)) { return(false); } for (int n = t.min; n <= t.max; n++) { path.Grow(path.Length + 1); path.Int32s[path.Length] = n; path.Length++; if (t.to.accept) { strings.Add(Int32sRef.DeepCopyOf(path)); if (limit >= 0 && strings.Count > limit) { return(false); } } if (!GetFiniteStrings(t.to, pathstates, strings, path, limit)) { return(false); } path.Length--; } } pathstates.Remove(s); return(true); }
/// <summary> /// Generates a list of stems for the provided word /// </summary> /// <param name="word"> Word to generate the stems for </param> /// <param name="length"> length </param> /// <param name="previous"> previous affix that was removed (so we dont remove same one twice) </param> /// <param name="prevFlag"> Flag from a previous stemming step that need to be cross-checked with any affixes in this recursive step </param> /// <param name="prefixFlag"> flag of the most inner removed prefix, so that when removing a suffix, its also checked against the word </param> /// <param name="recursionDepth"> current recursiondepth </param> /// <param name="doPrefix"> true if we should remove prefixes </param> /// <param name="doSuffix"> true if we should remove suffixes </param> /// <param name="previousWasPrefix"> true if the previous removal was a prefix: /// if we are removing a suffix, and it has no continuation requirements, its ok. /// but two prefixes (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse. </param> /// <param name="circumfix"> true if the previous prefix removal was signed as a circumfix /// this means inner most suffix must also contain circumfix flag. </param> /// <param name="caseVariant"> true if we are searching for a case variant. if the word has KEEPCASE flag it cannot succeed. </param> /// <returns> <see cref="IList{CharsRef}"/> of stems, or empty list if no stems are found </returns> private IList <CharsRef> Stem(char[] word, int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, bool doPrefix, bool doSuffix, bool previousWasPrefix, bool circumfix, bool caseVariant) { // TODO: allow this stuff to be reused by tokenfilter List <CharsRef> stems = new List <CharsRef>(); if (doPrefix && dictionary.prefixes != null) { FST <Int32sRef> fst = dictionary.prefixes; Outputs <Int32sRef> outputs = fst.Outputs; FST.BytesReader bytesReader = prefixReaders[recursionDepth]; FST.Arc <Int32sRef> arc = prefixArcs[recursionDepth]; fst.GetFirstArc(arc); Int32sRef NO_OUTPUT = outputs.NoOutput; Int32sRef output = NO_OUTPUT; int limit = dictionary.fullStrip ? length : length - 1; for (int i = 0; i < limit; i++) { if (i > 0) { int ch = word[i - 1]; if (fst.FindTargetArc(ch, arc, arc, bytesReader) == null) { break; } else if (arc.Output != NO_OUTPUT) { output = fst.Outputs.Add(output, arc.Output); } } Int32sRef prefixes; // LUCENENET: IDE0059 - Removed unnecessary value assignment if (!arc.IsFinal) { continue; } else { prefixes = fst.Outputs.Add(output, arc.NextFinalOutput); } for (int j = 0; j < prefixes.Length; j++) { int prefix = prefixes.Int32s[prefixes.Offset + j]; if (prefix == previous) { continue; } affixReader.Position = 8 * prefix; char flag = (char)(affixReader.ReadInt16() & 0xffff); char stripOrd = (char)(affixReader.ReadInt16() & 0xffff); int condition = (char)(affixReader.ReadInt16() & 0xffff); bool crossProduct = (condition & 1) == 1; condition = condition.TripleShift(1); char append = (char)(affixReader.ReadInt16() & 0xffff); bool compatible; if (recursionDepth == 0) { if (dictionary.onlyincompound == -1) { compatible = true; } else { // check if affix is allowed in a non-compound word dictionary.flagLookup.Get(append, scratch); char[] appendFlags = Dictionary.DecodeFlags(scratch); compatible = !Dictionary.HasFlag(appendFlags, (char)dictionary.onlyincompound); } } else if (crossProduct) { // cross check incoming continuation class (flag of previous affix) against list. dictionary.flagLookup.Get(append, scratch); char[] appendFlags = Dictionary.DecodeFlags(scratch); if (Debugging.AssertsEnabled) { Debugging.Assert(prevFlag >= 0); } bool allowed = dictionary.onlyincompound == -1 || !Dictionary.HasFlag(appendFlags, (char)dictionary.onlyincompound); compatible = allowed && HasCrossCheckedFlag((char)prevFlag, appendFlags, false); } else { compatible = false; } if (compatible) { int deAffixedStart = i; int deAffixedLength = length - deAffixedStart; int stripStart = dictionary.stripOffsets[stripOrd]; int stripEnd = dictionary.stripOffsets[stripOrd + 1]; int stripLength = stripEnd - stripStart; if (!CheckCondition(condition, dictionary.stripData, stripStart, stripLength, word, deAffixedStart, deAffixedLength)) { continue; } char[] strippedWord = new char[stripLength + deAffixedLength]; Array.Copy(dictionary.stripData, stripStart, strippedWord, 0, stripLength); Array.Copy(word, deAffixedStart, strippedWord, stripLength, deAffixedLength); IList <CharsRef> stemList = ApplyAffix(strippedWord, strippedWord.Length, prefix, -1, recursionDepth, true, circumfix, caseVariant); stems.AddRange(stemList); } } } } if (doSuffix && dictionary.suffixes != null) { FST <Int32sRef> fst = dictionary.suffixes; Outputs <Int32sRef> outputs = fst.Outputs; FST.BytesReader bytesReader = suffixReaders[recursionDepth]; FST.Arc <Int32sRef> arc = suffixArcs[recursionDepth]; fst.GetFirstArc(arc); Int32sRef NO_OUTPUT = outputs.NoOutput; Int32sRef output = NO_OUTPUT; int limit = dictionary.fullStrip ? 0 : 1; for (int i = length; i >= limit; i--) { if (i < length) { int ch = word[i]; if (fst.FindTargetArc(ch, arc, arc, bytesReader) == null) { break; } else if (arc.Output != NO_OUTPUT) { output = fst.Outputs.Add(output, arc.Output); } } Int32sRef suffixes; // LUCENENET: IDE0059 - Removed unnecessary value assignment if (!arc.IsFinal) { continue; } else { suffixes = fst.Outputs.Add(output, arc.NextFinalOutput); } for (int j = 0; j < suffixes.Length; j++) { int suffix = suffixes.Int32s[suffixes.Offset + j]; if (suffix == previous) { continue; } affixReader.Position = 8 * suffix; char flag = (char)(affixReader.ReadInt16() & 0xffff); char stripOrd = (char)(affixReader.ReadInt16() & 0xffff); int condition = (char)(affixReader.ReadInt16() & 0xffff); bool crossProduct = (condition & 1) == 1; condition = condition.TripleShift(1); char append = (char)(affixReader.ReadInt16() & 0xffff); bool compatible; if (recursionDepth == 0) { if (dictionary.onlyincompound == -1) { compatible = true; } else { // check if affix is allowed in a non-compound word dictionary.flagLookup.Get(append, scratch); char[] appendFlags = Dictionary.DecodeFlags(scratch); compatible = !Dictionary.HasFlag(appendFlags, (char)dictionary.onlyincompound); } } else if (crossProduct) { // cross check incoming continuation class (flag of previous affix) against list. dictionary.flagLookup.Get(append, scratch); char[] appendFlags = Dictionary.DecodeFlags(scratch); if (Debugging.AssertsEnabled) { Debugging.Assert(prevFlag >= 0); } bool allowed = dictionary.onlyincompound == -1 || !Dictionary.HasFlag(appendFlags, (char)dictionary.onlyincompound); compatible = HasCrossCheckedFlag((char)prevFlag, appendFlags, previousWasPrefix); } else { compatible = false; } if (compatible) { int appendLength = length - i; int deAffixedLength = length - appendLength; int stripStart = dictionary.stripOffsets[stripOrd]; int stripEnd = dictionary.stripOffsets[stripOrd + 1]; int stripLength = stripEnd - stripStart; if (!CheckCondition(condition, word, 0, deAffixedLength, dictionary.stripData, stripStart, stripLength)) { continue; } char[] strippedWord = new char[stripLength + deAffixedLength]; Array.Copy(word, 0, strippedWord, 0, deAffixedLength); Array.Copy(dictionary.stripData, stripStart, strippedWord, deAffixedLength, stripLength); IList <CharsRef> stemList = ApplyAffix(strippedWord, strippedWord.Length, suffix, prefixFlag, recursionDepth, false, circumfix, caseVariant); stems.AddRange(stemList); } } } } return(stems); }
private void ProcessFacetFields(ITaxonomyWriter taxoWriter, IDictionary <string, IList <FacetField> > byField, Document doc) { foreach (KeyValuePair <string, IList <FacetField> > ent in byField) { string indexFieldName = ent.Key; //System.out.println(" indexFieldName=" + indexFieldName + " fields=" + ent.getValue()); Int32sRef ordinals = new Int32sRef(32); foreach (FacetField facetField in ent.Value) { FacetsConfig.DimConfig ft = GetDimConfig(facetField.Dim); if (facetField.Path.Length > 1 && ft.IsHierarchical == false) { throw new System.ArgumentException("dimension \"" + facetField.Dim + "\" is not hierarchical yet has " + facetField.Path.Length + " components"); } FacetLabel cp = new FacetLabel(facetField.Dim, facetField.Path); CheckTaxoWriter(taxoWriter); int ordinal = taxoWriter.AddCategory(cp); if (ordinals.Length == ordinals.Int32s.Length) { ordinals.Grow(ordinals.Length + 1); } ordinals.Int32s[ordinals.Length++] = ordinal; //System.out.println("ords[" + (ordinals.length-1) + "]=" + ordinal); //System.out.println(" add cp=" + cp); if (ft.IsMultiValued && (ft.IsHierarchical || ft.RequireDimCount)) { //System.out.println(" add parents"); // Add all parents too: int parent = taxoWriter.GetParent(ordinal); while (parent > 0) { if (ordinals.Int32s.Length == ordinals.Length) { ordinals.Grow(ordinals.Length + 1); } ordinals.Int32s[ordinals.Length++] = parent; parent = taxoWriter.GetParent(parent); } if (ft.RequireDimCount == false) { // Remove last (dimension) ord: ordinals.Length--; } } // Drill down: for (int i = 1; i <= cp.Length; i++) { doc.Add(new StringField(indexFieldName, PathToString(cp.Components, i), Field.Store.NO)); } } // Facet counts: // DocValues are considered stored fields: doc.Add(new BinaryDocValuesField(indexFieldName, DedupAndEncode(ordinals))); } }
// ================================================= Helper Methods ================================================ /// <summary> /// Generates a list of stems for the provided word /// </summary> /// <param name="word"> Word to generate the stems for </param> /// <param name="length"> length </param> /// <param name="previous"> previous affix that was removed (so we dont remove same one twice) </param> /// <param name="prevFlag"> Flag from a previous stemming step that need to be cross-checked with any affixes in this recursive step </param> /// <param name="prefixFlag"> flag of the most inner removed prefix, so that when removing a suffix, its also checked against the word </param> /// <param name="recursionDepth"> current recursiondepth </param> /// <param name="doPrefix"> true if we should remove prefixes </param> /// <param name="doSuffix"> true if we should remove suffixes </param> /// <param name="previousWasPrefix"> true if the previous removal was a prefix: /// if we are removing a suffix, and it has no continuation requirements, its ok. /// but two prefixes (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse. </param> /// <param name="circumfix"> true if the previous prefix removal was signed as a circumfix /// this means inner most suffix must also contain circumfix flag. </param> /// <returns> <see cref="IList{CharsRef}"/> of stems, or empty list if no stems are found </returns> private IList <CharsRef> Stem(char[] word, int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, bool doPrefix, bool doSuffix, bool previousWasPrefix, bool circumfix) { // TODO: allow this stuff to be reused by tokenfilter List <CharsRef> stems = new List <CharsRef>(); if (doPrefix && dictionary.prefixes != null) { for (int i = length - 1; i >= 0; i--) { Int32sRef prefixes = dictionary.LookupPrefix(word, 0, i); if (prefixes == null) { continue; } for (int j = 0; j < prefixes.Length; j++) { int prefix = prefixes.Int32s[prefixes.Offset + j]; if (prefix == previous) { continue; } affixReader.Position = 8 * prefix; char flag = (char)(affixReader.ReadInt16() & 0xffff); char stripOrd = (char)(affixReader.ReadInt16() & 0xffff); int condition = (char)(affixReader.ReadInt16() & 0xffff); bool crossProduct = (condition & 1) == 1; condition = condition.TripleShift(1); char append = (char)(affixReader.ReadInt16() & 0xffff); bool compatible; if (recursionDepth == 0) { compatible = true; } else if (crossProduct) { // cross check incoming continuation class (flag of previous affix) against list. dictionary.flagLookup.Get(append, scratch); char[] appendFlags = Dictionary.DecodeFlags(scratch); if (Debugging.AssertsEnabled) { Debugging.Assert(prevFlag >= 0); } compatible = HasCrossCheckedFlag((char)prevFlag, appendFlags, false); } else { compatible = false; } if (compatible) { int deAffixedStart = i; int deAffixedLength = length - deAffixedStart; int stripStart = dictionary.stripOffsets[stripOrd]; int stripEnd = dictionary.stripOffsets[stripOrd + 1]; int stripLength = stripEnd - stripStart; if (!CheckCondition(condition, dictionary.stripData, stripStart, stripLength, word, deAffixedStart, deAffixedLength)) { continue; } char[] strippedWord = new char[stripLength + deAffixedLength]; Array.Copy(dictionary.stripData, stripStart, strippedWord, 0, stripLength); Array.Copy(word, deAffixedStart, strippedWord, stripLength, deAffixedLength); IList <CharsRef> stemList = ApplyAffix(strippedWord, strippedWord.Length, prefix, -1, recursionDepth, true, circumfix); stems.AddRange(stemList); } } } } if (doSuffix && dictionary.suffixes != null) { for (int i = 0; i < length; i++) { Int32sRef suffixes = dictionary.LookupSuffix(word, i, length - i); if (suffixes == null) { continue; } for (int j = 0; j < suffixes.Length; j++) { int suffix = suffixes.Int32s[suffixes.Offset + j]; if (suffix == previous) { continue; } affixReader.Position = 8 * suffix; char flag = (char)(affixReader.ReadInt16() & 0xffff); char stripOrd = (char)(affixReader.ReadInt16() & 0xffff); int condition = (char)(affixReader.ReadInt16() & 0xffff); bool crossProduct = (condition & 1) == 1; condition = condition.TripleShift(1); char append = (char)(affixReader.ReadInt16() & 0xffff); bool compatible; if (recursionDepth == 0) { compatible = true; } else if (crossProduct) { // cross check incoming continuation class (flag of previous affix) against list. dictionary.flagLookup.Get(append, scratch); char[] appendFlags = Dictionary.DecodeFlags(scratch); if (Debugging.AssertsEnabled) { Debugging.Assert(prevFlag >= 0); } compatible = HasCrossCheckedFlag((char)prevFlag, appendFlags, previousWasPrefix); } else { compatible = false; } if (compatible) { int appendLength = length - i; int deAffixedLength = length - appendLength; int stripStart = dictionary.stripOffsets[stripOrd]; int stripEnd = dictionary.stripOffsets[stripOrd + 1]; int stripLength = stripEnd - stripStart; if (!CheckCondition(condition, word, 0, deAffixedLength, dictionary.stripData, stripStart, stripLength)) { continue; } char[] strippedWord = new char[stripLength + deAffixedLength]; Array.Copy(word, 0, strippedWord, 0, deAffixedLength); Array.Copy(dictionary.stripData, stripStart, strippedWord, deAffixedLength, stripLength); IList <CharsRef> stemList = ApplyAffix(strippedWord, strippedWord.Length, suffix, prefixFlag, recursionDepth, false, circumfix); stems.AddRange(stemList); } } } } return(stems); }
public abstract void Freeze(UnCompiledNode <S>[] frontier, int prefixLenPlus1, Int32sRef prevInput);
public InputOutput(Int32sRef input, T1 output) { this.Input = input; this.Output = output; }
// FST is complete private void VerifyUnPruned(int inputMode, FST <T> fst) { FST <long?> fstLong; ISet <long?> validOutputs; long minLong = long.MaxValue; long maxLong = long.MinValue; if (DoReverseLookup) { FST <long?> fstLong0 = fst as FST <long?>; fstLong = fstLong0; validOutputs = new HashSet <long?>(); foreach (InputOutput <T> pair in Pairs) { long?output = pair.Output as long?; maxLong = Math.Max(maxLong, output.Value); minLong = Math.Min(minLong, output.Value); validOutputs.Add(output.Value); } } else { fstLong = null; validOutputs = null; } if (Pairs.Count == 0) { Assert.IsNull(fst); return; } if (LuceneTestCase.VERBOSE) { Console.WriteLine("TEST: now verify " + Pairs.Count + " terms"); foreach (InputOutput <T> pair in Pairs) { Assert.IsNotNull(pair); Assert.IsNotNull(pair.Input); Assert.IsNotNull(pair.Output); Console.WriteLine(" " + InputToString(inputMode, pair.Input) + ": " + Outputs.OutputToString(pair.Output)); } } Assert.IsNotNull(fst); // visit valid pairs in order -- make sure all words // are accepted, and FSTEnum's next() steps through // them correctly if (LuceneTestCase.VERBOSE) { Console.WriteLine("TEST: check valid terms/next()"); } { Int32sRefFSTEnum <T> fstEnum = new Int32sRefFSTEnum <T>(fst); foreach (InputOutput <T> pair in Pairs) { Int32sRef term = pair.Input; if (LuceneTestCase.VERBOSE) { Console.WriteLine("TEST: check term=" + InputToString(inputMode, term) + " output=" + fst.Outputs.OutputToString(pair.Output)); } T output = Run(fst, term, null); Assert.IsNotNull(output, "term " + InputToString(inputMode, term) + " is not accepted"); Assert.IsTrue(OutputsEqual(pair.Output, output)); // verify enum's next Int32sRefFSTEnum.InputOutput <T> t = fstEnum.Next(); Assert.IsNotNull(t); Assert.AreEqual(term, t.Input, "expected input=" + InputToString(inputMode, term) + " but fstEnum returned " + InputToString(inputMode, t.Input)); Assert.IsTrue(OutputsEqual(pair.Output, t.Output)); } Assert.IsNull(fstEnum.Next()); } IDictionary <Int32sRef, T> termsMap = new Dictionary <Int32sRef, T>(); foreach (InputOutput <T> pair in Pairs) { termsMap[pair.Input] = pair.Output; } if (DoReverseLookup && maxLong > minLong) { // Do random lookups so we test null (output doesn't // exist) case: Assert.IsNull(Util.GetByOutput(fstLong, minLong - 7)); Assert.IsNull(Util.GetByOutput(fstLong, maxLong + 7)); int num = LuceneTestCase.AtLeast(Random, 100); for (int iter = 0; iter < num; iter++) { long v = TestUtil.NextLong(Random, minLong, maxLong); Int32sRef input = Util.GetByOutput(fstLong, v); Assert.IsTrue(validOutputs.Contains(v) || input == null); } } // find random matching word and make sure it's valid if (LuceneTestCase.VERBOSE) { Console.WriteLine("TEST: verify random accepted terms"); } Int32sRef scratch = new Int32sRef(10); int num_ = LuceneTestCase.AtLeast(Random, 500); for (int iter = 0; iter < num_; iter++) { T output = RandomAcceptedWord(fst, scratch); Assert.IsTrue(termsMap.ContainsKey(scratch), "accepted word " + InputToString(inputMode, scratch) + " is not valid"); Assert.IsTrue(OutputsEqual(termsMap[scratch], output)); if (DoReverseLookup) { //System.out.println("lookup output=" + output + " outs=" + fst.Outputs); Int32sRef input = Util.GetByOutput(fstLong, (output as long?).Value); Assert.IsNotNull(input); //System.out.println(" got " + Util.toBytesRef(input, new BytesRef()).utf8ToString()); Assert.AreEqual(scratch, input); } } // test IntsRefFSTEnum.Seek: if (LuceneTestCase.VERBOSE) { Console.WriteLine("TEST: verify seek"); } Int32sRefFSTEnum <T> fstEnum_ = new Int32sRefFSTEnum <T>(fst); num_ = LuceneTestCase.AtLeast(Random, 100); for (int iter = 0; iter < num_; iter++) { if (LuceneTestCase.VERBOSE) { Console.WriteLine(" iter=" + iter); } if (Random.NextBoolean()) { // seek to term that doesn't exist: while (true) { Int32sRef term = ToIntsRef(GetRandomString(Random), inputMode); int pos = Pairs.BinarySearch(new InputOutput <T>(term, default(T))); if (pos < 0) { pos = -(pos + 1); // ok doesn't exist //System.out.println(" seek " + inputToString(inputMode, term)); Int32sRefFSTEnum.InputOutput <T> seekResult; if (Random.Next(3) == 0) { if (LuceneTestCase.VERBOSE) { Console.WriteLine(" do non-exist seekExact term=" + InputToString(inputMode, term)); } seekResult = fstEnum_.SeekExact(term); pos = -1; } else if (Random.NextBoolean()) { if (LuceneTestCase.VERBOSE) { Console.WriteLine(" do non-exist seekFloor term=" + InputToString(inputMode, term)); } seekResult = fstEnum_.SeekFloor(term); pos--; } else { if (LuceneTestCase.VERBOSE) { Console.WriteLine(" do non-exist seekCeil term=" + InputToString(inputMode, term)); } seekResult = fstEnum_.SeekCeil(term); } if (pos != -1 && pos < Pairs.Count) { //System.out.println(" got " + inputToString(inputMode,seekResult.input) + " output=" + fst.Outputs.outputToString(seekResult.Output)); Assert.IsNotNull(seekResult, "got null but expected term=" + InputToString(inputMode, Pairs[pos].Input)); if (LuceneTestCase.VERBOSE) { Console.WriteLine(" got " + InputToString(inputMode, seekResult.Input)); } Assert.AreEqual(Pairs[pos].Input, seekResult.Input, "expected " + InputToString(inputMode, Pairs[pos].Input) + " but got " + InputToString(inputMode, seekResult.Input)); Assert.IsTrue(OutputsEqual(Pairs[pos].Output, seekResult.Output)); } else { // seeked before start or beyond end //System.out.println("seek=" + seekTerm); Assert.IsNull(seekResult, "expected null but got " + (seekResult == null ? "null" : InputToString(inputMode, seekResult.Input))); if (LuceneTestCase.VERBOSE) { Console.WriteLine(" got null"); } } break; } } } else { // seek to term that does exist: InputOutput <T> pair = Pairs[Random.Next(Pairs.Count)]; Int32sRefFSTEnum.InputOutput <T> seekResult; if (Random.Next(3) == 2) { if (LuceneTestCase.VERBOSE) { Console.WriteLine(" do exists seekExact term=" + InputToString(inputMode, pair.Input)); } seekResult = fstEnum_.SeekExact(pair.Input); } else if (Random.NextBoolean()) { if (LuceneTestCase.VERBOSE) { Console.WriteLine(" do exists seekFloor " + InputToString(inputMode, pair.Input)); } seekResult = fstEnum_.SeekFloor(pair.Input); } else { if (LuceneTestCase.VERBOSE) { Console.WriteLine(" do exists seekCeil " + InputToString(inputMode, pair.Input)); } seekResult = fstEnum_.SeekCeil(pair.Input); } Assert.IsNotNull(seekResult); Assert.AreEqual(pair.Input, seekResult.Input, "got " + InputToString(inputMode, seekResult.Input) + " but expected " + InputToString(inputMode, pair.Input)); Assert.IsTrue(OutputsEqual(pair.Output, seekResult.Output)); } } if (LuceneTestCase.VERBOSE) { Console.WriteLine("TEST: mixed next/seek"); } // test mixed next/seek num_ = LuceneTestCase.AtLeast(Random, 100); for (int iter = 0; iter < num_; iter++) { if (LuceneTestCase.VERBOSE) { Console.WriteLine("TEST: iter " + iter); } // reset: fstEnum_ = new Int32sRefFSTEnum <T>(fst); int upto = -1; while (true) { bool isDone = false; if (upto == Pairs.Count - 1 || Random.NextBoolean()) { // next upto++; if (LuceneTestCase.VERBOSE) { Console.WriteLine(" do next"); } isDone = fstEnum_.Next() == null; } else if (upto != -1 && upto < 0.75 * Pairs.Count && Random.NextBoolean()) { int attempt = 0; for (; attempt < 10; attempt++) { Int32sRef term = ToIntsRef(GetRandomString(Random), inputMode); if (!termsMap.ContainsKey(term) && term.CompareTo(Pairs[upto].Input) > 0) { int pos = Pairs.BinarySearch(new InputOutput <T>(term, default(T))); Debug.Assert(pos < 0); upto = -(pos + 1); if (Random.NextBoolean()) { upto--; Assert.IsTrue(upto != -1); if (LuceneTestCase.VERBOSE) { Console.WriteLine(" do non-exist seekFloor(" + InputToString(inputMode, term) + ")"); } isDone = fstEnum_.SeekFloor(term) == null; } else { if (LuceneTestCase.VERBOSE) { Console.WriteLine(" do non-exist seekCeil(" + InputToString(inputMode, term) + ")"); } isDone = fstEnum_.SeekCeil(term) == null; } break; } } if (attempt == 10) { continue; } } else { int inc = Random.Next(Pairs.Count - upto - 1); upto += inc; if (upto == -1) { upto = 0; } if (Random.NextBoolean()) { if (LuceneTestCase.VERBOSE) { Console.WriteLine(" do seekCeil(" + InputToString(inputMode, Pairs[upto].Input) + ")"); } isDone = fstEnum_.SeekCeil(Pairs[upto].Input) == null; } else { if (LuceneTestCase.VERBOSE) { Console.WriteLine(" do seekFloor(" + InputToString(inputMode, Pairs[upto].Input) + ")"); } isDone = fstEnum_.SeekFloor(Pairs[upto].Input) == null; } } if (LuceneTestCase.VERBOSE) { if (!isDone) { Console.WriteLine(" got " + InputToString(inputMode, fstEnum_.Current.Input)); } else { Console.WriteLine(" got null"); } } if (upto == Pairs.Count) { Assert.IsTrue(isDone); break; } else { Assert.IsFalse(isDone); Assert.AreEqual(Pairs[upto].Input, fstEnum_.Current.Input); Assert.IsTrue(OutputsEqual(Pairs[upto].Output, fstEnum_.Current.Output)); /* * if (upto < pairs.size()-1) { * int tryCount = 0; * while(tryCount < 10) { * final IntsRef t = toIntsRef(getRandomString(), inputMode); * if (pairs.get(upto).input.compareTo(t) < 0) { * final boolean expected = t.compareTo(pairs.get(upto+1).input) < 0; * if (LuceneTestCase.VERBOSE) { * System.out.println("TEST: call beforeNext(" + inputToString(inputMode, t) + "); current=" + inputToString(inputMode, pairs.get(upto).input) + " next=" + inputToString(inputMode, pairs.get(upto+1).input) + " expected=" + expected); * } * Assert.AreEqual(expected, fstEnum.beforeNext(t)); * break; * } * tryCount++; * } * } */ } } } }
public SortedSetDocValuesAnonymousInnerClassHelper(FSTEntry fstEntry, BinaryDocValues binaryDocValues, FST <long?> fst1, FST.BytesReader @in, FST.Arc <long?> arc, FST.Arc <long?> scratchArc1, Int32sRef intsRef, BytesRefFSTEnum <long?> bytesRefFstEnum, BytesRef @ref, ByteArrayDataInput byteArrayDataInput) { entry = fstEntry; docToOrds = binaryDocValues; fst = fst1; this.@in = @in; firstArc = arc; scratchArc = scratchArc1; scratchInts = intsRef; fstEnum = bytesRefFstEnum; this.@ref = @ref; input = byteArrayDataInput; }
public virtual void Test() { int[] ints = new int[7]; Int32sRef input = new Int32sRef(ints, 0, ints.Length); int seed = Random.Next(); Directory dir = new MMapDirectory(CreateTempDir("2BFST")); for (int doPackIter = 0; doPackIter < 2; doPackIter++) { bool doPack = doPackIter == 1; // Build FST w/ NoOutputs and stop when nodeCount > 2.2B if (!doPack) { Console.WriteLine("\nTEST: 3B nodes; doPack=false output=NO_OUTPUTS"); Outputs <object> outputs = NoOutputs.Singleton; object NO_OUTPUT = outputs.NoOutput; Builder <object> b = new Builder <object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, int.MaxValue, outputs, null, doPack, PackedInt32s.COMPACT, true, 15); int count = 0; Random r = new Random(seed); int[] ints2 = new int[200]; Int32sRef input2 = new Int32sRef(ints2, 0, ints2.Length); while (true) { //System.out.println("add: " + input + " -> " + output); for (int i = 10; i < ints2.Length; i++) { ints2[i] = r.Next(256); } b.Add(input2, NO_OUTPUT); count++; if (count % 100000 == 0) { Console.WriteLine(count + ": " + b.GetFstSizeInBytes() + " bytes; " + b.TotStateCount + " nodes"); } if (b.TotStateCount > int.MaxValue + 100L * 1024 * 1024) { break; } NextInput(r, ints2); } FST <object> fst = b.Finish(); for (int verify = 0; verify < 2; verify++) { Console.WriteLine("\nTEST: now verify [fst size=" + fst.GetSizeInBytes() + "; nodeCount=" + fst.NodeCount + "; arcCount=" + fst.ArcCount + "]"); Arrays.Fill(ints2, 0); r = new Random(seed); for (int i = 0; i < count; i++) { if (i % 1000000 == 0) { Console.WriteLine(i + "...: "); } for (int j = 10; j < ints2.Length; j++) { ints2[j] = r.Next(256); } Assert.AreEqual(NO_OUTPUT, Util.Get(fst, input2)); NextInput(r, ints2); } Console.WriteLine("\nTEST: enum all input/outputs"); Int32sRefFSTEnum <object> fstEnum = new Int32sRefFSTEnum <object>(fst); Arrays.Fill(ints2, 0); r = new Random(seed); int upto = 0; while (true) { Int32sRefFSTEnum.InputOutput <object> pair = fstEnum.Next(); if (pair == null) { break; } for (int j = 10; j < ints2.Length; j++) { ints2[j] = r.Next(256); } Assert.AreEqual(input2, pair.Input); Assert.AreEqual(NO_OUTPUT, pair.Output); upto++; NextInput(r, ints2); } Assert.AreEqual(count, upto); if (verify == 0) { Console.WriteLine("\nTEST: save/load FST and re-verify"); IndexOutput @out = dir.CreateOutput("fst", IOContext.DEFAULT); fst.Save(@out); @out.Dispose(); IndexInput @in = dir.OpenInput("fst", IOContext.DEFAULT); fst = new FST <object>(@in, outputs); @in.Dispose(); } else { dir.DeleteFile("fst"); } } } // Build FST w/ ByteSequenceOutputs and stop when FST // size = 3GB { Console.WriteLine("\nTEST: 3 GB size; doPack=" + doPack + " outputs=bytes"); Outputs <BytesRef> outputs = ByteSequenceOutputs.Singleton; Builder <BytesRef> b = new Builder <BytesRef>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, int.MaxValue, outputs, null, doPack, PackedInt32s.COMPACT, true, 15); var outputBytes = new byte[20]; BytesRef output = new BytesRef(outputBytes); Arrays.Fill(ints, 0); int count = 0; Random r = new Random(seed); while (true) { r.NextBytes(outputBytes); //System.out.println("add: " + input + " -> " + output); b.Add(input, BytesRef.DeepCopyOf(output)); count++; if (count % 1000000 == 0) { Console.WriteLine(count + "...: " + b.GetFstSizeInBytes() + " bytes"); } if (b.GetFstSizeInBytes() > LIMIT) { break; } NextInput(r, ints); } FST <BytesRef> fst = b.Finish(); for (int verify = 0; verify < 2; verify++) { Console.WriteLine("\nTEST: now verify [fst size=" + fst.GetSizeInBytes() + "; nodeCount=" + fst.NodeCount + "; arcCount=" + fst.ArcCount + "]"); r = new Random(seed); Arrays.Fill(ints, 0); for (int i = 0; i < count; i++) { if (i % 1000000 == 0) { Console.WriteLine(i + "...: "); } r.NextBytes(outputBytes); Assert.AreEqual(output, Util.Get(fst, input)); NextInput(r, ints); } Console.WriteLine("\nTEST: enum all input/outputs"); Int32sRefFSTEnum <BytesRef> fstEnum = new Int32sRefFSTEnum <BytesRef>(fst); Arrays.Fill(ints, 0); r = new Random(seed); int upto = 0; while (true) { Int32sRefFSTEnum.InputOutput <BytesRef> pair = fstEnum.Next(); if (pair == null) { break; } Assert.AreEqual(input, pair.Input); r.NextBytes(outputBytes); Assert.AreEqual(output, pair.Output); upto++; NextInput(r, ints); } Assert.AreEqual(count, upto); if (verify == 0) { Console.WriteLine("\nTEST: save/load FST and re-verify"); IndexOutput @out = dir.CreateOutput("fst", IOContext.DEFAULT); fst.Save(@out); @out.Dispose(); IndexInput @in = dir.OpenInput("fst", IOContext.DEFAULT); fst = new FST <BytesRef>(@in, outputs); @in.Dispose(); } else { dir.DeleteFile("fst"); } } } // Build FST w/ PositiveIntOutputs and stop when FST // size = 3GB { Console.WriteLine("\nTEST: 3 GB size; doPack=" + doPack + " outputs=long"); Outputs <long?> outputs = PositiveInt32Outputs.Singleton; Builder <long?> b = new Builder <long?>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, int.MaxValue, outputs, null, doPack, PackedInt32s.COMPACT, true, 15); long output = 1; Arrays.Fill(ints, 0); int count = 0; Random r = new Random(seed); while (true) { //System.out.println("add: " + input + " -> " + output); b.Add(input, output); output += 1 + r.Next(10); count++; if (count % 1000000 == 0) { Console.WriteLine(count + "...: " + b.GetFstSizeInBytes() + " bytes"); } if (b.GetFstSizeInBytes() > LIMIT) { break; } NextInput(r, ints); } FST <long?> fst = b.Finish(); for (int verify = 0; verify < 2; verify++) { Console.WriteLine("\nTEST: now verify [fst size=" + fst.GetSizeInBytes() + "; nodeCount=" + fst.NodeCount + "; arcCount=" + fst.ArcCount + "]"); Arrays.Fill(ints, 0); output = 1; r = new Random(seed); for (int i = 0; i < count; i++) { if (i % 1000000 == 0) { Console.WriteLine(i + "...: "); } // forward lookup: Assert.AreEqual(output, (long)Util.Get(fst, input)); // reverse lookup: Assert.AreEqual(input, Util.GetByOutput(fst, output)); output += 1 + r.Next(10); NextInput(r, ints); } Console.WriteLine("\nTEST: enum all input/outputs"); Int32sRefFSTEnum <long?> fstEnum = new Int32sRefFSTEnum <long?>(fst); Arrays.Fill(ints, 0); r = new Random(seed); int upto = 0; output = 1; while (true) { Int32sRefFSTEnum.InputOutput <long?> pair = fstEnum.Next(); if (pair == null) { break; } Assert.AreEqual(input, pair.Input); Assert.AreEqual(output, pair.Output.Value); output += 1 + r.Next(10); upto++; NextInput(r, ints); } Assert.AreEqual(count, upto); if (verify == 0) { Console.WriteLine("\nTEST: save/load FST and re-verify"); IndexOutput @out = dir.CreateOutput("fst", IOContext.DEFAULT); fst.Save(@out); @out.Dispose(); IndexInput @in = dir.OpenInput("fst", IOContext.DEFAULT); fst = new FST <long?>(@in, outputs); @in.Dispose(); } else { dir.DeleteFile("fst"); } } } } dir.Dispose(); }
/// <summary> /// Builds an <see cref="SynonymMap"/> and returns it. /// </summary> public virtual SynonymMap Build() { ByteSequenceOutputs outputs = ByteSequenceOutputs.Singleton; // TODO: are we using the best sharing options? var builder = new Builder <BytesRef>(FST.INPUT_TYPE.BYTE4, outputs); BytesRef scratch = new BytesRef(64); ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput(); ISet <int?> dedupSet; if (dedup) { dedupSet = new JCG.HashSet <int?>(); } else { dedupSet = null; } var spare = new byte[5]; ICollection <CharsRef> keys = workingSet.Keys; CharsRef[] sortedKeys = new CharsRef[keys.Count]; keys.CopyTo(sortedKeys, 0); #pragma warning disable 612, 618 System.Array.Sort(sortedKeys, CharsRef.UTF16SortedAsUTF8Comparer); #pragma warning restore 612, 618 Int32sRef scratchIntsRef = new Int32sRef(); //System.out.println("fmap.build"); for (int keyIdx = 0; keyIdx < sortedKeys.Length; keyIdx++) { CharsRef input = sortedKeys[keyIdx]; MapEntry output = workingSet[input]; int numEntries = output.ords.Count; // output size, assume the worst case int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry scratch.Grow(estimatedSize); scratchOutput.Reset(scratch.Bytes, scratch.Offset, scratch.Bytes.Length); if (Debugging.AssertsEnabled) { Debugging.Assert(scratch.Offset == 0); } // now write our output data: int count = 0; for (int i = 0; i < numEntries; i++) { if (dedupSet != null) { // box once int?ent = output.ords[i]; if (dedupSet.Contains(ent)) { continue; } dedupSet.Add(ent); } scratchOutput.WriteVInt32(output.ords[i]); count++; } int pos = scratchOutput.Position; scratchOutput.WriteVInt32(count << 1 | (output.includeOrig ? 0 : 1)); int pos2 = scratchOutput.Position; int vIntLen = pos2 - pos; // Move the count + includeOrig to the front of the byte[]: Array.Copy(scratch.Bytes, pos, spare, 0, vIntLen); Array.Copy(scratch.Bytes, 0, scratch.Bytes, vIntLen, pos); Array.Copy(spare, 0, scratch.Bytes, 0, vIntLen); if (dedupSet != null) { dedupSet.Clear(); } scratch.Length = scratchOutput.Position - scratch.Offset; //System.out.println(" add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count); builder.Add(Lucene.Net.Util.Fst.Util.ToUTF32(input.ToString(), scratchIntsRef), BytesRef.DeepCopyOf(scratch)); } FST <BytesRef> fst = builder.Finish(); return(new SynonymMap(fst, words, maxHorizontalContext)); }
// FST is pruned private void VerifyPruned(int inputMode, FST <T> fst, int prune1, int prune2) { if (LuceneTestCase.Verbose) { Console.WriteLine("TEST: now verify pruned " + pairs.Count + " terms; outputs=" + outputs); foreach (InputOutput <T> pair in pairs) { Console.WriteLine(" " + InputToString(inputMode, pair.Input) + ": " + outputs.OutputToString(pair.Output)); } } // To validate the FST, we brute-force compute all prefixes // in the terms, matched to their "common" outputs, prune that // set according to the prune thresholds, then assert the FST // matches that same set. // NOTE: Crazy RAM intensive!! //System.out.println("TEST: tally prefixes"); // build all prefixes // LUCENENET: We use ConcurrentDictionary<TKey, TValue> because Dictionary<TKey, TValue> doesn't support // deletion while iterating, but ConcurrentDictionary does. IDictionary <Int32sRef, CountMinOutput <T> > prefixes = new ConcurrentDictionary <Int32sRef, CountMinOutput <T> >(); Int32sRef scratch = new Int32sRef(10); foreach (InputOutput <T> pair in pairs) { scratch.CopyInt32s(pair.Input); for (int idx = 0; idx <= pair.Input.Length; idx++) { scratch.Length = idx; if (!prefixes.TryGetValue(scratch, out CountMinOutput <T> cmo) || cmo == null) { cmo = new CountMinOutput <T>(); cmo.Count = 1; cmo.Output = pair.Output; prefixes[Int32sRef.DeepCopyOf(scratch)] = cmo; } else { cmo.Count++; T output1 = cmo.Output; if (output1.Equals(outputs.NoOutput)) { output1 = outputs.NoOutput; } T output2 = pair.Output; if (output2.Equals(outputs.NoOutput)) { output2 = outputs.NoOutput; } cmo.Output = outputs.Common(output1, output2); } if (idx == pair.Input.Length) { cmo.IsFinal = true; cmo.FinalOutput = cmo.Output; } } } if (LuceneTestCase.Verbose) { Console.WriteLine("TEST: now prune"); } // prune 'em using (var it = prefixes.GetEnumerator()) { while (it.MoveNext()) { var ent = it.Current; Int32sRef prefix = ent.Key; CountMinOutput <T> cmo = ent.Value; if (LuceneTestCase.Verbose) { Console.WriteLine(" term prefix=" + InputToString(inputMode, prefix, false) + " count=" + cmo.Count + " isLeaf=" + cmo.IsLeaf + " output=" + outputs.OutputToString(cmo.Output) + " isFinal=" + cmo.IsFinal); } bool keep; if (prune1 > 0) { keep = cmo.Count >= prune1; } else { if (Debugging.AssertsEnabled) { Debugging.Assert(prune2 > 0); } if (prune2 > 1 && cmo.Count >= prune2) { keep = true; } else if (prefix.Length > 0) { // consult our parent scratch.Length = prefix.Length - 1; Array.Copy(prefix.Int32s, prefix.Offset, scratch.Int32s, 0, scratch.Length); keep = prefixes.TryGetValue(scratch, out CountMinOutput <T> cmo2) && cmo2 != null && ((prune2 > 1 && cmo2.Count >= prune2) || (prune2 == 1 && (cmo2.Count >= 2 || prefix.Length <= 1))); //System.out.println(" parent count = " + (cmo2 == null ? -1 : cmo2.count)); } else if (cmo.Count >= prune2) { keep = true; } else { keep = false; } } if (!keep) { //it.remove(); prefixes.Remove(ent); //System.out.println(" remove"); } else { // clear isLeaf for all ancestors //System.out.println(" keep"); scratch.CopyInt32s(prefix); scratch.Length--; while (scratch.Length >= 0) { if (prefixes.TryGetValue(scratch, out CountMinOutput <T> cmo2) && cmo2 != null) { //System.out.println(" clear isLeaf " + inputToString(inputMode, scratch)); cmo2.IsLeaf = false; } scratch.Length--; } } } } if (LuceneTestCase.Verbose) { Console.WriteLine("TEST: after prune"); foreach (KeyValuePair <Int32sRef, CountMinOutput <T> > ent in prefixes) { Console.WriteLine(" " + InputToString(inputMode, ent.Key, false) + ": isLeaf=" + ent.Value.IsLeaf + " isFinal=" + ent.Value.IsFinal); if (ent.Value.IsFinal) { Console.WriteLine(" finalOutput=" + outputs.OutputToString(ent.Value.FinalOutput)); } } } if (prefixes.Count <= 1) { Assert.IsNull(fst); return; } Assert.IsNotNull(fst); // make sure FST only enums valid prefixes if (LuceneTestCase.Verbose) { Console.WriteLine("TEST: check pruned enum"); } Int32sRefFSTEnum <T> fstEnum = new Int32sRefFSTEnum <T>(fst); Int32sRefFSTEnum.InputOutput <T> current; while ((current = fstEnum.Next()) != null) { if (LuceneTestCase.Verbose) { Console.WriteLine(" fstEnum.next prefix=" + InputToString(inputMode, current.Input, false) + " output=" + outputs.OutputToString(current.Output)); } prefixes.TryGetValue(current.Input, out CountMinOutput <T> cmo); Assert.IsNotNull(cmo); Assert.IsTrue(cmo.IsLeaf || cmo.IsFinal); //if (cmo.isFinal && !cmo.isLeaf) { if (cmo.IsFinal) { Assert.AreEqual(cmo.FinalOutput, current.Output); } else { Assert.AreEqual(cmo.Output, current.Output); } } // make sure all non-pruned prefixes are present in the FST if (LuceneTestCase.Verbose) { Console.WriteLine("TEST: verify all prefixes"); } int[] stopNode = new int[1]; foreach (KeyValuePair <Int32sRef, CountMinOutput <T> > ent in prefixes) { if (ent.Key.Length > 0) { CountMinOutput <T> cmo = ent.Value; T output = Run(fst, ent.Key, stopNode); if (LuceneTestCase.Verbose) { Console.WriteLine("TEST: verify prefix=" + InputToString(inputMode, ent.Key, false) + " output=" + outputs.OutputToString(cmo.Output)); } // if (cmo.isFinal && !cmo.isLeaf) { if (cmo.IsFinal) { Assert.AreEqual(cmo.FinalOutput, output); } else { Assert.AreEqual(cmo.Output, output); } Assert.AreEqual(ent.Key.Length, stopNode[0]); } } }
/// <summary> /// Encodes ordinals into a <see cref="BytesRef"/>; expert: subclass can /// override this to change encoding. /// </summary> protected virtual BytesRef DedupAndEncode(Int32sRef ordinals) { Array.Sort(ordinals.Int32s, ordinals.Offset, ordinals.Length); byte[] bytes = new byte[5 * ordinals.Length]; int lastOrd = -1; int upto = 0; for (int i = 0; i < ordinals.Length; i++) { int ord = ordinals.Int32s[ordinals.Offset + i]; // ord could be == lastOrd, so we must dedup: if (ord > lastOrd) { int delta; if (lastOrd == -1) { delta = ord; } else { delta = ord - lastOrd; } if ((delta & ~0x7F) == 0) { bytes[upto] = (byte)delta; upto++; } else if ((delta & ~0x3FFF) == 0) { bytes[upto] = unchecked ((byte)(0x80 | ((delta & 0x3F80) >> 7))); bytes[upto + 1] = (byte)(delta & 0x7F); upto += 2; } else if ((delta & ~0x1FFFFF) == 0) { bytes[upto] = unchecked ((byte)(0x80 | ((delta & 0x1FC000) >> 14))); bytes[upto + 1] = unchecked ((byte)(0x80 | ((delta & 0x3F80) >> 7))); bytes[upto + 2] = (byte)(delta & 0x7F); upto += 3; } else if ((delta & ~0xFFFFFFF) == 0) { bytes[upto] = unchecked ((byte)(0x80 | ((delta & 0xFE00000) >> 21))); bytes[upto + 1] = unchecked ((byte)(0x80 | ((delta & 0x1FC000) >> 14))); bytes[upto + 2] = unchecked ((byte)(0x80 | ((delta & 0x3F80) >> 7))); bytes[upto + 3] = (byte)(delta & 0x7F); upto += 4; } else { bytes[upto] = unchecked ((byte)(0x80 | ((delta & 0xF0000000) >> 28))); bytes[upto + 1] = unchecked ((byte)(0x80 | ((delta & 0xFE00000) >> 21))); bytes[upto + 2] = unchecked ((byte)(0x80 | ((delta & 0x1FC000) >> 14))); bytes[upto + 3] = unchecked ((byte)(0x80 | ((delta & 0x3F80) >> 7))); bytes[upto + 4] = (byte)(delta & 0x7F); upto += 5; } lastOrd = ord; } } return(new BytesRef(bytes, 0, upto)); }
public SortedDocValuesAnonymousInnerClassHelper(FSTEntry entry, NumericDocValues docToOrd, FST <long?> fst, FST.BytesReader @in, FST.Arc <long?> firstArc, FST.Arc <long?> scratchArc, Int32sRef scratchInts, BytesRefFSTEnum <long?> fstEnum) { this.entry = entry; this.docToOrd = docToOrd; this.fst = fst; this.@in = @in; this.firstArc = firstArc; this.scratchArc = scratchArc; this.scratchInts = scratchInts; this.fstEnum = fstEnum; }
internal static string InputToString(int inputMode, Int32sRef term) { return(InputToString(inputMode, term, true)); }
public SortedSetDocValuesAnonymousInnerClassHelper(FSTEntry entry, BinaryDocValues docToOrds, FST <long?> fst, FST.BytesReader @in, FST.Arc <long?> firstArc, FST.Arc <long?> scratchArc, Int32sRef scratchInts, BytesRefFSTEnum <long?> fstEnum, BytesRef @ref, ByteArrayDataInput input) { this.entry = entry; this.docToOrds = docToOrds; this.fst = fst; this.@in = @in; this.firstArc = firstArc; this.scratchArc = scratchArc; this.scratchInts = scratchInts; this.fstEnum = fstEnum; this.@ref = @ref; this.input = input; }
// FST is pruned private void VerifyPruned(int inputMode, FST <T> fst, int prune1, int prune2) { if (LuceneTestCase.VERBOSE) { Console.WriteLine("TEST: now verify pruned " + Pairs.Count + " terms; outputs=" + Outputs); foreach (InputOutput <T> pair in Pairs) { Console.WriteLine(" " + InputToString(inputMode, pair.Input) + ": " + Outputs.OutputToString(pair.Output)); } } // To validate the FST, we brute-force compute all prefixes // in the terms, matched to their "common" outputs, prune that // set according to the prune thresholds, then assert the FST // matches that same set. // NOTE: Crazy RAM intensive!! //System.out.println("TEST: tally prefixes"); // build all prefixes IDictionary <Int32sRef, CountMinOutput <T> > prefixes = new HashMap <Int32sRef, CountMinOutput <T> >(); Int32sRef scratch = new Int32sRef(10); foreach (InputOutput <T> pair in Pairs) { scratch.CopyInt32s(pair.Input); for (int idx = 0; idx <= pair.Input.Length; idx++) { scratch.Length = idx; CountMinOutput <T> cmo = prefixes.ContainsKey(scratch) ? prefixes[scratch] : null; if (cmo == null) { cmo = new CountMinOutput <T>(); cmo.Count = 1; cmo.Output = pair.Output; prefixes[Int32sRef.DeepCopyOf(scratch)] = cmo; } else { cmo.Count++; T output1 = cmo.Output; if (output1.Equals(Outputs.NoOutput)) { output1 = Outputs.NoOutput; } T output2 = pair.Output; if (output2.Equals(Outputs.NoOutput)) { output2 = Outputs.NoOutput; } cmo.Output = Outputs.Common(output1, output2); } if (idx == pair.Input.Length) { cmo.IsFinal = true; cmo.FinalOutput = cmo.Output; } } } if (LuceneTestCase.VERBOSE) { Console.WriteLine("TEST: now prune"); } // prune 'em // LUCENENET NOTE: Altered this a bit to go in reverse rather than use an enumerator since // in .NET you cannot delete records while enumerating forward through a dictionary. for (int i = prefixes.Count - 1; i >= 0; i--) { KeyValuePair <Int32sRef, CountMinOutput <T> > ent = prefixes.ElementAt(i); Int32sRef prefix = ent.Key; CountMinOutput <T> cmo = ent.Value; if (LuceneTestCase.VERBOSE) { Console.WriteLine(" term prefix=" + InputToString(inputMode, prefix, false) + " count=" + cmo.Count + " isLeaf=" + cmo.IsLeaf + " output=" + Outputs.OutputToString(cmo.Output) + " isFinal=" + cmo.IsFinal); } bool keep; if (prune1 > 0) { keep = cmo.Count >= prune1; } else { Debug.Assert(prune2 > 0); if (prune2 > 1 && cmo.Count >= prune2) { keep = true; } else if (prefix.Length > 0) { // consult our parent scratch.Length = prefix.Length - 1; Array.Copy(prefix.Int32s, prefix.Offset, scratch.Int32s, 0, scratch.Length); CountMinOutput <T> cmo2 = prefixes.ContainsKey(scratch) ? prefixes[scratch] : null; //System.out.println(" parent count = " + (cmo2 == null ? -1 : cmo2.count)); keep = cmo2 != null && ((prune2 > 1 && cmo2.Count >= prune2) || (prune2 == 1 && (cmo2.Count >= 2 || prefix.Length <= 1))); } else if (cmo.Count >= prune2) { keep = true; } else { keep = false; } } if (!keep) { prefixes.Remove(prefix); //System.out.println(" remove"); } else { // clear isLeaf for all ancestors //System.out.println(" keep"); scratch.CopyInt32s(prefix); scratch.Length--; while (scratch.Length >= 0) { CountMinOutput <T> cmo2 = prefixes.ContainsKey(scratch) ? prefixes[scratch] : null; if (cmo2 != null) { //System.out.println(" clear isLeaf " + inputToString(inputMode, scratch)); cmo2.IsLeaf = false; } scratch.Length--; } } } if (LuceneTestCase.VERBOSE) { Console.WriteLine("TEST: after prune"); foreach (KeyValuePair <Int32sRef, CountMinOutput <T> > ent in prefixes) { Console.WriteLine(" " + InputToString(inputMode, ent.Key, false) + ": isLeaf=" + ent.Value.IsLeaf + " isFinal=" + ent.Value.IsFinal); if (ent.Value.IsFinal) { Console.WriteLine(" finalOutput=" + Outputs.OutputToString(ent.Value.FinalOutput)); } } } if (prefixes.Count <= 1) { Assert.IsNull(fst); return; } Assert.IsNotNull(fst); // make sure FST only enums valid prefixes if (LuceneTestCase.VERBOSE) { Console.WriteLine("TEST: check pruned enum"); } Int32sRefFSTEnum <T> fstEnum = new Int32sRefFSTEnum <T>(fst); Int32sRefFSTEnum.InputOutput <T> current; while ((current = fstEnum.Next()) != null) { if (LuceneTestCase.VERBOSE) { Console.WriteLine(" fstEnum.next prefix=" + InputToString(inputMode, current.Input, false) + " output=" + Outputs.OutputToString(current.Output)); } CountMinOutput <T> cmo = prefixes.ContainsKey(current.Input) ? prefixes[current.Input] : null; Assert.IsNotNull(cmo); Assert.IsTrue(cmo.IsLeaf || cmo.IsFinal); //if (cmo.isFinal && !cmo.isLeaf) { if (cmo.IsFinal) { Assert.AreEqual(cmo.FinalOutput, current.Output); } else { Assert.AreEqual(cmo.Output, current.Output); } } // make sure all non-pruned prefixes are present in the FST if (LuceneTestCase.VERBOSE) { Console.WriteLine("TEST: verify all prefixes"); } int[] stopNode = new int[1]; foreach (KeyValuePair <Int32sRef, CountMinOutput <T> > ent in prefixes) { if (ent.Key.Length > 0) { CountMinOutput <T> cmo = ent.Value; T output = Run(fst, ent.Key, stopNode); if (LuceneTestCase.VERBOSE) { Console.WriteLine("TEST: verify prefix=" + InputToString(inputMode, ent.Key, false) + " output=" + Outputs.OutputToString(cmo.Output)); } // if (cmo.isFinal && !cmo.isLeaf) { if (cmo.IsFinal) { Assert.AreEqual(cmo.FinalOutput, output); } else { Assert.AreEqual(cmo.Output, output); } Assert.AreEqual(ent.Key.Length, stopNode[0]); } } }
// for debugging /* * private String toString(BytesRef b) { * try { * return b.utf8ToString() + " " + b; * } catch (Throwable t) { * return b.toString(); * } * } */ /// <summary> /// It's OK to add the same input twice in a row with /// different outputs, as long as outputs impls the merge /// method. Note that input is fully consumed after this /// method is returned (so caller is free to reuse), but /// output is not. So if your outputs are changeable (eg /// <see cref="ByteSequenceOutputs"/> or /// <see cref="Int32SequenceOutputs"/>) then you cannot reuse across /// calls. /// </summary> public virtual void Add(Int32sRef input, T output) { /* * if (DEBUG) { * BytesRef b = new BytesRef(input.length); * for(int x=0;x<input.length;x++) { * b.bytes[x] = (byte) input.ints[x]; * } * b.length = input.length; * if (output == NO_OUTPUT) { * System.out.println("\nFST ADD: input=" + toString(b) + " " + b); * } else { * System.out.println("\nFST ADD: input=" + toString(b) + " " + b + " output=" + fst.outputs.outputToString(output)); * } * } */ // De-dup NO_OUTPUT since it must be a singleton: if (output.Equals(NO_OUTPUT)) { output = NO_OUTPUT; } Debug.Assert(lastInput.Length == 0 || input.CompareTo(lastInput) >= 0, "inputs are added out of order lastInput=" + lastInput + " vs input=" + input); Debug.Assert(ValidOutput(output)); //System.out.println("\nadd: " + input); if (input.Length == 0) { // empty input: only allowed as first input. we have // to special case this because the packed FST // format cannot represent the empty input since // 'finalness' is stored on the incoming arc, not on // the node frontier[0].InputCount++; frontier[0].IsFinal = true; fst.EmptyOutput = output; return; } // compare shared prefix length int pos1 = 0; int pos2 = input.Offset; int pos1Stop = Math.Min(lastInput.Length, input.Length); while (true) { frontier[pos1].InputCount++; //System.out.println(" incr " + pos1 + " ct=" + frontier[pos1].inputCount + " n=" + frontier[pos1]); if (pos1 >= pos1Stop || lastInput.Int32s[pos1] != input.Int32s[pos2]) { break; } pos1++; pos2++; } int prefixLenPlus1 = pos1 + 1; if (frontier.Length < input.Length + 1) { UnCompiledNode <T>[] next = new UnCompiledNode <T> [ArrayUtil.Oversize(input.Length + 1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; Array.Copy(frontier, 0, next, 0, frontier.Length); for (int idx = frontier.Length; idx < next.Length; idx++) { next[idx] = new UnCompiledNode <T>(this, idx); } frontier = next; } // minimize/compile states from previous input's // orphan'd suffix DoFreezeTail(prefixLenPlus1); // init tail states for current input for (int idx = prefixLenPlus1; idx <= input.Length; idx++) { frontier[idx - 1].AddArc(input.Int32s[input.Offset + idx - 1], frontier[idx]); frontier[idx].InputCount++; } UnCompiledNode <T> lastNode = frontier[input.Length]; if (lastInput.Length != input.Length || prefixLenPlus1 != input.Length + 1) { lastNode.IsFinal = true; lastNode.Output = NO_OUTPUT; } // push conflicting outputs forward, only as far as // needed for (int idx = 1; idx < prefixLenPlus1; idx++) { UnCompiledNode <T> node = frontier[idx]; UnCompiledNode <T> parentNode = frontier[idx - 1]; T lastOutput = parentNode.GetLastOutput(input.Int32s[input.Offset + idx - 1]); Debug.Assert(ValidOutput(lastOutput)); T commonOutputPrefix; T wordSuffix; if (!lastOutput.Equals(NO_OUTPUT)) { commonOutputPrefix = fst.Outputs.Common(output, lastOutput); Debug.Assert(ValidOutput(commonOutputPrefix)); wordSuffix = fst.Outputs.Subtract(lastOutput, commonOutputPrefix); Debug.Assert(ValidOutput(wordSuffix)); parentNode.SetLastOutput(input.Int32s[input.Offset + idx - 1], commonOutputPrefix); node.PrependOutput(wordSuffix); } else { commonOutputPrefix = wordSuffix = NO_OUTPUT; } output = fst.Outputs.Subtract(output, commonOutputPrefix); Debug.Assert(ValidOutput(output)); } if (lastInput.Length == input.Length && prefixLenPlus1 == 1 + input.Length) { // same input more than 1 time in a row, mapping to // multiple outputs lastNode.Output = fst.Outputs.Merge(lastNode.Output, output); } else { // this new arc is private to this new input; set its // arc output to the leftover output: frontier[prefixLenPlus1 - 1].SetLastOutput(input.Int32s[input.Offset + prefixLenPlus1 - 1], output); } // save last input lastInput.CopyInt32s(input); //System.out.println(" count[0]=" + frontier[0].inputCount); }
public virtual TokenInfoDictionaryWriter BuildDictionary(IList <string> csvFiles) { TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(10 * 1024 * 1024); // all lines in the file Console.WriteLine(" parse..."); List <string[]> lines = new List <string[]>(400000); foreach (string file in csvFiles) { using (Stream inputStream = new FileStream(file, FileMode.Open, FileAccess.Read)) { Encoding decoder = Encoding.GetEncoding(encoding); TextReader reader = new StreamReader(inputStream, decoder); string line = null; while ((line = reader.ReadLine()) != null) { string[] entry = CSVUtil.Parse(line); if (entry.Length < 13) { Console.WriteLine("Entry in CSV is not valid: " + line); continue; } string[] formatted = FormatEntry(entry); lines.Add(formatted); // NFKC normalize dictionary entry if (normalizeEntries) { //if (normalizer.isNormalized(entry[0])){ if (entry[0].IsNormalized(NormalizationForm.FormKC)) { continue; } string[] normalizedEntry = new string[entry.Length]; for (int i = 0; i < entry.Length; i++) { //normalizedEntry[i] = normalizer.normalize(entry[i]); normalizedEntry[i] = entry[i].Normalize(NormalizationForm.FormKC); } formatted = FormatEntry(normalizedEntry); lines.Add(formatted); } } } } Console.WriteLine(" sort..."); // sort by term: we sorted the files already and use a stable sort. lines.Sort(new ComparerAnonymousHelper()); Console.WriteLine(" encode..."); PositiveInt32Outputs fstOutput = PositiveInt32Outputs.Singleton; Builder <long?> fstBuilder = new Builder <long?>(Lucene.Net.Util.Fst.FST.INPUT_TYPE.BYTE2, 0, 0, true, true, int.MaxValue, fstOutput, null, true, PackedInt32s.DEFAULT, true, 15); Int32sRef scratch = new Int32sRef(); long ord = -1; // first ord will be 0 string lastValue = null; // build tokeninfo dictionary foreach (string[] entry in lines) { int next = dictionary.Put(entry); if (next == offset) { Console.WriteLine("Failed to process line: " + Collections.ToString(entry)); continue; } string token = entry[0]; if (!token.Equals(lastValue, StringComparison.Ordinal)) { // new word to add to fst ord++; lastValue = token; scratch.Grow(token.Length); scratch.Length = token.Length; for (int i = 0; i < token.Length; i++) { scratch.Int32s[i] = (int)token[i]; } fstBuilder.Add(scratch, ord); } dictionary.AddMapping((int)ord, offset); offset = next; } FST <long?> fst = fstBuilder.Finish(); Console.WriteLine(" " + fst.NodeCount + " nodes, " + fst.ArcCount + " arcs, " + fst.GetSizeInBytes() + " bytes... "); dictionary.SetFST(fst); Console.WriteLine(" done"); return(dictionary); }
private void LoadTerms() { var posIntOutputs = PositiveInt32Outputs.Singleton; var outputsInner = new PairOutputs <long?, long?>(posIntOutputs, posIntOutputs); var outputs = new PairOutputs <long?, PairOutputs <long?, long?> .Pair>(posIntOutputs, outputsInner); // honestly, wtf kind of generic mess is this. var b = new Builder <PairOutputs <long?, PairOutputs <long?, long?> .Pair> .Pair>(FST.INPUT_TYPE.BYTE1, outputs); var input = (IndexInput)_outerInstance._input.Clone(); input.Seek(_termsStart); var lastTerm = new BytesRef(10); long lastDocsStart = -1; int docFreq = 0; long totalTermFreq = 0; var visitedDocs = new FixedBitSet(_maxDoc); var scratchIntsRef = new Int32sRef(); while (true) { SimpleTextUtil.ReadLine(input, _scratch); if (_scratch.Equals(SimpleTextFieldsWriter.END) || StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.FIELD)) { if (lastDocsStart != -1) { b.Add(Util.ToInt32sRef(lastTerm, scratchIntsRef), outputs.NewPair(lastDocsStart, outputsInner.NewPair(docFreq, totalTermFreq))); _sumTotalTermFreq += totalTermFreq; } break; } if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.DOC)) { docFreq++; _sumDocFreq++; UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.DOC.Length, _scratch.Length - SimpleTextFieldsWriter.DOC.Length, _scratchUtf16); int docId = ArrayUtil.ParseInt32(_scratchUtf16.Chars, 0, _scratchUtf16.Length); visitedDocs.Set(docId); } else if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.FREQ)) { UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.FREQ.Length, _scratch.Length - SimpleTextFieldsWriter.FREQ.Length, _scratchUtf16); totalTermFreq += ArrayUtil.ParseInt32(_scratchUtf16.Chars, 0, _scratchUtf16.Length); } else if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.TERM)) { if (lastDocsStart != -1) { b.Add(Util.ToInt32sRef(lastTerm, scratchIntsRef), outputs.NewPair(lastDocsStart, outputsInner.NewPair(docFreq, totalTermFreq))); } lastDocsStart = input.GetFilePointer(); int len = _scratch.Length - SimpleTextFieldsWriter.TERM.Length; if (len > lastTerm.Length) { lastTerm.Grow(len); } Array.Copy(_scratch.Bytes, SimpleTextFieldsWriter.TERM.Length, lastTerm.Bytes, 0, len); lastTerm.Length = len; docFreq = 0; _sumTotalTermFreq += totalTermFreq; totalTermFreq = 0; _termCount++; } } _docCount = visitedDocs.Cardinality(); _fst = b.Finish(); }
public void TestEnumerateAll() { // just for debugging int numTerms = 0; int numWords = 0; int lastWordId = -1; int lastSourceId = -1; TokenInfoDictionary tid = TokenInfoDictionary.Instance; ConnectionCosts matrix = ConnectionCosts.Instance; FST <long?> fst = tid.FST.InternalFST; Int32sRefFSTEnum <long?> fstEnum = new Int32sRefFSTEnum <long?>(fst); Int32sRefFSTEnum.InputOutput <long?> mapping; Int32sRef scratch = new Int32sRef(); while ((mapping = fstEnum.Next()) != null) { numTerms++; Int32sRef input = mapping.Input; char[] chars = new char[input.Length]; for (int i = 0; i < chars.Length; i++) { chars[i] = (char)input.Int32s[input.Offset + i]; } assertTrue(UnicodeUtil.ValidUTF16String(new string(chars))); long?output = mapping.Output; int sourceId = (int)output.Value; // we walk in order, terms, sourceIds, and wordIds should always be increasing assertTrue(sourceId > lastSourceId); lastSourceId = sourceId; tid.LookupWordIds(sourceId, scratch); for (int i = 0; i < scratch.Length; i++) { numWords++; int wordId = scratch.Int32s[scratch.Offset + i]; assertTrue(wordId > lastWordId); lastWordId = wordId; String baseForm = tid.GetBaseForm(wordId, chars, 0, chars.Length); assertTrue(baseForm == null || UnicodeUtil.ValidUTF16String(baseForm)); String inflectionForm = tid.GetInflectionForm(wordId); assertTrue(inflectionForm == null || UnicodeUtil.ValidUTF16String(inflectionForm)); if (inflectionForm != null) { // check that its actually an ipadic inflection form assertNotNull(ToStringUtil.GetInflectedFormTranslation(inflectionForm)); } String inflectionType = tid.GetInflectionType(wordId); assertTrue(inflectionType == null || UnicodeUtil.ValidUTF16String(inflectionType)); if (inflectionType != null) { // check that its actually an ipadic inflection type assertNotNull(ToStringUtil.GetInflectionTypeTranslation(inflectionType)); } int leftId = tid.GetLeftId(wordId); int rightId = tid.GetRightId(wordId); matrix.Get(rightId, leftId); tid.GetWordCost(wordId); String pos = tid.GetPartOfSpeech(wordId); assertNotNull(pos); assertTrue(UnicodeUtil.ValidUTF16String(pos)); // check that its actually an ipadic pos tag assertNotNull(ToStringUtil.GetPOSTranslation(pos)); String pronunciation = tid.GetPronunciation(wordId, chars, 0, chars.Length); assertNotNull(pronunciation); assertTrue(UnicodeUtil.ValidUTF16String(pronunciation)); String reading = tid.GetReading(wordId, chars, 0, chars.Length); assertNotNull(reading); assertTrue(UnicodeUtil.ValidUTF16String(reading)); } } if (VERBOSE) { Console.WriteLine("checked " + numTerms + " terms, " + numWords + " words."); } }
/// <summary> /// Applies the affix rule to the given word, producing a list of stems if any are found /// </summary> /// <param name="strippedWord"> Word the affix has been removed and the strip added </param> /// <param name="length"> valid length of stripped word </param> /// <param name="affix"> HunspellAffix representing the affix rule itself </param> /// <param name="prefixFlag"> when we already stripped a prefix, we cant simply recurse and check the suffix, unless both are compatible /// so we must check dictionary form against both to add it as a stem! </param> /// <param name="recursionDepth"> current recursion depth </param> /// <param name="prefix"> true if we are removing a prefix (false if its a suffix) </param> /// <param name="circumfix"> true if the previous prefix removal was signed as a circumfix /// this means inner most suffix must also contain circumfix flag. </param> /// <param name="caseVariant"> true if we are searching for a case variant. if the word has KEEPCASE flag it cannot succeed. </param> /// <returns> <see cref="IList{CharsRef}"/> of stems for the word, or an empty list if none are found </returns> internal IList <CharsRef> ApplyAffix(char[] strippedWord, int length, int affix, int prefixFlag, int recursionDepth, bool prefix, bool circumfix, bool caseVariant) { // TODO: just pass this in from before, no need to decode it twice affixReader.Position = 8 * affix; char flag = (char)(affixReader.ReadInt16() & 0xffff); affixReader.SkipBytes(2); // strip int condition = (char)(affixReader.ReadInt16() & 0xffff); bool crossProduct = (condition & 1) == 1; condition = condition.TripleShift(1); char append = (char)(affixReader.ReadInt16() & 0xffff); List <CharsRef> stems = new List <CharsRef>(); Int32sRef forms = dictionary.LookupWord(strippedWord, 0, length); if (forms != null) { for (int i = 0; i < forms.Length; i += formStep) { dictionary.flagLookup.Get(forms.Int32s[forms.Offset + i], scratch); char[] wordFlags = Dictionary.DecodeFlags(scratch); if (Dictionary.HasFlag(wordFlags, flag)) { // confusing: in this one exception, we already chained the first prefix against the second, // so it doesnt need to be checked against the word bool chainedPrefix = dictionary.complexPrefixes && recursionDepth == 1 && prefix; if (chainedPrefix == false && prefixFlag >= 0 && !Dictionary.HasFlag(wordFlags, (char)prefixFlag)) { // see if we can chain prefix thru the suffix continuation class (only if it has any!) dictionary.flagLookup.Get(append, scratch); char[] appendFlags = Dictionary.DecodeFlags(scratch); if (!HasCrossCheckedFlag((char)prefixFlag, appendFlags, false)) { continue; } } // if circumfix was previously set by a prefix, we must check this suffix, // to ensure it has it, and vice versa if (dictionary.circumfix != -1) { dictionary.flagLookup.Get(append, scratch); char[] appendFlags = Dictionary.DecodeFlags(scratch); bool suffixCircumfix = Dictionary.HasFlag(appendFlags, (char)dictionary.circumfix); if (circumfix != suffixCircumfix) { continue; } } // we are looking for a case variant, but this word does not allow it if (caseVariant && dictionary.keepcase != -1 && Dictionary.HasFlag(wordFlags, (char)dictionary.keepcase)) { continue; } // we aren't decompounding (yet) if (dictionary.onlyincompound != -1 && Dictionary.HasFlag(wordFlags, (char)dictionary.onlyincompound)) { continue; } stems.Add(NewStem(strippedWord, length, forms, i)); } } } // if a circumfix flag is defined in the dictionary, and we are a prefix, we need to check if we have that flag if (dictionary.circumfix != -1 && !circumfix && prefix) { dictionary.flagLookup.Get(append, scratch); char[] appendFlags = Dictionary.DecodeFlags(scratch); circumfix = Dictionary.HasFlag(appendFlags, (char)dictionary.circumfix); } if (crossProduct) { if (recursionDepth == 0) { if (prefix) { // we took away the first prefix. // COMPLEXPREFIXES = true: combine with a second prefix and another suffix // COMPLEXPREFIXES = false: combine with a suffix stems.AddRange(Stem(strippedWord, length, affix, flag, flag, ++recursionDepth, dictionary.complexPrefixes && dictionary.twoStageAffix, true, true, circumfix, caseVariant)); } else if (dictionary.complexPrefixes == false && dictionary.twoStageAffix) { // we took away a suffix. // COMPLEXPREFIXES = true: we don't recurse! only one suffix allowed // COMPLEXPREFIXES = false: combine with another suffix stems.AddRange(Stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix, caseVariant)); } } else if (recursionDepth == 1) { if (prefix && dictionary.complexPrefixes) { // we took away the second prefix: go look for another suffix stems.AddRange(Stem(strippedWord, length, affix, flag, flag, ++recursionDepth, false, true, true, circumfix, caseVariant)); } else if (prefix == false && dictionary.complexPrefixes == false && dictionary.twoStageAffix) { // we took away a prefix, then a suffix: go look for another suffix stems.AddRange(Stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix, caseVariant)); } } } return(stems); }
private void LoadTerms() { PositiveInt32Outputs posIntOutputs = PositiveInt32Outputs.Singleton; var outputsInner = new PairOutputs <Int64, Int64>(posIntOutputs, posIntOutputs); var outputs = new PairOutputs <Int64, PairOutputs <Int64, Int64> .Pair>(posIntOutputs, outputsInner); var b = new Builder <PairOutputs <Int64, PairOutputs <Int64, Int64> .Pair> .Pair>(FST.INPUT_TYPE.BYTE1, outputs); IndexInput @in = (IndexInput)outerInstance.input.Clone(); @in.Seek(termsStart); BytesRef lastTerm = new BytesRef(10); long lastDocsStart = -1; int docFreq = 0; long totalTermFreq = 0; FixedBitSet visitedDocs = new FixedBitSet(maxDoc); Int32sRef scratchIntsRef = new Int32sRef(); while (true) { SimpleTextUtil.ReadLine(@in, scratch); if (scratch.Equals(SimpleTextFieldsWriter.END) || StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.FIELD)) { if (lastDocsStart != -1) { b.Add(Util.ToInt32sRef(lastTerm, scratchIntsRef), outputs.NewPair(lastDocsStart, outputsInner.NewPair((long)docFreq, totalTermFreq))); sumTotalTermFreq += totalTermFreq; } break; } else if (StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.DOC)) { docFreq++; sumDocFreq++; UnicodeUtil.UTF8toUTF16(scratch.Bytes, scratch.Offset + SimpleTextFieldsWriter.DOC.Length, scratch.Length - SimpleTextFieldsWriter.DOC.Length, scratchUTF16); int docID = ArrayUtil.ParseInt32(scratchUTF16.Chars, 0, scratchUTF16.Length); visitedDocs.Set(docID); } else if (StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.FREQ)) { UnicodeUtil.UTF8toUTF16(scratch.Bytes, scratch.Offset + SimpleTextFieldsWriter.FREQ.Length, scratch.Length - SimpleTextFieldsWriter.FREQ.Length, scratchUTF16); totalTermFreq += ArrayUtil.ParseInt32(scratchUTF16.Chars, 0, scratchUTF16.Length); } else if (StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.TERM)) { if (lastDocsStart != -1) { b.Add(Util.ToInt32sRef(lastTerm, scratchIntsRef), outputs.NewPair(lastDocsStart, outputsInner.NewPair((long)docFreq, totalTermFreq))); } lastDocsStart = @in.Position; // LUCENENET specific: Renamed from getFilePointer() to match FileStream int len = scratch.Length - SimpleTextFieldsWriter.TERM.Length; if (len > lastTerm.Length) { lastTerm.Grow(len); } System.Array.Copy(scratch.Bytes, SimpleTextFieldsWriter.TERM.Length, lastTerm.Bytes, 0, len); lastTerm.Length = len; docFreq = 0; sumTotalTermFreq += totalTermFreq; totalTermFreq = 0; termCount++; } } docCount = visitedDocs.Cardinality; fst = b.Finish(); /* * PrintStream ps = new PrintStream("out.dot"); * fst.toDot(ps); * ps.close(); * System.out.println("SAVED out.dot"); */ //System.out.println("FST " + fst.sizeInBytes()); }