public override SortedSetDocValues GetSortedSet(FieldInfo field) { FSTEntry entry = fsts[field.Number]; if (entry.NumOrds == 0) { return(DocValues.EMPTY_SORTED_SET); // empty FST! } FST <long?> instance; lock (this) { if (!fstInstances.TryGetValue(field.Number, out instance) || instance == null) { data.Seek(entry.Offset); instance = new FST <long?>(data, PositiveInt32Outputs.Singleton); ramBytesUsed.AddAndGet(instance.GetSizeInBytes()); fstInstances[field.Number] = instance; } } BinaryDocValues docToOrds = GetBinary(field); FST <long?> fst = instance; // per-thread resources var @in = fst.GetBytesReader(); var firstArc = new FST.Arc <long?>(); var scratchArc = new FST.Arc <long?>(); var scratchInts = new Int32sRef(); var fstEnum = new BytesRefFSTEnum <long?>(fst); var @ref = new BytesRef(); var input = new ByteArrayDataInput(); return(new SortedSetDocValuesAnonymousInnerClassHelper(entry, docToOrds, fst, @in, firstArc, scratchArc, scratchInts, fstEnum, @ref, input)); }
public override SortedDocValues GetSorted(FieldInfo field) { FSTEntry entry = fsts[field.Number]; FST <long?> instance; lock (this) { if (!fstInstances.TryGetValue(field.Number, out instance) || instance == null) { data.Seek(entry.Offset); instance = new FST <long?>(data, PositiveInt32Outputs.Singleton); ramBytesUsed.AddAndGet(instance.GetSizeInBytes()); fstInstances[field.Number] = instance; } } var docToOrd = GetNumeric(field); var fst = instance; // per-thread resources var @in = fst.GetBytesReader(); var firstArc = new FST.Arc <long?>(); var scratchArc = new FST.Arc <long?>(); var scratchInts = new Int32sRef(); var fstEnum = new BytesRefFSTEnum <long?>(fst); return(new SortedDocValuesAnonymousInnerClassHelper(entry, docToOrd, fst, @in, firstArc, scratchArc, scratchInts, fstEnum)); }
/// <summary> /// Returns byte size of the underlying FST. </summary> public override long GetSizeInBytes() { if (fst == null) { return(0); } return(fst.GetSizeInBytes()); }
public override SortedSetDocValues GetSortedSet(FieldInfo field) { var entry = fsts[field.Number]; if (entry.numOrds == 0) { return(DocValues.EMPTY_SORTED_SET); // empty FST! } FST <Int64> instance; UninterruptableMonitor.Enter(this); try { if (!fstInstances.TryGetValue(field.Number, out instance)) { data.Seek(entry.offset); instance = new FST <Int64>(data, PositiveInt32Outputs.Singleton); ramBytesUsed.AddAndGet(instance.GetSizeInBytes()); fstInstances[field.Number] = instance; } } finally { UninterruptableMonitor.Exit(this); } var docToOrds = GetBinary(field); var fst = instance; // per-thread resources var @in = fst.GetBytesReader(); var firstArc = new FST.Arc <Int64>(); var scratchArc = new FST.Arc <Int64>(); var scratchInts = new Int32sRef(); var fstEnum = new BytesRefFSTEnum <Int64>(fst); var @ref = new BytesRef(); var input = new ByteArrayDataInput(); return(new SortedSetDocValuesAnonymousClass(entry, docToOrds, fst, @in, firstArc, scratchArc, scratchInts, fstEnum, @ref, input)); }
public override SortedDocValues GetSorted(FieldInfo field) { FSTEntry entry = fsts[field.Number]; if (entry.numOrds == 0) { return(DocValues.EMPTY_SORTED); } FST <long?> instance; UninterruptableMonitor.Enter(this); try { if (!fstInstances.TryGetValue(field.Number, out instance)) { data.Seek(entry.offset); instance = new FST <long?>(data, PositiveInt32Outputs.Singleton); ramBytesUsed.AddAndGet(instance.GetSizeInBytes()); fstInstances[field.Number] = instance; } } finally { UninterruptableMonitor.Exit(this); } var docToOrd = GetNumeric(field); var fst = instance; // per-thread resources var @in = fst.GetBytesReader(); var firstArc = new FST.Arc <long?>(); var scratchArc = new FST.Arc <long?>(); var scratchInts = new Int32sRef(); var fstEnum = new BytesRefFSTEnum <long?>(fst); return(new SortedDocValuesAnonymousClass(entry, docToOrd, fst, @in, firstArc, scratchArc, scratchInts, fstEnum)); }
/// <summary> /// Returns byte size of the underlying FST. </summary> public override long GetSizeInBytes() { return(fst == null ? 0 : fst.GetSizeInBytes()); }
public long RamBytesUsed() { return((fst != null) ? fst.GetSizeInBytes() : 0); }
/// <summary>Returns approximate RAM bytes used.</summary> public virtual long RamBytesUsed() { return(fst == null ? 0 : fst.GetSizeInBytes()); }
// LUCENENET specific: moved Arc<S> to Builder type // NOTE: not many instances of Node or CompiledNode are in // memory while the FST is being built; it's only the // current "frontier": // LUCENENET specific: moved INode to Builder type public virtual long GetFstSizeInBytes() { return(fst.GetSizeInBytes()); }
/// <summary>Returns approximate RAM bytes used.</summary> public virtual long RamBytesUsed() { return((_fst != null) ? _fst.GetSizeInBytes() : 0); }
/// <summary> /// Returns byte size of the underlying FST. </summary> public override long GetSizeInBytes() { return((fst is null) ? 0 : fst.GetSizeInBytes()); }
public virtual void Test() { int[] ints = new int[7]; Int32sRef input = new Int32sRef(ints, 0, ints.Length); int seed = Random.Next(); Directory dir = new MMapDirectory(CreateTempDir("2BFST")); for (int doPackIter = 0; doPackIter < 2; doPackIter++) { bool doPack = doPackIter == 1; // Build FST w/ NoOutputs and stop when nodeCount > 2.2B if (!doPack) { Console.WriteLine("\nTEST: 3B nodes; doPack=false output=NO_OUTPUTS"); Outputs <object> outputs = NoOutputs.Singleton; object NO_OUTPUT = outputs.NoOutput; Builder <object> b = new Builder <object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, int.MaxValue, outputs, null, doPack, PackedInt32s.COMPACT, true, 15); int count = 0; Random r = new Random(seed); int[] ints2 = new int[200]; Int32sRef input2 = new Int32sRef(ints2, 0, ints2.Length); while (true) { //System.out.println("add: " + input + " -> " + output); for (int i = 10; i < ints2.Length; i++) { ints2[i] = r.Next(256); } b.Add(input2, NO_OUTPUT); count++; if (count % 100000 == 0) { Console.WriteLine(count + ": " + b.GetFstSizeInBytes() + " bytes; " + b.TotStateCount + " nodes"); } if (b.TotStateCount > int.MaxValue + 100L * 1024 * 1024) { break; } NextInput(r, ints2); } FST <object> fst = b.Finish(); for (int verify = 0; verify < 2; verify++) { Console.WriteLine("\nTEST: now verify [fst size=" + fst.GetSizeInBytes() + "; nodeCount=" + fst.NodeCount + "; arcCount=" + fst.ArcCount + "]"); Arrays.Fill(ints2, 0); r = new Random(seed); for (int i = 0; i < count; i++) { if (i % 1000000 == 0) { Console.WriteLine(i + "...: "); } for (int j = 10; j < ints2.Length; j++) { ints2[j] = r.Next(256); } Assert.AreEqual(NO_OUTPUT, Util.Get(fst, input2)); NextInput(r, ints2); } Console.WriteLine("\nTEST: enum all input/outputs"); Int32sRefFSTEnum <object> fstEnum = new Int32sRefFSTEnum <object>(fst); Arrays.Fill(ints2, 0); r = new Random(seed); int upto = 0; while (true) { Int32sRefFSTEnum.InputOutput <object> pair = fstEnum.Next(); if (pair == null) { break; } for (int j = 10; j < ints2.Length; j++) { ints2[j] = r.Next(256); } Assert.AreEqual(input2, pair.Input); Assert.AreEqual(NO_OUTPUT, pair.Output); upto++; NextInput(r, ints2); } Assert.AreEqual(count, upto); if (verify == 0) { Console.WriteLine("\nTEST: save/load FST and re-verify"); IndexOutput @out = dir.CreateOutput("fst", IOContext.DEFAULT); fst.Save(@out); @out.Dispose(); IndexInput @in = dir.OpenInput("fst", IOContext.DEFAULT); fst = new FST <object>(@in, outputs); @in.Dispose(); } else { dir.DeleteFile("fst"); } } } // Build FST w/ ByteSequenceOutputs and stop when FST // size = 3GB { Console.WriteLine("\nTEST: 3 GB size; doPack=" + doPack + " outputs=bytes"); Outputs <BytesRef> outputs = ByteSequenceOutputs.Singleton; Builder <BytesRef> b = new Builder <BytesRef>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, int.MaxValue, outputs, null, doPack, PackedInt32s.COMPACT, true, 15); var outputBytes = new byte[20]; BytesRef output = new BytesRef(outputBytes); Arrays.Fill(ints, 0); int count = 0; Random r = new Random(seed); while (true) { r.NextBytes(outputBytes); //System.out.println("add: " + input + " -> " + output); b.Add(input, BytesRef.DeepCopyOf(output)); count++; if (count % 1000000 == 0) { Console.WriteLine(count + "...: " + b.GetFstSizeInBytes() + " bytes"); } if (b.GetFstSizeInBytes() > LIMIT) { break; } NextInput(r, ints); } FST <BytesRef> fst = b.Finish(); for (int verify = 0; verify < 2; verify++) { Console.WriteLine("\nTEST: now verify [fst size=" + fst.GetSizeInBytes() + "; nodeCount=" + fst.NodeCount + "; arcCount=" + fst.ArcCount + "]"); r = new Random(seed); Arrays.Fill(ints, 0); for (int i = 0; i < count; i++) { if (i % 1000000 == 0) { Console.WriteLine(i + "...: "); } r.NextBytes(outputBytes); Assert.AreEqual(output, Util.Get(fst, input)); NextInput(r, ints); } Console.WriteLine("\nTEST: enum all input/outputs"); Int32sRefFSTEnum <BytesRef> fstEnum = new Int32sRefFSTEnum <BytesRef>(fst); Arrays.Fill(ints, 0); r = new Random(seed); int upto = 0; while (true) { Int32sRefFSTEnum.InputOutput <BytesRef> pair = fstEnum.Next(); if (pair == null) { break; } Assert.AreEqual(input, pair.Input); r.NextBytes(outputBytes); Assert.AreEqual(output, pair.Output); upto++; NextInput(r, ints); } Assert.AreEqual(count, upto); if (verify == 0) { Console.WriteLine("\nTEST: save/load FST and re-verify"); IndexOutput @out = dir.CreateOutput("fst", IOContext.DEFAULT); fst.Save(@out); @out.Dispose(); IndexInput @in = dir.OpenInput("fst", IOContext.DEFAULT); fst = new FST <BytesRef>(@in, outputs); @in.Dispose(); } else { dir.DeleteFile("fst"); } } } // Build FST w/ PositiveIntOutputs and stop when FST // size = 3GB { Console.WriteLine("\nTEST: 3 GB size; doPack=" + doPack + " outputs=long"); Outputs <long?> outputs = PositiveInt32Outputs.Singleton; Builder <long?> b = new Builder <long?>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, int.MaxValue, outputs, null, doPack, PackedInt32s.COMPACT, true, 15); long output = 1; Arrays.Fill(ints, 0); int count = 0; Random r = new Random(seed); while (true) { //System.out.println("add: " + input + " -> " + output); b.Add(input, output); output += 1 + r.Next(10); count++; if (count % 1000000 == 0) { Console.WriteLine(count + "...: " + b.GetFstSizeInBytes() + " bytes"); } if (b.GetFstSizeInBytes() > LIMIT) { break; } NextInput(r, ints); } FST <long?> fst = b.Finish(); for (int verify = 0; verify < 2; verify++) { Console.WriteLine("\nTEST: now verify [fst size=" + fst.GetSizeInBytes() + "; nodeCount=" + fst.NodeCount + "; arcCount=" + fst.ArcCount + "]"); Arrays.Fill(ints, 0); output = 1; r = new Random(seed); for (int i = 0; i < count; i++) { if (i % 1000000 == 0) { Console.WriteLine(i + "...: "); } // forward lookup: Assert.AreEqual(output, (long)Util.Get(fst, input)); // reverse lookup: Assert.AreEqual(input, Util.GetByOutput(fst, output)); output += 1 + r.Next(10); NextInput(r, ints); } Console.WriteLine("\nTEST: enum all input/outputs"); Int32sRefFSTEnum <long?> fstEnum = new Int32sRefFSTEnum <long?>(fst); Arrays.Fill(ints, 0); r = new Random(seed); int upto = 0; output = 1; while (true) { Int32sRefFSTEnum.InputOutput <long?> pair = fstEnum.Next(); if (pair == null) { break; } Assert.AreEqual(input, pair.Input); Assert.AreEqual(output, pair.Output.Value); output += 1 + r.Next(10); upto++; NextInput(r, ints); } Assert.AreEqual(count, upto); if (verify == 0) { Console.WriteLine("\nTEST: save/load FST and re-verify"); IndexOutput @out = dir.CreateOutput("fst", IOContext.DEFAULT); fst.Save(@out); @out.Dispose(); IndexInput @in = dir.OpenInput("fst", IOContext.DEFAULT); fst = new FST <long?>(@in, outputs); @in.Dispose(); } else { dir.DeleteFile("fst"); } } } } dir.Dispose(); }
public virtual TokenInfoDictionaryWriter BuildDictionary(IList <string> csvFiles) { TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(10 * 1024 * 1024); // all lines in the file Console.WriteLine(" parse..."); List <string[]> lines = new List <string[]>(400000); foreach (string file in csvFiles) { using (Stream inputStream = new FileStream(file, FileMode.Open, FileAccess.Read)) { Encoding decoder = Encoding.GetEncoding(encoding); TextReader reader = new StreamReader(inputStream, decoder); string line = null; while ((line = reader.ReadLine()) != null) { string[] entry = CSVUtil.Parse(line); if (entry.Length < 13) { Console.WriteLine("Entry in CSV is not valid: " + line); continue; } string[] formatted = FormatEntry(entry); lines.Add(formatted); // NFKC normalize dictionary entry if (normalizeEntries) { //if (normalizer.isNormalized(entry[0])){ if (entry[0].IsNormalized(NormalizationForm.FormKC)) { continue; } string[] normalizedEntry = new string[entry.Length]; for (int i = 0; i < entry.Length; i++) { //normalizedEntry[i] = normalizer.normalize(entry[i]); normalizedEntry[i] = entry[i].Normalize(NormalizationForm.FormKC); } formatted = FormatEntry(normalizedEntry); lines.Add(formatted); } } } } Console.WriteLine(" sort..."); // sort by term: we sorted the files already and use a stable sort. lines.Sort(new ComparerAnonymousHelper()); Console.WriteLine(" encode..."); PositiveInt32Outputs fstOutput = PositiveInt32Outputs.Singleton; Builder <long?> fstBuilder = new Builder <long?>(Lucene.Net.Util.Fst.FST.INPUT_TYPE.BYTE2, 0, 0, true, true, int.MaxValue, fstOutput, null, true, PackedInt32s.DEFAULT, true, 15); Int32sRef scratch = new Int32sRef(); long ord = -1; // first ord will be 0 string lastValue = null; // build tokeninfo dictionary foreach (string[] entry in lines) { int next = dictionary.Put(entry); if (next == offset) { Console.WriteLine("Failed to process line: " + Collections.ToString(entry)); continue; } string token = entry[0]; if (!token.Equals(lastValue, StringComparison.Ordinal)) { // new word to add to fst ord++; lastValue = token; scratch.Grow(token.Length); scratch.Length = token.Length; for (int i = 0; i < token.Length; i++) { scratch.Int32s[i] = (int)token[i]; } fstBuilder.Add(scratch, ord); } dictionary.AddMapping((int)ord, offset); offset = next; } FST <long?> fst = fstBuilder.Finish(); Console.WriteLine(" " + fst.NodeCount + " nodes, " + fst.ArcCount + " arcs, " + fst.GetSizeInBytes() + " bytes... "); dictionary.SetFST(fst); Console.WriteLine(" done"); return(dictionary); }