public override SortedSetDocValues GetSortedSet(FieldInfo field)
        {
            FSTEntry entry = fsts[field.Number];

            if (entry.NumOrds == 0)
            {
                return(DocValues.EMPTY_SORTED_SET); // empty FST!
            }
            FST <long?> instance;

            lock (this)
            {
                if (!fstInstances.TryGetValue(field.Number, out instance) || instance == null)
                {
                    data.Seek(entry.Offset);
                    instance = new FST <long?>(data, PositiveInt32Outputs.Singleton);
                    ramBytesUsed.AddAndGet(instance.GetSizeInBytes());
                    fstInstances[field.Number] = instance;
                }
            }
            BinaryDocValues docToOrds = GetBinary(field);
            FST <long?>     fst       = instance;

            // per-thread resources
            var @in         = fst.GetBytesReader();
            var firstArc    = new FST.Arc <long?>();
            var scratchArc  = new FST.Arc <long?>();
            var scratchInts = new Int32sRef();
            var fstEnum     = new BytesRefFSTEnum <long?>(fst);
            var @ref        = new BytesRef();
            var input       = new ByteArrayDataInput();

            return(new SortedSetDocValuesAnonymousInnerClassHelper(entry, docToOrds, fst, @in, firstArc, scratchArc, scratchInts, fstEnum, @ref, input));
        }
        public override SortedDocValues GetSorted(FieldInfo field)
        {
            FSTEntry    entry = fsts[field.Number];
            FST <long?> instance;

            lock (this)
            {
                if (!fstInstances.TryGetValue(field.Number, out instance) || instance == null)
                {
                    data.Seek(entry.Offset);
                    instance = new FST <long?>(data, PositiveInt32Outputs.Singleton);
                    ramBytesUsed.AddAndGet(instance.GetSizeInBytes());
                    fstInstances[field.Number] = instance;
                }
            }
            var docToOrd = GetNumeric(field);
            var fst      = instance;

            // per-thread resources
            var @in         = fst.GetBytesReader();
            var firstArc    = new FST.Arc <long?>();
            var scratchArc  = new FST.Arc <long?>();
            var scratchInts = new Int32sRef();
            var fstEnum     = new BytesRefFSTEnum <long?>(fst);

            return(new SortedDocValuesAnonymousInnerClassHelper(entry, docToOrd, fst, @in, firstArc, scratchArc, scratchInts, fstEnum));
        }
Example #3
0
 /// <summary>
 /// Returns byte size of the underlying FST. </summary>
 public override long GetSizeInBytes()
 {
     if (fst == null)
     {
         return(0);
     }
     return(fst.GetSizeInBytes());
 }
Example #4
0
        public override SortedSetDocValues GetSortedSet(FieldInfo field)
        {
            var entry = fsts[field.Number];

            if (entry.numOrds == 0)
            {
                return(DocValues.EMPTY_SORTED_SET); // empty FST!
            }
            FST <Int64> instance;

            UninterruptableMonitor.Enter(this);
            try
            {
                if (!fstInstances.TryGetValue(field.Number, out instance))
                {
                    data.Seek(entry.offset);
                    instance = new FST <Int64>(data, PositiveInt32Outputs.Singleton);
                    ramBytesUsed.AddAndGet(instance.GetSizeInBytes());
                    fstInstances[field.Number] = instance;
                }
            }
            finally
            {
                UninterruptableMonitor.Exit(this);
            }
            var docToOrds = GetBinary(field);
            var fst       = instance;

            // per-thread resources
            var @in         = fst.GetBytesReader();
            var firstArc    = new FST.Arc <Int64>();
            var scratchArc  = new FST.Arc <Int64>();
            var scratchInts = new Int32sRef();
            var fstEnum     = new BytesRefFSTEnum <Int64>(fst);
            var @ref        = new BytesRef();
            var input       = new ByteArrayDataInput();

            return(new SortedSetDocValuesAnonymousClass(entry, docToOrds, fst, @in, firstArc,
                                                        scratchArc, scratchInts, fstEnum, @ref, input));
        }
Example #5
0
        public override SortedDocValues GetSorted(FieldInfo field)
        {
            FSTEntry entry = fsts[field.Number];

            if (entry.numOrds == 0)
            {
                return(DocValues.EMPTY_SORTED);
            }
            FST <long?> instance;

            UninterruptableMonitor.Enter(this);
            try
            {
                if (!fstInstances.TryGetValue(field.Number, out instance))
                {
                    data.Seek(entry.offset);
                    instance = new FST <long?>(data, PositiveInt32Outputs.Singleton);
                    ramBytesUsed.AddAndGet(instance.GetSizeInBytes());
                    fstInstances[field.Number] = instance;
                }
            }
            finally
            {
                UninterruptableMonitor.Exit(this);
            }
            var docToOrd = GetNumeric(field);
            var fst      = instance;

            // per-thread resources
            var @in         = fst.GetBytesReader();
            var firstArc    = new FST.Arc <long?>();
            var scratchArc  = new FST.Arc <long?>();
            var scratchInts = new Int32sRef();
            var fstEnum     = new BytesRefFSTEnum <long?>(fst);

            return(new SortedDocValuesAnonymousClass(entry, docToOrd, fst, @in, firstArc, scratchArc,
                                                     scratchInts, fstEnum));
        }
Example #6
0
 /// <summary>
 /// Returns byte size of the underlying FST. </summary>
 public override long GetSizeInBytes()
 {
     return(fst == null ? 0 : fst.GetSizeInBytes());
 }
 public long RamBytesUsed()
 {
     return((fst != null) ? fst.GetSizeInBytes() : 0);
 }
 /// <summary>Returns approximate RAM bytes used.</summary>
 public virtual long RamBytesUsed()
 {
     return(fst == null ? 0 : fst.GetSizeInBytes());
 }
Example #9
0
        // LUCENENET specific: moved Arc<S> to Builder type

        // NOTE: not many instances of Node or CompiledNode are in
        // memory while the FST is being built; it's only the
        // current "frontier":

        // LUCENENET specific: moved INode to Builder type

        public virtual long GetFstSizeInBytes()
        {
            return(fst.GetSizeInBytes());
        }
 /// <summary>Returns approximate RAM bytes used.</summary>
 public virtual long RamBytesUsed()
 {
     return((_fst != null) ? _fst.GetSizeInBytes() : 0);
 }
Example #11
0
 /// <summary>
 /// Returns byte size of the underlying FST. </summary>
 public override long GetSizeInBytes()
 {
     return((fst is null) ? 0 : fst.GetSizeInBytes());
 }
Example #12
0
        public virtual void Test()
        {
            int[]     ints  = new int[7];
            Int32sRef input = new Int32sRef(ints, 0, ints.Length);
            int       seed  = Random.Next();

            Directory dir = new MMapDirectory(CreateTempDir("2BFST"));

            for (int doPackIter = 0; doPackIter < 2; doPackIter++)
            {
                bool doPack = doPackIter == 1;

                // Build FST w/ NoOutputs and stop when nodeCount > 2.2B
                if (!doPack)
                {
                    Console.WriteLine("\nTEST: 3B nodes; doPack=false output=NO_OUTPUTS");
                    Outputs <object> outputs   = NoOutputs.Singleton;
                    object           NO_OUTPUT = outputs.NoOutput;
                    Builder <object> b         = new Builder <object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, int.MaxValue, outputs, null, doPack, PackedInt32s.COMPACT, true, 15);

                    int       count  = 0;
                    Random    r      = new Random(seed);
                    int[]     ints2  = new int[200];
                    Int32sRef input2 = new Int32sRef(ints2, 0, ints2.Length);
                    while (true)
                    {
                        //System.out.println("add: " + input + " -> " + output);
                        for (int i = 10; i < ints2.Length; i++)
                        {
                            ints2[i] = r.Next(256);
                        }
                        b.Add(input2, NO_OUTPUT);
                        count++;
                        if (count % 100000 == 0)
                        {
                            Console.WriteLine(count + ": " + b.GetFstSizeInBytes() + " bytes; " + b.TotStateCount + " nodes");
                        }
                        if (b.TotStateCount > int.MaxValue + 100L * 1024 * 1024)
                        {
                            break;
                        }
                        NextInput(r, ints2);
                    }

                    FST <object> fst = b.Finish();

                    for (int verify = 0; verify < 2; verify++)
                    {
                        Console.WriteLine("\nTEST: now verify [fst size=" + fst.GetSizeInBytes() + "; nodeCount=" + fst.NodeCount + "; arcCount=" + fst.ArcCount + "]");

                        Arrays.Fill(ints2, 0);
                        r = new Random(seed);

                        for (int i = 0; i < count; i++)
                        {
                            if (i % 1000000 == 0)
                            {
                                Console.WriteLine(i + "...: ");
                            }
                            for (int j = 10; j < ints2.Length; j++)
                            {
                                ints2[j] = r.Next(256);
                            }
                            Assert.AreEqual(NO_OUTPUT, Util.Get(fst, input2));
                            NextInput(r, ints2);
                        }

                        Console.WriteLine("\nTEST: enum all input/outputs");
                        Int32sRefFSTEnum <object> fstEnum = new Int32sRefFSTEnum <object>(fst);

                        Arrays.Fill(ints2, 0);
                        r = new Random(seed);
                        int upto = 0;
                        while (true)
                        {
                            Int32sRefFSTEnum.InputOutput <object> pair = fstEnum.Next();
                            if (pair == null)
                            {
                                break;
                            }
                            for (int j = 10; j < ints2.Length; j++)
                            {
                                ints2[j] = r.Next(256);
                            }
                            Assert.AreEqual(input2, pair.Input);
                            Assert.AreEqual(NO_OUTPUT, pair.Output);
                            upto++;
                            NextInput(r, ints2);
                        }
                        Assert.AreEqual(count, upto);

                        if (verify == 0)
                        {
                            Console.WriteLine("\nTEST: save/load FST and re-verify");
                            IndexOutput @out = dir.CreateOutput("fst", IOContext.DEFAULT);
                            fst.Save(@out);
                            @out.Dispose();
                            IndexInput @in = dir.OpenInput("fst", IOContext.DEFAULT);
                            fst = new FST <object>(@in, outputs);
                            @in.Dispose();
                        }
                        else
                        {
                            dir.DeleteFile("fst");
                        }
                    }
                }

                // Build FST w/ ByteSequenceOutputs and stop when FST
                // size = 3GB
                {
                    Console.WriteLine("\nTEST: 3 GB size; doPack=" + doPack + " outputs=bytes");
                    Outputs <BytesRef> outputs = ByteSequenceOutputs.Singleton;
                    Builder <BytesRef> b       = new Builder <BytesRef>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, int.MaxValue, outputs, null, doPack, PackedInt32s.COMPACT, true, 15);

                    var      outputBytes = new byte[20];
                    BytesRef output      = new BytesRef(outputBytes);
                    Arrays.Fill(ints, 0);
                    int    count = 0;
                    Random r     = new Random(seed);
                    while (true)
                    {
                        r.NextBytes(outputBytes);
                        //System.out.println("add: " + input + " -> " + output);
                        b.Add(input, BytesRef.DeepCopyOf(output));
                        count++;
                        if (count % 1000000 == 0)
                        {
                            Console.WriteLine(count + "...: " + b.GetFstSizeInBytes() + " bytes");
                        }
                        if (b.GetFstSizeInBytes() > LIMIT)
                        {
                            break;
                        }
                        NextInput(r, ints);
                    }

                    FST <BytesRef> fst = b.Finish();
                    for (int verify = 0; verify < 2; verify++)
                    {
                        Console.WriteLine("\nTEST: now verify [fst size=" + fst.GetSizeInBytes() + "; nodeCount=" + fst.NodeCount + "; arcCount=" + fst.ArcCount + "]");

                        r = new Random(seed);
                        Arrays.Fill(ints, 0);

                        for (int i = 0; i < count; i++)
                        {
                            if (i % 1000000 == 0)
                            {
                                Console.WriteLine(i + "...: ");
                            }
                            r.NextBytes(outputBytes);
                            Assert.AreEqual(output, Util.Get(fst, input));
                            NextInput(r, ints);
                        }

                        Console.WriteLine("\nTEST: enum all input/outputs");
                        Int32sRefFSTEnum <BytesRef> fstEnum = new Int32sRefFSTEnum <BytesRef>(fst);

                        Arrays.Fill(ints, 0);
                        r = new Random(seed);
                        int upto = 0;
                        while (true)
                        {
                            Int32sRefFSTEnum.InputOutput <BytesRef> pair = fstEnum.Next();
                            if (pair == null)
                            {
                                break;
                            }
                            Assert.AreEqual(input, pair.Input);
                            r.NextBytes(outputBytes);
                            Assert.AreEqual(output, pair.Output);
                            upto++;
                            NextInput(r, ints);
                        }
                        Assert.AreEqual(count, upto);

                        if (verify == 0)
                        {
                            Console.WriteLine("\nTEST: save/load FST and re-verify");
                            IndexOutput @out = dir.CreateOutput("fst", IOContext.DEFAULT);
                            fst.Save(@out);
                            @out.Dispose();
                            IndexInput @in = dir.OpenInput("fst", IOContext.DEFAULT);
                            fst = new FST <BytesRef>(@in, outputs);
                            @in.Dispose();
                        }
                        else
                        {
                            dir.DeleteFile("fst");
                        }
                    }
                }

                // Build FST w/ PositiveIntOutputs and stop when FST
                // size = 3GB
                {
                    Console.WriteLine("\nTEST: 3 GB size; doPack=" + doPack + " outputs=long");
                    Outputs <long?> outputs = PositiveInt32Outputs.Singleton;
                    Builder <long?> b       = new Builder <long?>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, int.MaxValue, outputs, null, doPack, PackedInt32s.COMPACT, true, 15);

                    long output = 1;

                    Arrays.Fill(ints, 0);
                    int    count = 0;
                    Random r     = new Random(seed);
                    while (true)
                    {
                        //System.out.println("add: " + input + " -> " + output);
                        b.Add(input, output);
                        output += 1 + r.Next(10);
                        count++;
                        if (count % 1000000 == 0)
                        {
                            Console.WriteLine(count + "...: " + b.GetFstSizeInBytes() + " bytes");
                        }
                        if (b.GetFstSizeInBytes() > LIMIT)
                        {
                            break;
                        }
                        NextInput(r, ints);
                    }

                    FST <long?> fst = b.Finish();

                    for (int verify = 0; verify < 2; verify++)
                    {
                        Console.WriteLine("\nTEST: now verify [fst size=" + fst.GetSizeInBytes() + "; nodeCount=" + fst.NodeCount + "; arcCount=" + fst.ArcCount + "]");

                        Arrays.Fill(ints, 0);

                        output = 1;
                        r      = new Random(seed);
                        for (int i = 0; i < count; i++)
                        {
                            if (i % 1000000 == 0)
                            {
                                Console.WriteLine(i + "...: ");
                            }

                            // forward lookup:
                            Assert.AreEqual(output, (long)Util.Get(fst, input));
                            // reverse lookup:
                            Assert.AreEqual(input, Util.GetByOutput(fst, output));
                            output += 1 + r.Next(10);
                            NextInput(r, ints);
                        }

                        Console.WriteLine("\nTEST: enum all input/outputs");
                        Int32sRefFSTEnum <long?> fstEnum = new Int32sRefFSTEnum <long?>(fst);

                        Arrays.Fill(ints, 0);
                        r = new Random(seed);
                        int upto = 0;
                        output = 1;
                        while (true)
                        {
                            Int32sRefFSTEnum.InputOutput <long?> pair = fstEnum.Next();
                            if (pair == null)
                            {
                                break;
                            }
                            Assert.AreEqual(input, pair.Input);
                            Assert.AreEqual(output, pair.Output.Value);
                            output += 1 + r.Next(10);
                            upto++;
                            NextInput(r, ints);
                        }
                        Assert.AreEqual(count, upto);

                        if (verify == 0)
                        {
                            Console.WriteLine("\nTEST: save/load FST and re-verify");
                            IndexOutput @out = dir.CreateOutput("fst", IOContext.DEFAULT);
                            fst.Save(@out);
                            @out.Dispose();
                            IndexInput @in = dir.OpenInput("fst", IOContext.DEFAULT);
                            fst = new FST <long?>(@in, outputs);
                            @in.Dispose();
                        }
                        else
                        {
                            dir.DeleteFile("fst");
                        }
                    }
                }
            }
            dir.Dispose();
        }
Example #13
0
        public virtual TokenInfoDictionaryWriter BuildDictionary(IList <string> csvFiles)
        {
            TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(10 * 1024 * 1024);

            // all lines in the file
            Console.WriteLine("  parse...");
            List <string[]> lines = new List <string[]>(400000);

            foreach (string file in csvFiles)
            {
                using (Stream inputStream = new FileStream(file, FileMode.Open, FileAccess.Read))
                {
                    Encoding   decoder = Encoding.GetEncoding(encoding);
                    TextReader reader  = new StreamReader(inputStream, decoder);

                    string line = null;
                    while ((line = reader.ReadLine()) != null)
                    {
                        string[] entry = CSVUtil.Parse(line);

                        if (entry.Length < 13)
                        {
                            Console.WriteLine("Entry in CSV is not valid: " + line);
                            continue;
                        }

                        string[] formatted = FormatEntry(entry);
                        lines.Add(formatted);

                        // NFKC normalize dictionary entry
                        if (normalizeEntries)
                        {
                            //if (normalizer.isNormalized(entry[0])){
                            if (entry[0].IsNormalized(NormalizationForm.FormKC))
                            {
                                continue;
                            }
                            string[] normalizedEntry = new string[entry.Length];
                            for (int i = 0; i < entry.Length; i++)
                            {
                                //normalizedEntry[i] = normalizer.normalize(entry[i]);
                                normalizedEntry[i] = entry[i].Normalize(NormalizationForm.FormKC);
                            }

                            formatted = FormatEntry(normalizedEntry);
                            lines.Add(formatted);
                        }
                    }
                }
            }

            Console.WriteLine("  sort...");

            // sort by term: we sorted the files already and use a stable sort.
            lines.Sort(new ComparerAnonymousHelper());

            Console.WriteLine("  encode...");

            PositiveInt32Outputs fstOutput  = PositiveInt32Outputs.Singleton;
            Builder <long?>      fstBuilder = new Builder <long?>(Lucene.Net.Util.Fst.FST.INPUT_TYPE.BYTE2, 0, 0, true, true, int.MaxValue, fstOutput, null, true, PackedInt32s.DEFAULT, true, 15);
            Int32sRef            scratch    = new Int32sRef();
            long   ord       = -1; // first ord will be 0
            string lastValue = null;

            // build tokeninfo dictionary
            foreach (string[] entry in lines)
            {
                int next = dictionary.Put(entry);

                if (next == offset)
                {
                    Console.WriteLine("Failed to process line: " + Collections.ToString(entry));
                    continue;
                }

                string token = entry[0];
                if (!token.Equals(lastValue, StringComparison.Ordinal))
                {
                    // new word to add to fst
                    ord++;
                    lastValue = token;
                    scratch.Grow(token.Length);
                    scratch.Length = token.Length;
                    for (int i = 0; i < token.Length; i++)
                    {
                        scratch.Int32s[i] = (int)token[i];
                    }
                    fstBuilder.Add(scratch, ord);
                }
                dictionary.AddMapping((int)ord, offset);
                offset = next;
            }

            FST <long?> fst = fstBuilder.Finish();

            Console.WriteLine("  " + fst.NodeCount + " nodes, " + fst.ArcCount + " arcs, " + fst.GetSizeInBytes() + " bytes...  ");
            dictionary.SetFST(fst);
            Console.WriteLine(" done");

            return(dictionary);
        }