Пример #1
0
        private OfflineSorter.ByteSequencesReader Sort()
        {
            string        prefix    = this.GetType().Name;
            DirectoryInfo directory = OfflineSorter.DefaultTempDir();

            tempInput  = FileSupport.CreateTempFile(prefix, ".input", directory);
            tempSorted = FileSupport.CreateTempFile(prefix, ".sorted", directory);

            var  writer  = new OfflineSorter.ByteSequencesWriter(tempInput);
            bool success = false;

            try
            {
                BytesRef spare;
                byte[]   buffer = new byte[0];
                var      output = new ByteArrayDataOutput(buffer);

                while ((spare = source.Next()) != null)
                {
                    Encode(writer, output, buffer, spare, source.Payload, source.Contexts, source.Weight);
                }
                writer.Dispose();
                (new OfflineSorter(tieBreakByCostComparer)).Sort(tempInput, tempSorted);
                var reader = new OfflineSorter.ByteSequencesReader(tempSorted);
                success = true;
                return(reader);
            }
            finally
            {
                if (success)
                {
                    IOUtils.Dispose(writer);
                }
                else
                {
                    try
                    {
                        IOUtils.DisposeWhileHandlingException(writer);
                    }
                    finally
                    {
                        Dispose();
                    }
                }
            }
        }
Пример #2
0
        private OfflineSorter.ByteSequencesReader Sort()
        {
            string        prefix    = this.GetType().Name;
            DirectoryInfo directory = OfflineSorter.DefaultTempDir();

            tempInput  = FileSupport.CreateTempFile(prefix, ".input", directory);
            tempSorted = FileSupport.CreateTempFile(prefix, ".sorted", directory);

            var  writer  = new OfflineSorter.ByteSequencesWriter(tempInput);
            bool success = false;

            try
            {
                byte[] buffer = Arrays.Empty <byte>();
                ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);

                while (source.MoveNext())
                {
                    Encode(writer, output, buffer, source.Current, source.Weight);
                }
                writer.Dispose();
                (new OfflineSorter(tieBreakByCostComparer)).Sort(tempInput, tempSorted);
                OfflineSorter.ByteSequencesReader reader = new OfflineSorter.ByteSequencesReader(tempSorted);
                success = true;
                return(reader);
            }
            finally
            {
                if (success)
                {
                    IOUtils.Dispose(writer);
                }
                else
                {
                    try
                    {
                        IOUtils.DisposeWhileHandlingException(writer);
                    }
                    finally
                    {
                        Close();
                    }
                }
            }
        }
Пример #3
0
        public override void Build(IInputEnumerator enumerator)
        {
            if (enumerator.HasContexts)
            {
                throw new ArgumentException("this suggester doesn't support contexts");
            }
            string prefix     = this.GetType().Name;
            var    directory  = OfflineSorter.DefaultTempDir();
            var    tempInput  = FileSupport.CreateTempFile(prefix, ".input", directory);
            var    tempSorted = FileSupport.CreateTempFile(prefix, ".sorted", directory);

            hasPayloads = enumerator.HasPayloads;

            var writer = new OfflineSorter.ByteSequencesWriter(tempInput);

            OfflineSorter.ByteSequencesReader reader = null;
            var scratch = new BytesRef();

            TokenStreamToAutomaton ts2a = GetTokenStreamToAutomaton();

            bool success = false;

            count = 0;
            byte[] buffer = new byte[8];
            try
            {
                var      output = new ByteArrayDataOutput(buffer);
                BytesRef surfaceForm;

                while (enumerator.MoveNext())
                {
                    surfaceForm = enumerator.Current;
                    ISet <Int32sRef> paths = ToFiniteStrings(surfaceForm, ts2a);

                    maxAnalyzedPathsForOneInput = Math.Max(maxAnalyzedPathsForOneInput, paths.Count);

                    foreach (Int32sRef path in paths)
                    {
                        Util.Fst.Util.ToBytesRef(path, scratch);

                        // length of the analyzed text (FST input)
                        if (scratch.Length > ushort.MaxValue - 2)
                        {
                            throw new ArgumentException("cannot handle analyzed forms > " + (ushort.MaxValue - 2) +
                                                        " in length (got " + scratch.Length + ")");
                        }
                        ushort analyzedLength = (ushort)scratch.Length;

                        // compute the required length:
                        // analyzed sequence + weight (4) + surface + analyzedLength (short)
                        int requiredLength = analyzedLength + 4 + surfaceForm.Length + 2;

                        BytesRef payload;

                        if (hasPayloads)
                        {
                            if (surfaceForm.Length > (ushort.MaxValue - 2))
                            {
                                throw new ArgumentException("cannot handle surface form > " + (ushort.MaxValue - 2) +
                                                            " in length (got " + surfaceForm.Length + ")");
                            }
                            payload = enumerator.Payload;
                            // payload + surfaceLength (short)
                            requiredLength += payload.Length + 2;
                        }
                        else
                        {
                            payload = null;
                        }

                        buffer = ArrayUtil.Grow(buffer, requiredLength);

                        output.Reset(buffer);

                        output.WriteInt16((short)analyzedLength);

                        output.WriteBytes(scratch.Bytes, scratch.Offset, scratch.Length);

                        output.WriteInt32(EncodeWeight(enumerator.Weight));

                        if (hasPayloads)
                        {
                            for (int i = 0; i < surfaceForm.Length; i++)
                            {
                                if (surfaceForm.Bytes[i] == PAYLOAD_SEP)
                                {
                                    throw new ArgumentException(
                                              "surface form cannot contain unit separator character U+001F; this character is reserved");
                                }
                            }
                            output.WriteInt16((short)surfaceForm.Length);
                            output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length);
                            output.WriteBytes(payload.Bytes, payload.Offset, payload.Length);
                        }
                        else
                        {
                            output.WriteBytes(surfaceForm.Bytes, surfaceForm.Offset, surfaceForm.Length);
                        }

                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(output.Position == requiredLength, () => output.Position + " vs " + requiredLength);
                        }

                        writer.Write(buffer, 0, output.Position);
                    }
                    count++;
                }
                writer.Dispose();

                // Sort all input/output pairs (required by FST.Builder):
                (new OfflineSorter(new AnalyzingComparer(hasPayloads))).Sort(tempInput, tempSorted);

                // Free disk space:
                tempInput.Delete();

                reader = new OfflineSorter.ByteSequencesReader(tempSorted);

                var outputs = new PairOutputs <long?, BytesRef>(PositiveInt32Outputs.Singleton,
                                                                ByteSequenceOutputs.Singleton);
                var builder = new Builder <PairOutputs <long?, BytesRef> .Pair>(FST.INPUT_TYPE.BYTE1, outputs);

                // Build FST:
                BytesRef  previousAnalyzed = null;
                BytesRef  analyzed         = new BytesRef();
                BytesRef  surface          = new BytesRef();
                Int32sRef scratchInts      = new Int32sRef();
                var       input            = new ByteArrayDataInput();

                // Used to remove duplicate surface forms (but we
                // still index the hightest-weight one).  We clear
                // this when we see a new analyzed form, so it cannot
                // grow unbounded (at most 256 entries):
                var seenSurfaceForms = new JCG.HashSet <BytesRef>();

                var dedup = 0;
                while (reader.Read(scratch))
                {
                    input.Reset(scratch.Bytes, scratch.Offset, scratch.Length);
                    ushort analyzedLength = (ushort)input.ReadInt16();
                    analyzed.Grow(analyzedLength + 2);
                    input.ReadBytes(analyzed.Bytes, 0, analyzedLength);
                    analyzed.Length = analyzedLength;

                    long cost = input.ReadInt32();

                    surface.Bytes = scratch.Bytes;
                    if (hasPayloads)
                    {
                        surface.Length = (ushort)input.ReadInt16();
                        surface.Offset = input.Position;
                    }
                    else
                    {
                        surface.Offset = input.Position;
                        surface.Length = scratch.Length - surface.Offset;
                    }

                    if (previousAnalyzed == null)
                    {
                        previousAnalyzed = new BytesRef();
                        previousAnalyzed.CopyBytes(analyzed);
                        seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface));
                    }
                    else if (analyzed.Equals(previousAnalyzed))
                    {
                        dedup++;
                        if (dedup >= maxSurfaceFormsPerAnalyzedForm)
                        {
                            // More than maxSurfaceFormsPerAnalyzedForm
                            // dups: skip the rest:
                            continue;
                        }
                        if (seenSurfaceForms.Contains(surface))
                        {
                            continue;
                        }
                        seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface));
                    }
                    else
                    {
                        dedup = 0;
                        previousAnalyzed.CopyBytes(analyzed);
                        seenSurfaceForms.Clear();
                        seenSurfaceForms.Add(BytesRef.DeepCopyOf(surface));
                    }

                    // TODO: I think we can avoid the extra 2 bytes when
                    // there is no dup (dedup==0), but we'd have to fix
                    // the exactFirst logic ... which would be sort of
                    // hairy because we'd need to special case the two
                    // (dup/not dup)...

                    // NOTE: must be byte 0 so we sort before whatever
                    // is next
                    analyzed.Bytes[analyzed.Offset + analyzed.Length]     = 0;
                    analyzed.Bytes[analyzed.Offset + analyzed.Length + 1] = (byte)dedup;
                    analyzed.Length += 2;

                    Util.Fst.Util.ToInt32sRef(analyzed, scratchInts);
                    //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString());
                    if (!hasPayloads)
                    {
                        builder.Add(scratchInts, outputs.NewPair(cost, BytesRef.DeepCopyOf(surface)));
                    }
                    else
                    {
                        int      payloadOffset = input.Position + surface.Length;
                        int      payloadLength = scratch.Length - payloadOffset;
                        BytesRef br            = new BytesRef(surface.Length + 1 + payloadLength);
                        Array.Copy(surface.Bytes, surface.Offset, br.Bytes, 0, surface.Length);
                        br.Bytes[surface.Length] = PAYLOAD_SEP;
                        Array.Copy(scratch.Bytes, payloadOffset, br.Bytes, surface.Length + 1, payloadLength);
                        br.Length = br.Bytes.Length;
                        builder.Add(scratchInts, outputs.NewPair(cost, br));
                    }
                }
                fst = builder.Finish();

                //Util.dotToFile(fst, "/tmp/suggest.dot");

                success = true;
            }
            finally
            {
                if (success)
                {
                    IOUtils.Dispose(reader, writer);
                }
                else
                {
                    IOUtils.DisposeWhileHandlingException(reader, writer);
                }

                tempInput.Delete();
                tempSorted.Delete();
            }
        }
Пример #4
0
 /// <summary>
 /// Will buffer all sequences to a temporary file and then sort (all on-disk).
 /// </summary>
 public ExternalRefSorter(OfflineSorter sort)
 {
     this.sort   = sort;
     this.input  = new FileInfo(Path.GetTempFileName());
     this.writer = new OfflineSorter.ByteSequencesWriter(input);
 }
Пример #5
0
        public override void Build(IInputIterator iterator)
        {
            if (iterator.HasPayloads)
            {
                throw new System.ArgumentException("this suggester doesn't support payloads");
            }
            if (iterator.HasContexts)
            {
                throw new System.ArgumentException("this suggester doesn't support contexts");
            }
            FileInfo tempInput  = FileSupport.CreateTempFile(typeof(FSTCompletionLookup).Name, ".input", OfflineSorter.DefaultTempDir());
            FileInfo tempSorted = FileSupport.CreateTempFile(typeof(FSTCompletionLookup).Name, ".sorted", OfflineSorter.DefaultTempDir());

            OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
            OfflineSorter.ByteSequencesReader reader = null;
            ExternalRefSorter sorter = null;

            // Push floats up front before sequences to sort them. For now, assume they are non-negative.
            // If negative floats are allowed some trickery needs to be done to find their byte order.
            bool success = false;

            count = 0;
            try
            {
                byte[] buffer = new byte[0];
                ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
                BytesRef            spare;
                while ((spare = iterator.Next()) != null)
                {
                    if (spare.Length + 4 >= buffer.Length)
                    {
                        buffer = ArrayUtil.Grow(buffer, spare.Length + 4);
                    }

                    output.Reset(buffer);
                    output.WriteInt32(EncodeWeight(iterator.Weight));
                    output.WriteBytes(spare.Bytes, spare.Offset, spare.Length);
                    writer.Write(buffer, 0, output.Position);
                }
                writer.Dispose();

                // We don't know the distribution of scores and we need to bucket them, so we'll sort
                // and divide into equal buckets.
                OfflineSorter.SortInfo info = (new OfflineSorter()).Sort(tempInput, tempSorted);
                tempInput.Delete();
                FSTCompletionBuilder builder = new FSTCompletionBuilder(buckets, sorter = new ExternalRefSorter(new OfflineSorter()), sharedTailLength);

                int inputLines = info.Lines;
                reader = new OfflineSorter.ByteSequencesReader(tempSorted);
                long line                = 0;
                int  previousBucket      = 0;
                int  previousScore       = 0;
                ByteArrayDataInput input = new ByteArrayDataInput();
                BytesRef           tmp1  = new BytesRef();
                BytesRef           tmp2  = new BytesRef();
                while (reader.Read(tmp1))
                {
                    input.Reset(tmp1.Bytes);
                    int currentScore = input.ReadInt32();

                    int bucket;
                    if (line > 0 && currentScore == previousScore)
                    {
                        bucket = previousBucket;
                    }
                    else
                    {
                        bucket = (int)(line * buckets / inputLines);
                    }
                    previousScore  = currentScore;
                    previousBucket = bucket;

                    // Only append the input, discard the weight.
                    tmp2.Bytes  = tmp1.Bytes;
                    tmp2.Offset = input.Position;
                    tmp2.Length = tmp1.Length - input.Position;
                    builder.Add(tmp2, bucket);

                    line++;
                    count++;
                }

                // The two FSTCompletions share the same automaton.
                this.higherWeightsCompletion = builder.Build();
                this.normalCompletion        = new FSTCompletion(higherWeightsCompletion.FST, false, exactMatchFirst);

                success = true;
            }
            finally
            {
                if (success)
                {
                    IOUtils.Close(reader, writer, sorter);
                }
                else
                {
                    IOUtils.CloseWhileHandlingException(reader, writer, sorter);
                }

                tempInput.Delete();
                tempSorted.Delete();
            }
        }
Пример #6
0
        /// <summary>
        /// Build the suggest index, using up to the specified
        ///  amount of temporary RAM while building.  Note that
        ///  the weights for the suggestions are ignored.
        /// </summary>
        public virtual void Build(IInputIterator iterator, double ramBufferSizeMB)
        {
            if (iterator.HasPayloads)
            {
                throw new System.ArgumentException("this suggester doesn't support payloads");
            }
            if (iterator.HasContexts)
            {
                throw new System.ArgumentException("this suggester doesn't support contexts");
            }

            string prefix    = this.GetType().Name;
            var    directory = OfflineSorter.DefaultTempDir();
            // TODO: messy ... java7 has Files.createTempDirectory
            // ... but 4.x is java6:
            DirectoryInfo tempIndexPath = null;
            Random        random        = new Random();

            while (true)
            {
                tempIndexPath = new DirectoryInfo(Path.Combine(directory.FullName, prefix + ".index." + random.Next(int.MaxValue)));
                tempIndexPath.Create();
                if (System.IO.Directory.Exists(tempIndexPath.FullName))
                {
                    break;
                }
            }

            using (Directory dir = FSDirectory.Open(tempIndexPath))
            {
#pragma warning disable 612, 618
                IndexWriterConfig iwc = new IndexWriterConfig(LuceneVersion.LUCENE_CURRENT, indexAnalyzer);
#pragma warning restore 612, 618
                iwc.SetOpenMode(OpenMode.CREATE);
                iwc.SetRAMBufferSizeMB(ramBufferSizeMB);
                IndexWriter writer = new IndexWriter(dir, iwc);

                var ft = new FieldType(TextField.TYPE_NOT_STORED);
                // TODO: if only we had IndexOptions.TERMS_ONLY...
                ft.IndexOptions = IndexOptions.DOCS_AND_FREQS;
                ft.OmitNorms    = true;
                ft.Freeze();

                Document doc   = new Document();
                Field    field = new Field("body", "", ft);
                doc.Add(field);

                totTokens = 0;
                IndexReader reader = null;

                bool success = false;
                count = 0;
                try
                {
                    while (true)
                    {
                        BytesRef surfaceForm = iterator.Next();
                        if (surfaceForm == null)
                        {
                            break;
                        }
                        field.SetStringValue(surfaceForm.Utf8ToString());
                        writer.AddDocument(doc);
                        count++;
                    }
                    reader = DirectoryReader.Open(writer, false);

                    Terms terms = MultiFields.GetTerms(reader, "body");
                    if (terms == null)
                    {
                        throw new System.ArgumentException("need at least one suggestion");
                    }

                    // Move all ngrams into an FST:
                    TermsEnum termsEnum = terms.GetIterator(null);

                    Outputs <long?> outputs = PositiveInt32Outputs.Singleton;
                    Builder <long?> builder = new Builder <long?>(FST.INPUT_TYPE.BYTE1, outputs);

                    Int32sRef scratchInts = new Int32sRef();
                    while (true)
                    {
                        BytesRef term = termsEnum.Next();
                        if (term == null)
                        {
                            break;
                        }
                        int ngramCount = CountGrams(term);
                        if (ngramCount > grams)
                        {
                            throw new System.ArgumentException("tokens must not contain separator byte; got token=" + term + " but gramCount=" + ngramCount + ", which is greater than expected max ngram size=" + grams);
                        }
                        if (ngramCount == 1)
                        {
                            totTokens += termsEnum.TotalTermFreq;
                        }

                        builder.Add(Lucene.Net.Util.Fst.Util.ToInt32sRef(term, scratchInts), EncodeWeight(termsEnum.TotalTermFreq));
                    }

                    fst = builder.Finish();
                    if (fst == null)
                    {
                        throw new System.ArgumentException("need at least one suggestion");
                    }
                    //System.out.println("FST: " + fst.getNodeCount() + " nodes");

                    /*
                     * PrintWriter pw = new PrintWriter("/x/tmp/out.dot");
                     * Util.toDot(fst, pw, true, true);
                     * pw.close();
                     */

                    success = true;
                }
                finally
                {
                    try
                    {
                        if (success)
                        {
                            IOUtils.Dispose(writer, reader);
                        }
                        else
                        {
                            IOUtils.DisposeWhileHandlingException(writer, reader);
                        }
                    }
                    finally
                    {
                        foreach (string file in dir.ListAll())
                        {
                            FileInfo path = new FileInfo(Path.Combine(tempIndexPath.FullName, file));
                            try
                            {
                                path.Delete();
                            }
                            catch (Exception e)
                            {
                                throw new InvalidOperationException("failed to remove " + path, e);
                            }
                        }

                        try
                        {
                            tempIndexPath.Delete();
                        }
                        catch (Exception e)
                        {
                            throw new InvalidOperationException("failed to remove " + tempIndexPath, e);
                        }
                    }
                }
            }
        }
Пример #7
0
        /// <summary>
        /// Build the suggest index, using up to the specified
        /// amount of temporary RAM while building.  Note that
        /// the weights for the suggestions are ignored.
        /// </summary>
        public virtual void Build(IInputEnumerator enumerator, double ramBufferSizeMB)
        {
            if (enumerator.HasPayloads)
            {
                throw new ArgumentException("this suggester doesn't support payloads");
            }
            if (enumerator.HasContexts)
            {
                throw new ArgumentException("this suggester doesn't support contexts");
            }

            string prefix    = this.GetType().Name;
            var    directory = OfflineSorter.DefaultTempDir();

            // LUCENENET specific - using GetRandomFileName() instead of picking a random int
            DirectoryInfo tempIndexPath = null;

            while (true)
            {
                tempIndexPath = new DirectoryInfo(Path.Combine(directory.FullName, prefix + ".index." + Path.GetFileNameWithoutExtension(Path.GetRandomFileName())));
                tempIndexPath.Create();
                if (System.IO.Directory.Exists(tempIndexPath.FullName))
                {
                    break;
                }
            }

            Directory dir = FSDirectory.Open(tempIndexPath);

            try
            {
#pragma warning disable 612, 618
                IndexWriterConfig iwc = new IndexWriterConfig(LuceneVersion.LUCENE_CURRENT, indexAnalyzer);
#pragma warning restore 612, 618
                iwc.SetOpenMode(OpenMode.CREATE);
                iwc.SetRAMBufferSizeMB(ramBufferSizeMB);
                IndexWriter writer = new IndexWriter(dir, iwc);

                var ft = new FieldType(TextField.TYPE_NOT_STORED);
                // TODO: if only we had IndexOptions.TERMS_ONLY...
                ft.IndexOptions = IndexOptions.DOCS_AND_FREQS;
                ft.OmitNorms    = true;
                ft.Freeze();

                Document doc   = new Document();
                Field    field = new Field("body", "", ft);
                doc.Add(field);

                totTokens = 0;
                IndexReader reader = null;

                bool success = false;
                count = 0;
                try
                {
                    while (enumerator.MoveNext())
                    {
                        BytesRef surfaceForm = enumerator.Current;
                        field.SetStringValue(surfaceForm.Utf8ToString());
                        writer.AddDocument(doc);
                        count++;
                    }

                    reader = DirectoryReader.Open(writer, false);

                    Terms terms = MultiFields.GetTerms(reader, "body");
                    if (terms == null)
                    {
                        throw new ArgumentException("need at least one suggestion");
                    }

                    // Move all ngrams into an FST:
                    TermsEnum termsEnum = terms.GetEnumerator(null);

                    Outputs <long?> outputs = PositiveInt32Outputs.Singleton;
                    Builder <long?> builder = new Builder <long?>(FST.INPUT_TYPE.BYTE1, outputs);

                    Int32sRef scratchInts = new Int32sRef();
                    while (termsEnum.MoveNext())
                    {
                        BytesRef term       = termsEnum.Term;
                        int      ngramCount = CountGrams(term);
                        if (ngramCount > grams)
                        {
                            throw new ArgumentException("tokens must not contain separator byte; got token=" + term + " but gramCount=" + ngramCount + ", which is greater than expected max ngram size=" + grams);
                        }
                        if (ngramCount == 1)
                        {
                            totTokens += termsEnum.TotalTermFreq;
                        }

                        builder.Add(Lucene.Net.Util.Fst.Util.ToInt32sRef(term, scratchInts), EncodeWeight(termsEnum.TotalTermFreq));
                    }

                    fst = builder.Finish();
                    if (fst == null)
                    {
                        throw new ArgumentException("need at least one suggestion");
                    }
                    //System.out.println("FST: " + fst.getNodeCount() + " nodes");

                    /*
                     * PrintWriter pw = new PrintWriter("/x/tmp/out.dot");
                     * Util.toDot(fst, pw, true, true);
                     * pw.close();
                     */

                    success = true;
                }
                finally
                {
                    if (success)
                    {
                        IOUtils.Dispose(writer, reader);
                    }
                    else
                    {
                        IOUtils.DisposeWhileHandlingException(writer, reader);
                    }
                }
            }
            finally
            {
                try
                {
                    IOUtils.Dispose(dir);
                }
                finally
                {
                    // LUCENENET specific - since we are removing the entire directory anyway,
                    // it doesn't make sense to first do a loop in order remove the files.
                    // Let the System.IO.Directory.Delete() method handle that.
                    // We also need to dispose the Directory instance first before deleting from disk.
                    try
                    {
                        System.IO.Directory.Delete(tempIndexPath.FullName, true);
                    }
                    catch (Exception e)
                    {
                        throw new InvalidOperationException("failed to remove " + tempIndexPath, e);
                    }
                }
            }
        }
Пример #8
0
        /// <summary>
        /// Reads the dictionary file through the provided InputStreams, building up the words map
        /// </summary>
        /// <param name="dictionaries"> InputStreams to read the dictionary file through </param>
        /// <param name="decoder"> CharsetDecoder used to decode the contents of the file </param>
        /// <exception cref="IOException"> Can be thrown while reading from the file </exception>
        private void ReadDictionaryFiles(IList <Stream> dictionaries, Encoding decoder, Builder <IntsRef> words)
        {
            BytesRef flagsScratch = new BytesRef();
            IntsRef  scratchInts  = new IntsRef();

            StringBuilder sb = new StringBuilder();

            FileInfo unsorted = FileSupport.CreateTempFile("unsorted", "dat", tempDir);

            using (OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(unsorted))
            {
                foreach (Stream dictionary in dictionaries)
                {
                    var    lines = new StreamReader(dictionary, decoder);
                    string line  = lines.ReadLine(); // first line is number of entries (approximately, sometimes)

                    while ((line = lines.ReadLine()) != null)
                    {
                        line = UnescapeEntry(line);
                        if (needsInputCleaning)
                        {
                            int flagSep = line.LastIndexOf(FLAG_SEPARATOR);
                            if (flagSep == -1)
                            {
                                string cleansed = CleanInput(line, sb);
                                writer.Write(cleansed.ToString().GetBytes(Encoding.UTF8));
                            }
                            else
                            {
                                string text     = line.Substring(0, flagSep - 0);
                                string cleansed = CleanInput(text, sb);
                                if (cleansed != sb.ToString())
                                {
                                    sb.Length = 0;
                                    sb.Append(cleansed);
                                }
                                sb.Append(line.Substring(flagSep));
                                writer.Write(sb.ToString().GetBytes(Encoding.UTF8));
                            }
                        }
                        else
                        {
                            writer.Write(line.GetBytes(Encoding.UTF8));
                        }
                    }
                }
            }

            FileInfo sorted = FileSupport.CreateTempFile("sorted", "dat", tempDir);

            OfflineSorter sorter = new OfflineSorter(new ComparatorAnonymousInnerClassHelper(this));

            sorter.Sort(unsorted, sorted);
            try
            {
                unsorted.Delete();
            }
            catch
            {
                // ignore
            }

            using (OfflineSorter.ByteSequencesReader reader = new OfflineSorter.ByteSequencesReader(sorted))
            {
                BytesRef scratchLine = new BytesRef();

                // TODO: the flags themselves can be double-chars (long) or also numeric
                // either way the trick is to encode them as char... but they must be parsed differently

                string  currentEntry = null;
                IntsRef currentOrds  = new IntsRef();

                string line2;
                while (reader.Read(scratchLine))
                {
                    line2 = scratchLine.Utf8ToString();
                    string entry;
                    char[] wordForm;

                    int flagSep = line2.LastIndexOf(FLAG_SEPARATOR);
                    if (flagSep == -1)
                    {
                        wordForm = NOFLAGS;
                        entry    = line2;
                    }
                    else
                    {
                        // note, there can be comments (morph description) after a flag.
                        // we should really look for any whitespace: currently just tab and space
                        int end = line2.IndexOf('\t', flagSep);
                        if (end == -1)
                        {
                            end = line2.Length;
                        }
                        int end2 = line2.IndexOf(' ', flagSep);
                        if (end2 == -1)
                        {
                            end2 = line2.Length;
                        }
                        end = Math.Min(end, end2);

                        string flagPart = line2.Substring(flagSep + 1, end - (flagSep + 1));
                        if (aliasCount > 0)
                        {
                            flagPart = GetAliasValue(int.Parse(flagPart, CultureInfo.InvariantCulture));
                        }

                        wordForm = flagParsingStrategy.ParseFlags(flagPart);
                        Array.Sort(wordForm);
                        entry = line2.Substring(0, flagSep - 0);
                    }
                    // LUCENENET NOTE: CompareToOrdinal is an extension method that works similarly to
                    // Java's String.compareTo method.
                    int cmp = currentEntry == null ? 1 : entry.CompareToOrdinal(currentEntry);
                    if (cmp < 0)
                    {
                        throw new System.ArgumentException("out of order: " + entry + " < " + currentEntry);
                    }
                    else
                    {
                        EncodeFlags(flagsScratch, wordForm);
                        int ord = flagLookup.Add(flagsScratch);
                        if (ord < 0)
                        {
                            // already exists in our hash
                            ord = (-ord) - 1;
                        }
                        // finalize current entry, and switch "current" if necessary
                        if (cmp > 0 && currentEntry != null)
                        {
                            Lucene.Net.Util.Fst.Util.ToUTF32(currentEntry, scratchInts);
                            words.Add(scratchInts, currentOrds);
                        }
                        // swap current
                        if (cmp > 0 || currentEntry == null)
                        {
                            currentEntry = entry;
                            currentOrds  = new IntsRef(); // must be this way
                        }
                        currentOrds.Grow(currentOrds.Length + 1);
                        currentOrds.Ints[currentOrds.Length++] = ord;
                    }
                }

                // finalize last entry
                Lucene.Net.Util.Fst.Util.ToUTF32(currentEntry, scratchInts);
                words.Add(scratchInts, currentOrds);
            }
            try
            {
                sorted.Delete();
            }
            catch
            {
                // ignore
            }
        }
Пример #9
0
 /// <summary>
 /// Will buffer all sequences to a temporary file and then sort (all on-disk).
 /// </summary>
 public ExternalRefSorter(OfflineSorter sort)
 {
     this.sort = sort;
     this.input = new FileInfo(Path.GetTempFileName());
     this.writer = new OfflineSorter.ByteSequencesWriter(input);
 }
Пример #10
0
 public ByteSequenceIterator(OfflineSorter.ByteSequencesReader reader, IComparer<BytesRef> comparator)
 {
     this.reader = reader;
     this.comparator = comparator;
 }