コード例 #1
0
ファイル: Dictionary.cs プロジェクト: wwb/lucenenet
        /// <summary>
        /// Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix
        /// and dictionary files.
        /// You have to close the provided InputStreams yourself.
        /// </summary>
        /// <param name="affix"> InputStream for reading the hunspell affix file (won't be closed). </param>
        /// <param name="dictionaries"> InputStream for reading the hunspell dictionary files (won't be closed). </param>
        /// <exception cref="IOException"> Can be thrown while reading from the InputStreams </exception>
        /// <exception cref="ParseException"> Can be thrown if the content of the files does not meet expected formats </exception>
        public Dictionary(Stream affix, IList <Stream> dictionaries, bool ignoreCase)
        {
            this.ignoreCase          = ignoreCase;
            this.needsInputCleaning  = ignoreCase;
            this.needsOutputCleaning = false; // set if we have an OCONV
            flagLookup.Add(new BytesRef());   // no flags -> ord 0

            FileInfo aff = FileSupport.CreateTempFile("affix", "aff", tempDir);

            using (Stream @out = aff.Open(FileMode.Open, FileAccess.ReadWrite))
            {
                // copy contents of affix stream to temp file
                affix.CopyTo(@out);
            }

            // pass 1: get encoding
            string encoding;

            using (Stream aff1 = aff.Open(FileMode.Open, FileAccess.Read))
            {
                encoding = GetDictionaryEncoding(aff1);
            }

            // pass 2: parse affixes
            Encoding decoder = GetSystemEncoding(encoding);

            using (Stream aff2 = aff.Open(FileMode.Open, FileAccess.Read))
            {
                ReadAffixFile(aff2, decoder);
            }

            // read dictionary entries
            IntSequenceOutputs o = IntSequenceOutputs.Singleton;
            Builder <IntsRef>  b = new Builder <IntsRef>(FST.INPUT_TYPE.BYTE4, o);

            ReadDictionaryFiles(dictionaries, decoder, b);
            words   = b.Finish();
            aliases = null; // no longer needed

            try
            {
                aff.Delete();
            }
            catch
            {
                // ignore
            }
        }
コード例 #2
0
            /// <summary>
            /// Adds an input string and it's stemmer override output to this builder.
            /// </summary>
            /// <param name="input"> the input char sequence </param>
            /// <param name="output"> the stemmer override output char sequence </param>
            /// <returns> <c>false</c> if the input has already been added to this builder otherwise <c>true</c>. </returns>
            public virtual bool Add(string input, string output)
            {
                int length = input.Length;

                if (ignoreCase)
                {
                    // convert on the fly to lowercase
                    charsSpare.Grow(length);
                    char[] buffer = charsSpare.Chars;
                    for (int i = 0; i < length;)
                    {
                        i += Character.ToChars(Character.ToLower(Character.CodePointAt(input, i), CultureInfo.InvariantCulture), buffer, i);
                    }
                    UnicodeUtil.UTF16toUTF8(buffer, 0, length, spare);
                }
                else
                {
                    UnicodeUtil.UTF16toUTF8(input.ToCharArray(), 0, length, spare);
                }
                if (hash.Add(spare) >= 0)
                {
                    outputValues.Add(output);
                    return(true);
                }
                return(false);
            }
コード例 #3
0
        private void AddOneValue(BytesRef value)
        {
            int termID = hash.Add(value);

            if (termID < 0)
            {
                termID = -termID - 1;
            }
            else
            {
                // reserve additional space for each unique value:
                // 1. when indexing, when hash is 50% full, rehash() suddenly needs 2*size ints.
                //    TODO: can this same OOM happen in THPF?
                // 2. when flushing, we need 1 int per value (slot in the ordMap).
                iwBytesUsed.AddAndGet(2 * RamUsageEstimator.NUM_BYTES_INT32);
            }

            if (currentUpto == currentValues.Length)
            {
                currentValues = ArrayUtil.Grow(currentValues, currentValues.Length + 1);
                // reserve additional space for max # values per-doc
                // when flushing, we need an int[] to sort the mapped-ords within the doc
                iwBytesUsed.AddAndGet((currentValues.Length - currentUpto) * 2 * RamUsageEstimator.NUM_BYTES_INT32);
            }

            currentValues[currentUpto] = termID;
            currentUpto++;
        }
コード例 #4
0
            public override bool Collect(BytesRef bytes)
            {
                int       e     = terms.Add(bytes);
                TermState state = termsEnum.GetTermState();

                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(state != null);
                }
                if (e < 0)
                {
                    // duplicate term: update docFreq
                    int pos = (-e) - 1;
                    array.termState[pos].Register(state, m_readerContext.Ord, termsEnum.DocFreq, termsEnum.TotalTermFreq);
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(array.boost[pos] == boostAtt.Boost, "boost should be equal in all segment TermsEnums");
                    }
                }
                else
                {
                    // new entry: we populate the entry initially
                    array.boost[e]     = boostAtt.Boost;
                    array.termState[e] = new TermContext(m_topReaderContext, state, m_readerContext.Ord, termsEnum.DocFreq, termsEnum.TotalTermFreq);
                    outerInstance.CheckMaxClauseCount(terms.Count);
                }
                return(true);
            }
コード例 #5
0
            public override bool Collect(BytesRef bytes)
            {
                int       e     = terms.Add(bytes);
                TermState state = termsEnum.GetTermState();

                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(state != null);
                }
                if (e < 0)
                {
                    // duplicate term: update docFreq
                    int pos = (-e) - 1;
                    array.termState[pos].Register(state, m_readerContext.Ord, termsEnum.DocFreq, termsEnum.TotalTermFreq);
                    // LUCENENET specific - compare bits rather than using equality operators to prevent these comparisons from failing in x86 in .NET Framework with optimizations enabled
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(NumericUtils.SingleToSortableInt32(array.boost[pos]) == NumericUtils.SingleToSortableInt32(boostAtt.Boost), "boost should be equal in all segment TermsEnums");
                    }
                }
                else
                {
                    // new entry: we populate the entry initially
                    array.boost[e]     = boostAtt.Boost;
                    array.termState[e] = new TermContext(m_topReaderContext, state, m_readerContext.Ord, termsEnum.DocFreq, termsEnum.TotalTermFreq);
                    outerInstance.CheckMaxClauseCount(terms.Count);
                }
                return(true);
            }
コード例 #6
0
            // NOTE: while it's tempting to make this public, since
            // caller's parser likely knows the
            // numInput/numOutputWords, sneaky exceptions, much later
            // on, will result if these values are wrong; so we always
            // recompute ourselves to be safe:
            internal virtual void Add(CharsRef input, int numInputWords, CharsRef output, int numOutputWords, bool includeOrig)
            {
                // first convert to UTF-8
                if (numInputWords <= 0)
                {
                    throw new ArgumentOutOfRangeException(nameof(numInputWords), "numInputWords must be > 0 (got " + numInputWords + ")"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention)
                }
                if (input.Length <= 0)
                {
                    throw new ArgumentOutOfRangeException(nameof(input.Length), "input.Length must be > 0 (got " + input.Length + ")"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention)
                }
                if (numOutputWords <= 0)
                {
                    throw new ArgumentOutOfRangeException(nameof(numOutputWords), "numOutputWords must be > 0 (got " + numOutputWords + ")"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention)
                }
                if (output.Length <= 0)
                {
                    throw new ArgumentOutOfRangeException(nameof(output.Length), "output.Length must be > 0 (got " + output.Length + ")"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention)
                }

                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(!HasHoles(input), "input has holes: {0}", input);
                    Debugging.Assert(!HasHoles(output), "output has holes: {0}", output);
                }

                //System.out.println("fmap.add input=" + input + " numInputWords=" + numInputWords + " output=" + output + " numOutputWords=" + numOutputWords);
                UnicodeUtil.UTF16toUTF8(output.Chars, output.Offset, output.Length, utf8Scratch);
                // lookup in hash
                int ord = words.Add(utf8Scratch);

                if (ord < 0)
                {
                    // already exists in our hash
                    ord = (-ord) - 1;
                    //System.out.println("  output=" + output + " old ord=" + ord);
                }
                else
                {
                    //System.out.println("  output=" + output + " new ord=" + ord);
                }

                if (!workingSet.TryGetValue(input, out MapEntry e) || e is null)
                {
                    e = new MapEntry();
                    workingSet[CharsRef.DeepCopyOf(input)] = e; // make a copy, since we will keep around in our map
                }

                e.ords.Add(ord);
                e.includeOrig       |= includeOrig;
                maxHorizontalContext = Math.Max(maxHorizontalContext, numInputWords);
                maxHorizontalContext = Math.Max(maxHorizontalContext, numOutputWords);
            }
コード例 #7
0
ファイル: SynonymMap.cs プロジェクト: sycct/lucenenet
            // NOTE: while it's tempting to make this public, since
            // caller's parser likely knows the
            // numInput/numOutputWords, sneaky exceptions, much later
            // on, will result if these values are wrong; so we always
            // recompute ourselves to be safe:
            internal virtual void Add(CharsRef input, int numInputWords, CharsRef output, int numOutputWords, bool includeOrig)
            {
                // first convert to UTF-8
                if (numInputWords <= 0)
                {
                    throw new System.ArgumentException("numInputWords must be > 0 (got " + numInputWords + ")");
                }
                if (input.Length <= 0)
                {
                    throw new System.ArgumentException("input.length must be > 0 (got " + input.Length + ")");
                }
                if (numOutputWords <= 0)
                {
                    throw new System.ArgumentException("numOutputWords must be > 0 (got " + numOutputWords + ")");
                }
                if (output.Length <= 0)
                {
                    throw new System.ArgumentException("output.length must be > 0 (got " + output.Length + ")");
                }

                Debug.Assert(!HasHoles(input), "input has holes: " + input);
                Debug.Assert(!HasHoles(output), "output has holes: " + output);

                //System.out.println("fmap.add input=" + input + " numInputWords=" + numInputWords + " output=" + output + " numOutputWords=" + numOutputWords);
                UnicodeUtil.UTF16toUTF8(output.Chars, output.Offset, output.Length, utf8Scratch);
                // lookup in hash
                int ord = words.Add(utf8Scratch);

                if (ord < 0)
                {
                    // already exists in our hash
                    ord = (-ord) - 1;
                    //System.out.println("  output=" + output + " old ord=" + ord);
                }
                else
                {
                    //System.out.println("  output=" + output + " new ord=" + ord);
                }

                MapEntry e = workingSet.ContainsKey(input) ? workingSet[input] : null;

                if (e == null)
                {
                    e = new MapEntry();
                    workingSet[CharsRef.DeepCopyOf(input)] = e; // make a copy, since we will keep around in our map
                }

                e.ords.Add(ord);
                e.includeOrig       |= includeOrig;
                maxHorizontalContext = Math.Max(maxHorizontalContext, numInputWords);
                maxHorizontalContext = Math.Max(maxHorizontalContext, numOutputWords);
            }
コード例 #8
0
        private void AddOneValue(BytesRef value)
        {
            int termID = hash.Add(value);

            if (termID < 0)
            {
                termID = -termID - 1;
            }
            else
            {
                // reserve additional space for each unique value:
                // 1. when indexing, when hash is 50% full, rehash() suddenly needs 2*size ints.
                //    TODO: can this same OOM happen in THPF?
                // 2. when flushing, we need 1 int per value (slot in the ordMap).
                iwBytesUsed.AddAndGet(2 * RamUsageEstimator.NUM_BYTES_INT32);
            }

            pending.Add(termID);
            UpdateBytesUsed();
        }
コード例 #9
0
            public override bool Collect(BytesRef bytes)
            {
                int       e     = Terms.Add(bytes);
                TermState state = TermsEnum.TermState();

                Debug.Assert(state != null);
                if (e < 0)
                {
                    // duplicate term: update docFreq
                    int pos = (-e) - 1;
                    Array.TermState[pos].Register(state, ReaderContext.Ord, TermsEnum.DocFreq(), TermsEnum.TotalTermFreq());
                    Debug.Assert(Array.Boost[pos] == BoostAtt.Boost, "boost should be equal in all segment TermsEnums");
                }
                else
                {
                    // new entry: we populate the entry initially
                    Array.Boost[e]     = BoostAtt.Boost;
                    Array.TermState[e] = new TermContext(TopReaderContext, state, ReaderContext.Ord, TermsEnum.DocFreq(), TermsEnum.TotalTermFreq());
                    OuterInstance.CheckMaxClauseCount(Terms.Size());
                }
                return(true);
            }
コード例 #10
0
        // [Test] // LUCENENET NOTE: For now, we are overriding this test in every subclass to pull it into the right context for the subclass
        public virtual void TestRandomSortedBytes()
        {
            Directory dir = NewDirectory();
            IndexWriterConfig cfg = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random()));
            if (!DefaultCodecSupportsDocsWithField())
            {
                // if the codec doesnt support missing, we expect missing to be mapped to byte[]
                // by the impersonator, but we have to give it a chance to merge them to this
                cfg.SetMergePolicy(NewLogMergePolicy());
            }
            RandomIndexWriter w = new RandomIndexWriter(Random(), dir, cfg);
            int numDocs = AtLeast(100);
            BytesRefHash hash = new BytesRefHash();
            IDictionary<string, string> docToString = new Dictionary<string, string>();
            int maxLength = TestUtil.NextInt(Random(), 1, 50);
            for (int i = 0; i < numDocs; i++)
            {
                Document doc = new Document();
                doc.Add(NewTextField("id", "" + i, Field.Store.YES));
                string @string = TestUtil.RandomRealisticUnicodeString(Random(), 1, maxLength);
                BytesRef br = new BytesRef(@string);
                doc.Add(new SortedDocValuesField("field", br));
                hash.Add(br);
                docToString["" + i] = @string;
                w.AddDocument(doc);
            }
            if (Rarely())
            {
                w.Commit();
            }
            int numDocsNoValue = AtLeast(10);
            for (int i = 0; i < numDocsNoValue; i++)
            {
                Document doc = new Document();
                doc.Add(NewTextField("id", "noValue", Field.Store.YES));
                w.AddDocument(doc);
            }
            if (!DefaultCodecSupportsDocsWithField())
            {
                BytesRef bytesRef = new BytesRef();
                hash.Add(bytesRef); // add empty value for the gaps
            }
            if (Rarely())
            {
                w.Commit();
            }
            if (!DefaultCodecSupportsDocsWithField())
            {
                // if the codec doesnt support missing, we expect missing to be mapped to byte[]
                // by the impersonator, but we have to give it a chance to merge them to this
                w.ForceMerge(1);
            }
            for (int i = 0; i < numDocs; i++)
            {
                Document doc = new Document();
                string id = "" + i + numDocs;
                doc.Add(NewTextField("id", id, Field.Store.YES));
                string @string = TestUtil.RandomRealisticUnicodeString(Random(), 1, maxLength);
                BytesRef br = new BytesRef(@string);
                hash.Add(br);
                docToString[id] = @string;
                doc.Add(new SortedDocValuesField("field", br));
                w.AddDocument(doc);
            }
            w.Commit();
            IndexReader reader = w.Reader;
            SortedDocValues docValues = MultiDocValues.GetSortedValues(reader, "field");
            int[] sort = hash.Sort(BytesRef.UTF8SortedAsUnicodeComparer);
            BytesRef expected = new BytesRef();
            BytesRef actual = new BytesRef();
            Assert.AreEqual(hash.Size(), docValues.ValueCount);
            for (int i = 0; i < hash.Size(); i++)
            {
                hash.Get(sort[i], expected);
                docValues.LookupOrd(i, actual);
                Assert.AreEqual(expected.Utf8ToString(), actual.Utf8ToString());
                int ord = docValues.LookupTerm(expected);
                Assert.AreEqual(i, ord);
            }
            AtomicReader slowR = SlowCompositeReaderWrapper.Wrap(reader);
            ISet<KeyValuePair<string, string>> entrySet = docToString.EntrySet();

            foreach (KeyValuePair<string, string> entry in entrySet)
            {
                // pk lookup
                DocsEnum termDocsEnum = slowR.TermDocsEnum(new Term("id", entry.Key));
                int docId = termDocsEnum.NextDoc();
                expected = new BytesRef(entry.Value);
                docValues.Get(docId, actual);
                Assert.AreEqual(expected, actual);
            }

            reader.Dispose();
            w.Dispose();
            dir.Dispose();
        }