/// <summary> /// Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix /// and dictionary files. /// You have to close the provided InputStreams yourself. /// </summary> /// <param name="affix"> InputStream for reading the hunspell affix file (won't be closed). </param> /// <param name="dictionaries"> InputStream for reading the hunspell dictionary files (won't be closed). </param> /// <exception cref="IOException"> Can be thrown while reading from the InputStreams </exception> /// <exception cref="ParseException"> Can be thrown if the content of the files does not meet expected formats </exception> public Dictionary(Stream affix, IList <Stream> dictionaries, bool ignoreCase) { this.ignoreCase = ignoreCase; this.needsInputCleaning = ignoreCase; this.needsOutputCleaning = false; // set if we have an OCONV flagLookup.Add(new BytesRef()); // no flags -> ord 0 FileInfo aff = FileSupport.CreateTempFile("affix", "aff", tempDir); using (Stream @out = aff.Open(FileMode.Open, FileAccess.ReadWrite)) { // copy contents of affix stream to temp file affix.CopyTo(@out); } // pass 1: get encoding string encoding; using (Stream aff1 = aff.Open(FileMode.Open, FileAccess.Read)) { encoding = GetDictionaryEncoding(aff1); } // pass 2: parse affixes Encoding decoder = GetSystemEncoding(encoding); using (Stream aff2 = aff.Open(FileMode.Open, FileAccess.Read)) { ReadAffixFile(aff2, decoder); } // read dictionary entries IntSequenceOutputs o = IntSequenceOutputs.Singleton; Builder <IntsRef> b = new Builder <IntsRef>(FST.INPUT_TYPE.BYTE4, o); ReadDictionaryFiles(dictionaries, decoder, b); words = b.Finish(); aliases = null; // no longer needed try { aff.Delete(); } catch { // ignore } }
/// <summary> /// Adds an input string and it's stemmer override output to this builder. /// </summary> /// <param name="input"> the input char sequence </param> /// <param name="output"> the stemmer override output char sequence </param> /// <returns> <c>false</c> if the input has already been added to this builder otherwise <c>true</c>. </returns> public virtual bool Add(string input, string output) { int length = input.Length; if (ignoreCase) { // convert on the fly to lowercase charsSpare.Grow(length); char[] buffer = charsSpare.Chars; for (int i = 0; i < length;) { i += Character.ToChars(Character.ToLower(Character.CodePointAt(input, i), CultureInfo.InvariantCulture), buffer, i); } UnicodeUtil.UTF16toUTF8(buffer, 0, length, spare); } else { UnicodeUtil.UTF16toUTF8(input.ToCharArray(), 0, length, spare); } if (hash.Add(spare) >= 0) { outputValues.Add(output); return(true); } return(false); }
private void AddOneValue(BytesRef value) { int termID = hash.Add(value); if (termID < 0) { termID = -termID - 1; } else { // reserve additional space for each unique value: // 1. when indexing, when hash is 50% full, rehash() suddenly needs 2*size ints. // TODO: can this same OOM happen in THPF? // 2. when flushing, we need 1 int per value (slot in the ordMap). iwBytesUsed.AddAndGet(2 * RamUsageEstimator.NUM_BYTES_INT32); } if (currentUpto == currentValues.Length) { currentValues = ArrayUtil.Grow(currentValues, currentValues.Length + 1); // reserve additional space for max # values per-doc // when flushing, we need an int[] to sort the mapped-ords within the doc iwBytesUsed.AddAndGet((currentValues.Length - currentUpto) * 2 * RamUsageEstimator.NUM_BYTES_INT32); } currentValues[currentUpto] = termID; currentUpto++; }
public override bool Collect(BytesRef bytes) { int e = terms.Add(bytes); TermState state = termsEnum.GetTermState(); if (Debugging.AssertsEnabled) { Debugging.Assert(state != null); } if (e < 0) { // duplicate term: update docFreq int pos = (-e) - 1; array.termState[pos].Register(state, m_readerContext.Ord, termsEnum.DocFreq, termsEnum.TotalTermFreq); if (Debugging.AssertsEnabled) { Debugging.Assert(array.boost[pos] == boostAtt.Boost, "boost should be equal in all segment TermsEnums"); } } else { // new entry: we populate the entry initially array.boost[e] = boostAtt.Boost; array.termState[e] = new TermContext(m_topReaderContext, state, m_readerContext.Ord, termsEnum.DocFreq, termsEnum.TotalTermFreq); outerInstance.CheckMaxClauseCount(terms.Count); } return(true); }
public override bool Collect(BytesRef bytes) { int e = terms.Add(bytes); TermState state = termsEnum.GetTermState(); if (Debugging.AssertsEnabled) { Debugging.Assert(state != null); } if (e < 0) { // duplicate term: update docFreq int pos = (-e) - 1; array.termState[pos].Register(state, m_readerContext.Ord, termsEnum.DocFreq, termsEnum.TotalTermFreq); // LUCENENET specific - compare bits rather than using equality operators to prevent these comparisons from failing in x86 in .NET Framework with optimizations enabled if (Debugging.AssertsEnabled) { Debugging.Assert(NumericUtils.SingleToSortableInt32(array.boost[pos]) == NumericUtils.SingleToSortableInt32(boostAtt.Boost), "boost should be equal in all segment TermsEnums"); } } else { // new entry: we populate the entry initially array.boost[e] = boostAtt.Boost; array.termState[e] = new TermContext(m_topReaderContext, state, m_readerContext.Ord, termsEnum.DocFreq, termsEnum.TotalTermFreq); outerInstance.CheckMaxClauseCount(terms.Count); } return(true); }
// NOTE: while it's tempting to make this public, since // caller's parser likely knows the // numInput/numOutputWords, sneaky exceptions, much later // on, will result if these values are wrong; so we always // recompute ourselves to be safe: internal virtual void Add(CharsRef input, int numInputWords, CharsRef output, int numOutputWords, bool includeOrig) { // first convert to UTF-8 if (numInputWords <= 0) { throw new ArgumentOutOfRangeException(nameof(numInputWords), "numInputWords must be > 0 (got " + numInputWords + ")"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention) } if (input.Length <= 0) { throw new ArgumentOutOfRangeException(nameof(input.Length), "input.Length must be > 0 (got " + input.Length + ")"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention) } if (numOutputWords <= 0) { throw new ArgumentOutOfRangeException(nameof(numOutputWords), "numOutputWords must be > 0 (got " + numOutputWords + ")"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention) } if (output.Length <= 0) { throw new ArgumentOutOfRangeException(nameof(output.Length), "output.Length must be > 0 (got " + output.Length + ")"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention) } if (Debugging.AssertsEnabled) { Debugging.Assert(!HasHoles(input), "input has holes: {0}", input); Debugging.Assert(!HasHoles(output), "output has holes: {0}", output); } //System.out.println("fmap.add input=" + input + " numInputWords=" + numInputWords + " output=" + output + " numOutputWords=" + numOutputWords); UnicodeUtil.UTF16toUTF8(output.Chars, output.Offset, output.Length, utf8Scratch); // lookup in hash int ord = words.Add(utf8Scratch); if (ord < 0) { // already exists in our hash ord = (-ord) - 1; //System.out.println(" output=" + output + " old ord=" + ord); } else { //System.out.println(" output=" + output + " new ord=" + ord); } if (!workingSet.TryGetValue(input, out MapEntry e) || e is null) { e = new MapEntry(); workingSet[CharsRef.DeepCopyOf(input)] = e; // make a copy, since we will keep around in our map } e.ords.Add(ord); e.includeOrig |= includeOrig; maxHorizontalContext = Math.Max(maxHorizontalContext, numInputWords); maxHorizontalContext = Math.Max(maxHorizontalContext, numOutputWords); }
// NOTE: while it's tempting to make this public, since // caller's parser likely knows the // numInput/numOutputWords, sneaky exceptions, much later // on, will result if these values are wrong; so we always // recompute ourselves to be safe: internal virtual void Add(CharsRef input, int numInputWords, CharsRef output, int numOutputWords, bool includeOrig) { // first convert to UTF-8 if (numInputWords <= 0) { throw new System.ArgumentException("numInputWords must be > 0 (got " + numInputWords + ")"); } if (input.Length <= 0) { throw new System.ArgumentException("input.length must be > 0 (got " + input.Length + ")"); } if (numOutputWords <= 0) { throw new System.ArgumentException("numOutputWords must be > 0 (got " + numOutputWords + ")"); } if (output.Length <= 0) { throw new System.ArgumentException("output.length must be > 0 (got " + output.Length + ")"); } Debug.Assert(!HasHoles(input), "input has holes: " + input); Debug.Assert(!HasHoles(output), "output has holes: " + output); //System.out.println("fmap.add input=" + input + " numInputWords=" + numInputWords + " output=" + output + " numOutputWords=" + numOutputWords); UnicodeUtil.UTF16toUTF8(output.Chars, output.Offset, output.Length, utf8Scratch); // lookup in hash int ord = words.Add(utf8Scratch); if (ord < 0) { // already exists in our hash ord = (-ord) - 1; //System.out.println(" output=" + output + " old ord=" + ord); } else { //System.out.println(" output=" + output + " new ord=" + ord); } MapEntry e = workingSet.ContainsKey(input) ? workingSet[input] : null; if (e == null) { e = new MapEntry(); workingSet[CharsRef.DeepCopyOf(input)] = e; // make a copy, since we will keep around in our map } e.ords.Add(ord); e.includeOrig |= includeOrig; maxHorizontalContext = Math.Max(maxHorizontalContext, numInputWords); maxHorizontalContext = Math.Max(maxHorizontalContext, numOutputWords); }
private void AddOneValue(BytesRef value) { int termID = hash.Add(value); if (termID < 0) { termID = -termID - 1; } else { // reserve additional space for each unique value: // 1. when indexing, when hash is 50% full, rehash() suddenly needs 2*size ints. // TODO: can this same OOM happen in THPF? // 2. when flushing, we need 1 int per value (slot in the ordMap). iwBytesUsed.AddAndGet(2 * RamUsageEstimator.NUM_BYTES_INT32); } pending.Add(termID); UpdateBytesUsed(); }
public override bool Collect(BytesRef bytes) { int e = Terms.Add(bytes); TermState state = TermsEnum.TermState(); Debug.Assert(state != null); if (e < 0) { // duplicate term: update docFreq int pos = (-e) - 1; Array.TermState[pos].Register(state, ReaderContext.Ord, TermsEnum.DocFreq(), TermsEnum.TotalTermFreq()); Debug.Assert(Array.Boost[pos] == BoostAtt.Boost, "boost should be equal in all segment TermsEnums"); } else { // new entry: we populate the entry initially Array.Boost[e] = BoostAtt.Boost; Array.TermState[e] = new TermContext(TopReaderContext, state, ReaderContext.Ord, TermsEnum.DocFreq(), TermsEnum.TotalTermFreq()); OuterInstance.CheckMaxClauseCount(Terms.Size()); } return(true); }
// [Test] // LUCENENET NOTE: For now, we are overriding this test in every subclass to pull it into the right context for the subclass public virtual void TestRandomSortedBytes() { Directory dir = NewDirectory(); IndexWriterConfig cfg = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())); if (!DefaultCodecSupportsDocsWithField()) { // if the codec doesnt support missing, we expect missing to be mapped to byte[] // by the impersonator, but we have to give it a chance to merge them to this cfg.SetMergePolicy(NewLogMergePolicy()); } RandomIndexWriter w = new RandomIndexWriter(Random(), dir, cfg); int numDocs = AtLeast(100); BytesRefHash hash = new BytesRefHash(); IDictionary<string, string> docToString = new Dictionary<string, string>(); int maxLength = TestUtil.NextInt(Random(), 1, 50); for (int i = 0; i < numDocs; i++) { Document doc = new Document(); doc.Add(NewTextField("id", "" + i, Field.Store.YES)); string @string = TestUtil.RandomRealisticUnicodeString(Random(), 1, maxLength); BytesRef br = new BytesRef(@string); doc.Add(new SortedDocValuesField("field", br)); hash.Add(br); docToString["" + i] = @string; w.AddDocument(doc); } if (Rarely()) { w.Commit(); } int numDocsNoValue = AtLeast(10); for (int i = 0; i < numDocsNoValue; i++) { Document doc = new Document(); doc.Add(NewTextField("id", "noValue", Field.Store.YES)); w.AddDocument(doc); } if (!DefaultCodecSupportsDocsWithField()) { BytesRef bytesRef = new BytesRef(); hash.Add(bytesRef); // add empty value for the gaps } if (Rarely()) { w.Commit(); } if (!DefaultCodecSupportsDocsWithField()) { // if the codec doesnt support missing, we expect missing to be mapped to byte[] // by the impersonator, but we have to give it a chance to merge them to this w.ForceMerge(1); } for (int i = 0; i < numDocs; i++) { Document doc = new Document(); string id = "" + i + numDocs; doc.Add(NewTextField("id", id, Field.Store.YES)); string @string = TestUtil.RandomRealisticUnicodeString(Random(), 1, maxLength); BytesRef br = new BytesRef(@string); hash.Add(br); docToString[id] = @string; doc.Add(new SortedDocValuesField("field", br)); w.AddDocument(doc); } w.Commit(); IndexReader reader = w.Reader; SortedDocValues docValues = MultiDocValues.GetSortedValues(reader, "field"); int[] sort = hash.Sort(BytesRef.UTF8SortedAsUnicodeComparer); BytesRef expected = new BytesRef(); BytesRef actual = new BytesRef(); Assert.AreEqual(hash.Size(), docValues.ValueCount); for (int i = 0; i < hash.Size(); i++) { hash.Get(sort[i], expected); docValues.LookupOrd(i, actual); Assert.AreEqual(expected.Utf8ToString(), actual.Utf8ToString()); int ord = docValues.LookupTerm(expected); Assert.AreEqual(i, ord); } AtomicReader slowR = SlowCompositeReaderWrapper.Wrap(reader); ISet<KeyValuePair<string, string>> entrySet = docToString.EntrySet(); foreach (KeyValuePair<string, string> entry in entrySet) { // pk lookup DocsEnum termDocsEnum = slowR.TermDocsEnum(new Term("id", entry.Key)); int docId = termDocsEnum.NextDoc(); expected = new BytesRef(entry.Value); docValues.Get(docId, actual); Assert.AreEqual(expected, actual); } reader.Dispose(); w.Dispose(); dir.Dispose(); }