private void SetNewSuffixStart(BytesRef br1, BytesRef br2) { int limit = Math.Min(br1.Length, br2.Length); int lastStart = 0; for (int i = 0; i < limit; i++) { if ((br1.Bytes[br1.Offset + i] & 0xc0) == 0xc0 || (br1.Bytes[br1.Offset + i] & 0x80) == 0) { lastStart = i; } if (br1.Bytes[br1.Offset + i] != br2.Bytes[br2.Offset + i]) { newSuffixStart = lastStart; if (DEBUG_SURROGATES) { Console.WriteLine(" set newSuffixStart=" + newSuffixStart); } return; } } newSuffixStart = limit; if (DEBUG_SURROGATES) { Console.WriteLine(" set newSuffixStart=" + newSuffixStart); } }
public static void Main(string[] args) { if (args.Length != 2) { // LUCENENET specific - our wrapper console shows the correct usage throw new ArgumentException(); //Console.WriteLine("Usage: java Lucene.Net.Store.LockVerifyServer bindToIp clients\n"); //Environment.FailFast("1"); } int arg = 0; string hostname = args[arg++]; int maxClients = Convert.ToInt32(args[arg++], CultureInfo.InvariantCulture); IPAddress ipAddress = IPAddress.Parse(hostname); using (Socket s = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp)) { s.SetSocketOption(SocketOptionLevel.Socket, SocketOptionName.ReuseAddress, 1); s.SetSocketOption(SocketOptionLevel.Socket, SocketOptionName.ReceiveTimeout, 30000);// SoTimeout = 30000; // initially 30 secs to give clients enough time to startup s.Bind(new IPEndPoint(ipAddress, 0)); s.Listen(maxClients); Console.WriteLine("Listening on " + ((IPEndPoint)s.LocalEndPoint).Port.ToString() + "..."); // we set the port as a sysprop, so the ANT task can read it. For that to work, this server must run in-process: SystemProperties.SetProperty("lockverifyserver.port", ((IPEndPoint)s.LocalEndPoint).Port.ToString(CultureInfo.InvariantCulture)); object localLock = new object(); int[] lockedID = new int[1]; lockedID[0] = -1; CountdownEvent startingGun = new CountdownEvent(1); ThreadJob[] threads = new ThreadJob[maxClients]; for (int count = 0; count < maxClients; count++) { Socket cs = s.Accept(); threads[count] = new ThreadAnonymousInnerClassHelper(localLock, lockedID, startingGun, cs); threads[count].Start(); } // start Console.WriteLine("All clients started, fire gun..."); startingGun.Signal(); // wait for all threads to finish foreach (ThreadJob t in threads) { t.Join(); } // cleanup sysprop SystemProperties.SetProperty("lockverifyserver.port", null); Console.WriteLine("Server terminated."); } }
// Seek type 2 "continue" (back to the start of the // surrogates): scan the stripped suffix from the // prior term, backwards. If there was an E in that // part, then we try to seek back to S. If that // seek finds a matching term, we go there. private bool DoContinue() { if (DEBUG_SURROGATES) { Console.WriteLine(" try cont"); } int downTo = prevTerm.Length - 1; bool didSeek = false; int limit = Math.Min(newSuffixStart, scratchTerm.Length - 1); while (downTo > limit) { if (IsHighBMPChar(prevTerm.Bytes, downTo)) { if (DEBUG_SURROGATES) { Console.WriteLine(" found E pos=" + downTo + " vs len=" + prevTerm.Length); } if (SeekToNonBMP(seekTermEnum, prevTerm, downTo)) { // TODO: more efficient seek? outerInstance.TermsDict.SeekEnum(termEnum, seekTermEnum.Term(), true); //newSuffixStart = downTo+4; newSuffixStart = downTo; scratchTerm.CopyBytes(termEnum.Term().Bytes); didSeek = true; if (DEBUG_SURROGATES) { Console.WriteLine(" seek!"); } break; } else { if (DEBUG_SURROGATES) { Console.WriteLine(" no seek"); } } } // Shorten prevTerm in place so that we don't redo // this loop if we come back here: if ((prevTerm.Bytes[downTo] & 0xc0) == 0xc0 || (prevTerm.Bytes[downTo] & 0x80) == 0) { prevTerm.Length = downTo; } downTo--; } return(didSeek); }
public override SeekStatus SeekCeil(BytesRef term) { if (DEBUG_SURROGATES) { Console.WriteLine("TE.seek target=" + UnicodeUtil.ToHexString(term.Utf8ToString())); } skipNext = false; TermInfosReader tis = outerInstance.TermsDict; Term t0 = new Term(fieldInfo.Name, term); Debug.Assert(termEnum != null); tis.SeekEnum(termEnum, t0, false); Term t = termEnum.Term(); if (t != null && t.Field == internedFieldName && term.BytesEquals(t.Bytes)) { // If we found an exact match, no need to do the // surrogate dance if (DEBUG_SURROGATES) { Console.WriteLine(" seek exact match"); } current = t.Bytes; return(SeekStatus.FOUND); } else if (t == null || t.Field != internedFieldName) { // TODO: maybe we can handle this like the next() // into null? set term as prevTerm then dance? if (DEBUG_SURROGATES) { Console.WriteLine(" seek hit EOF"); } // We hit EOF; try end-case surrogate dance: if we // find an E, try swapping in S, backwards: scratchTerm.CopyBytes(term); Debug.Assert(scratchTerm.Offset == 0); for (int i = scratchTerm.Length - 1; i >= 0; i--) { if (IsHighBMPChar(scratchTerm.Bytes, i)) { if (DEBUG_SURROGATES) { Console.WriteLine(" found E pos=" + i + "; try seek"); } if (SeekToNonBMP(seekTermEnum, scratchTerm, i)) { scratchTerm.CopyBytes(seekTermEnum.Term().Bytes); outerInstance.TermsDict.SeekEnum(termEnum, seekTermEnum.Term(), false); newSuffixStart = 1 + i; DoPushes(); // Found a match // TODO: faster seek? current = termEnum.Term().Bytes; return(SeekStatus.NOT_FOUND); } } } if (DEBUG_SURROGATES) { Console.WriteLine(" seek END"); } current = null; return(SeekStatus.END); } else { // We found a non-exact but non-null term; this one // is fun -- just treat it like next, by pretending // requested term was prev: prevTerm.CopyBytes(term); if (DEBUG_SURROGATES) { Console.WriteLine(" seek hit non-exact term=" + UnicodeUtil.ToHexString(t.Text())); } BytesRef br = t.Bytes; Debug.Assert(br.Offset == 0); SetNewSuffixStart(term, br); SurrogateDance(); Term t2 = termEnum.Term(); if (t2 == null || t2.Field != internedFieldName) { // PreFlex codec interns field names; verify: Debug.Assert(t2 == null || !t2.Field.Equals(internedFieldName, StringComparison.Ordinal)); current = null; return(SeekStatus.END); } else { current = t2.Bytes; Debug.Assert(!unicodeSortOrder || term.CompareTo(current) < 0, "term=" + UnicodeUtil.ToHexString(term.Utf8ToString()) + " vs current=" + UnicodeUtil.ToHexString(current.Utf8ToString())); return(SeekStatus.NOT_FOUND); } } }
// Look for seek type 1 ("push"): if the newly added // suffix contains any S, we must try to seek to the // corresponding E. If we find a match, we go there; // else we keep looking for additional S's in the new // suffix. this "starts" the dance, at this character // position: private void DoPushes() { int upTo = newSuffixStart; if (DEBUG_SURROGATES) { Console.WriteLine(" try push newSuffixStart=" + newSuffixStart + " scratchLen=" + scratchTerm.Length); } while (upTo < scratchTerm.Length) { if (IsNonBMPChar(scratchTerm.Bytes, upTo) && (upTo > newSuffixStart || (upTo >= prevTerm.Length || (!IsNonBMPChar(prevTerm.Bytes, upTo) && !IsHighBMPChar(prevTerm.Bytes, upTo))))) { // A non-BMP char (4 bytes UTF8) starts here: Debug.Assert(scratchTerm.Length >= upTo + 4); int savLength = scratchTerm.Length; scratch[0] = (sbyte)scratchTerm.Bytes[upTo]; scratch[1] = (sbyte)scratchTerm.Bytes[upTo + 1]; scratch[2] = (sbyte)scratchTerm.Bytes[upTo + 2]; scratchTerm.Bytes[upTo] = (byte)UTF8_HIGH_BMP_LEAD; scratchTerm.Bytes[upTo + 1] = 0x80; scratchTerm.Bytes[upTo + 2] = 0x80; scratchTerm.Length = upTo + 3; if (DEBUG_SURROGATES) { Console.WriteLine(" try seek 1 pos=" + upTo + " term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString()) + " " + scratchTerm.ToString() + " len=" + scratchTerm.Length); } // Seek "forward": // TODO: more efficient seek? outerInstance.TermsDict.SeekEnum(seekTermEnum, new Term(fieldInfo.Name, scratchTerm), true); scratchTerm.Bytes[upTo] = (byte)scratch[0]; scratchTerm.Bytes[upTo + 1] = (byte)scratch[1]; scratchTerm.Bytes[upTo + 2] = (byte)scratch[2]; scratchTerm.Length = savLength; // Did we find a match? Term t2 = seekTermEnum.Term(); if (DEBUG_SURROGATES) { if (t2 == null) { Console.WriteLine(" hit term=null"); } else { Console.WriteLine(" hit term=" + UnicodeUtil.ToHexString(t2.Text()) + " " + (t2 == null ? null : t2.Bytes)); } } // Since this was a seek "forward", we could hit // EOF or a different field: bool matches; if (t2 != null && t2.Field == internedFieldName) { BytesRef b2 = t2.Bytes; Debug.Assert(b2.Offset == 0); if (b2.Length >= upTo + 3 && IsHighBMPChar(b2.Bytes, upTo)) { matches = true; for (int i = 0; i < upTo; i++) { if (scratchTerm.Bytes[i] != b2.Bytes[i]) { matches = false; break; } } } else { matches = false; } } else { matches = false; } if (matches) { if (DEBUG_SURROGATES) { Console.WriteLine(" matches!"); } // OK seek "back" // TODO: more efficient seek? outerInstance.TermsDict.SeekEnum(termEnum, seekTermEnum.Term(), true); scratchTerm.CopyBytes(seekTermEnum.Term().Bytes); // +3 because we don't need to check the char // at upTo: we know it's > BMP upTo += 3; // NOTE: we keep iterating, now, since this // can easily "recurse". Ie, after seeking // forward at a certain char position, we may // find another surrogate in our [new] suffix // and must then do another seek (recurse) } else { upTo++; } } else { upTo++; } } }
// Pre-flex indices store terms in UTF16 sort order, but // certain queries require Unicode codepoint order; this // method carefully seeks around surrogates to handle // this impedance mismatch private void SurrogateDance() { if (!unicodeSortOrder) { return; } // We are invoked after TIS.next() (by UTF16 order) to // possibly seek to a different "next" (by unicode // order) term. // We scan only the "delta" from the last term to the // current term, in UTF8 bytes. We look at 1) the bytes // stripped from the prior term, and then 2) the bytes // appended to that prior term's prefix. // We don't care about specific UTF8 sequences, just // the "category" of the UTF16 character. Category S // is a high/low surrogate pair (it non-BMP). // Category E is any BMP char > UNI_SUR_LOW_END (and < // U+FFFF). Category A is the rest (any unicode char // <= UNI_SUR_HIGH_START). // The core issue is that pre-flex indices sort the // characters as ASE, while flex must sort as AES. So // when scanning, when we hit S, we must 1) seek // forward to E and enum the terms there, then 2) seek // back to S and enum all terms there, then 3) seek to // after E. Three different seek points (1, 2, 3). // We can easily detect S in UTF8: if a byte has // prefix 11110 (0xf0), then that byte and the // following 3 bytes encode a single unicode codepoint // in S. Similarly, we can detect E: if a byte has // prefix 1110111 (0xee), then that byte and the // following 2 bytes encode a single unicode codepoint // in E. // Note that this is really a recursive process -- // maybe the char at pos 2 needs to dance, but any // point in its dance, suddenly pos 4 needs to dance // so you must finish pos 4 before returning to pos // 2. But then during pos 4's dance maybe pos 7 needs // to dance, etc. However, despite being recursive, // we don't need to hold any state because the state // can always be derived by looking at prior term & // current term. // TODO: can we avoid this copy? if (termEnum.Term() == null || termEnum.Term().Field != internedFieldName) { scratchTerm.Length = 0; } else { scratchTerm.CopyBytes(termEnum.Term().Bytes); } if (DEBUG_SURROGATES) { Console.WriteLine(" dance"); Console.WriteLine(" prev=" + UnicodeUtil.ToHexString(prevTerm.Utf8ToString())); Console.WriteLine(" " + prevTerm.ToString()); Console.WriteLine(" term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString())); Console.WriteLine(" " + scratchTerm.ToString()); } // this code assumes TermInfosReader/SegmentTermEnum // always use BytesRef.offset == 0 Debug.Assert(prevTerm.Offset == 0); Debug.Assert(scratchTerm.Offset == 0); // Need to loop here because we may need to do multiple // pops, and possibly a continue in the end, ie: // // cont // pop, cont // pop, pop, cont // <nothing> // while (true) { if (DoContinue()) { break; } else { if (!DoPop()) { break; } } } if (DEBUG_SURROGATES) { Console.WriteLine(" finish bmp ends"); } DoPushes(); }
// Look for seek type 3 ("pop"): if the delta from // prev -> current was replacing an S with an E, // we must now seek to beyond that E. this seek // "finishes" the dance at this character // position. private bool DoPop() { if (DEBUG_SURROGATES) { Console.WriteLine(" try pop"); } Debug.Assert(newSuffixStart <= prevTerm.Length); Debug.Assert(newSuffixStart < scratchTerm.Length || newSuffixStart == 0); if (prevTerm.Length > newSuffixStart && IsNonBMPChar(prevTerm.Bytes, newSuffixStart) && IsHighBMPChar(scratchTerm.Bytes, newSuffixStart)) { // Seek type 2 -- put 0xFF at this position: scratchTerm.Bytes[newSuffixStart] = 0xff; scratchTerm.Length = newSuffixStart + 1; if (DEBUG_SURROGATES) { Console.WriteLine(" seek to term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString()) + " " + scratchTerm.ToString()); } // TODO: more efficient seek? can we simply swap // the enums? outerInstance.TermsDict.SeekEnum(termEnum, new Term(fieldInfo.Name, scratchTerm), true); Term t2 = termEnum.Term(); // We could hit EOF or different field since this // was a seek "forward": if (t2 != null && t2.Field == internedFieldName) { if (DEBUG_SURROGATES) { Console.WriteLine(" got term=" + UnicodeUtil.ToHexString(t2.Text()) + " " + t2.Bytes); } BytesRef b2 = t2.Bytes; Debug.Assert(b2.Offset == 0); // Set newSuffixStart -- we can't use // termEnum's since the above seek may have // done no scanning (eg, term was precisely // and index term, or, was in the term seek // cache): scratchTerm.CopyBytes(b2); SetNewSuffixStart(prevTerm, scratchTerm); return(true); } else if (newSuffixStart != 0 || scratchTerm.Length != 0) { if (DEBUG_SURROGATES) { Console.WriteLine(" got term=null (or next field)"); } newSuffixStart = 0; scratchTerm.Length = 0; return(true); } } return(false); }
// Swap in S, in place of E: private bool SeekToNonBMP(SegmentTermEnum te, BytesRef term, int pos) { int savLength = term.Length; Debug.Assert(term.Offset == 0); // The 3 bytes starting at downTo make up 1 // unicode character: Debug.Assert(IsHighBMPChar(term.Bytes, pos)); // NOTE: we cannot make this assert, because // AutomatonQuery legitimately sends us malformed UTF8 // (eg the UTF8 bytes with just 0xee) // assert term.length >= pos + 3: "term.length=" + term.length + " pos+3=" + (pos+3) + " byte=" + Integer.toHexString(term.bytes[pos]) + " term=" + term.toString(); // Save the bytes && length, since we need to // restore this if seek "back" finds no matching // terms if (term.Bytes.Length < 4 + pos) { term.Grow(4 + pos); } scratch[0] = (sbyte)term.Bytes[pos]; scratch[1] = (sbyte)term.Bytes[pos + 1]; scratch[2] = (sbyte)term.Bytes[pos + 2]; term.Bytes[pos] = 0xf0; term.Bytes[pos + 1] = 0x90; term.Bytes[pos + 2] = 0x80; term.Bytes[pos + 3] = 0x80; term.Length = 4 + pos; if (DEBUG_SURROGATES) { Console.WriteLine(" try seek term=" + UnicodeUtil.ToHexString(term.Utf8ToString())); } // Seek "back": outerInstance.TermsDict.SeekEnum(te, new Term(fieldInfo.Name, term), true); // Test if the term we seek'd to in fact found a // surrogate pair at the same position as the E: Term t2 = te.Term(); // Cannot be null (or move to next field) because at // "worst" it'd seek to the same term we are on now, // unless we are being called from seek if (t2 == null || t2.Field != internedFieldName) { return(false); } if (DEBUG_SURROGATES) { Console.WriteLine(" got term=" + UnicodeUtil.ToHexString(t2.Text())); } // Now test if prefix is identical and we found // a non-BMP char at the same position: BytesRef b2 = t2.Bytes; Debug.Assert(b2.Offset == 0); bool matches; if (b2.Length >= term.Length && IsNonBMPChar(b2.Bytes, pos)) { matches = true; for (int i = 0; i < pos; i++) { if (term.Bytes[i] != b2.Bytes[i]) { matches = false; break; } } } else { matches = false; } // Restore term: term.Length = savLength; term.Bytes[pos] = (byte)scratch[0]; term.Bytes[pos + 1] = (byte)scratch[1]; term.Bytes[pos + 2] = (byte)scratch[2]; return(matches); }
public override BytesRef Next() { if (DEBUG_SURROGATES) { Console.WriteLine("TE.next()"); } if (skipNext) { if (DEBUG_SURROGATES) { Console.WriteLine(" skipNext=true"); } skipNext = false; if (termEnum.Term() == null) { return(null); // PreFlex codec interns field names: } else if (termEnum.Term().Field != internedFieldName) { return(null); } else { return(current = termEnum.Term().Bytes); } } // TODO: can we use STE's prevBuffer here? prevTerm.CopyBytes(termEnum.Term().Bytes); if (termEnum.Next() && termEnum.Term().Field == internedFieldName) { newSuffixStart = termEnum.newSuffixStart; if (DEBUG_SURROGATES) { Console.WriteLine(" newSuffixStart=" + newSuffixStart); } SurrogateDance(); Term t = termEnum.Term(); if (t == null || t.Field != internedFieldName) { // PreFlex codec interns field names; verify: Debug.Assert(t == null || !t.Field.Equals(internedFieldName, StringComparison.Ordinal)); current = null; } else { current = t.Bytes; } return(current); } else { // this field is exhausted, but we have to give // surrogateDance a chance to seek back: if (DEBUG_SURROGATES) { Console.WriteLine(" force cont"); } //newSuffixStart = prevTerm.length; newSuffixStart = 0; SurrogateDance(); Term t = termEnum.Term(); if (t == null || t.Field != internedFieldName) { // PreFlex codec interns field names; verify: Debug.Assert(t == null || !t.Field.Equals(internedFieldName, StringComparison.Ordinal)); return(null); } else { current = t.Bytes; return(current); } } }