Ejemplo n.º 1
0
        public override void DeleteFile(string name)
        {
            UninterruptableMonitor.Enter(this);
            try
            {
                if (VERBOSE)
                {
                    Console.WriteLine("nrtdir.deleteFile name=" + name);
                }
#pragma warning disable 612, 618
                if (cache.FileExists(name))
#pragma warning restore 612, 618
                {
                    cache.DeleteFile(name);
                }
                else
                {
                    @delegate.DeleteFile(name);
                }
            }
            finally
            {
                UninterruptableMonitor.Exit(this);
            }
        }
Ejemplo n.º 2
0
 public override IndexOutput CreateOutput(string name, IOContext context)
 {
     if (VERBOSE)
     {
         Console.WriteLine("nrtdir.createOutput name=" + name);
     }
     if (DoCacheWrite(name, context))
     {
         if (VERBOSE)
         {
             Console.WriteLine("  to cache");
         }
         try
         {
             @delegate.DeleteFile(name);
         }
         catch (IOException) // LUCENENET: IDE0059: Remove unnecessary value assignment
         {
             // this is fine: file may not exist
         }
         return(cache.CreateOutput(name, context));
     }
     else
     {
         try
         {
             cache.DeleteFile(name);
         }
         catch (IOException) // LUCENENET: IDE0059: Remove unnecessary value assignment
         {
             // this is fine: file may not exist
         }
         return(@delegate.CreateOutput(name, context));
     }
 }
Ejemplo n.º 3
0
        public override IndexInput OpenInput(string name, IOContext context)
        {
            UninterruptableMonitor.Enter(this);
            try
            {
                if (VERBOSE)
                {
                    Console.WriteLine("nrtdir.openInput name=" + name);
                }
#pragma warning disable 612, 618
                if (cache.FileExists(name))
#pragma warning restore 612, 618
                {
                    if (VERBOSE)
                    {
                        Console.WriteLine("  from cache");
                    }
                    return(cache.OpenInput(name, context));
                }
                else
                {
                    return(@delegate.OpenInput(name, context));
                }
            }
            finally
            {
                UninterruptableMonitor.Exit(this);
            }
        }
Ejemplo n.º 4
0
        public override IndexInputSlicer CreateSlicer(string name, IOContext context)
        {
            lock (this)
            {
                EnsureOpen();
                if (VERBOSE)
                {
                    Console.WriteLine("nrtdir.openInput name=" + name);
                }
#pragma warning disable 612, 618
                if (cache.FileExists(name))
#pragma warning restore 612, 618
                {
                    if (VERBOSE)
                    {
                        Console.WriteLine("  from cache");
                    }
                    return(cache.CreateSlicer(name, context));
                }
                else
                {
                    return(@delegate.CreateSlicer(name, context));
                }
            }
        }
Ejemplo n.º 5
0
            private void SetNewSuffixStart(BytesRef br1, BytesRef br2)
            {
                int limit     = Math.Min(br1.Length, br2.Length);
                int lastStart = 0;

                for (int i = 0; i < limit; i++)
                {
                    if ((br1.Bytes[br1.Offset + i] & 0xc0) == 0xc0 || (br1.Bytes[br1.Offset + i] & 0x80) == 0)
                    {
                        lastStart = i;
                    }
                    if (br1.Bytes[br1.Offset + i] != br2.Bytes[br2.Offset + i])
                    {
                        newSuffixStart = lastStart;
                        if (DEBUG_SURROGATES)
                        {
                            Console.WriteLine("    set newSuffixStart=" + newSuffixStart);
                        }
                        return;
                    }
                }
                newSuffixStart = limit;
                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("    set newSuffixStart=" + newSuffixStart);
                }
            }
Ejemplo n.º 6
0
 public override IndexOutput CreateOutput(string name, IOContext context)
 {
     if (VERBOSE)
     {
         Console.WriteLine("nrtdir.createOutput name=" + name);
     }
     if (DoCacheWrite(name, context))
     {
         if (VERBOSE)
         {
             Console.WriteLine("  to cache");
         }
         try
         {
             @delegate.DeleteFile(name);
         }
         catch (Exception ioe) when(ioe.IsIOException())
         {
             // this is fine: file may not exist
         }
         return(cache.CreateOutput(name, context));
     }
     else
     {
         try
         {
             cache.DeleteFile(name);
         }
         catch (Exception ioe) when(ioe.IsIOException())
         {
             // this is fine: file may not exist
         }
         return(@delegate.CreateOutput(name, context));
     }
 }
Ejemplo n.º 7
0
            // Seek type 2 "continue" (back to the start of the
            // surrogates): scan the stripped suffix from the
            // prior term, backwards. If there was an E in that
            // part, then we try to seek back to S.  If that
            // seek finds a matching term, we go there.
            private bool DoContinue()
            {
                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("  try cont");
                }

                int downTo = prevTerm.Length - 1;

                bool didSeek = false;

                int limit = Math.Min(newSuffixStart, scratchTerm.Length - 1);

                while (downTo > limit)
                {
                    if (IsHighBMPChar(prevTerm.Bytes, downTo))
                    {
                        if (DEBUG_SURROGATES)
                        {
                            Console.WriteLine("    found E pos=" + downTo + " vs len=" + prevTerm.Length);
                        }

                        if (SeekToNonBMP(seekTermEnum, prevTerm, downTo))
                        {
                            // TODO: more efficient seek?
                            outerInstance.TermsDict.SeekEnum(termEnum, seekTermEnum.Term(), true);
                            //newSuffixStart = downTo+4;
                            newSuffixStart = downTo;
                            scratchTerm.CopyBytes(termEnum.Term().Bytes);
                            didSeek = true;
                            if (DEBUG_SURROGATES)
                            {
                                Console.WriteLine("      seek!");
                            }
                            break;
                        }
                        else
                        {
                            if (DEBUG_SURROGATES)
                            {
                                Console.WriteLine("      no seek");
                            }
                        }
                    }

                    // Shorten prevTerm in place so that we don't redo
                    // this loop if we come back here:
                    if ((prevTerm.Bytes[downTo] & 0xc0) == 0xc0 || (prevTerm.Bytes[downTo] & 0x80) == 0)
                    {
                        prevTerm.Length = downTo;
                    }

                    downTo--;
                }

                return(didSeek);
            }
Ejemplo n.º 8
0
 public override void Sync(ICollection <string> fileNames)
 {
     if (VERBOSE)
     {
         Console.WriteLine("nrtdir.sync files=" + fileNames);
     }
     foreach (string fileName in fileNames)
     {
         UnCache(fileName);
     }
     @delegate.Sync(fileNames);
 }
Ejemplo n.º 9
0
        private void UnCache(string fileName)
        {
            // Only let one thread uncache at a time; this only
            // happens during commit() or close():
            UninterruptableMonitor.Enter(uncacheLock);
            try
            {
                if (VERBOSE)
                {
                    Console.WriteLine("nrtdir.unCache name=" + fileName);
                }
#pragma warning disable 612, 618
                if (!cache.FileExists(fileName))
#pragma warning restore 612, 618
                {
                    // Another thread beat us...
                    return;
                }
                IOContext   context = IOContext.DEFAULT;
                IndexOutput @out    = @delegate.CreateOutput(fileName, context);
                IndexInput  @in     = null;
                try
                {
                    @in = cache.OpenInput(fileName, context);
                    @out.CopyBytes(@in, @in.Length);
                }
                finally
                {
                    IOUtils.Dispose(@in, @out);
                }

                // Lock order: uncacheLock -> this
                UninterruptableMonitor.Enter(this);
                try
                {
                    // Must sync here because other sync methods have
                    // if (cache.fileExists(name)) { ... } else { ... }:
                    cache.DeleteFile(fileName);
                }
                finally
                {
                    UninterruptableMonitor.Exit(this);
                }
            }
            finally
            {
                UninterruptableMonitor.Exit(uncacheLock);
            }
        }
Ejemplo n.º 10
0
        public override void DeleteFile(string name)
        {
            lock (this)
            {
                if (VERBOSE)
                {
                    Console.WriteLine("nrtdir.deleteFile name=" + name);
                }
#pragma warning disable 612, 618
                if (cache.FileExists(name))
#pragma warning restore 612, 618
                {
                    cache.DeleteFile(name);
                }
                else
                {
                    @delegate.DeleteFile(name);
                }
            }
        }
Ejemplo n.º 11
0
        public override IndexOutput CreateOutput(string name, IOContext context)
        {
            if (VERBOSE)
            {
                Console.WriteLine("nrtdir.createOutput name=" + name);
            }
            if (DoCacheWrite(name, context))
            {
                if (VERBOSE)
                {
                    Console.WriteLine("  to cache");
                }
                try
                {
                    @delegate.DeleteFile(name);
                }
#pragma warning disable 168
                catch (IOException ioe)
#pragma warning restore 168
                {
                    // this is fine: file may not exist
                }
                return(cache.CreateOutput(name, context));
            }
            else
            {
                try
                {
                    cache.DeleteFile(name);
                }
#pragma warning disable 168
                catch (IOException ioe)
#pragma warning restore 168
                {
                    // this is fine: file may not exist
                }
                return(@delegate.CreateOutput(name, context));
            }
        }
Ejemplo n.º 12
0
        public static void Main(string[] args)
        {
            if (args.Length != 7)
            {
                // LUCENENET specific - our wrapper console shows the correct usage
                throw new ArgumentException();
                //Console.WriteLine("Usage: java Lucene.Net.Store.LockStressTest myID verifierHost verifierPort lockFactoryClassName lockDirName sleepTimeMS count\n" +
                //    "\n" +
                //    "  myID = int from 0 .. 255 (should be unique for test process)\n" +
                //    "  verifierHost = hostname that LockVerifyServer is listening on\n" +
                //    "  verifierPort = port that LockVerifyServer is listening on\n" +
                //    "  lockFactoryClassName = primary LockFactory class that we will use\n" +
                //    "  lockDirName = path to the lock directory (only set for Simple/NativeFSLockFactory\n" +
                //    "  sleepTimeMS = milliseconds to pause betweeen each lock obtain/release\n" +
                //    "  count = number of locking tries\n" +
                //    "\n" +
                //    "You should run multiple instances of this process, each with its own\n" +
                //    "unique ID, and each pointing to the same lock directory, to verify\n" +
                //    "that locking is working correctly.\n" +
                //    "\n" +
                //    "Make sure you are first running LockVerifyServer.");
                //Environment.FailFast("1");
            }

            int arg  = 0;
            int myID = Convert.ToInt32(args[arg++], CultureInfo.InvariantCulture);

            if (myID < 0 || myID > 255)
            {
                throw new ArgumentException("ID must be a unique int 0..255");
                //Console.WriteLine("myID must be a unique int 0..255");
                //Environment.Exit(1);
            }

            string verifierHost         = args[arg++];
            int    verifierPort         = Convert.ToInt32(args[arg++], CultureInfo.InvariantCulture);
            string lockFactoryClassName = args[arg++];
            string lockDirName          = args[arg++];
            int    sleepTimeMS          = Convert.ToInt32(args[arg++], CultureInfo.InvariantCulture);
            int    count = Convert.ToInt32(args[arg++], CultureInfo.InvariantCulture);

            IPAddress[] addresses = Dns.GetHostAddressesAsync(verifierHost).Result;
            IPAddress   addr      = addresses.Length > 0 ? addresses[0] : null;

            Type c;

            try
            {
                c = Type.GetType(lockFactoryClassName);
                if (c == null)
                {
                    // LUCENENET: try again, this time with the Store namespace
                    c = Type.GetType("Lucene.Net.Store." + lockFactoryClassName);
                }
            }
            catch (Exception)
            {
                throw new IOException("unable to find LockClass " + lockFactoryClassName);
            }

            LockFactory lockFactory;

            try
            {
                lockFactory = (LockFactory)Activator.CreateInstance(c);
            }
            catch (UnauthorizedAccessException e)
            {
                throw new IOException("Cannot instantiate lock factory " + lockFactoryClassName, e);
            }
            catch (InvalidCastException e)
            {
                throw new IOException("unable to cast LockClass " + lockFactoryClassName + " instance to a LockFactory", e);
            }
            catch (Exception e)
            {
                throw new IOException("InstantiationException when instantiating LockClass " + lockFactoryClassName, e);
            }

            DirectoryInfo lockDir = new DirectoryInfo(lockDirName);

            if (lockFactory is FSLockFactory)
            {
                ((FSLockFactory)lockFactory).SetLockDir(lockDir);
            }

            Console.WriteLine("Connecting to server " + addr + " and registering as client " + myID + "...");
            using (Socket socket = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp))
            {
                socket.SetSocketOption(SocketOptionLevel.Socket, SocketOptionName.ReuseAddress, 1);
                socket.Connect(verifierHost, verifierPort);

                using (Stream stream = new NetworkStream(socket))
                {
                    BinaryReader intReader = new BinaryReader(stream);
                    BinaryWriter intWriter = new BinaryWriter(stream);

                    intWriter.Write(myID);
                    stream.Flush();

                    lockFactory.LockPrefix = "test";
                    LockFactory verifyLF = new VerifyingLockFactory(lockFactory, stream);
                    Lock        l        = verifyLF.MakeLock("test.lock");
                    Random      rnd      = new Random();

                    // wait for starting gun
                    if (intReader.ReadInt32() != 43)
                    {
                        throw new IOException("Protocol violation");
                    }

                    for (int i = 0; i < count; i++)
                    {
                        bool obtained = false;

                        try
                        {
                            obtained = l.Obtain(rnd.Next(100) + 10);
                        }
#pragma warning disable 168
                        catch (LockObtainFailedException e)
#pragma warning restore 168
                        {
                        }

                        if (obtained)
                        {
                            Thread.Sleep(sleepTimeMS);
                            l.Dispose();
                        }

                        if (i % 500 == 0)
                        {
                            Console.WriteLine((i * 100.0 / count) + "% done.");
                        }

                        Thread.Sleep(sleepTimeMS);
                    }
                }
            }

            Console.WriteLine("Finished " + count + " tries.");
        }
Ejemplo n.º 13
0
            public override BytesRef Next()
            {
                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("TE.next()");
                }
                if (skipNext)
                {
                    if (DEBUG_SURROGATES)
                    {
                        Console.WriteLine("  skipNext=true");
                    }
                    skipNext = false;
                    if (termEnum.Term() == null)
                    {
                        return(null);
                        // PreFlex codec interns field names:
                    }
                    else if (termEnum.Term().Field != internedFieldName)
                    {
                        return(null);
                    }
                    else
                    {
                        return(current = termEnum.Term().Bytes);
                    }
                }

                // TODO: can we use STE's prevBuffer here?
                prevTerm.CopyBytes(termEnum.Term().Bytes);

                if (termEnum.Next() && termEnum.Term().Field == internedFieldName)
                {
                    newSuffixStart = termEnum.newSuffixStart;
                    if (DEBUG_SURROGATES)
                    {
                        Console.WriteLine("  newSuffixStart=" + newSuffixStart);
                    }
                    SurrogateDance();
                    Term t = termEnum.Term();
                    if (t == null || t.Field != internedFieldName)
                    {
                        // PreFlex codec interns field names; verify:
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(t == null || !t.Field.Equals(internedFieldName, StringComparison.Ordinal));
                        }
                        current = null;
                    }
                    else
                    {
                        current = t.Bytes;
                    }
                    return(current);
                }
                else
                {
                    // this field is exhausted, but we have to give
                    // surrogateDance a chance to seek back:
                    if (DEBUG_SURROGATES)
                    {
                        Console.WriteLine("  force cont");
                    }
                    //newSuffixStart = prevTerm.length;
                    newSuffixStart = 0;
                    SurrogateDance();

                    Term t = termEnum.Term();
                    if (t == null || t.Field != internedFieldName)
                    {
                        // PreFlex codec interns field names; verify:
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(t == null || !t.Field.Equals(internedFieldName, StringComparison.Ordinal));
                        }
                        return(null);
                    }
                    else
                    {
                        current = t.Bytes;
                        return(current);
                    }
                }
            }
Ejemplo n.º 14
0
            public override SeekStatus SeekCeil(BytesRef term)
            {
                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("TE.seek target=" + UnicodeUtil.ToHexString(term.Utf8ToString()));
                }
                skipNext = false;
                TermInfosReader tis = outerInstance.TermsDict;
                Term            t0  = new Term(fieldInfo.Name, term);

                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(termEnum != null);
                }

                tis.SeekEnum(termEnum, t0, false);

                Term t = termEnum.Term();

                if (t != null && t.Field == internedFieldName && term.BytesEquals(t.Bytes))
                {
                    // If we found an exact match, no need to do the
                    // surrogate dance
                    if (DEBUG_SURROGATES)
                    {
                        Console.WriteLine("  seek exact match");
                    }
                    current = t.Bytes;
                    return(SeekStatus.FOUND);
                }
                else if (t == null || t.Field != internedFieldName)
                {
                    // TODO: maybe we can handle this like the next()
                    // into null?  set term as prevTerm then dance?

                    if (DEBUG_SURROGATES)
                    {
                        Console.WriteLine("  seek hit EOF");
                    }

                    // We hit EOF; try end-case surrogate dance: if we
                    // find an E, try swapping in S, backwards:
                    scratchTerm.CopyBytes(term);

                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(scratchTerm.Offset == 0);
                    }

                    for (int i = scratchTerm.Length - 1; i >= 0; i--)
                    {
                        if (IsHighBMPChar(scratchTerm.Bytes, i))
                        {
                            if (DEBUG_SURROGATES)
                            {
                                Console.WriteLine("    found E pos=" + i + "; try seek");
                            }

                            if (SeekToNonBMP(seekTermEnum, scratchTerm, i))
                            {
                                scratchTerm.CopyBytes(seekTermEnum.Term().Bytes);
                                outerInstance.TermsDict.SeekEnum(termEnum, seekTermEnum.Term(), false);

                                newSuffixStart = 1 + i;

                                DoPushes();

                                // Found a match
                                // TODO: faster seek?
                                current = termEnum.Term().Bytes;
                                return(SeekStatus.NOT_FOUND);
                            }
                        }
                    }

                    if (DEBUG_SURROGATES)
                    {
                        Console.WriteLine("  seek END");
                    }

                    current = null;
                    return(SeekStatus.END);
                }
                else
                {
                    // We found a non-exact but non-null term; this one
                    // is fun -- just treat it like next, by pretending
                    // requested term was prev:
                    prevTerm.CopyBytes(term);

                    if (DEBUG_SURROGATES)
                    {
                        Console.WriteLine("  seek hit non-exact term=" + UnicodeUtil.ToHexString(t.Text()));
                    }

                    BytesRef br = t.Bytes;
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(br.Offset == 0);
                    }

                    SetNewSuffixStart(term, br);

                    SurrogateDance();

                    Term t2 = termEnum.Term();
                    if (t2 == null || t2.Field != internedFieldName)
                    {
                        // PreFlex codec interns field names; verify:
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(t2 == null || !t2.Field.Equals(internedFieldName, StringComparison.Ordinal));
                        }
                        current = null;
                        return(SeekStatus.END);
                    }
                    else
                    {
                        current = t2.Bytes;
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(!unicodeSortOrder || term.CompareTo(current) < 0, () => "term=" + UnicodeUtil.ToHexString(term.Utf8ToString()) + " vs current=" + UnicodeUtil.ToHexString(current.Utf8ToString()));
                        }
                        return(SeekStatus.NOT_FOUND);
                    }
                }
            }
Ejemplo n.º 15
0
            // Look for seek type 1 ("push"): if the newly added
            // suffix contains any S, we must try to seek to the
            // corresponding E.  If we find a match, we go there;
            // else we keep looking for additional S's in the new
            // suffix.  this "starts" the dance, at this character
            // position:
            private void DoPushes()
            {
                int upTo = newSuffixStart;

                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("  try push newSuffixStart=" + newSuffixStart + " scratchLen=" + scratchTerm.Length);
                }

                while (upTo < scratchTerm.Length)
                {
                    if (IsNonBMPChar(scratchTerm.Bytes, upTo) && (upTo > newSuffixStart || (upTo >= prevTerm.Length || (!IsNonBMPChar(prevTerm.Bytes, upTo) && !IsHighBMPChar(prevTerm.Bytes, upTo)))))
                    {
                        // A non-BMP char (4 bytes UTF8) starts here:
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(scratchTerm.Length >= upTo + 4);
                        }

                        int savLength = scratchTerm.Length;
                        scratch[0] = (sbyte)scratchTerm.Bytes[upTo];
                        scratch[1] = (sbyte)scratchTerm.Bytes[upTo + 1];
                        scratch[2] = (sbyte)scratchTerm.Bytes[upTo + 2];

                        scratchTerm.Bytes[upTo]     = (byte)UTF8_HIGH_BMP_LEAD;
                        scratchTerm.Bytes[upTo + 1] = 0x80;
                        scratchTerm.Bytes[upTo + 2] = 0x80;
                        scratchTerm.Length          = upTo + 3;

                        if (DEBUG_SURROGATES)
                        {
                            Console.WriteLine("    try seek 1 pos=" + upTo + " term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString()) + " " + scratchTerm.ToString() + " len=" + scratchTerm.Length);
                        }

                        // Seek "forward":
                        // TODO: more efficient seek?
                        outerInstance.TermsDict.SeekEnum(seekTermEnum, new Term(fieldInfo.Name, scratchTerm), true);

                        scratchTerm.Bytes[upTo]     = (byte)scratch[0];
                        scratchTerm.Bytes[upTo + 1] = (byte)scratch[1];
                        scratchTerm.Bytes[upTo + 2] = (byte)scratch[2];
                        scratchTerm.Length          = savLength;

                        // Did we find a match?
                        Term t2 = seekTermEnum.Term();

                        if (DEBUG_SURROGATES)
                        {
                            if (t2 == null)
                            {
                                Console.WriteLine("      hit term=null");
                            }
                            else
                            {
                                Console.WriteLine("      hit term=" + UnicodeUtil.ToHexString(t2.Text()) + " " + (t2 == null ? null : t2.Bytes));
                            }
                        }

                        // Since this was a seek "forward", we could hit
                        // EOF or a different field:
                        bool matches;

                        if (t2 != null && t2.Field == internedFieldName)
                        {
                            BytesRef b2 = t2.Bytes;
                            if (Debugging.AssertsEnabled)
                            {
                                Debugging.Assert(b2.Offset == 0);
                            }
                            if (b2.Length >= upTo + 3 && IsHighBMPChar(b2.Bytes, upTo))
                            {
                                matches = true;
                                for (int i = 0; i < upTo; i++)
                                {
                                    if (scratchTerm.Bytes[i] != b2.Bytes[i])
                                    {
                                        matches = false;
                                        break;
                                    }
                                }
                            }
                            else
                            {
                                matches = false;
                            }
                        }
                        else
                        {
                            matches = false;
                        }

                        if (matches)
                        {
                            if (DEBUG_SURROGATES)
                            {
                                Console.WriteLine("      matches!");
                            }

                            // OK seek "back"
                            // TODO: more efficient seek?
                            outerInstance.TermsDict.SeekEnum(termEnum, seekTermEnum.Term(), true);

                            scratchTerm.CopyBytes(seekTermEnum.Term().Bytes);

                            // +3 because we don't need to check the char
                            // at upTo: we know it's > BMP
                            upTo += 3;

                            // NOTE: we keep iterating, now, since this
                            // can easily "recurse".  Ie, after seeking
                            // forward at a certain char position, we may
                            // find another surrogate in our [new] suffix
                            // and must then do another seek (recurse)
                        }
                        else
                        {
                            upTo++;
                        }
                    }
                    else
                    {
                        upTo++;
                    }
                }
            }
Ejemplo n.º 16
0
            // Pre-flex indices store terms in UTF16 sort order, but
            // certain queries require Unicode codepoint order; this
            // method carefully seeks around surrogates to handle
            // this impedance mismatch

            private void SurrogateDance()
            {
                if (!unicodeSortOrder)
                {
                    return;
                }

                // We are invoked after TIS.next() (by UTF16 order) to
                // possibly seek to a different "next" (by unicode
                // order) term.

                // We scan only the "delta" from the last term to the
                // current term, in UTF8 bytes.  We look at 1) the bytes
                // stripped from the prior term, and then 2) the bytes
                // appended to that prior term's prefix.

                // We don't care about specific UTF8 sequences, just
                // the "category" of the UTF16 character.  Category S
                // is a high/low surrogate pair (it non-BMP).
                // Category E is any BMP char > UNI_SUR_LOW_END (and <
                // U+FFFF). Category A is the rest (any unicode char
                // <= UNI_SUR_HIGH_START).

                // The core issue is that pre-flex indices sort the
                // characters as ASE, while flex must sort as AES.  So
                // when scanning, when we hit S, we must 1) seek
                // forward to E and enum the terms there, then 2) seek
                // back to S and enum all terms there, then 3) seek to
                // after E.  Three different seek points (1, 2, 3).

                // We can easily detect S in UTF8: if a byte has
                // prefix 11110 (0xf0), then that byte and the
                // following 3 bytes encode a single unicode codepoint
                // in S.  Similarly, we can detect E: if a byte has
                // prefix 1110111 (0xee), then that byte and the
                // following 2 bytes encode a single unicode codepoint
                // in E.

                // Note that this is really a recursive process --
                // maybe the char at pos 2 needs to dance, but any
                // point in its dance, suddenly pos 4 needs to dance
                // so you must finish pos 4 before returning to pos
                // 2.  But then during pos 4's dance maybe pos 7 needs
                // to dance, etc.  However, despite being recursive,
                // we don't need to hold any state because the state
                // can always be derived by looking at prior term &
                // current term.

                // TODO: can we avoid this copy?
                if (termEnum.Term() == null || termEnum.Term().Field != internedFieldName)
                {
                    scratchTerm.Length = 0;
                }
                else
                {
                    scratchTerm.CopyBytes(termEnum.Term().Bytes);
                }

                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("  dance");
                    Console.WriteLine("    prev=" + UnicodeUtil.ToHexString(prevTerm.Utf8ToString()));
                    Console.WriteLine("         " + prevTerm.ToString());
                    Console.WriteLine("    term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString()));
                    Console.WriteLine("         " + scratchTerm.ToString());
                }

                // this code assumes TermInfosReader/SegmentTermEnum
                // always use BytesRef.offset == 0
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(prevTerm.Offset == 0);
                    Debugging.Assert(scratchTerm.Offset == 0);
                }

                // Need to loop here because we may need to do multiple
                // pops, and possibly a continue in the end, ie:
                //
                //  cont
                //  pop, cont
                //  pop, pop, cont
                //  <nothing>
                //

                while (true)
                {
                    if (DoContinue())
                    {
                        break;
                    }
                    else
                    {
                        if (!DoPop())
                        {
                            break;
                        }
                    }
                }

                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("  finish bmp ends");
                }

                DoPushes();
            }
Ejemplo n.º 17
0
            // Look for seek type 3 ("pop"): if the delta from
            // prev -> current was replacing an S with an E,
            // we must now seek to beyond that E.  this seek
            // "finishes" the dance at this character
            // position.
            private bool DoPop()
            {
                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("  try pop");
                }

                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(newSuffixStart <= prevTerm.Length);
                    Debugging.Assert(newSuffixStart < scratchTerm.Length || newSuffixStart == 0);
                }

                if (prevTerm.Length > newSuffixStart && IsNonBMPChar(prevTerm.Bytes, newSuffixStart) && IsHighBMPChar(scratchTerm.Bytes, newSuffixStart))
                {
                    // Seek type 2 -- put 0xFF at this position:
                    scratchTerm.Bytes[newSuffixStart] = 0xff;
                    scratchTerm.Length = newSuffixStart + 1;

                    if (DEBUG_SURROGATES)
                    {
                        Console.WriteLine("    seek to term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString()) + " " + scratchTerm.ToString());
                    }

                    // TODO: more efficient seek?  can we simply swap
                    // the enums?
                    outerInstance.TermsDict.SeekEnum(termEnum, new Term(fieldInfo.Name, scratchTerm), true);

                    Term t2 = termEnum.Term();

                    // We could hit EOF or different field since this
                    // was a seek "forward":
                    if (t2 != null && t2.Field == internedFieldName)
                    {
                        if (DEBUG_SURROGATES)
                        {
                            Console.WriteLine("      got term=" + UnicodeUtil.ToHexString(t2.Text()) + " " + t2.Bytes);
                        }

                        BytesRef b2 = t2.Bytes;
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(b2.Offset == 0);
                        }

                        // Set newSuffixStart -- we can't use
                        // termEnum's since the above seek may have
                        // done no scanning (eg, term was precisely
                        // and index term, or, was in the term seek
                        // cache):
                        scratchTerm.CopyBytes(b2);
                        SetNewSuffixStart(prevTerm, scratchTerm);

                        return(true);
                    }
                    else if (newSuffixStart != 0 || scratchTerm.Length != 0)
                    {
                        if (DEBUG_SURROGATES)
                        {
                            Console.WriteLine("      got term=null (or next field)");
                        }
                        newSuffixStart     = 0;
                        scratchTerm.Length = 0;
                        return(true);
                    }
                }

                return(false);
            }
Ejemplo n.º 18
0
            // Swap in S, in place of E:
            private bool SeekToNonBMP(SegmentTermEnum te, BytesRef term, int pos)
            {
                int savLength = term.Length;

                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(term.Offset == 0);
                }

                // The 3 bytes starting at downTo make up 1
                // unicode character:
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(IsHighBMPChar(term.Bytes, pos));
                }

                // NOTE: we cannot make this assert, because
                // AutomatonQuery legitimately sends us malformed UTF8
                // (eg the UTF8 bytes with just 0xee)
                // assert term.length >= pos + 3: "term.length=" + term.length + " pos+3=" + (pos+3) + " byte=" + Integer.toHexString(term.bytes[pos]) + " term=" + term.toString();

                // Save the bytes && length, since we need to
                // restore this if seek "back" finds no matching
                // terms
                if (term.Bytes.Length < 4 + pos)
                {
                    term.Grow(4 + pos);
                }

                scratch[0] = (sbyte)term.Bytes[pos];
                scratch[1] = (sbyte)term.Bytes[pos + 1];
                scratch[2] = (sbyte)term.Bytes[pos + 2];

                term.Bytes[pos]     = 0xf0;
                term.Bytes[pos + 1] = 0x90;
                term.Bytes[pos + 2] = 0x80;
                term.Bytes[pos + 3] = 0x80;
                term.Length         = 4 + pos;

                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("      try seek term=" + UnicodeUtil.ToHexString(term.Utf8ToString()));
                }

                // Seek "back":
                outerInstance.TermsDict.SeekEnum(te, new Term(fieldInfo.Name, term), true);

                // Test if the term we seek'd to in fact found a
                // surrogate pair at the same position as the E:
                Term t2 = te.Term();

                // Cannot be null (or move to next field) because at
                // "worst" it'd seek to the same term we are on now,
                // unless we are being called from seek
                if (t2 == null || t2.Field != internedFieldName)
                {
                    return(false);
                }

                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("      got term=" + UnicodeUtil.ToHexString(t2.Text()));
                }

                // Now test if prefix is identical and we found
                // a non-BMP char at the same position:
                BytesRef b2 = t2.Bytes;

                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(b2.Offset == 0);
                }

                bool matches;

                if (b2.Length >= term.Length && IsNonBMPChar(b2.Bytes, pos))
                {
                    matches = true;
                    for (int i = 0; i < pos; i++)
                    {
                        if (term.Bytes[i] != b2.Bytes[i])
                        {
                            matches = false;
                            break;
                        }
                    }
                }
                else
                {
                    matches = false;
                }

                // Restore term:
                term.Length         = savLength;
                term.Bytes[pos]     = (byte)scratch[0];
                term.Bytes[pos + 1] = (byte)scratch[1];
                term.Bytes[pos + 2] = (byte)scratch[2];

                return(matches);
            }