Beispiel #1
0
        private void Initialize()
        {
            LoggerBundle.Debug($"Initializing logger of type '{Type}'...");

            _executeLogPipe = GetExecuteLogPipe();

            LoggerBundle.Inform($"Logger of type '{Type}' successfully initialized");
        }
Beispiel #2
0
        public void Log(LogFlags flags, params Object[] args)
        {
            StringBuilder sb = new StringBuilder();
            ProcessPipe <Object, String> preparePipe = _prepareLogPipe.Connect(new LogFlagPipe(Type, flags));

            foreach (Object arg in args)
            {
                try
                {
                    sb.Append(preparePipe.Process(arg));
                }
                catch (Exception ex)
                {
                    BubbleException(flags, ex);
                }
            }

            _executeLogPipe.Process(sb.ToString());
        }
        // @TODO: get destruction right, delete the temp files
        //public sealed override void Dispose()
        //{
        //    //Dispose(true);
        //    GC.SuppressFinalize(this);
        //}
        //[HandleProcessCorruptedStateExceptions]
        //protected virtual void Dispose(bool A_0) { }

        // This is the only interface into SPM used by FactoredSegmenter.
        // It determines the split points where SPM would split.
        // @TODO: change return type to IList type, which will save one operation in this build, while costing nothing in MTMAIN
        public int[] GetSplitPoints(string segmentMe)
        {
            if (segmentMe.Length <= 1) // nothing to split. This includes space, which is SPM's break symbol, and should not be sent.
            {
                return(null);
            }
            // obtain a server process if available, or create a new one if all are in use
            if (!m_serverPool.TryDequeue(out var processPipe))
            {
                var argv = new List <string> {
                    SpmBinaryDirPath + "spm_encode", "--model", m_tempModelPath
                };
                if (m_vocabulary != null)
                {
                    argv.AddRange(new List <string> {
                        "--vocabulary", m_tempVocabPath
                    });
                }
                Logger.WriteLine($"starting SentencePiece instance as: {" ".JoinItems(argv)}");
                processPipe = new ProcessPipe(argv, envirVariables: new Dictionary <string, string> {
                    { "LC_ALL", "en_US.UTF-8" }
                });
                // @TODO: do we need the environment variable for spm_encode?
            }
            //Logger.WriteLine($"SPM-encoding word {segmentMe}");
            processPipe.process.StandardInput.WriteLine(segmentMe); // @TODO: how do we know/ensure this is UTF-8?
            var encodedWord = processPipe.process.StandardOutput.ReadLine();

            Sanity.Requires(encodedWord != null, "spm_encode unexpectedly terminated");
            // return the process back into the pool
            m_serverPool.Enqueue(processPipe);

            var pieces = encodedWord.Split(' ', options: StringSplitOptions.RemoveEmptyEntries);

            if ("".JoinItems(pieces) != segmentMe)
            {
                Logger.WriteLine($"ignoring word: SentencePiece did not just split the word ('{segmentMe}', -> '{" ".JoinItems(pieces)}')");
                return(null);
            }

            // create array of segmentation points
            // E.g. if "abcde" got broken into "ab cde", then we return the split points (0, 2, 5).
            // This code handles the special case of OOV pieces.
            // E.g. if there is no '+' in the SentencePiece vocab, then spm_encode will keep
            // it as '++++'. We must break those up into individual pieces.
            List <int> res = null; // (created lazily)
            int        n   = 0;    // accumulator for split points

            for (int i = 0; i < pieces.Length; i++)
            {
                var piece = pieces[i];
                if (m_vocabulary == null || m_vocabulary.Contains(piece))
                {
                    n += piece.Length;
                    if (n < segmentMe.Length || res != null) // (in the frequent special case of an unbroken single token, we return null for efficiency)
                    {
                        if (res == null)
                        {
                            res = new List <int> {
                                0, n
                            }
                        }
                        ;
                        else
                        {
                            res.Add(n);
                        }
                    }
                }
                else // special case: OOV. Break at each character.
                {
                    for (int j = 0; j < piece.Length; /*j += n*/)
                    {
                        // length of this piece is 1 Unicode character. Surrogate pairs are 2 characters in C#'s UCS-2 encoding.
                        var ucs2Len = (char.IsHighSurrogate(piece[j]) && j + 2 <= piece.Length) ? 2 : 1;
                        n += ucs2Len;
                        j += ucs2Len;
                        if (n < segmentMe.Length || res != null)
                        {
                            if (res == null)
                            {
                                res = new List <int> {
                                    0, n
                                }
                            }
                            ;
                            else
                            {
                                res.Add(n);
                            }
                        }
                    }
                }
            }
            return(res?.ToArray());
        }
 public ConditionalPipe(Func <TIn, Boolean> condition, ProcessPipe <TIn, TOut> onTrue, ProcessPipe <TIn, TOut> onFalse) :
     base(o => condition(o) ? onTrue?.Process(o) : onFalse?.Process(o))
 {
 }