private void Initialize() { LoggerBundle.Debug($"Initializing logger of type '{Type}'..."); _executeLogPipe = GetExecuteLogPipe(); LoggerBundle.Inform($"Logger of type '{Type}' successfully initialized"); }
public void Log(LogFlags flags, params Object[] args) { StringBuilder sb = new StringBuilder(); ProcessPipe <Object, String> preparePipe = _prepareLogPipe.Connect(new LogFlagPipe(Type, flags)); foreach (Object arg in args) { try { sb.Append(preparePipe.Process(arg)); } catch (Exception ex) { BubbleException(flags, ex); } } _executeLogPipe.Process(sb.ToString()); }
// @TODO: get destruction right, delete the temp files //public sealed override void Dispose() //{ // //Dispose(true); // GC.SuppressFinalize(this); //} //[HandleProcessCorruptedStateExceptions] //protected virtual void Dispose(bool A_0) { } // This is the only interface into SPM used by FactoredSegmenter. // It determines the split points where SPM would split. // @TODO: change return type to IList type, which will save one operation in this build, while costing nothing in MTMAIN public int[] GetSplitPoints(string segmentMe) { if (segmentMe.Length <= 1) // nothing to split. This includes space, which is SPM's break symbol, and should not be sent. { return(null); } // obtain a server process if available, or create a new one if all are in use if (!m_serverPool.TryDequeue(out var processPipe)) { var argv = new List <string> { SpmBinaryDirPath + "spm_encode", "--model", m_tempModelPath }; if (m_vocabulary != null) { argv.AddRange(new List <string> { "--vocabulary", m_tempVocabPath }); } Logger.WriteLine($"starting SentencePiece instance as: {" ".JoinItems(argv)}"); processPipe = new ProcessPipe(argv, envirVariables: new Dictionary <string, string> { { "LC_ALL", "en_US.UTF-8" } }); // @TODO: do we need the environment variable for spm_encode? } //Logger.WriteLine($"SPM-encoding word {segmentMe}"); processPipe.process.StandardInput.WriteLine(segmentMe); // @TODO: how do we know/ensure this is UTF-8? var encodedWord = processPipe.process.StandardOutput.ReadLine(); Sanity.Requires(encodedWord != null, "spm_encode unexpectedly terminated"); // return the process back into the pool m_serverPool.Enqueue(processPipe); var pieces = encodedWord.Split(' ', options: StringSplitOptions.RemoveEmptyEntries); if ("".JoinItems(pieces) != segmentMe) { Logger.WriteLine($"ignoring word: SentencePiece did not just split the word ('{segmentMe}', -> '{" ".JoinItems(pieces)}')"); return(null); } // create array of segmentation points // E.g. if "abcde" got broken into "ab cde", then we return the split points (0, 2, 5). // This code handles the special case of OOV pieces. // E.g. if there is no '+' in the SentencePiece vocab, then spm_encode will keep // it as '++++'. We must break those up into individual pieces. List <int> res = null; // (created lazily) int n = 0; // accumulator for split points for (int i = 0; i < pieces.Length; i++) { var piece = pieces[i]; if (m_vocabulary == null || m_vocabulary.Contains(piece)) { n += piece.Length; if (n < segmentMe.Length || res != null) // (in the frequent special case of an unbroken single token, we return null for efficiency) { if (res == null) { res = new List <int> { 0, n } } ; else { res.Add(n); } } } else // special case: OOV. Break at each character. { for (int j = 0; j < piece.Length; /*j += n*/) { // length of this piece is 1 Unicode character. Surrogate pairs are 2 characters in C#'s UCS-2 encoding. var ucs2Len = (char.IsHighSurrogate(piece[j]) && j + 2 <= piece.Length) ? 2 : 1; n += ucs2Len; j += ucs2Len; if (n < segmentMe.Length || res != null) { if (res == null) { res = new List <int> { 0, n } } ; else { res.Add(n); } } } } } return(res?.ToArray()); }
public ConditionalPipe(Func <TIn, Boolean> condition, ProcessPipe <TIn, TOut> onTrue, ProcessPipe <TIn, TOut> onFalse) : base(o => condition(o) ? onTrue?.Process(o) : onFalse?.Process(o)) { }