int seg_termbuf_build(crf_seg_out term_buf) { term_buf.Clear(); //build raw result at first var iRet = termbuf_build(term_buf); if (iRet != Utils.ERROR_SUCCESS) { return(iRet); } //Then build token result var term_len = 0; var weight = 0.0; var num = 0; for (var i = 0; i < x_.Count; i++) { //Adding the length of current token var strTag = term_buf.result_[i]; term_len += x_[i][0].Length; weight += term_buf.weight_[i]; num++; //Check if current term is the end of a token if ((strTag.StartsWith("B_") == false && strTag.StartsWith("M_") == false) || i == x_.Count - 1) { var tkn = new SegToken(); tkn.length = term_len; tkn.offset = term_buf.termTotalLength; var spos = strTag.IndexOf('_'); if (spos < 0) { if (strTag == "NOR") { tkn.strTag = ""; } else { tkn.strTag = strTag; } } else { tkn.strTag = strTag.Substring(spos + 1); } term_buf.termTotalLength += term_len; //Calculate each token's weight switch (vlevel_) { case 0: tkn.fWeight = 0.0; break; case 2: tkn.fWeight = weight / num; weight = 0.0; num = 0; break; } term_buf.tokenList.Add(tkn); term_len = 0; } } return(Utils.ERROR_SUCCESS); }
//Segment given text public int Segment(crf_seg_out[] pout, //segment result SegDecoderTagger tagger, //Tagger per thread List<List<string>> inbuf //feature set for segment ) { var ret = 0; if (inbuf.Count == 0) { //Empty input string return Utils.ERROR_SUCCESS; } ret = tagger.reset(); if (ret < 0) { return ret; } ret = tagger.add(inbuf); if (ret < 0) { return ret; } //parse ret = tagger.parse(); if (ret < 0) { return ret; } //wrap result ret = tagger.output(pout); if (ret < 0) { return ret; } return Utils.ERROR_SUCCESS; }
//Convert CRFSharp output format to string list private List<string> ConvertCRFTermOutToStringList(List<List<string>> inbuf, crf_seg_out[] crf_out) { var sb = new StringBuilder(); for (var i = 0; i < inbuf.Count; i++) { sb.Append(inbuf[i][0]); } var strText = sb.ToString(); var rstList = new List<string>(); for (var i = 0; i < crf_out.Length; i++) { if (crf_out[i] == null) { //No more result break; } sb.Clear(); var crf_term_out = crf_out[i]; for (var j = 0; j < crf_term_out.Count; j++) { var str = strText.Substring(crf_term_out.tokenList[j].offset, crf_term_out.tokenList[j].length); var strNE = crf_term_out.tokenList[j].strTag; sb.Append(str); if (strNE.Length > 0) { sb.Append("[" + strNE + "]"); } sb.Append(" "); } rstList.Add(sb.ToString().Trim()); } return rstList; }
bool Decode(CRFSharpWrapper.DecoderArgs options) { var parallelOption = new ParallelOptions(); if (File.Exists(options.strInputFileName) == false) { Console.WriteLine("FAILED: Open {0} file failed.", options.strInputFileName); return false; } if (File.Exists(options.strModelFileName) == false) { Console.WriteLine("FAILED: Open {0} file failed.", options.strModelFileName); return false; } var sr = new StreamReader(options.strInputFileName); StreamWriter sw = null, swSeg = null; if (options.strOutputFileName != null && options.strOutputFileName.Length > 0) { sw = new StreamWriter(options.strOutputFileName); } if (options.strOutputSegFileName != null && options.strOutputSegFileName.Length > 0) { swSeg = new StreamWriter(options.strOutputSegFileName); } //Create CRFSharp wrapper instance. It's a global instance var crfWrapper = new CRFSharpWrapper.Decoder(); //Load model from file if (crfWrapper.LoadModel(options.strModelFileName) == false) { return false; } var queueRecords = new ConcurrentQueue<List<List<string>>>(); var queueSegRecords = new ConcurrentQueue<List<List<string>>>(); parallelOption.MaxDegreeOfParallelism = options.thread; Parallel.For(0, options.thread, parallelOption, t => { //Create decoder tagger instance. If the running environment is multi-threads, each thread needs a separated instance var tagger = crfWrapper.CreateTagger(options.nBest, options.maxword); tagger.set_vlevel(options.probLevel); //Initialize result var crf_out = new crf_seg_out[options.nBest]; for (var i = 0; i < options.nBest; i++) { crf_out[i] = new crf_seg_out(tagger.crf_max_word_num); } var inbuf = new List<List<string>>(); while (true) { lock (rdLocker) { if (ReadRecord(inbuf, sr) == false) { break; } queueRecords.Enqueue(inbuf); queueSegRecords.Enqueue(inbuf); } //Call CRFSharp wrapper to predict given string's tags if (swSeg != null) { crfWrapper.Segment(crf_out, tagger, inbuf); } else { crfWrapper.Segment((crf_term_out[])crf_out, (DecoderTagger)tagger, inbuf); } List<List<string>> peek = null; //Save segmented tagged result into file if (swSeg != null) { var rstList = ConvertCRFTermOutToStringList(inbuf, crf_out); while (peek != inbuf) { queueSegRecords.TryPeek(out peek); } for (int index = 0; index < rstList.Count; index++) { var item = rstList[index]; swSeg.WriteLine(item); } queueSegRecords.TryDequeue(out peek); peek = null; } //Save raw tagged result (with probability) into file if (sw != null) { while (peek != inbuf) { queueRecords.TryPeek(out peek); } OutputRawResultToFile(inbuf, crf_out, tagger, sw); queueRecords.TryDequeue(out peek); } } }); sr.Close(); if (sw != null) { sw.Close(); } if (swSeg != null) { swSeg.Close(); } return true; }
public bool Decode(CRFSharpWrapper.DecoderArgs options) { var parallelOption = new ParallelOptions(); var watch = Stopwatch.StartNew(); if (File.Exists(options.strInputFileName) == false) { Logger.WriteLine("FAILED: Open {0} file failed.", options.strInputFileName); return(false); } if (File.Exists(options.strModelFileName) == false) { Logger.WriteLine("FAILED: Open {0} file failed.", options.strModelFileName); return(false); } var sr = new StreamReader(options.strInputFileName); StreamWriter sw = null, swSeg = null; if (!string.IsNullOrEmpty(options.strOutputFileName)) { sw = new StreamWriter(options.strOutputFileName); } if (!string.IsNullOrEmpty(options.strOutputSegFileName)) { swSeg = new StreamWriter(options.strOutputSegFileName); } //Create CRFSharp wrapper instance. It's a global instance var crfWrapper = new CRFSharpWrapper.Decoder(); //Load encoded model from file Logger.WriteLine("Loading model from {0}", options.strModelFileName); crfWrapper.LoadModel(options.strModelFileName); var queueRecords = new ConcurrentQueue <List <List <string> > >(); var queueSegRecords = new ConcurrentQueue <List <List <string> > >(); parallelOption.MaxDegreeOfParallelism = options.thread; Parallel.For(0, options.thread, parallelOption, t => { //Create decoder tagger instance. If the running environment is multi-threads, each thread needs a separated instance var tagger = crfWrapper.CreateTagger(options.nBest, options.maxword); tagger.set_vlevel(options.probLevel); //Initialize result var crf_out = new crf_seg_out[options.nBest]; for (var i = 0; i < options.nBest; i++) { crf_out[i] = new crf_seg_out(tagger.crf_max_word_num); } var inbuf = new List <List <string> >(); while (true) { lock (rdLocker) { if (ReadRecord(inbuf, sr) == false) { break; } queueRecords.Enqueue(inbuf); queueSegRecords.Enqueue(inbuf); } //Call CRFSharp wrapper to predict given string's tags if (swSeg != null) { crfWrapper.Segment(crf_out, tagger, inbuf); } else { crfWrapper.Segment(crf_out, (DecoderTagger)tagger, inbuf); } List <List <string> > peek = null; //Save segmented tagged result into file if (swSeg != null) { List <string> rstList = ConvertCRFTermOutToStringList(inbuf, crf_out); while (peek != inbuf) { queueSegRecords.TryPeek(out peek); } foreach (var item in rstList) { swSeg.WriteLine(item); } queueSegRecords.TryDequeue(out peek); peek = null; } //Save raw tagged result (with probability) into file if (sw != null) { while (peek != inbuf) { queueRecords.TryPeek(out peek); } OutputRawResultToFile(inbuf, crf_out, tagger, sw); queueRecords.TryDequeue(out peek); } } }); sr.Close(); sw?.Close(); swSeg?.Close(); watch.Stop(); Logger.WriteLine("Elapsed: {0} ms", watch.ElapsedMilliseconds); return(true); }
public int output(crf_seg_out[] pout) { var n = 0; var ret = 0; if (nbest_ == 1) { //If only best result and no need probability, "next" is not to be used ret = seg_termbuf_build(pout[0]); if (ret < 0) { return ret; } } else { //Fill the n best result var iNBest = nbest_; if (pout.Length < iNBest) { iNBest = pout.Length; } for (n = 0; n < iNBest; ++n) { ret = next(); if (ret < 0) { break; } ret = seg_termbuf_build(pout[n]); if (ret < 0) { return ret; } } } return Utils.ERROR_SUCCESS; }
int seg_termbuf_build(crf_seg_out term_buf) { term_buf.Clear(); //build raw result at first var iRet = termbuf_build(term_buf); if (iRet != Utils.ERROR_SUCCESS) { return iRet; } //Then build token result var term_len = 0; var weight = 0.0; var num = 0; for (var i = 0; i < x_.Count; i++) { //Adding the length of current token var strTag = term_buf.result_[i]; term_len += x_[i][0].Length; weight += term_buf.weight_[i]; num++; //Check if current term is the end of a token if ((strTag.StartsWith("B_") == false && strTag.StartsWith("M_") == false) || i == x_.Count - 1) { var tkn = new SegToken(); tkn.length = term_len; tkn.offset = term_buf.termTotalLength; var spos = strTag.IndexOf('_'); if (spos < 0) { if (strTag == "NOR") { tkn.strTag = ""; } else { tkn.strTag = strTag; } } else { tkn.strTag = strTag.Substring(spos + 1); } term_buf.termTotalLength += term_len; //Calculate each token's weight switch (vlevel_) { case 0: tkn.fWeight = 0.0; break; case 2: tkn.fWeight = weight / num; weight = 0.0; num = 0; break; } term_buf.tokenList.Add(tkn); term_len = 0; } } return Utils.ERROR_SUCCESS; }