Пример #1
0
        //Convert CRFSharp output format to string list
        private List <string> ConvertCRFTermOutToStringList(List <List <string> > inbuf, crf_seg_out[] crf_out)
        {
            StringBuilder sb = new StringBuilder();

            for (int i = 0; i < inbuf.Count; i++)
            {
                sb.Append(inbuf[i][0]);
            }

            string        strText = sb.ToString();
            List <string> rstList = new List <string>();

            for (int i = 0; i < crf_out.Length; i++)
            {
                if (crf_out[i] == null)
                {
                    //No more result
                    break;
                }

                sb.Clear();
                crf_seg_out crf_term_out = crf_out[i];
                for (int j = 0; j < crf_term_out.Count; j++)
                {
                    string str   = strText.Substring(crf_term_out.tokenList[j].offset, crf_term_out.tokenList[j].length);
                    string strNE = crf_term_out.tokenList[j].strTag;

                    sb.Append(str);
                    if (strNE.Length > 0)
                    {
                        sb.Append("[" + strNE + "]");
                    }
                    sb.Append(" ");
                }
                rstList.Add(sb.ToString().Trim());
            }

            return(rstList);
        }
Пример #2
0
        public string Predict(DecoderArgs _property)
        {
            ParallelOptions parallelOption = new ParallelOptions();
            StringWriter    sw = null, swSeg = null;

            if (_property.outputstyle == 0)
            {
                sw = new StringWriter();
            }
            else
            {
                swSeg = new StringWriter();
            }


            ConcurrentQueue <List <List <string> > > queueRecords = new ConcurrentQueue <List <List <string> > >();
            ConcurrentQueue <List <List <string> > > queueSegRecords = new ConcurrentQueue <List <List <string> > >();

            StringReader sr = new StringReader(_property.predictstring);

            parallelOption.MaxDegreeOfParallelism = _property.thread;
            Parallel.For(0, parallelOption.MaxDegreeOfParallelism, t =>
            {
                //Create decoder tagger instance. If the running environment is multi-threads, each thread needs a separated instance
                SegDecoderTagger tagger = _crfWrapper.CreateTagger();
                tagger.set_nbest(_property.nbest);
                tagger.set_vlevel(_property.probLevel);

                //Initialize result
                crf_seg_out[] crf_out = new crf_seg_out[_property.nbest];
                for (int i = 0; i < _property.nbest; i++)
                {
                    crf_out[i] = new crf_seg_out();
                }

                List <List <string> > inbuf = new List <List <string> >();
                while (true)
                {
                    lock (rdLocker)
                    {
                        if (ReadRecord(inbuf, sr) == false)
                        {
                            break;
                        }

                        queueRecords.Enqueue(inbuf);
                        queueSegRecords.Enqueue(inbuf);
                    }

                    //Call CRFSharp wrapper to predict given string's tags
                    if (swSeg != null)
                    {
                        _crfWrapper.Segment(crf_out, tagger, inbuf);
                    }
                    else
                    {
                        _crfWrapper.Segment((crf_term_out[])crf_out, (DecoderTagger)tagger, inbuf);
                    }

                    List <List <string> > peek = null;

                    //Save segmented tagged result into file
                    if (swSeg != null)
                    {
                        List <string> rstList = ConvertCRFTermOutToStringList(inbuf, crf_out);
                        while (peek != inbuf)
                        {
                            queueSegRecords.TryPeek(out peek);
                        }
                        foreach (string item in rstList)
                        {
                            swSeg.WriteLine(item);
                        }
                        queueSegRecords.TryDequeue(out peek);
                        peek = null;
                    }

                    //Save raw tagged result (with probability) into file
                    if (sw != null)
                    {
                        while (peek != inbuf)
                        {
                            queueRecords.TryPeek(out peek);
                        }
                        OutputRawResult(inbuf, crf_out, tagger, sw);
                        queueRecords.TryDequeue(out peek);
                    }
                }
            });



            if (sw != null)
            {
                sw.Close();
                return(sw.ToString());
            }
            if (swSeg != null)
            {
                swSeg.Close();
                return(swSeg.ToString());
            }

            return("");
        }
Пример #3
0
        int seg_termbuf_build(crf_seg_out term_buf)
        {
            term_buf.Clear();

            //build raw result at first
            int iRet = termbuf_build(term_buf);

            if (iRet != Utils.ERROR_SUCCESS)
            {
                return(iRet);
            }

            //Then build token result
            int    term_len = 0;
            double weight   = 0.0;
            int    num      = 0;

            for (int i = 0; i < x_.Count; i++)
            {
                //Adding the length of current token
                string strTag = term_buf.result_[i];
                term_len += x_[i][0].Length;
                weight   += term_buf.weight_[i];
                num++;

                //Check if current term is the end of a token
                if ((strTag.StartsWith("B_") == false &&
                     strTag.StartsWith("M_") == false) ||
                    i == x_.Count - 1)
                {
                    SegToken tkn = new SegToken();
                    tkn.length = term_len;
                    tkn.offset = term_buf.termTotalLength;

                    int spos = strTag.IndexOf('_');
                    if (spos < 0)
                    {
                        if (strTag == "NOR")
                        {
                            tkn.strTag = "";
                        }
                        else
                        {
                            tkn.strTag = strTag;
                        }
                    }
                    else
                    {
                        tkn.strTag = strTag.Substring(spos + 1);
                    }

                    term_buf.termTotalLength += term_len;
                    //Calculate each token's weight
                    switch (vlevel_)
                    {
                    case 0:
                        tkn.fWeight = 0.0;
                        break;

                    case 2:
                        tkn.fWeight = weight / num;
                        weight      = 0.0;
                        num         = 0;
                        break;
                    }

                    term_buf.tokenList.Add(tkn);
                    term_len = 0;
                }
            }


            return(Utils.ERROR_SUCCESS);
        }