Example #1
0
        public static void TestBiGraphGenerate()
        {
            WordDictionary coreDict = new WordDictionary();
             if (!coreDict.Load(coreDictFile))
             {
            Console.WriteLine("coreDict 字典装入错误!");
            return;
             }

             WordDictionary biDict = new WordDictionary();
             if (!biDict.Load(biDictFile))
             {
            Console.WriteLine("字典装入错误!");
            return;
             }

             string sSentence = @"他说的确实在理";
             sSentence = Predefine.SENTENCE_BEGIN + sSentence + Predefine.SENTENCE_END;

             //---原子分词
             List<AtomNode> atomSegment = Segment.AtomSegment(sSentence);

             //---检索词库,加入所有可能分词方案并存入链表结构
             RowFirstDynamicArray<ChainContent> segGraph = Segment.GenerateWordNet(atomSegment, coreDict);

             //---检索所有可能的两两组合
             ColumnFirstDynamicArray<ChainContent> biGraphResult = Segment.BiGraphGenerate(segGraph, 0.1, biDict, coreDict);

             Console.WriteLine(biGraphResult.ToString());
        }
Example #2
0
 public Segment(WordDictionary biDict, WordDictionary coreDict)
 {
    this.biDict = biDict;
    this.coreDict = coreDict;
    rawNShortPath = new NShortPath();
    optNShortPath = new NShortPath();
 }
      public WordSegment(double SmoothingParameter)
      {
         m_dictCore = new WordDictionary();
         m_dictBigram = new WordDictionary();
         m_POSTagger = new Span();
         m_uPerson = new UnknowWord();
         m_uTransPerson = new UnknowWord();
         m_uPlace = new UnknowWord();
         m_Seg = new Segment(m_dictBigram, m_dictCore);
         m_Seg.OnSegmentEvent += new SegmentEventHandler(this.OnSegmentEventHandler);

         m_dSmoothingPara = SmoothingParameter;//Smoothing parameter
      }
      //Unknown word recognition
      //pWordSegResult:word Segmentation result;
      //graphOptimum: The optimized segmentation graph
      //graphSeg: The original segmentation graph
      public bool Recognition(WordResult[] pWordSegResult, RowFirstDynamicArray<ChainContent> graphOptimum,
         List<AtomNode> atomSegment, WordDictionary dictCore)
      {
         ChainItem<ChainContent> item;
         int nStartPos = 0, j = 0, nAtomStart, nAtomEnd, nPOSOriginal;
         double dValue;
         m_roleTag.POSTagging(pWordSegResult, dictCore, m_dict);
         //Tag the segmentation with unknown recognition roles according the core dictionary and unknown recognition dictionary
         for (int i = 0; i < m_roleTag.m_nUnknownWordsCount; i++)
         {
            while (j < atomSegment.Count && nStartPos < m_roleTag.m_nUnknownWords[i, 0])
               nStartPos += atomSegment[j++].sWord.Length;

            nAtomStart = j;
            while (j < atomSegment.Count && nStartPos < m_roleTag.m_nUnknownWords[i, 1])
               nStartPos += atomSegment[j++].sWord.Length;

            nAtomEnd = j;
            if (nAtomStart < nAtomEnd)
            {
               item = graphOptimum.GetElement(nAtomStart, nAtomEnd);
               if (item != null)
               {
                  dValue = item.Content.eWeight;
                  nPOSOriginal = item.Content.nPOS;
               }
               else
                  dValue = Predefine.INFINITE_VALUE;

               if (dValue > m_roleTag.m_dWordsPossibility[i])
                  //Set the element with less frequency
                  graphOptimum.SetElement(nAtomStart, nAtomEnd, new ChainContent(m_roleTag.m_dWordsPossibility[i], m_nPOS, m_sUnknownFlags));
            }
         }
         return true;
      }
Example #5
0
        //====================================================================
        // ����������֮��Ķ���ͼ��
        //====================================================================
        public static ColumnFirstDynamicArray<ChainContent> BiGraphGenerate(
         RowFirstDynamicArray<ChainContent> aWord, double smoothPara, WordDictionary biDict, WordDictionary coreDict)
        {
            ColumnFirstDynamicArray<ChainContent> aBiWordNet = new ColumnFirstDynamicArray<ChainContent>();

             ChainItem<ChainContent> pCur, pNextWords;
             int nTwoWordsFreq = 0, nCurWordIndex, nNextWordIndex;
             double dCurFreqency, dValue, dTemp;
             string sTwoWords;
             StringBuilder sb = new StringBuilder();

             //Record the position map of possible words
             int[] m_npWordPosMapTable = PreparePositionMap(aWord);

             pCur = aWord.GetHead();
             while (pCur != null)
             {
            if (pCur.Content.nPOS >= 0)
               //It's not an unknown words
               dCurFreqency = pCur.Content.eWeight;
            else
               //Unknown words
               dCurFreqency = coreDict.GetFrequency(pCur.Content.sWord, 2);

            //Get next words which begin with pCur.col��ע��������Ķ�Ӧ��ϵ��
            pNextWords = aWord.GetFirstElementOfRow(pCur.col);

            while (pNextWords != null && pNextWords.row == pCur.col)
            {
               sb.Remove(0, sb.Length);
               sb.Append(pCur.Content.sWord);
               sb.Append(Predefine.WORD_SEGMENTER);
               sb.Append(pNextWords.Content.sWord);

               sTwoWords = sb.ToString();

               //Two linked Words frequency
               nTwoWordsFreq = biDict.GetFrequency(sTwoWords, 3);

               //Smoothing
               dTemp = 1.0 / Predefine.MAX_FREQUENCE;

               //-log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1
               dValue = -Math.Log(smoothPara * (1.0 + dCurFreqency) / (Predefine.MAX_FREQUENCE + 80000.0)
                 + (1.0 - smoothPara) * ((1.0 - dTemp) * nTwoWordsFreq / (1.0 + dCurFreqency) +
                 dTemp));

               //Unknown words: P(Wi|Ci);while known words:1
               if (pCur.Content.nPOS < 0)
                  dValue += pCur.Content.nPOS;

               //Get the position index of current word in the position map table
               nCurWordIndex = Utility.BinarySearch(pCur.row * Predefine.MAX_SENTENCE_LEN + pCur.col, m_npWordPosMapTable);
               nNextWordIndex = Utility.BinarySearch(pNextWords.row * Predefine.MAX_SENTENCE_LEN + pNextWords.col, m_npWordPosMapTable);

               aBiWordNet.SetElement(nCurWordIndex, nNextWordIndex, new ChainContent(dValue, pCur.Content.nPOS, sTwoWords));

               pNextWords = pNextWords.next; //Get next word
            }
            pCur = pCur.next;
             }

             return aBiWordNet;
        }
Example #6
0
        /// <summary>
        /// 导入外部词库,词频按照重合词频比例平均值
        /// </summary>
        /// <param name="ImportDicFile">外部词库文件名</param>
        /// <param name="ImportEncoding">外部词库文件编码</param>
        /// <param name="SourceDicFile">源dct文件名</param>
        /// <param name="DestDicFile">目标dct文件名</param>
        /// <param name="DicFormat">外部词库类型</param>
        /// <param name="OddLines">导入的库中无效且不在源库中的数据</param>
        /// <param name="ImportFrqRate">设置固定的导入文件频度比例(除以此数字后入库,小于等于0则按照AvgFrqRate入库)</param>
        /// <param name="AvgFrqRate">导入文件的平均频度比例</param>
        /// <returns>导入的条数</returns>
        public static int ImportDictionary(string ImportDicFile, Encoding ImportEncoding, string SourceDicFile, string DestDicFile, DictionaryFormat DicFormat, out string[] OddLines, out double AvgFrqRate, double ImportFrqRate = 0)
        {
            //初始化
            double MaxFrqRate, MinFrqRate;
            WordDictionary.DicWordInfo[] NewWords;
            WordDictionary.DicWordInfo[] ExistWords;
            FindDifferent(ImportDicFile, ImportEncoding, DicFormat, SourceDicFile, out OddLines, out NewWords, out ExistWords, out MaxFrqRate, out MinFrqRate, out AvgFrqRate);

            //加载词库
            WordDictionary dict = new WordDictionary();
            if (!dict.Load(SourceDicFile))
                throw new Exception("load source dic file fail");

            //加入新词
            foreach (WordDictionary.DicWordInfo Word in NewWords)
            {
                int Frq = Convert.ToInt32(ImportFrqRate <= 0 ? Word.Frequence / AvgFrqRate : Word.Frequence / ImportFrqRate);
                dict.AddWord(Word.Word, Word.Pos, Frq);
            }

            //保存
            dict.Save(DestDicFile);
            dict.ReleaseDict();
            return NewWords.Length;
        }
        //====================================================================
        //Merge dict2 into current dictionary and the frequency ratio from dict2 and current dict is nRatio
        //====================================================================
        public bool Merge(WordDictionary dict2, int nRatio)
        {
            int    i, j, k, nCmpValue;
            string sWord;

            //Modification made, not to output when modify table exists.
            if (modifyTable != null || dict2.modifyTable != null)
            {
                return(false);
            }

            for (i = 0; i < Predefine.CC_NUM; i++)
            {
                j = 0;
                k = 0;
                while (j < indexTable[i].nCount && k < dict2.indexTable[i].nCount)
                {
                    nCmpValue = Utility.CCStringCompare(indexTable[i].WordItems[j].sWord, dict2.indexTable[i].WordItems[k].sWord);
                    if (nCmpValue == 0)
                    //Same Words and determine the different handle
                    {
                        if (indexTable[i].WordItems[j].nPOS < dict2.indexTable[i].WordItems[k].nPOS)
                        {
                            nCmpValue = -1;
                        }
                        else if (indexTable[i].WordItems[j].nPOS > dict2.indexTable[i].WordItems[k].nPOS)
                        {
                            nCmpValue = 1;
                        }
                    }

                    if (nCmpValue == 0)
                    {
                        indexTable[i].WordItems[j].nFrequency = (nRatio * indexTable[i].WordItems[j].nFrequency + dict2.indexTable[i].WordItems[k].nFrequency) / (nRatio + 1);
                        j += 1;
                        k += 1;
                    }
                    //Get next word in the current dictionary
                    else if (nCmpValue < 0)
                    {
                        indexTable[i].WordItems[j].nFrequency = (nRatio * indexTable[i].WordItems[j].nFrequency) / (nRatio + 1);
                        j += 1;
                    }
                    else
                    //Get next word in the second dictionary
                    {
                        if (dict2.indexTable[i].WordItems[k].nFrequency > (nRatio + 1) / 10)
                        {
                            sWord = string.Format("{0}{1}", Utility.CC_ID2Char(i).ToString(), dict2.indexTable[i].WordItems[k].sWord);
                            AddItem(sWord, dict2.indexTable[i].WordItems[k].nPOS, dict2.indexTable[i].WordItems[k].nFrequency / (nRatio + 1));
                        }
                        k += 1;
                    }
                }

                //words in current dictionary are left
                while (j < indexTable[i].nCount)
                {
                    indexTable[i].WordItems[j].nFrequency = (nRatio * indexTable[i].WordItems[j].nFrequency) / (nRatio + 1);
                    j += 1;
                }

                //words in Dict2 are left
                while (k < dict2.indexTable[i].nCount)
                {
                    if (dict2.indexTable[i].WordItems[k].nFrequency > (nRatio + 1) / 10)
                    {
                        sWord = string.Format("{0}{1}", Utility.CC_ID2Char(i).ToString(), dict2.indexTable[i].WordItems[k].sWord);
                        AddItem(sWord, dict2.indexTable[i].WordItems[k].nPOS, dict2.indexTable[i].WordItems[k].nFrequency / (nRatio + 1));
                    }
                    k += 1;
                }
            }
            return(true);
        }
Example #8
0
        public bool PlaceRecognize(WordDictionary dictCore, WordDictionary placeDict)
        {
            int nStart = 1, nEnd = 1, i = 1, nTemp;
            double dPanelty = 1.0; //Panelty value
            while (m_nBestTag[i] > -1)
            {
                if (m_nBestTag[i] == 1)
                //1 Trigger the recognition procession
                {
                    nStart = i;
                    nEnd = nStart + 1;
                    //=========== by zhenyulu: 此处nEnd = nStart + 1;有些强迫之嫌,因此后面处理了一下
                    while (m_nBestTag[nEnd] == 1)
                    //
                    {
                        if (nEnd > nStart + 1)
                            dPanelty += 1.0;
                        nEnd++;
                    }
                    while (m_nBestTag[nEnd] == 2)
                        //2,12,22
                        nEnd++;
                    nTemp = nEnd;
                    while (m_nBestTag[nEnd] == 3)
                    {
                        if (nEnd > nTemp)
                            dPanelty += 1.0;
                        nEnd++;
                    }
                }
                else if (m_nBestTag[i] == 2)
                //1,11,21 Trigger the recognition
                {
                    dPanelty += 1.0;
                    nStart = i;
                    nEnd = nStart + 1;
                    while (m_nBestTag[nEnd] == 2)
                        //2
                        nEnd++;
                    nTemp = nEnd;
                    while (m_nBestTag[nEnd] == 3)
                    //2
                    {
                        if (nEnd > nTemp)
                            dPanelty += 1.0;
                        nEnd++;
                    }
                }
                if (nEnd > nStart)
                {
                    //=========== by zhenyulu: 避免上面强迫之嫌带来的负面影响
                    if (m_sWords[nEnd] == null)
                        nEnd--;

                    m_nUnknownWords[m_nUnknownWordsCount, 0] = m_nWordPosition[nStart];
                    m_nUnknownWords[m_nUnknownWordsCount, 1] = m_nWordPosition[nEnd];
                    m_dWordsPossibility[m_nUnknownWordsCount++] = ComputePossibility(nStart, nEnd - nStart + 1, placeDict) +
                       Math.Log(dPanelty);
                    nStart = nEnd;
                }
                if (i < nEnd)
                    i = nEnd;
                else
                    i = i + 1;
            }
            return true;
        }
Example #9
0
        private int GetFrom(WordResult[] pWordItems, int nIndex, WordDictionary dictCore, WordDictionary dictUnknown)
        {
            WordInfo info;
            int[] aPOS = new int[Predefine.MAX_POS_PER_WORD];
            int[] aFreq = new int[Predefine.MAX_POS_PER_WORD];
            int nFreq = 0, j, nRetPos = 0, nWordsIndex = 0;
            bool bSplit = false; //Need to split in Transliteration recognition 
            int i = 1, nPOSCount;
            string sCurWord; //Current word

            nWordsIndex = i + nIndex - 1;
            for (i = 1; i < Predefine.MAX_WORDS_PER_SENTENCE && nWordsIndex < pWordItems.Length; i++)
            {
                if (m_tagType == TAG_TYPE.TT_NORMAL || !dictUnknown.IsExist(pWordItems[nWordsIndex].sWord, 44))
                {
                    m_sWords[i] = pWordItems[nWordsIndex].sWord; //store current word
                    m_nWordPosition[i + 1] = m_nWordPosition[i] + m_sWords[i].Length;
                }
                else
                {
                    if (!bSplit)
                    {
                        m_sWords[i] = pWordItems[nWordsIndex].sWord.Substring(0, 1);
                        //store current word
                        bSplit = true;
                    }
                    else
                    {
                        m_sWords[i] = pWordItems[nWordsIndex].sWord.Substring(1);
                        //store current word
                        bSplit = false;
                    }
                    m_nWordPosition[i + 1] = m_nWordPosition[i] + m_sWords[i].Length;
                }
                //Record the position of current word
                m_nStartPos = m_nWordPosition[i + 1];
                //Move the Start POS to the ending
                if (m_tagType != TAG_TYPE.TT_NORMAL)
                {
                    //Get the POSs from the unknown recognition dictionary
                    sCurWord = m_sWords[i];
                    if (m_tagType == TAG_TYPE.TT_TRANS_PERSON && i > 0 && m_sWords[i - 1] != null &&
                       Utility.charType(m_sWords[i - 1].ToCharArray()[0]) == Predefine.CT_CHINESE)
                    {
                        if (m_sWords[i] == ".")
                            sCurWord = ".";
                        else if (m_sWords[i] == "-")
                            sCurWord = "-";
                    }

                    info = dictUnknown.GetWordInfo(sCurWord);
                    if (info != null)
                    {
                        nPOSCount = info.Count + 1;
                        for (j = 0; j < info.Count; j++)
                        {
                            //Get the POS set of sCurWord in the unknown dictionary
                            m_nTags[i, j] = info.POSs[j];
                            m_dFrequency[i, j] = -Math.Log((double)(1 + info.Frequencies[j])) +
                               Math.Log((double)(m_context.GetFrequency(0, info.POSs[j]) + nPOSCount));
                        }
                    }
                    else
                    {
                        nPOSCount = 1;
                        j = 0;
                    }

                    //Get the POS set of sCurWord in the core dictionary
                    //We ignore the POS in the core dictionary and recognize them as other (0).
                    //We add their frequency to get the possibility as POS 0
                    if (string.Compare(m_sWords[i], "始##始") == 0)
                    {
                        m_nTags[i, j] = 100;
                        m_dFrequency[i, j] = 0;
                        j++;
                    }
                    else if (string.Compare(m_sWords[i], "末##末") == 0)
                    {
                        m_nTags[i, j] = 101;
                        m_dFrequency[i, j] = 0;
                        j++;
                    }
                    else
                    {
                        //dictCore.GetHandle(m_sWords[i], &nCount, aPOS, aFreq);
                        info = dictCore.GetWordInfo(m_sWords[i]);
                        nFreq = 0;
                        if (info != null)
                        {
                            for (int k = 0; k < info.Count; k++)
                            {
                                nFreq += info.Frequencies[k];
                            }
                            if (info.Count > 0)
                            {
                                m_nTags[i, j] = 0;
                                //m_dFrequency[i][j]=(double)(1+nFreq)/(double)(m_context.GetFrequency(0,0)+1);
                                m_dFrequency[i, j] = -Math.Log((double)(1 + nFreq)) + Math.Log((double)(m_context.GetFrequency(0, 0) + nPOSCount));
                                j++;
                            }
                        }
                    }
                }
                else
                //For normal POS tagging
                {
                    j = 0;
                    //Get the POSs from the unknown recognition dictionary
                    if (pWordItems[nWordsIndex].nPOS > 0)
                    {
                        //The word has  is only one POS value
                        //We have record its POS and nFrequncy in the items.
                        m_nTags[i, j] = pWordItems[nWordsIndex].nPOS;
                        m_dFrequency[i, j] = -Math.Log(pWordItems[nWordsIndex].dValue) + Math.Log((double)(m_context.GetFrequency(0, m_nTags[i, j]) + 1));
                        if (m_dFrequency[i, j] < 0)
                            //Not permit the value less than 0
                            m_dFrequency[i, j] = 0;
                        j++;
                    }
                    else
                    {
                        //The word has multiple POSs, we should retrieve the information from Core Dictionary 
                        if (pWordItems[nWordsIndex].nPOS < 0)
                        {
                            //The word has  is only one POS value
                            //We have record its POS and nFrequncy in the items.
                            m_nTags[i, j] = -pWordItems[nWordsIndex].nPOS;
                            m_dFrequency[i, j++] = pWordItems[nWordsIndex].dValue;
                        }
                        //dictCore.GetHandle(m_sWords[i], &nCount, aPOS, aFreq);
                        info = dictCore.GetWordInfo(m_sWords[i]);
                        if (info != null)
                        {
                            nPOSCount = info.Count;
                            for (; j < info.Count; j++)
                            {
                                //Get the POS set of sCurWord in the unknown dictionary
                                m_nTags[i, j] = info.POSs[j];
                                m_dFrequency[i, j] = -Math.Log(1 + info.Frequencies[j]) + Math.Log(m_context.GetFrequency(0, m_nTags[i, j]) + nPOSCount);
                            }
                        }
                    }
                }
                if (j == 0)
                {
                    //We donot know the POS, so we have to guess them according lexical knowledge
                    GuessPOS(i, out j); //Guess the POS of current word
                }
                m_nTags[i, j] = -1; //Set the ending POS 
                if (j == 1 && m_nTags[i, j] != Predefine.CT_SENTENCE_BEGIN)
                //No ambuguity
                {
                    //No ambuguity, so we can break from the loop
                    i++;
                    m_sWords[i] = null;
                    break;
                }
                if (!bSplit)
                    nWordsIndex++;
            }
            if (nWordsIndex == pWordItems.Length)
                nRetPos = -1;
            //Reaching ending

            if (m_nTags[i - 1, 1] != -1)
            //||m_sWords[i][0]==0
            {
                //Set end for words like "张/华/平"
                if (m_tagType != TAG_TYPE.TT_NORMAL)
                    m_nTags[i, 0] = 101;
                else
                    m_nTags[i, 0] = 1;

                m_dFrequency[i, 0] = 0;
                m_sWords[i] = null; //Set virtual ending
                m_nTags[i++, 1] = -1;
            }
            m_nCurLength = i; //The current word count
            if (nRetPos != -1)
                return nWordsIndex + 1;
            //Next start position
            return -1; //Reaching ending
        }
      //====================================================================
      // Func Name  : GenerateWordNet
      // Description: Generate the segmentation word net according 
      //              the original sentence
      // Parameters : sSentence: the sentence
      //              dictCore : core dictionary
      //              bOriginalFreq=false: output original frequency
      // Returns    : bool
      //====================================================================
      public static RowFirstDynamicArray<ChainContent> GenerateWordNet(List<AtomNode> atomSegment, WordDictionary coreDict)
      {
         string sWord = "", sMaxMatchWord;
         int nPOSRet, nPOS, nTotalFreq;
         double dValue = 0;

         RowFirstDynamicArray<ChainContent> m_segGraph = new RowFirstDynamicArray<ChainContent>();
         m_segGraph.SetEmpty();

         // 将原子部分存入m_segGraph
         for (int i = 0; i < atomSegment.Count; i++)//Init the cost array
         {
            if (atomSegment[i].nPOS == Predefine.CT_CHINESE)
               m_segGraph.SetElement(i, i + 1, new ChainContent(0, 0, atomSegment[i].sWord));
            else
            {
               sWord = atomSegment[i].sWord;//init the word 
               dValue = Predefine.MAX_FREQUENCE;
               switch (atomSegment[i].nPOS)
               {
                  case Predefine.CT_INDEX:
                  case Predefine.CT_NUM:
                     nPOS = -27904;//'m'*256
                     sWord = "未##数";
                     dValue = 0;
                     break;
                  case Predefine.CT_DELIMITER:
                     nPOS = 30464;//'w'*256;
                     break;
                  case Predefine.CT_LETTER:
                     nPOS = -28280; // -'n' * 256 - 'x';
                     dValue = 0;
                     sWord = "未##串";
                     break;
                  case Predefine.CT_SINGLE://12021-2129-3121
                     if (Regex.IsMatch(atomSegment[i].sWord, @"^(-?\d+)(\.\d+)?$"))  //匹配浮点数
                     {
                        nPOS = -27904;//'m'*256
                        sWord = "未##数";
                     }
                     else
                     {
                        nPOS = -28280; // -'n' * 256 - 'x'
                        sWord = "未##串";
                     }
                     dValue = 0;
                     break;
                  default:
                     nPOS = atomSegment[i].nPOS;//'?'*256;
                     break;
               }
               m_segGraph.SetElement(i, i + 1, new ChainContent(dValue, nPOS, sWord));//init the link with minimum
            }
         }

         // 将所有可能的组词存入m_segGraph
         for (int i = 0; i < atomSegment.Count; i++)//All the word
         {
            sWord = atomSegment[i].sWord;//Get the current atom
            int j = i + 1;

            while (j < atomSegment.Count && coreDict.GetMaxMatch(sWord, out sMaxMatchWord, out nPOSRet))
            {
               if (sMaxMatchWord == sWord)  // 就是我们要找的词
               {
                  WordInfo info = coreDict.GetWordInfo(sWord); // 该词可能就有多种词性

                  // 计算该词的所有词频之和
                  nTotalFreq = 0;
                  for (int k = 0; k < info.Count; k++)
                     nTotalFreq += info.Frequencies[k];

                  // 限制出现某些特殊词
                  if (sWord.Length == 2 && (sWord.StartsWith("年") || sWord.StartsWith("月")) && i >= 1 &&
                     (Utility.IsAllNum(atomSegment[i - 1].sWord) ||
                     Utility.IsAllChineseNum(atomSegment[i - 1].sWord)))
                  {
                     //1年内、1999年末
                     if ("末内中底前间初".IndexOf(sWord.Substring(1)) >= 0)
                        break;
                  }

                  // 如果该词只有一个词性,则存储,否则词性记录为 0
                  if (info.Count == 1)
                     m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, info.POSs[0], sWord));
                  else
                     m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, 0, sWord));
               }

               sWord += atomSegment[j++].sWord;
            }
         }
         return m_segGraph;
      }
Example #11
0
        public bool PersonRecognize(WordDictionary personDict)
        {
            StringBuilder sb = new StringBuilder();

            int i;
            string sPOS = "z", sPersonName;
            string[] sPatterns = { "BBCD", "BBC", "BBE", "BBZ", "BCD", "BEE", "BE", "BG", "BXD", "BZ", "CDCD", "CD", "EE", "FB", "Y", "XD", "" };
            double[] dFactor =   { 0.003606, 0.000021, 0.001314, 0.000315, 0.656624, 0.000021, 0.146116, 0.009136, 
            0.000042, 0.038971, 0, 0.090367, 0.000273, 0.009157, 0.034324, 0.009735, 0 };

            /*------------------------------------
            About parameter:
         
            BBCD  343      0.003606
            BBC   2        0.000021
            BBE   125      0.001314
            BBZ   30       0.000315
            BCD   62460    0.656624
            BEE   0        0.000000
            BE    13899    0.146116
            BG    869      0.009136
            BXD   4        0.000042
            BZ    3707     0.038971
            CD    8596     0.090367
            EE    26       0.000273
            FB    871      0.009157
            Y     3265     0.034324
            XD    926      0.009735

            The person recognition patterns set
            BBCD:姓+姓+名1+名2;
            BBE: 姓+姓+单名;
            BBZ: 姓+姓+双名成词;
            BCD: 姓+名1+名2;
            BE:  姓+单名;
            BEE: 姓+单名+单名;韩磊磊
            BG:  姓+后缀
            BXD: 姓+姓双名首字成词+双名末字
            BZ:  姓+双名成词;
            B:   姓
            CD:  名1+名2;
            EE:  单名+单名;
            FB:  前缀+姓
            XD:  姓双名首字成词+双名末字
            Y:   姓单名成词
            ------------------------------------*/

            int[] nPatternLen = { 4, 3, 3, 3, 3, 3, 2, 2, 3, 2, 4, 2, 2, 2, 1, 2, 0 };

            //Convert to string from POS
            sb.Append('z');
            for (i = 1; m_nBestTag[i] > -1; i++)
                sb.Append(Convert.ToChar(m_nBestTag[i] + Convert.ToInt32('A')));

            sPOS = sb.ToString();

            int j = 1, k, nPos; //Find the proper pattern from the first POS
            int nLittleFreqCount; //Counter for the person name role with little frequecy
            bool bMatched = false;
            while (j < i)
            {
                bMatched = false;
                for (k = 0; !bMatched && nPatternLen[k] > 0; k++)
                {
                    if (string.Compare(sPatterns[k], 0, sPOS, j, nPatternLen[k]) == 0 &&
                       string.Compare(m_sWords[j - 1], "·") != 0 && string.Compare(m_sWords[j + nPatternLen[k]], "·") != 0)
                    {
                        //Find the proper pattern k
                        if (string.Compare(sPatterns[k], "FB") == 0 && (sPOS[j + 2] == 'E' || sPOS[j + 2] == 'C' || sPOS[j + 2] == 'G'))
                        {
                            //Rule 1 for exclusion:前缀+姓+名1(名2): 规则(前缀+姓)失效;
                            continue;
                        }

                        /*			
                        if((strcmp(sPatterns[k],"BEE")==0||strcmp(sPatterns[k],"EE")==0)&&strcmp(m_sWords[j+nPatternLen[k]-1],m_sWords[j+nPatternLen[k]-2])!=0)
                        {//Rule 2 for exclusion:姓+单名+单名:单名+单名 若EE对应的字不同,规则失效.如:韩磊磊
                        continue;
                        }

                        if(strcmp(sPatterns[k],"B")==0&&m_nBestTag[j+1]!=12)
                        {//Rule 3 for exclusion: 若姓后不是后缀,规则失效.如:江主席、刘大娘
                        continue;
                        }
                         */
                        //Get the possible name

                        nPos = j; //Record the person position in the tag sequence
                        sPersonName = null;
                        nLittleFreqCount = 0; //Record the number of role with little frequency
                        while (nPos < j + nPatternLen[k])
                        {
                            //Get the possible person name
                            //
                            if (m_nBestTag[nPos] < 4 && personDict.GetFrequency(m_sWords[nPos], m_nBestTag[nPos]) < Predefine.LITTLE_FREQUENCY)
                                nLittleFreqCount++;
                            //The counter increase
                            sPersonName += m_sWords[nPos];
                            nPos += 1;
                        }
                        /*
                        if(IsAllForeign(sPersonName)&&personDict.GetFrequency(m_sWords[j],1)<LITTLE_FREQUENCY)
                        {//Exclusion foreign name
                        //Rule 2 for exclusion:若均为外国人名用字 规则(名1+名2)失效
                        j+=nPatternLen[k]-1;
                        continue;
                        }
                         */
                        if (string.Compare(sPatterns[k], "CDCD") == 0)
                        {
                            //Rule for exclusion
                            //规则(名1+名2+名1+名2)本身是排除规则:女高音歌唱家迪里拜尔演唱
                            //Rule 3 for exclusion:含外国人名用字 规则适用
                            //否则,排除规则失效:黑妞白妞姐俩拔了头筹。
                            if (Utility.GetForeignCharCount(sPersonName) > 0)
                                j += nPatternLen[k] - 1;
                            continue;
                        }
                        /*
                        if(strcmp(sPatterns[k],"CD")==0&&IsAllForeign(sPersonName))
                        {//
                        j+=nPatternLen[k]-1;
                        continue;
                        }
                        if(nLittleFreqCount==nPatternLen[k]||nLittleFreqCount==3)
                        //马哈蒂尔;小扎耶德与他的中国阿姨胡彩玲受华黎明大使之邀,
                        //The all roles appear with two lower frequecy,we will ignore them
                        continue;
                         */
                        m_nUnknownWords[m_nUnknownWordsCount, 0] = m_nWordPosition[j];
                        m_nUnknownWords[m_nUnknownWordsCount, 1] = m_nWordPosition[j + nPatternLen[k]];
                        m_dWordsPossibility[m_nUnknownWordsCount] = -Math.Log(dFactor[k]) + ComputePossibility(j, nPatternLen[k], personDict);
                        //Mutiply the factor 
                        m_nUnknownWordsCount += 1;
                        j += nPatternLen[k];
                        bMatched = true;
                    }
                }
                if (!bMatched)
                    //Not matched, add j by 1
                    j += 1;
            }
            return true;
        }
Example #12
0
        public bool PlaceRecognize(WordDictionary dictCore, WordDictionary placeDict)
        {
            int    nStart = 1, nEnd = 1, i = 1, nTemp;
            double dPanelty = 1.0; //Panelty value

            while (m_nBestTag[i] > -1)
            {
                if (m_nBestTag[i] == 1)
                //1 Trigger the recognition procession
                {
                    nStart = i;
                    nEnd   = nStart + 1;
                    //=========== by zhenyulu: 此处nEnd = nStart + 1;有些强迫之嫌,因此后面处理了一下
                    while (m_nBestTag[nEnd] == 1)
                    //
                    {
                        if (nEnd > nStart + 1)
                        {
                            dPanelty += 1.0;
                        }
                        nEnd++;
                    }
                    while (m_nBestTag[nEnd] == 2)
                    {
                        //2,12,22
                        nEnd++;
                    }
                    nTemp = nEnd;
                    while (m_nBestTag[nEnd] == 3)
                    {
                        if (nEnd > nTemp)
                        {
                            dPanelty += 1.0;
                        }
                        nEnd++;
                    }
                }
                else if (m_nBestTag[i] == 2)
                //1,11,21 Trigger the recognition
                {
                    dPanelty += 1.0;
                    nStart    = i;
                    nEnd      = nStart + 1;
                    while (m_nBestTag[nEnd] == 2)
                    {
                        //2
                        nEnd++;
                    }
                    nTemp = nEnd;
                    while (m_nBestTag[nEnd] == 3)
                    //2
                    {
                        if (nEnd > nTemp)
                        {
                            dPanelty += 1.0;
                        }
                        nEnd++;
                    }
                }
                if (nEnd > nStart)
                {
                    //=========== by zhenyulu: 避免上面强迫之嫌带来的负面影响
                    if (m_sWords[nEnd] == null)
                    {
                        nEnd--;
                    }

                    m_nUnknownWords[m_nUnknownWordsCount, 0]    = m_nWordPosition[nStart];
                    m_nUnknownWords[m_nUnknownWordsCount, 1]    = m_nWordPosition[nEnd];
                    m_dWordsPossibility[m_nUnknownWordsCount++] = ComputePossibility(nStart, nEnd - nStart + 1, placeDict) +
                                                                  Math.Log(dPanelty);
                    nStart = nEnd;
                }
                if (i < nEnd)
                {
                    i = nEnd;
                }
                else
                {
                    i = i + 1;
                }
            }
            return(true);
        }
Example #13
0
        public static void TestBiSegment()
        {
            List<string> sentence = new List<string>();
             List<string> description = new List<string>();

             sentence.Add(@"他说的的确实在理");
             description.Add(@"普通分词测试");

             sentence.Add(@"张华平3-4月份来北京开会");
             description.Add(@"数字切分");

             sentence.Add(@"1.加强管理");
             description.Add(@"剔除多余的“.”");

             sentence.Add(@"他出生于1980年1月1日10点");
             description.Add(@"日期合并");

             sentence.Add(@"他出生于甲子年");
             description.Add(@"年份识别");

             sentence.Add(@"馆内陈列周恩来和邓颖超生前使用过的物品");
             description.Add(@"姓名识别");

             WordDictionary coreDict = new WordDictionary();
             if (!coreDict.Load(coreDictFile))
             {
            Console.WriteLine("coreDict 字典装入错误!");
            return;
             }

             WordDictionary biDict = new WordDictionary();
             if (!biDict.Load(biDictFile))
             {
            Console.WriteLine("字典装入错误!");
            return;
             }

             string sSentence;
             string sDescription;

             for (int i = 0; i < sentence.Count; i++)
             {
            sSentence = sentence[i];
            sDescription = description[i];
            Console.WriteLine("\r\n============ {0} ============", sDescription);

            sSentence = Predefine.SENTENCE_BEGIN + sSentence + Predefine.SENTENCE_END;

            List<AtomNode> nodes = Segment.AtomSegment(sSentence);
            Console.WriteLine("原子切分:");
            for (int j = 0; j < nodes.Count; j++)
               Console.Write("{0}, ", nodes[j].sWord);

            Console.WriteLine("\r\n\r\n实际切分:");
            Segment segment = new Segment(biDict, coreDict);
            segment.BiSegment(sSentence, 0.1, 1);

            for (int k = 0; k < segment.m_pWordSeg.Count; k++)
            {
               for (int j = 0; j < segment.m_pWordSeg[k].Length; j++)
                  Console.Write("{0}, ", segment.m_pWordSeg[k][j].sWord);
               Console.WriteLine();
            }
             }
        }
Example #14
0
        public static void TestGenerateWordNet()
        {
            WordDictionary coreDict = new WordDictionary();
             if (!coreDict.Load(coreDictFile))
             {
            Console.WriteLine("字典装入错误!");
            return;
             }

             string sSentence = @"人民币现在很值钱";
             sSentence = Predefine.SENTENCE_BEGIN + sSentence + Predefine.SENTENCE_END;

             List<AtomNode> atomSegment = Segment.AtomSegment(sSentence);
             RowFirstDynamicArray<ChainContent> m_segGraph = Segment.GenerateWordNet(atomSegment, coreDict);

             Console.WriteLine(m_segGraph.ToString());
        }
Example #15
0
        public static void TestDictionary()
        {
            WordDictionary dict = new WordDictionary();
             if (dict.Load(coreDictFile, false))
             {
            for (int j = 2; j <= 5; j++)
            {
               Console.WriteLine("====================================\r\n汉字:{0}, ID :{1}\r\n", Utility.CC_ID2Char(j), j);

               Console.WriteLine("  词长  频率  词性   词");
               for (int i = 0; i < dict.indexTable[j].nCount; i++)
                  Console.WriteLine("{0,5} {1,6} {2,5}  ({3}){4}",
                     dict.indexTable[j].WordItems[i].nWordLen,
                     dict.indexTable[j].WordItems[i].nFrequency,
                     Utility.GetPOSString(dict.indexTable[j].WordItems[i].nPOS),
                     Utility.CC_ID2Char(j),
                     dict.indexTable[j].WordItems[i].sWord);
            }
             }
             else
            Console.WriteLine("Wrong!");
        }
      //====================================================================
      //Merge dict2 into current dictionary and the frequency ratio from dict2 and current dict is nRatio
      //====================================================================
      public bool Merge(WordDictionary dict2, int nRatio)
      {
         int i, j, k, nCmpValue;
         string sWord;

         //Modification made, not to output when modify table exists.
         if (modifyTable != null || dict2.modifyTable != null)
            return false;

         for (i = 0; i < Predefine.CC_NUM; i++)
         {
            j = 0;
            k = 0;
            while (j < indexTable[i].nCount && k < dict2.indexTable[i].nCount)
            {
               nCmpValue = Utility.CCStringCompare(indexTable[i].WordItems[j].sWord, dict2.indexTable[i].WordItems[k].sWord);
               if (nCmpValue == 0)
               //Same Words and determine the different handle
               {
                  if (indexTable[i].WordItems[j].nPOS < dict2.indexTable[i].WordItems[k].nPOS)
                     nCmpValue = -1;
                  else if (indexTable[i].WordItems[j].nPOS > dict2.indexTable[i].WordItems[k].nPOS)
                     nCmpValue = 1;
               }

               if (nCmpValue == 0)
               {
                  indexTable[i].WordItems[j].nFrequency = (nRatio * indexTable[i].WordItems[j].nFrequency + dict2.indexTable[i].WordItems[k].nFrequency) / (nRatio + 1);
                  j += 1;
                  k += 1;
               }
               //Get next word in the current dictionary
               else if (nCmpValue < 0)
               {
                  indexTable[i].WordItems[j].nFrequency = (nRatio * indexTable[i].WordItems[j].nFrequency) / (nRatio + 1);
                  j += 1;
               }
               else
               //Get next word in the second dictionary
               {
                  if (dict2.indexTable[i].WordItems[k].nFrequency > (nRatio + 1) / 10)
                  {
                     sWord = string.Format("{0}{1}", Utility.CC_ID2Char(i).ToString(), dict2.indexTable[i].WordItems[k].sWord);
                     AddItem(sWord, dict2.indexTable[i].WordItems[k].nPOS, dict2.indexTable[i].WordItems[k].nFrequency / (nRatio + 1));
                  }
                  k += 1;
               }
            }

            //words in current dictionary are left
            while (j < indexTable[i].nCount)
            {
               indexTable[i].WordItems[j].nFrequency = (nRatio * indexTable[i].WordItems[j].nFrequency) / (nRatio + 1);
               j += 1;
            }

            //words in Dict2 are left
            while (k < dict2.indexTable[i].nCount)
            {
               if (dict2.indexTable[i].WordItems[k].nFrequency > (nRatio + 1) / 10)
               {
                  sWord = string.Format("{0}{1}", Utility.CC_ID2Char(i).ToString(), dict2.indexTable[i].WordItems[k].sWord);
                  AddItem(sWord, dict2.indexTable[i].WordItems[k].nPOS, dict2.indexTable[i].WordItems[k].nFrequency / (nRatio + 1));
               }
               k += 1;
            }
         }
         return true;
      }
Example #17
0
        //====================================================================
        // Func Name  : GenerateWordNet
        // Description: Generate the segmentation word net according
        //              the original sentence
        // Parameters : sSentence: the sentence
        //              dictCore : core dictionary
        //              bOriginalFreq=false: output original frequency
        // Returns    : bool
        //====================================================================
        public static RowFirstDynamicArray<ChainContent> GenerateWordNet(List<AtomNode> atomSegment, WordDictionary coreDict)
        {
            string sWord = "", sMaxMatchWord;
             int nPOSRet, nPOS, nTotalFreq;
             double dValue = 0;

             RowFirstDynamicArray<ChainContent> m_segGraph = new RowFirstDynamicArray<ChainContent>();
             m_segGraph.SetEmpty();

             // ��ԭ�Ӳ��ִ���m_segGraph
             for (int i = 0; i < atomSegment.Count; i++)//Init the cost array
             {
            if (atomSegment[i].nPOS == Predefine.CT_CHINESE)
               m_segGraph.SetElement(i, i + 1, new ChainContent(0, 0, atomSegment[i].sWord));
            else
            {
               sWord = atomSegment[i].sWord;//init the word
               dValue = Predefine.MAX_FREQUENCE;
               switch (atomSegment[i].nPOS)
               {
                  case Predefine.CT_INDEX:
                  case Predefine.CT_NUM:
                     nPOS = -27904;//'m'*256
                     sWord = "δ##��";
                     dValue = 0;
                     break;
                  case Predefine.CT_DELIMITER:
                     nPOS = 30464;//'w'*256;
                     break;
                  case Predefine.CT_LETTER:
                     nPOS = -28280; // -'n' * 256 - 'x';
                     dValue = 0;
                     sWord = "δ##��";
                     break;
                  case Predefine.CT_SINGLE://12021-2129-3121
                     if (Regex.IsMatch(atomSegment[i].sWord, @"^(-?\d+)(\.\d+)?$"))����//ƥ�両����
                     {
                        nPOS = -27904;//'m'*256
                        sWord = "δ##��";
                     }
                     else
                     {
                        nPOS = -28280; // -'n' * 256 - 'x'
                        sWord = "δ##��";
                     }
                     dValue = 0;
                     break;
                  default:
                     nPOS = atomSegment[i].nPOS;//'?'*256;
                     break;
               }
               m_segGraph.SetElement(i, i + 1, new ChainContent(dValue, nPOS, sWord));//init the link with minimum
            }
             }

             // �����п��ܵ���ʴ���m_segGraph
             for (int i = 0; i < atomSegment.Count; i++)//All the word
             {
            sWord = atomSegment[i].sWord;//Get the current atom
            int j = i + 1;

            while (j < atomSegment.Count && coreDict.GetMaxMatch(sWord, out sMaxMatchWord, out nPOSRet))
            {
               if (sMaxMatchWord == sWord)  // ��������Ҫ�ҵĴ�
               {
                  WordInfo info = coreDict.GetWordInfo(sWord); // �ôʿ��ܾ��ж��ִ���

                  // ����ôʵ����д�Ƶ֮��
                  nTotalFreq = 0;
                  for (int k = 0; k < info.Count; k++)
                     nTotalFreq += info.Frequencies[k];

                  // ���Ƴ���ijЩ�����
                  if (sWord.Length == 2 && (sWord.StartsWith("��") || sWord.StartsWith("��")) && i >= 1 &&
                     (Utility.IsAllNum(atomSegment[i - 1].sWord) ||
                     Utility.IsAllChineseNum(atomSegment[i - 1].sWord)))
                  {
                     //1���ڡ�1999��ĩ
                     if ("ĩ���е�ǰ���".IndexOf(sWord.Substring(1)) >= 0)
                        break;
                  }

                  // ����ô�ֻ��һ�����ԣ���洢��������Լ�¼Ϊ 0
                  if (info.Count == 1)
                     m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, info.POSs[0], sWord));
                  else
                     m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, 0, sWord));
               }

               sWord += atomSegment[j++].sWord;
            }
             }
             return m_segGraph;
        }
Example #18
0
        //====================================================================
        // Func Name  : GenerateWordNet
        // Description: Generate the segmentation word net according
        //              the original sentence
        // Parameters : sSentence: the sentence
        //              dictCore : core dictionary
        //              bOriginalFreq=false: output original frequency
        // Returns    : bool
        //====================================================================
        public static RowFirstDynamicArray <ChainContent> GenerateWordNet(List <AtomNode> atomSegment, WordDictionary coreDict)
        {
            string sWord = "", sMaxMatchWord;
            int    nPOSRet, nPOS, nTotalFreq;
            double dValue = 0;

            RowFirstDynamicArray <ChainContent> m_segGraph = new RowFirstDynamicArray <ChainContent>();

            m_segGraph.SetEmpty();

            // 将原子部分存入m_segGraph
            for (int i = 0; i < atomSegment.Count; i++)//Init the cost array
            {
                if (atomSegment[i].nPOS == Predefine.CT_CHINESE)
                {
                    m_segGraph.SetElement(i, i + 1, new ChainContent(0, 0, atomSegment[i].sWord));
                }
                else
                {
                    sWord  = atomSegment[i].sWord;//init the word
                    dValue = Predefine.MAX_FREQUENCE;
                    switch (atomSegment[i].nPOS)
                    {
                    case Predefine.CT_INDEX:
                    case Predefine.CT_NUM:
                        nPOS   = -27904;//'m'*256
                        sWord  = "未##数";
                        dValue = 0;
                        break;

                    case Predefine.CT_DELIMITER:
                        nPOS = 30464;//'w'*256;
                        break;

                    case Predefine.CT_LETTER:
                        nPOS   = -28280; // -'n' * 256 - 'x';
                        dValue = 0;
                        sWord  = "未##串";
                        break;

                    case Predefine.CT_SINGLE://12021-2129-3121
                        if (Regex.IsMatch(atomSegment[i].sWord, @"^(-?\d+)(\.\d+)?$"))
                        {
                                                //匹配浮点数
                            {
                                nPOS  = -27904; //'m'*256
                                sWord = "未##数";
                            }
                        }
                        else
                        {
                            nPOS  = -28280; // -'n' * 256 - 'x'
                            sWord = "未##串";
                        }
                        dValue = 0;
                        break;

                    default:
                        nPOS = atomSegment[i].nPOS;//'?'*256;
                        break;
                    }
                    m_segGraph.SetElement(i, i + 1, new ChainContent(dValue, nPOS, sWord));//init the link with minimum
                }
            }

            // 将所有可能的组词存入m_segGraph
            for (int i = 0; i < atomSegment.Count; i++) //All the word
            {
                sWord = atomSegment[i].sWord;           //Get the current atom
                int j = i + 1;

                while (j < atomSegment.Count && coreDict.GetMaxMatch(sWord, out sMaxMatchWord, out nPOSRet))
                {
                    if (sMaxMatchWord == sWord)                      // 就是我们要找的词
                    {
                        WordInfo info = coreDict.GetWordInfo(sWord); // 该词可能就有多种词性

                        // 计算该词的所有词频之和
                        nTotalFreq = 0;
                        for (int k = 0; k < info.Count; k++)
                        {
                            nTotalFreq += info.Frequencies[k];
                        }

                        // 限制出现某些特殊词
                        if (sWord.Length == 2 && (sWord.StartsWith("年") || sWord.StartsWith("月")) && i >= 1 &&
                            (Utility.IsAllNum(atomSegment[i - 1].sWord) ||
                             Utility.IsAllChineseNum(atomSegment[i - 1].sWord)))
                        {
                            //1年内、1999年末
                            if ("末内中底前间初".IndexOf(sWord.Substring(1)) >= 0)
                            {
                                break;
                            }
                        }

                        // 如果该词只有一个词性,则存储,否则词性记录为 0
                        if (info.Count == 1)
                        {
                            m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, info.POSs[0], sWord));
                        }
                        else
                        {
                            m_segGraph.SetElement(i, j, new ChainContent(nTotalFreq, 0, sWord));
                        }
                    }

                    sWord += atomSegment[j++].sWord;
                }
            }
            return(m_segGraph);
        }
Example #19
0
 public Segment(WordDictionary biDict, WordDictionary coreDict)
 {
     this.biDict = biDict;
      this.coreDict = coreDict;
 }
Example #20
0
        //====================================================================
        // 生成两两词之间的二叉图表
        //====================================================================
        public static ColumnFirstDynamicArray <ChainContent> BiGraphGenerate(
            RowFirstDynamicArray <ChainContent> aWord, double smoothPara, WordDictionary biDict, WordDictionary coreDict)
        {
            ColumnFirstDynamicArray <ChainContent> aBiWordNet = new ColumnFirstDynamicArray <ChainContent>();

            ChainItem <ChainContent> pCur, pNextWords;
            int           nTwoWordsFreq = 0, nCurWordIndex, nNextWordIndex;
            double        dCurFreqency, dValue, dTemp;
            string        sTwoWords;
            StringBuilder sb = new StringBuilder();

            //Record the position map of possible words
            int[] m_npWordPosMapTable = PreparePositionMap(aWord);

            pCur = aWord.GetHead();
            while (pCur != null)
            {
                if (pCur.Content.nPOS >= 0)
                {
                    //It's not an unknown words
                    dCurFreqency = pCur.Content.eWeight;
                }
                else
                {
                    //Unknown words
                    dCurFreqency = coreDict.GetFrequency(pCur.Content.sWord, 2);
                }

                //Get next words which begin with pCur.col(注:很特殊的对应关系)
                pNextWords = aWord.GetFirstElementOfRow(pCur.col);

                while (pNextWords != null && pNextWords.row == pCur.col)
                {
                    sb.Remove(0, sb.Length);
                    sb.Append(pCur.Content.sWord);
                    sb.Append(Predefine.WORD_SEGMENTER);
                    sb.Append(pNextWords.Content.sWord);

                    sTwoWords = sb.ToString();

                    //Two linked Words frequency
                    nTwoWordsFreq = biDict.GetFrequency(sTwoWords, 3);

                    //Smoothing
                    dTemp = 1.0 / Predefine.MAX_FREQUENCE;

                    //-log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1
                    dValue = -Math.Log(smoothPara * (1.0 + dCurFreqency) / (Predefine.MAX_FREQUENCE + 80000.0)
                                       + (1.0 - smoothPara) * ((1.0 - dTemp) * nTwoWordsFreq / (1.0 + dCurFreqency) +
                                                               dTemp));

                    //Unknown words: P(Wi|Ci);while known words:1
                    if (pCur.Content.nPOS < 0)
                    {
                        dValue += pCur.Content.nPOS;
                    }

                    //Get the position index of current word in the position map table
                    nCurWordIndex  = Utility.BinarySearch(pCur.row * Predefine.MAX_SENTENCE_LEN + pCur.col, m_npWordPosMapTable);
                    nNextWordIndex = Utility.BinarySearch(pNextWords.row * Predefine.MAX_SENTENCE_LEN + pNextWords.col, m_npWordPosMapTable);

                    aBiWordNet.SetElement(nCurWordIndex, nNextWordIndex, new ChainContent(dValue, pCur.Content.nPOS, sTwoWords));

                    pNextWords = pNextWords.next; //Get next word
                }
                pCur = pCur.next;
            }

            return(aBiWordNet);
        }
Example #21
0
 private double ComputePossibility(int nStartPos, int nLength, WordDictionary dict)
 {
     double dRetValue = 0, dPOSPoss;
     //dPOSPoss: the possibility of a POS appears
     //dContextPoss: The possibility of context POS appears
     int nFreq;
     for (int i = nStartPos; i < nStartPos + nLength; i++)
     {
         nFreq = dict.GetFrequency(m_sWords[i], m_nBestTag[i]);
         //nFreq is word being the POS
         dPOSPoss = Math.Log((double)(m_context.GetFrequency(0, m_nBestTag[i]) + 1)) - Math.Log((double)(nFreq + 1));
         dRetValue += dPOSPoss;
         /*
          if(i<nStartPos+nLength-1)
          {
             dContextPoss=log((double)(m_context.GetContextPossibility(0,m_nBestTag[i],m_nBestTag[i+1])+1));
             dRetValue+=dPOSPoss-dContextPoss;
          }
          */
     }
     return dRetValue;
 }
Example #22
0
 public Segment(WordDictionary biDict, WordDictionary coreDict)
 {
     this.biDict   = biDict;
     this.coreDict = coreDict;
 }
Example #23
0
 //POS tagging with Hidden Markov Model
 public bool POSTagging(WordResult[] pWordItems, WordDictionary dictCore, WordDictionary dictUnknown)
 {
     //pWordItems: Items; nItemCount: the count of items;core dictionary and unknown recognition dictionary
     int i = 0, j, nStartPos;
     Reset(false);
     while (i > -1 && i < pWordItems.Length && pWordItems[i].sWord != null)
     {
         nStartPos = i; //Start Position
         i = GetFrom(pWordItems, nStartPos, dictCore, dictUnknown);
         GetBestPOS();
         switch (m_tagType)
         {
             case TAG_TYPE.TT_NORMAL:
                 //normal POS tagging
                 j = 1;
                 while (m_nBestTag[j] != -1 && j < m_nCurLength)
                 {
                     //Store the best POS tagging
                     pWordItems[j + nStartPos - 1].nPOS = m_nBestTag[j];
                     //Let 。be 0
                     if (pWordItems[j + nStartPos - 1].dValue > 0 && dictCore.IsExist(pWordItems[j + nStartPos - 1].sWord, -1))
                         //Exist and update its frequncy as a POS value
                         pWordItems[j + nStartPos - 1].dValue = dictCore.GetFrequency(pWordItems[j + nStartPos - 1].sWord, m_nBestTag[j]);
                     j += 1;
                 }
                 break;
             case TAG_TYPE.TT_PERSON:
                 //Person recognition
                 PersonRecognize(dictUnknown);
                 break;
             case TAG_TYPE.TT_PLACE:
             //Place name recognition
             case TAG_TYPE.TT_TRANS_PERSON:
                 //Transliteration Person
                 PlaceRecognize(dictCore, dictUnknown);
                 break;
             default:
                 break;
         }
         Reset();
     }
     return true;
 }
Example #24
0
        public bool PersonRecognize(WordDictionary personDict)
        {
            StringBuilder sb = new StringBuilder();

            int    i;
            string sPOS = "z", sPersonName;

            string[] sPatterns = { "BBCD", "BBC", "BBE", "BBZ", "BCD", "BEE", "BE", "BG", "BXD", "BZ", "CDCD", "CD", "EE", "FB", "Y", "XD", "" };
            double[] dFactor   = { 0.003606, 0.000021, 0.001314, 0.000315, 0.656624, 0.000021, 0.146116, 0.009136,
                                   0.000042,   0.038971,        0, 0.090367, 0.000273, 0.009157, 0.034324, 0.009735, 0 };

            /*------------------------------------
            *  About parameter:
            *
            *  BBCD  343      0.003606
            *  BBC   2        0.000021
            *  BBE   125      0.001314
            *  BBZ   30       0.000315
            *  BCD   62460    0.656624
            *  BEE   0        0.000000
            *  BE    13899    0.146116
            *  BG    869      0.009136
            *  BXD   4        0.000042
            *  BZ    3707     0.038971
            *  CD    8596     0.090367
            *  EE    26       0.000273
            *  FB    871      0.009157
            *  Y     3265     0.034324
            *  XD    926      0.009735
            *
            *  The person recognition patterns set
            *  BBCD:姓+姓+名1+名2;
            *  BBE: 姓+姓+单名;
            *  BBZ: 姓+姓+双名成词;
            *  BCD: 姓+名1+名2;
            *  BE:  姓+单名;
            *  BEE: 姓+单名+单名;韩磊磊
            *  BG:  姓+后缀
            *  BXD: 姓+姓双名首字成词+双名末字
            *  BZ:  姓+双名成词;
            *  B:   姓
            *  CD:  名1+名2;
            *  EE:  单名+单名;
            *  FB:  前缀+姓
            *  XD:  姓双名首字成词+双名末字
            *  Y:   姓单名成词
            *  ------------------------------------*/

            int[] nPatternLen = { 4, 3, 3, 3, 3, 3, 2, 2, 3, 2, 4, 2, 2, 2, 1, 2, 0 };

            //Convert to string from POS
            sb.Append('z');
            for (i = 1; m_nBestTag[i] > -1; i++)
            {
                sb.Append(Convert.ToChar(m_nBestTag[i] + Convert.ToInt32('A')));
            }

            sPOS = sb.ToString();

            int  j = 1, k, nPos;   //Find the proper pattern from the first POS
            int  nLittleFreqCount; //Counter for the person name role with little frequecy
            bool bMatched = false;

            while (j < i)
            {
                bMatched = false;
                for (k = 0; !bMatched && nPatternLen[k] > 0; k++)
                {
                    if (string.Compare(sPatterns[k], 0, sPOS, j, nPatternLen[k]) == 0 &&
                        string.Compare(m_sWords[j - 1], "·") != 0 && string.Compare(m_sWords[j + nPatternLen[k]], "·") != 0)
                    {
                        //Find the proper pattern k
                        if (string.Compare(sPatterns[k], "FB") == 0 && (sPOS[j + 2] == 'E' || sPOS[j + 2] == 'C' || sPOS[j + 2] == 'G'))
                        {
                            //Rule 1 for exclusion:前缀+姓+名1(名2): 规则(前缀+姓)失效;
                            continue;
                        }

                        /*
                         * if((strcmp(sPatterns[k],"BEE")==0||strcmp(sPatterns[k],"EE")==0)&&strcmp(m_sWords[j+nPatternLen[k]-1],m_sWords[j+nPatternLen[k]-2])!=0)
                         * {//Rule 2 for exclusion:姓+单名+单名:单名+单名 若EE对应的字不同,规则失效.如:韩磊磊
                         * continue;
                         * }
                         *
                         * if(strcmp(sPatterns[k],"B")==0&&m_nBestTag[j+1]!=12)
                         * {//Rule 3 for exclusion: 若姓后不是后缀,规则失效.如:江主席、刘大娘
                         * continue;
                         * }
                         */
                        //Get the possible name

                        nPos             = j; //Record the person position in the tag sequence
                        sPersonName      = null;
                        nLittleFreqCount = 0; //Record the number of role with little frequency
                        while (nPos < j + nPatternLen[k])
                        {
                            //Get the possible person name
                            //
                            if (m_nBestTag[nPos] < 4 && personDict.GetFrequency(m_sWords[nPos], m_nBestTag[nPos]) < Predefine.LITTLE_FREQUENCY)
                            {
                                nLittleFreqCount++;
                            }
                            //The counter increase
                            sPersonName += m_sWords[nPos];
                            nPos        += 1;
                        }

                        /*
                         * if(IsAllForeign(sPersonName)&&personDict.GetFrequency(m_sWords[j],1)<LITTLE_FREQUENCY)
                         * {//Exclusion foreign name
                         * //Rule 2 for exclusion:若均为外国人名用字 规则(名1+名2)失效
                         * j+=nPatternLen[k]-1;
                         * continue;
                         * }
                         */
                        if (string.Compare(sPatterns[k], "CDCD") == 0)
                        {
                            //Rule for exclusion
                            //规则(名1+名2+名1+名2)本身是排除规则:女高音歌唱家迪里拜尔演唱
                            //Rule 3 for exclusion:含外国人名用字 规则适用
                            //否则,排除规则失效:黑妞白妞姐俩拔了头筹。
                            if (Utility.GetForeignCharCount(sPersonName) > 0)
                            {
                                j += nPatternLen[k] - 1;
                            }
                            continue;
                        }

                        /*
                         * if(strcmp(sPatterns[k],"CD")==0&&IsAllForeign(sPersonName))
                         * {//
                         * j+=nPatternLen[k]-1;
                         * continue;
                         * }
                         * if(nLittleFreqCount==nPatternLen[k]||nLittleFreqCount==3)
                         * //马哈蒂尔;小扎耶德与他的中国阿姨胡彩玲受华黎明大使之邀,
                         * //The all roles appear with two lower frequecy,we will ignore them
                         * continue;
                         */
                        m_nUnknownWords[m_nUnknownWordsCount, 0]  = m_nWordPosition[j];
                        m_nUnknownWords[m_nUnknownWordsCount, 1]  = m_nWordPosition[j + nPatternLen[k]];
                        m_dWordsPossibility[m_nUnknownWordsCount] = -Math.Log(dFactor[k]) + ComputePossibility(j, nPatternLen[k], personDict);
                        //Mutiply the factor
                        m_nUnknownWordsCount += 1;
                        j       += nPatternLen[k];
                        bMatched = true;
                    }
                }
                if (!bMatched)
                {
                    //Not matched, add j by 1
                    j += 1;
                }
            }
            return(true);
        }
Example #25
0
        /// <summary>
        /// 得到所有可能的分词方案
        /// </summary>
        /// <returns></returns>
        public RowFirstDynamicArray<ChainContent> GetSegGraph(string sSentence)
        {
            WordDictionary coreDict = new WordDictionary();
            if (!coreDict.Load(coreDictFile))
            {
                Console.WriteLine("字典装入错误!");
                return null;
            }

            //string sSentence = @"他说的确实实在";
            sSentence = Predefine.SENTENCE_BEGIN + sSentence + Predefine.SENTENCE_END;

            List<AtomNode> atomSegment = Segment.AtomSegment(sSentence);
            RowFirstDynamicArray<ChainContent> m_segGraph = Segment.GenerateWordNet(atomSegment, coreDict);
            return m_segGraph;
        }
Example #26
0
        /// <summary>
        /// 找到导入库和现有库的不同
        /// </summary>
        /// <param name="NewDicFile">导入库文件</param>
        /// <param name="Encoding">导入库文件编码</param>
        /// <param name="DicFormat">导入库文件格式</param>
        /// <param name="SourceDict">原库对象</param>
        /// <param name="OddLines">输出没有词性标注且现有库中也没有的词行</param>
        /// <param name="NewWords">输出新词或现有词的新词性</param>
        /// <param name="ExistWords">输出重复词,且词性也相同</param>
        /// <param name="MaxFrqRate">重复词的最大词频比例</param>
        /// <param name="MinFrqRate">重复词的最小词频比例</param>
        /// <param name="AvgFrqRate">重复词的平均词频比例</param>
        public static void FindDifferent(string NewDicFile, Encoding Encoding, DictionaryFormat DicFormat, WordDictionary SourceDict,
            out string[] OddLines, out WordDictionary.DicWordInfo[] NewWords, out WordDictionary.DicWordInfo[] ExistWords,
            out double MaxFrqRate, out double MinFrqRate, out double AvgFrqRate)
        {
            //初始化
            MaxFrqRate = double.MinValue; MinFrqRate = double.MaxValue; decimal SumFrqRate = 0;
            //const string[] CheckPos = new string[] { "n", "ns", "nr", "ng", "v", "j", "m", "vn", "a", "q" };

            //准备词性转换
            Dictionary<string, string> PosTrans = getPosTransformMap(DicFormat);

            //加载词库
            Dictionary<string, WordDictionary.DicWordInfo> OldWords = SourceDict.ToWordDictionary(); ;

            //内存词组
            List<string> Odds = new List<string>(OldWords.Count / 2);
            List<WordDictionary.DicWordInfo> Exists = new List<SharpICTCLAS.WordDictionary.DicWordInfo>(OldWords.Count / 2);
            List<WordDictionary.DicWordInfo> News = new List<WordDictionary.DicWordInfo>(OldWords.Count / 2);

            //加载词库并统计库内有的词的词频,以估算词频转换的比例关系
            foreach (string Line in File.ReadAllLines(NewDicFile, Encoding))
            {
                string Word;
                int Frq;
                string Poses;

                switch (DicFormat)
                {
                    case DictionaryFormat.SogouW2006:
                        string[] s = Line.Split('\t', ' ');
                        Word = s[0];
                        Frq = s.Length == 1 ? -1 : int.Parse(s[1]);
                        Poses = s.Length < 2 ? null : s[2];
                        break;

                    case DictionaryFormat.ExcelCSV:
                    default:
                        int p1 = Line.IndexOf(',');
                        int p2 = Line.IndexOf(',', p1 + 1);
                        Word = Line.Substring(0, p1);
                        Frq = int.Parse(Line.Substring(p1 + 1, p2 - p1 - 1));
                        Poses = Line.Substring(p2 + 1).Trim('"').Trim();
                        break;
                }

                if (string.IsNullOrEmpty(Poses))
                {
                    if (!OldWords.ContainsKey(Word.ToLower())) Odds.Add(Line);
                    continue;
                }

                foreach (string InputPos in Poses.TrimEnd(',').Split(','))
                {
                    if (string.IsNullOrEmpty(InputPos)) continue;
                    //如果映射表中没有,则保留原始词性字母
                    string Pos = PosTrans.ContainsKey(InputPos.ToLower()) ? PosTrans[InputPos.ToLower()] : InputPos.ToLower();

                    //是否存在
                    if (OldWords.ContainsKey(Word.ToLower()) && OldWords[Word.ToLower()].Pos.Contains(Pos))
                    {
                        int SourceFrq = OldWords[Word.ToLower()].Frequence;
                        double FrqR = SourceFrq == 0 ? Frq : (double)Frq / SourceFrq;
                        if (FrqR > MaxFrqRate) MaxFrqRate = FrqR;
                        if (FrqR < MinFrqRate) MinFrqRate = FrqR;
                        SumFrqRate += (decimal)FrqR;
                        Exists.Add(new WordDictionary.DicWordInfo(Word, Pos, Frq));
                    }
                    else //新词或新词性
                    {
                        News.Add(new WordDictionary.DicWordInfo(Word, Pos, Frq));
                    }
                }
            }

            //平均频度转换倍数
            AvgFrqRate = Exists.Count > 0 ? Convert.ToDouble(SumFrqRate / Exists.Count) : 0;

            OddLines = Odds.ToArray();
            NewWords = News.ToArray();
            ExistWords = Exists.ToArray();
        }
Example #27
0
 /// <summary>
 /// 找到导入库和现有库的不同
 /// </summary>
 /// <param name="NewDicFile">导入库文件</param>
 /// <param name="Encoding">导入库文件编码</param>
 /// <param name="DicFormat">导入库文件格式</param>
 /// <param name="SourceDictFileName">原库文件</param>
 /// <param name="OddLines">输出没有词性标注且现有库中也没有的词行</param>
 /// <param name="NewWords">输出新词或现有词的新词性</param>
 /// <param name="ExistWords">输出重复词,且词性也相同</param>
 /// <param name="MaxFrqRate">重复词的最大词频比例</param>
 /// <param name="MinFrqRate">重复词的最小词频比例</param>
 /// <param name="AvgFrqRate">重复词的平均词频比例</param>
 public static void FindDifferent(string NewDicFile, Encoding Encoding, DictionaryFormat DicFormat, string SourceDictFileName,
     out string[] OddLines, out WordDictionary.DicWordInfo[] NewWords, out WordDictionary.DicWordInfo[] ExistWords,
     out double MaxFrqRate, out double MinFrqRate, out double AvgFrqRate)
 {
     WordDictionary SourceDict = new WordDictionary();
     if (!SourceDict.Load(SourceDictFileName))
         throw new Exception("load source dic file fail");
     FindDifferent(NewDicFile, Encoding, DicFormat, SourceDict, out OddLines, out NewWords, out ExistWords, out MaxFrqRate, out MinFrqRate, out AvgFrqRate);
     SourceDict.ReleaseDict();
 }
Example #28
0
        private int GetFrom(WordResult[] pWordItems, int nIndex, WordDictionary dictCore, WordDictionary dictUnknown)
        {
            WordInfo info;

            int[]  aPOS = new int[Predefine.MAX_POS_PER_WORD];
            int[]  aFreq = new int[Predefine.MAX_POS_PER_WORD];
            int    nFreq = 0, j, nRetPos = 0, nWordsIndex = 0;
            bool   bSplit = false; //Need to split in Transliteration recognition
            int    i = 1, nPOSCount;
            string sCurWord;       //Current word

            nWordsIndex = i + nIndex - 1;
            for (i = 1; i < Predefine.MAX_WORDS_PER_SENTENCE && nWordsIndex < pWordItems.Length; i++)
            {
                if (m_tagType == TAG_TYPE.TT_NORMAL || !dictUnknown.IsExist(pWordItems[nWordsIndex].sWord, 44))
                {
                    m_sWords[i]            = pWordItems[nWordsIndex].sWord; //store current word
                    m_nWordPosition[i + 1] = m_nWordPosition[i] + m_sWords[i].Length;
                }
                else
                {
                    if (!bSplit)
                    {
                        m_sWords[i] = pWordItems[nWordsIndex].sWord.Substring(0, 1);
                        //store current word
                        bSplit = true;
                    }
                    else
                    {
                        m_sWords[i] = pWordItems[nWordsIndex].sWord.Substring(1);
                        //store current word
                        bSplit = false;
                    }
                    m_nWordPosition[i + 1] = m_nWordPosition[i] + m_sWords[i].Length;
                }
                //Record the position of current word
                m_nStartPos = m_nWordPosition[i + 1];
                //Move the Start POS to the ending
                if (m_tagType != TAG_TYPE.TT_NORMAL)
                {
                    //Get the POSs from the unknown recognition dictionary
                    sCurWord = m_sWords[i];
                    if (m_tagType == TAG_TYPE.TT_TRANS_PERSON && i > 0 && m_sWords[i - 1] != null &&
                        Utility.charType(m_sWords[i - 1].ToCharArray()[0]) == Predefine.CT_CHINESE)
                    {
                        if (m_sWords[i] == ".")
                        {
                            sCurWord = ".";
                        }
                        else if (m_sWords[i] == "-")
                        {
                            sCurWord = "-";
                        }
                    }

                    info = dictUnknown.GetWordInfo(sCurWord);
                    if (info != null)
                    {
                        nPOSCount = info.Count + 1;
                        for (j = 0; j < info.Count; j++)
                        {
                            //Get the POS set of sCurWord in the unknown dictionary
                            m_nTags[i, j]      = info.POSs[j];
                            m_dFrequency[i, j] = -Math.Log((double)(1 + info.Frequencies[j])) +
                                                 Math.Log((double)(m_context.GetFrequency(0, info.POSs[j]) + nPOSCount));
                        }
                    }
                    else
                    {
                        nPOSCount = 1;
                        j         = 0;
                    }

                    //Get the POS set of sCurWord in the core dictionary
                    //We ignore the POS in the core dictionary and recognize them as other (0).
                    //We add their frequency to get the possibility as POS 0
                    if (string.Compare(m_sWords[i], "始##始") == 0)
                    {
                        m_nTags[i, j]      = 100;
                        m_dFrequency[i, j] = 0;
                        j++;
                    }
                    else if (string.Compare(m_sWords[i], "末##末") == 0)
                    {
                        m_nTags[i, j]      = 101;
                        m_dFrequency[i, j] = 0;
                        j++;
                    }
                    else
                    {
                        //dictCore.GetHandle(m_sWords[i], &nCount, aPOS, aFreq);
                        info  = dictCore.GetWordInfo(m_sWords[i]);
                        nFreq = 0;
                        if (info != null)
                        {
                            for (int k = 0; k < info.Count; k++)
                            {
                                nFreq += info.Frequencies[k];
                            }
                            if (info.Count > 0)
                            {
                                m_nTags[i, j] = 0;
                                //m_dFrequency[i][j]=(double)(1+nFreq)/(double)(m_context.GetFrequency(0,0)+1);
                                m_dFrequency[i, j] = -Math.Log((double)(1 + nFreq)) + Math.Log((double)(m_context.GetFrequency(0, 0) + nPOSCount));
                                j++;
                            }
                        }
                    }
                }
                else
                //For normal POS tagging
                {
                    j = 0;
                    //Get the POSs from the unknown recognition dictionary
                    if (pWordItems[nWordsIndex].nPOS > 0)
                    {
                        //The word has  is only one POS value
                        //We have record its POS and nFrequncy in the items.
                        m_nTags[i, j]      = pWordItems[nWordsIndex].nPOS;
                        m_dFrequency[i, j] = -Math.Log(pWordItems[nWordsIndex].dValue) + Math.Log((double)(m_context.GetFrequency(0, m_nTags[i, j]) + 1));
                        if (m_dFrequency[i, j] < 0)
                        {
                            //Not permit the value less than 0
                            m_dFrequency[i, j] = 0;
                        }
                        j++;
                    }
                    else
                    {
                        //The word has multiple POSs, we should retrieve the information from Core Dictionary
                        if (pWordItems[nWordsIndex].nPOS < 0)
                        {
                            //The word has  is only one POS value
                            //We have record its POS and nFrequncy in the items.
                            m_nTags[i, j]        = -pWordItems[nWordsIndex].nPOS;
                            m_dFrequency[i, j++] = pWordItems[nWordsIndex].dValue;
                        }
                        //dictCore.GetHandle(m_sWords[i], &nCount, aPOS, aFreq);
                        info = dictCore.GetWordInfo(m_sWords[i]);
                        if (info != null)
                        {
                            nPOSCount = info.Count;
                            for (; j < info.Count; j++)
                            {
                                //Get the POS set of sCurWord in the unknown dictionary
                                m_nTags[i, j]      = info.POSs[j];
                                m_dFrequency[i, j] = -Math.Log(1 + info.Frequencies[j]) + Math.Log(m_context.GetFrequency(0, m_nTags[i, j]) + nPOSCount);
                            }
                        }
                    }
                }
                if (j == 0)
                {
                    //We donot know the POS, so we have to guess them according lexical knowledge
                    GuessPOS(i, out j); //Guess the POS of current word
                }
                m_nTags[i, j] = -1;     //Set the ending POS
                if (j == 1 && m_nTags[i, j] != Predefine.CT_SENTENCE_BEGIN)
                //No ambuguity
                {
                    //No ambuguity, so we can break from the loop
                    i++;
                    m_sWords[i] = null;
                    break;
                }
                if (!bSplit)
                {
                    nWordsIndex++;
                }
            }
            if (nWordsIndex == pWordItems.Length)
            {
                nRetPos = -1;
            }
            //Reaching ending

            if (m_nTags[i - 1, 1] != -1)
            //||m_sWords[i][0]==0
            {
                //Set end for words like "张/华/平"
                if (m_tagType != TAG_TYPE.TT_NORMAL)
                {
                    m_nTags[i, 0] = 101;
                }
                else
                {
                    m_nTags[i, 0] = 1;
                }

                m_dFrequency[i, 0] = 0;
                m_sWords[i]        = null; //Set virtual ending
                m_nTags[i++, 1]    = -1;
            }
            m_nCurLength = i; //The current word count
            if (nRetPos != -1)
            {
                return(nWordsIndex + 1);
            }
            //Next start position
            return(-1); //Reaching ending
        }